-
Notifications
You must be signed in to change notification settings - Fork 0
/
iEN_analy.R
292 lines (224 loc) · 13.3 KB
/
iEN_analy.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
#by:Patrick O'Connell
### Attempt at predicting parameters which effictivly discriminate responders from non-responders
#in mice who recieved Ad-SF7-Fc injection into B16-F10 tumors.
#3/27/2021
#I am taking a machine learning-based approach here and using the iEN algorithm. This was developed
#specificially for high dimensional immune data and allows the user to weight based on known immunological facts.
#https://github.com/Teculos/immunological-EN
setwd("~/iEN_modeling")
#Packages
library(iEN)
library(tidyverse)
library(readxl)
library(ggplot2)
library(xlsx)
library(parallel)
library(caret)
###------------------------Analysis on Ad-SF7-Fc B16 data----------------------------###
#First I will collect and pool my data together in one giant df
##List of data included:
# 1) T cell phenotype data
# 2) Frequency of all immune cell types in the TME (only used cell types common to both analyses)
# 3) Median expression of all markers on all cell types in TME
# 4) Tumor lysate cytokine data
# 5) Plasma cytokine data
# 6) SLAMF7 activating/blocking predictions
#load in all data, aggregate, and identify where missing values are
#missing:
# 1936 and 1937 lysate cytokine data (both batch1 respodners)
setwd("~/iEN_modeling/data_dfs")
#load the data, and remove unnecessary columns and name columns appropiatly (must be unique)
batch2_pred_iEN <- read_excel("batch2_preds_iEN.xlsx") %>%
pivot_wider(names_from = c("cell_type", "prediction"), values_from = freq)
batch2_pred_iEN$sample_id <- gsub(batch2_pred_iEN$sample_id, pattern = "SLAMF7", replacement = "SF7")
batch1_pred_iEN <- read_excel("batch1_preds_iEN.xlsx") %>%
pivot_wider(names_from = c("cell_type", "prediction"), values_from = freq)
batch1_pred_iEN$sample_id <- gsub(batch1_pred_iEN$sample_id, pattern = "SLAMF7", replacement = "SF7")
#merg_pred data
merg_pred_data <- bind_rows(batch1_pred_iEN, batch2_pred_iEN)
batch2_clust_expr_iEN <- read_excel("batch2_clust_expr_iEN.xlsx") %>%
select(!one_of("responder")) %>%
pivot_wider(names_from = merg_clust, values_from = c("CD45", "CD11b", "CD40", "CD4",
"CD8", "NK1.1", "CD19", "SLAMF7", "F4/80", "Ly6C", "Ly6G",
"MHC-II", "PD-L1", "CD11c", "B220", "PD-1", "IgD", "CCR2",
"CD38", "CD206", "AF", "CD3", "Fascin", "Lyve-1", "CD90",
"CCR7"))
batch1_clust_expr_iEN <- read_excel("batch1_clust_expr_iEN.xlsx") %>%
select(!one_of("responder")) %>%
pivot_wider(names_from = merg_clust, values_from = c("CD45", "CD11b", "CD40", "CD4",
"CD8", "NK1.1", "CD19", "SLAMF7", "F4/80", "Ly6C", "Ly6G",
"MHC-II", "PD-L1", "CD11c", "B220", "PD-1", "IgD", "CCR2",
"CD38", "CD206", "AF", "CD3", "Fascin", "Lyve-1", "CD90",
"CCR7"))
batch1_clust_expr_iEN$sample_id <- gsub(batch1_clust_expr_iEN$sample_id, pattern = "SLAMF7", replacement = "SF7")
#merg expr data
merg_expr_data <- bind_rows(batch1_clust_expr_iEN, batch2_clust_expr_iEN)
batch2_clust_freq_iEN <- read_excel("batch2_clust_freq_iEN.xlsx") %>%
select(!one_of("responder")) %>%
spread(., merg_clust, freq)
batch2_clust_freq_iEN$sample_id <- gsub(batch2_clust_freq_iEN$sample_id, pattern = "SLAMF7", replacement = "SF7")
batch1_clust_freq_iEN <- read_excel("batch1_clust_freq_iEN.xlsx") %>%
select(!one_of("responder")) %>%
spread(., merg_clust, freq)
batch1_clust_freq_iEN$sample_id <- gsub(batch1_clust_freq_iEN$sample_id, pattern = "SLAMF7", replacement = "SF7")
#merg freq data
merg_freq_data <- bind_rows(batch1_clust_freq_iEN, batch2_clust_freq_iEN)
#batch1_2_lysate_cyto_df_iEN <- read_excel("batch1_2_lysate_cyto_df_iEN.xlsx")
batch1_2_plasma_cyto_df_iEN <- read_excel("batch1_2_plasma_cyto_df_iEN.xlsx") %>%
filter(., time_point == "Day_0")
colnames(batch1_2_plasma_cyto_df_iEN) <- paste(colnames(batch1_2_plasma_cyto_df_iEN), "plasma_day0", sep = "_")
batch1_2_plasma_cyto_df_iEN <- rename(batch1_2_plasma_cyto_df_iEN, sample_id = sample_id_plasma_day0)
batch1_2_plasma_cyto_df_iEN <- select(batch1_2_plasma_cyto_df_iEN, !one_of("location_plasma_day0", "batch_plasma_day0", "responder_plasma_day0", "location_plasma_day0", "time_point_plasma_day0"))
batch1_2_plasma_cyto_df_iEN2 <- read_excel("batch1_2_plasma_cyto_df_iEN.xlsx") %>%
filter(., time_point == "Day_1")
colnames(batch1_2_plasma_cyto_df_iEN2) <- paste(colnames(batch1_2_plasma_cyto_df_iEN2), "plasma_day1", sep = "_")
batch1_2_plasma_cyto_df_iEN2 <- rename(batch1_2_plasma_cyto_df_iEN2, sample_id = sample_id_plasma_day1)
batch1_2_plasma_cyto_df_iEN2 <- select(batch1_2_plasma_cyto_df_iEN2, !one_of("location_plasma_day1", "batch_plasma_day1", "responder_plasma_day1", "location_plasma_day1", "time_point_plasma_day1"))
#merge them
batch1_2_plasma_cyto_df_iEN_clean <- left_join(batch1_2_plasma_cyto_df_iEN2, batch1_2_plasma_cyto_df_iEN, by = "sample_id")
batch2_T_cell_data_iEN <- read_excel("batch2_T_cell_data_iEN.xlsx") %>%
select(!one_of("responder")) %>%
spread(., cell_type, freq)
batch1_T_cell_data_iEN <- read_excel("batch1_T_cell_data_iEN.xlsx") %>%
select(!one_of("responder")) %>%
spread(., cell_type, freq)
#merg_T cell data
merg_T_cell_data <- bind_rows(batch1_T_cell_data_iEN, batch2_T_cell_data_iEN)
batch1_2_lysate_cyto_df_iEN_preds <- read_excel("batch1_2_lysate_cyto_df_iENP_w_preds.xlsx") %>%
select(., !one_of("location", "batch", "responder")) #contains pred values
colnames(batch1_2_lysate_cyto_df_iEN_preds) <- paste(colnames(batch1_2_lysate_cyto_df_iEN_preds), "lysate", sep = "_")
batch1_2_lysate_cyto_df_iEN_preds <- rename(batch1_2_lysate_cyto_df_iEN_preds, sample_id = sample_id_lysate)
#change name of one mislabeled sample
batch1_2_lysate_cyto_df_iEN_preds$sample_id <- gsub(batch1_2_lysate_cyto_df_iEN_preds$sample_id, pattern = "1798", replacement = "0798")
#for missing tumor lysate samples, I'll take the median expression for each cytokine from responding mice from both batches (since they were responders)
med_pred <- batch1_2_lysate_cyto_df_iEN %>%
filter(responder == "yes")
med_pred <- med_pred[,5:27]
preds <- med_pred %>%
as.data.frame(.) %>%
summarise_all(median)
#since I'm lazy I just export this and manually copy it into the oroginal df and reload
write.xlsx(preds, "~/iEN_modeling/data_dfs/lysate_preds.xlsx")
#Now aggregate all data above into single df of proper form
#Check that sample_id names for all dfs match
unique(batch1_clust_expr_iEN$sample_id, batch1_2_plasma_cyto_df_iEN_clean$sample_id)
setequal(batch2_clust_expr_iEN$sample_id, batch2_T_cell_data_iEN$sample_id)
agg_data <- full_join(batch1_2_lysate_cyto_df_iEN_preds, batch1_2_plasma_cyto_df_iEN_clean, by = "sample_id")
agg_data <- full_join(agg_data, merg_T_cell_data, by = "sample_id")
agg_data <- full_join(agg_data, merg_freq_data, by = "sample_id")
agg_data <- full_join(agg_data, merg_expr_data, by = "sample_id")
agg_data <- full_join(agg_data, merg_pred_data, by = "sample_id")
agg_data <- column_to_rownames(agg_data, var = "sample_id")
agg_dat_mtx <- as.matrix(agg_data) #input this into iEN
saveRDS(agg_dat_mtx, "~/iEN_modeling/agg_dat_mtx")
agg_dat_mtx <- readRDS("~/iEN_modeling/agg_dat_mtx")
##-------------generate priors-------------------##
#this is just a named number vector (as.numeric) w/ the rownames matching the colnames of the data_matrix
#and the values for each one the weighting.
#My situation is a bit different from the example paper, since I have differnt types of readouts
#I will score the expression variables from TME panel and T cell panel just as they did in the paper.
#I will not score TME panel frequency data since I do not want to bias it. (all have value of 1)
#Day_0 plasma cytokines do not get scored (all have value of 1)
#Day_1 plasma cytokines get scored based on expected changes from day_0 (from an Adenovirus perspective)
#lysate cytokine data does not get scored since I do not want to bias. (all have value of 1)
#SF7 blocked/activated prediction data does not get scored since we have no ground truth data on SF7-Fc ability to block or activate specific cell types (all values set to 1)
prior_temp <- t(agg_dat_mtx)
write.xlsx(prior_temp, "~/iEN_modeling/prior_temp.xlsx")
#read priors back in
SF7_Fc_priors<- read_excel("prior_temp.xlsx") %>%
column_to_rownames(., var = "...1")
#get data in right format
numbs <- SF7_Fc_priors[[1]]
rownames(SF7_Fc_priors) -> SF7_names
names(numbs) <- SF7_names
numbs -> SF7_priors_clean
rm(numbs)
##----------Set up variables for iEN------------##
#Set response variable
resp_var <- c("yes", "no", "yes", "no", "yes", "no", "yes", "yes", "no", "no", "yes", "yes", "yes") %>%
as.factor()
alphaGrid <- seq(0,1, length.out=11)
phiGrid <- c(0, exp(seq(log(1),log(100), length.out=10)))
nlambda <- 10
lambdas=NULL
ncores <- detectCores()-1
eval <- "ROCAUC"
family <- "binomial"
intercept <- FALSE
standardize <- TRUE
center <- TRUE
K <- 10
repeats <- 100
set.seed(1994)
for(i in seq(repeats)){
foldid <- caret::createFolds(seq(nrow(agg_dat_mtx)),k=K,list=FALSE)
}
#Run the algorithm
ien.SF7_Fc_100=cv_iEN(agg_dat_mtx, resp_var, foldid, alphaGrid, phiGrid, nlambda, lambdas, SF7_priors_clean, ncores, eval, family, intercept, standardize, center)
#save model
saveRDS(ien.SF7_Fc_100, "~/iEN_modeling/ien.SF7_Fc_100")
#load the model back
ien.SF7_Fc_100 <- readRDS("~/iEN_modeling/ien.SF7_Fc_100")
#Run the algorithm w/o priors
ien.SF7_Fc_100_wo=cv_iEN(agg_dat_mtx, resp_var, foldid, alphaGrid, phiGrid=c(0), nlambda, lambdas, SF7_priors_clean, ncores, eval, family, intercept, standardize, center)
#save the model
saveRDS(ien.SF7_Fc_100_wo, "~/iEN_modeling/ien.SF7_Fc_100_wo")
#load the model back
ien.SF7_Fc_100_wo <- readRDS("~/iEN_modeling/ien.SF7_Fc_100_wo")
#both run fine and have solid prediciton strength. Adding in priors increases the CV from 0.8 to 0.9 showing that my priors are useful.
print(ien.SF7_Fc_100)
print(ien.SF7_Fc_100_wo)
#make ROC curves
#my data and plot
#predicitons are from the iEN object and re-code yes no classifier to 1,0
SF7_ROC_df <- data.frame(predictions = (ien.SF7_Fc_100@cv.preds),
labels = c(1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1))
SF7_ROC_df_wo <- data.frame(predictions = (ien.SF7_Fc_100_wo@cv.preds),
labels = c(1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1))
library(pROC)
#now my data
pROC_obj_SF7 <- roc(SF7_ROC_df$labels,SF7_ROC_df$predictions,
smoothed = T,
ci=F, ci.alpha=0.9, stratified=FALSE,
plot=TRUE, auc.polygon=F, max.auc.polygon=F, grid=F,
print.auc=TRUE, show.thres=TRUE)
pROC_obj_SF7_wo <- roc(SF7_ROC_df_wo$labels,SF7_ROC_df_wo$predictions,
smoothed = T,
ci=F, ci.alpha=0.9, stratified=FALSE,
plot=TRUE, auc.polygon=F, max.auc.polygon=F, grid=F,
print.auc=TRUE, show.thres=TRUE)
#export sensitivity and specifity data from pROC_ob_SF7 and make a ggplot from it.
#and repeat for non-prior analysis and plot them too on top
both <- list(pROC_obj_SF7, pROC_obj_SF7_wo)
plot1 <- ggroc(list(`With Priors (AUC=0.925)` = pROC_obj_SF7,`Without Priors (AUC=0.825)` = pROC_obj_SF7_wo), size = 1.0, aes= c("linetype", "color"))
plot2 <- plot1 + scale_color_manual(values = c("black", "#808080"))
plot2 + theme(axis.line = element_line(linetype = "solid", size = 1.0),
axis.ticks = element_line(colour = "black",
size = 0.9), panel.grid.major = element_line(colour = "khaki4",
linetype = "blank"), axis.title = element_text(size = 19),
axis.text = element_text(size = 16),
legend.text = element_text(size = 16),
legend.title = element_text(colour = NA),
panel.background = element_rect(fill = NA),
plot.background = element_rect(colour = "white",
size = 1.1), legend.key = element_rect(fill = NA),
legend.background = element_rect(fill = NA))
##--------------------------------------------------------------##
#run wilcoxin signed rank test on predictive values from iEN comparing responders to non-responders
#this gives P value statistic for how good the model can seperate the groups.
SF7_ROC_df$labels <- as.factor(SF7_ROC_df$labels)
levels(SF7_ROC_df$labels) <- c("Non-Responder", "Responder")
#calc p value
wilcox.test(SF7_ROC_df$predictions~SF7_ROC_df$labels)$p.value
#plot it!
p_val_plot <- ggplot(SF7_ROC_df, aes(x=labels, y=predictions)) +
stat_boxplot(geom ='errorbar') +
geom_boxplot(outlier.alpha = 0) +
geom_dotplot(binaxis = 'y', stackdir = 'center', position = position_dodge(1), dotsize = 0.5)
p_val_plot + theme(axis.line = element_line(linetype = "solid"),
axis.ticks = element_line(colour = "black",
size = 0.9), axis.title = element_text(size = 19),
axis.text = element_text(size = 16),
panel.background = element_rect(fill = NA)) +
labs(y = "iEN predicted values", x = "")
##--------------------------------------------------------------##