/
mbms_epigenetic_clock_generation.R
398 lines (327 loc) · 15.4 KB
/
mbms_epigenetic_clock_generation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
# epigenetic clocks creation
# MBMS
# written by Sarah Watkins
# September 2021
# here we calculate 9 of the epigenetic clocks
# (grimage has to be uploaded to website)
# then we combine them all together in a df
# then we add in batch vars (slide + row)
# then we add in cell counts
# this is the dataset to send to Harvard for the epigenetic clocks
# analyses.
# Set directories:
project.dir <- "/project_dir"
data.dir <- file.path(project.dir, "data_dir")
output.dir <- file.path(project.dir,"output_dir")
clocks.dir <- file.path(project.dir, "clocks_dir")
## http://github.com/perishky/meffil
library(meffil)
options(mc.cores=10)
## http://github.com/perishky/eval.save
library(eval.save)
eval.save.dir(output.dir)
library(readxl)
library(meffonym)
# load in data
# we need meth and we need age
# see dataset generation for creation of mbms.csv (phenotype data)
pheno <- read.csv(file=paste0(data.dir,"/mbms.csv"),header = T)
pheno <- pheno[!is.na(pheno$sentrix_id),]
rownames(pheno) <- pheno$sentrix_id
# see normalisation and QC for norm.beta_mbms_slide_fixed.rda
load(file=paste0(data.dir,"/norm.beta_mbms_slide_fixed.rda"))
meth <- ret
meth <- as.data.frame(meth)
meth_samples <- as.character(colnames(meth))
pheno <- pheno[meth_samples,]
# add the clocks
#####################
# epiToc (385 CpG sites). "The average DNAm over these 385 CpG sites"
## https://static-content.springer.com/esm/art%3A10.1186%2Fs13059-016-1064-3/MediaObjects/13059_2016_1064_MOESM2_ESM.xls
epiToc <- read_xls(paste0(clocks.dir,"/13059_2016_1064_MOESM2_ESM.xls"))
epiToc <- epiToc[[1]][-1]
# not all of the epitoc cpgs may be present in our daatset:
present_epitoc_cpgs <- epiToc[epiToc%in% rownames(meth)]
length(present_epitoc_cpgs)
meth_epiToc <- meth[present_epitoc_cpgs,]
dim(meth_epiToc)
sum.cpgs <- colSums(meth_epiToc)
length(sum.cpgs)
epiToc.clock <- sum.cpgs/length(epiToc)
length(epiToc.clock)
head(epiToc.clock)
epiToc.clock <- as.data.frame(epiToc.clock)
######################
# Zhang mortality (10 CpG sites)
# https://doi.org/10.1038/ncomms14617
# We can have a continuous or 0-10 risk score
# "We .. used the fourth quartile value of cg08362785 and first
# quartile values of other nine CpGs as the cutoff points, to define
# aberrant methylation for each CpG (the exact cutoff points are
# listed in Supplementary Table 4)"
zhang_cpgs <- c("cg01612140", "cg05575921", "cg06126421", "cg08362785",
"cg10321156", "cg14975410", "cg19572487", "cg23665802",
"cg24704287", "cg25983901")
zhang_present_on_epic <- zhang_cpgs[zhang_cpgs %in% rownames(meth)]
length(zhang_present_on_epic)
# 10
####### or make the continuous score
zhang_coefficients <- c(-0.38253, -0.92224, -1.70129, 2.71749,
-0.02073, -0.04156, -0.28069, -0.89440,
-2.98637, -1.80325)
meffonym.add.model("zhang.mortality", c("intercept", zhang_cpgs), c(0, zhang_coefficients), c("Zhang et al 2017 mortality clock from PMID 28303888. Clock details in supplement of the paper."))
########################
# MiAge (268 CpG sites)
# https://dx.doi.org/10.1080%2F15592294.2017.1389361
miage <- read.csv(paste0(clocks.dir,"/2017EPI0166R-s02.csv"), stringsAsFactors=F)
# miage has functions and a script to get the mitotic age estimates
miage_in_epic <- miage$CpG_site_ID[miage$CpG_site_ID %in% rownames(meth)]
length(miage_in_epic)
#268
source(paste0(clocks.dir,"/miage_function_library.r")) ### library of all functions used in the calculation of mitotic ages
#HM450.mat=as.matrix(read.table("input_HM450_methyl_data.txt",header=T))#### inputHM450.txt is a matrix of HM450 methylation values with each row representing each CpG site and each column representing each sample. The row names of CpG sites and column names corresponding to sample names must be given
clocksitesID=as.matrix(read.csv(paste0(clocks.dir,"/miage_Additional_File1.csv"),header=T))[,1] ### epigenetic clock CpG sites
beta= meth[ match(clocksitesID,rownames(meth)),]##select clock CpG sites
load(paste0(clocks.dir,"/miage_site_specific_parameters.Rdata")) #### site-specific parameters
b=methyl.age[[1]];c=methyl.age[[2]];d=methyl.age[[3]]
miage.clock=mitotic.age(beta,b,c,d) ### estimated mitotic age
names(miage.clock)=colnames(beta)
head(miage.clock)
miage.clock <- as.data.frame(miage.clock)
#########################
# Levine/PhenoAge (513 sites)
# https://dx.doi.org/10.18632%2Faging.101414
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5940111/bin/aging-10-101414-s002.csv
levine <- read.csv(paste0(clocks.dir,"/aging-10-101414-s002.csv"),stringsAsFactors=F)
levine_in_epic <- levine$CpG[levine$CpG %in% rownames(meth)]
length(levine_in_epic)
# 513
levine_cpgs <- as.character(levine$CpG)
levine_weights <- levine$Weight
class(levine_weights)
meffonym.add.model("phenoage", variables=levine_cpgs, coefficients=levine_weights, c("PhenoAge (Levine) clock"))
meffonym.models()
#########################
# Dunedin
## devtools::install_github("danbelsky/DunedinPoAm38")
library("DunedinPoAm38")
dunedin.sites <- getRequiredProbes()$DunedinPoAm_38
length(dunedin.sites)
# 46
dunedin_in_epic <- dunedin.sites[dunedin.sites %in% rownames(meth)]
length(dunedin_in_epic)
# 46
# rows=cpgs
dunedin_age <- PoAmProjector(meth)
dunedin_age <- data.frame(dunedin_age[[1]])
names(dunedin_age) <- c("dunedin_age")
head(dunedin_age)
########################
# DNAmTL (140 CpG sites)
# https://dx.doi.org/10.18632%2Faging.102173
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6738410/bin/aging-11-102173-s003.xlsx
# load the sites and coefficients
dnamtl <- read.csv(paste0(clocks.dir,"/dnamtl_coefficients.csv"),stringsAsFactors=F)
colnames(dnamtl) <- c("cpg","weight")
head(dnamtl)
dnamtl_cpgs <- as.character(dnamtl$cpg)
length(dnamtl_cpgs)
dnamtl_in_epic <- dnamtl_cpgs[dnamtl_cpgs %in% rownames(meth)]
length(dnamtl_in_epic)
# 140
dnamtl_weights <- dnamtl$weight
class(dnamtl_weights)
meffonym.add.model("dnamtl", variables=dnamtl_cpgs, coefficients=dnamtl_weights, c("DNAmTL clock"))
######################
# Zhang (514 sites)
# script taken from github page
# https://github.com/qzhang314/DNAm-based-age-predictor
# https://raw.githubusercontent.com/qzhang314/DNAm-based-age-predictor/master/en.coef
############# 2. data loading and QC ##################
print("1. Data loading and QC")
meth.t <- as.data.frame(t(meth))
addna<-function(methy){
methy[is.na(methy)]<-mean(methy,na.rm=T)
return(methy)
}
print("1.2 Replacing missing values with mean value")
#dataNona<-apply(data,2,function(x) addna(x)) ############### replace the NA with mean value for each probe
#dataNona<- dataNona[,apply(dataNona,2,function(x) sum(is.na(x)))!=nrow(dataNona)] ############ remove the probe when it has NA across all individuals
#print(paste0(ncol(data) - ncol(dataNona)," probe(s) is(are) removed since it has (they have) NA across all individuals"))
print("1.3 Standardizing")
meth.t<- apply(meth.t,1,scale) ############### standardize the DNA methylation within each individual, remove the mean and divided by the SD of each individual Probe * IND
rownames(meth.t)<-rownames(meth)
############# 3. get the coefficients of each probe from Elastic Net/BLUP method, !!!!WE HAVE TWO PREDICTORS!!!#############
print("2. Loading predictors")
read.table(paste0(clocks.dir,"/en.coef"),stringsAsFactor=F,header=T)->encoef
zhang_on_epic <- encoef$probe[encoef$probe %in% rownames(meth)]
length(zhang_on_epic)
#514
en_int<-encoef[1,2]
encoef<-encoef[-1,]
rownames(encoef)<-encoef$probe
############# 4. get common probes between predictors and data ##############
print("3. Checking misssing probes")
encomm<- intersect(rownames(encoef),rownames(meth.t))
endiff<- nrow(encoef) - length(encomm)
print(paste0(endiff," probe(s) in Elastic Net predictor is(are) not in the data"))
############# 5. extract the common probes and do age prediction ###############
print("4. Predicting")
encoef<-encoef[encomm,]
encoef$coef%*%meth.t[encomm,]+en_int->enpred
zhang_clock <- as.data.frame(t(enpred))
colnames(zhang_clock) <- c("zhang_clock")
head(zhang_clock)
summary(zhang_clock$zhang_clock)
# get Horvath and Hannum cpg numbers
horvath <- read.csv(paste0(clocks.dir,"/13059_2013_3156_MOESM3_ESM.csv"), stringsAsFactors=F, skip=2)
horvath <- horvath$CpGmarker[-1]
horvath_450k <- horvath[horvath %in% rownames(meth)]
length(horvath_450k)
hannum <- read_xlsx(paste0(clocks.dir,"/NIHMS418935-supplement-02.xlsx"))
hannum <- hannum$Marker
hannum_450k <- hannum[hannum %in% rownames(meth)]
length(hannum_450k)
## running meffonym clocks
identical(rownames(pheno),colnames(meth))
meffonym.models()
meth.m <- as.matrix(meth)
meffonym_clocks <- cbind(age=pheno$AgeYRS, ## check age colname
sapply(c("phenoage","zhang.mortality", "dnamtl","age.hannum","age.horvath"), function(model) {
meffonym.score(meth.m, model)$score
}))
head(meffonym_clocks)
meffonym_clocks <- as.data.frame(meffonym_clocks)
cor(meffonym_clocks)
summary(meffonym_clocks$phenoage)
summary(meffonym_clocks$zhang.mortality)
summary(meffonym_clocks$dnamtl)
rownames(meffonym_clocks) <- colnames(meth)
head(meffonym_clocks)
# add the other clocks
head(epiToc.clock)
head(miage.clock)
head(zhang_clock)
head(dunedin_age)
# epiToc.clock.450k, miage.clock.450k, dunedin_age.450k, zhang_clock.450k
all_clocks_mbms <- merge(meffonym_clocks, epiToc.clock, by="row.names")
head(all_clocks_mbms)
rownames(all_clocks_mbms) <- all_clocks_mbms$Row.names
head(all_clocks_mbms)
all_clocks_mbms <- all_clocks_mbms[,-1]
all_clocks_mbms <- merge(all_clocks_mbms, miage.clock, by="row.names")
rownames(all_clocks_mbms) <- all_clocks_mbms$Row.names
head(all_clocks_mbms)
all_clocks_mbms <- all_clocks_mbms[,-1]
all_clocks_mbms <- merge(all_clocks_mbms, dunedin_age, by="row.names")
rownames(all_clocks_mbms) <- all_clocks_mbms$Row.names
head(all_clocks_mbms)
all_clocks_mbms <- all_clocks_mbms[,-1]
all_clocks_mbms <- merge(all_clocks_mbms, zhang_clock, by="row.names")
rownames(all_clocks_mbms) <- all_clocks_mbms$Row.names
head(all_clocks_mbms)
all_clocks_mbms <- all_clocks_mbms[,-1]
head(all_clocks_mbms)
clock_seq <- colnames(all_clocks_mbms)
for(i in 1:ncol(all_clocks_mbms)){
print(i)
print(clock_seq[i])
print(summary(all_clocks_mbms[,i]))
print(sd(all_clocks_mbms[,i]))
}
save(all_clocks_mbms,file=paste0(output.dir,"/all_clocks_mbms.Rdata"))
cor(all_clocks_mbms)
# finally, load in batch info, add that to the clocks df,
# and add cell counts too
# load batch info
load(file=paste0(data.dir,"/mbms_samplesheet.Rdata"))
samplesheet <- samplesheet[,c("participant", "Slide", "sentrix_row","dna","good")]
samplesheet$Slide <- as.factor(as.character(samplesheet$Slide))
identical(rownames(samplesheet), colnames(meth))
identical(rownames(samplesheet), rownames(all_clocks_mbms))
all_clocks_mbms <- merge(all_clocks_mbms,samplesheet,by="row.names")
rownames(all_clocks_mbms) <- all_clocks_mbms$Row.names
all_clocks_mbms <- all_clocks_mbms[,-1]
# create cell counts
cellcounts<-meffil.estimate.cell.counts.from.betas(meth.m,cell.type.reference="blood gse35069 complete",verbose=T)
cellcounts<-data.frame(IID=row.names(cellcounts),cellcounts)
cellcounts <- cellcounts[,-1]
# make sure ID are row names!
identical(rownames(all_clocks_mbms), rownames(cellcounts))
all_clocks_mbms <- merge(all_clocks_mbms,cellcounts,by="row.names")
rownames(all_clocks_mbms) <- all_clocks_mbms$Row.names
names(all_clocks_mbms)[names(all_clocks_mbms) == "Row.names"] <- "mbms_sentrix_id"
save(all_clocks_mbms,file=paste0(output.dir,"/all_clocks_mbms.Rdata"))
write.csv(all_clocks_mbms,file = paste0(output.dir,"/all_clocks_mbms.csv"),quote = F,row.names = T)
save(cellcounts,file=paste0(output.dir,"/mbms_cell_counts.Rdata"))
# create plots to check estimates
jpeg(filename = paste0(output.dir,"/plots_of_clocks/zhang_clock_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=zhang_clock)) +
geom_histogram(colour="black", fill="#440154FF")+
geom_vline(aes(xintercept = mean(zhang_clock)),col='red')+
labs(title="Zhang 2019 clock")+
theme_minimal()
dev.off()
jpeg(filename = paste0(output.dir,"/plots_of_clocks/chronological_age_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=age)) +
geom_histogram(colour="black", fill="#440154FF")+
geom_vline(aes(xintercept = mean(age)),col='red')+
labs(title="chronological age")+
theme_minimal()
dev.off()
jpeg(filename = paste0(output.dir,"/plots_of_clocks/phenoage_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=phenoage)) +
geom_histogram(colour="black", fill="#440154FF")+
geom_vline(aes(xintercept = mean(phenoage)),col='red')+
labs(title="Phenoage clock")+
theme_minimal()
dev.off()
jpeg(filename = paste0(output.dir,"/plots_of_clocks/zhang_mortality_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=zhang.mortality)) +
geom_histogram(colour="black", fill="#440154FF")+
geom_vline(aes(xintercept = mean(zhang.mortality)),col='red')+
labs(title="Zhang mortality clock")+
theme_minimal()
dev.off()
jpeg(filename = paste0(output.dir,"/plots_of_clocks/dnamtl_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=dnamtl)) +
geom_histogram(colour="black", fill="#440154FF")+
geom_vline(aes(xintercept = mean(dnamtl)),col='red')+
labs(title="DNAmTL")+
theme_minimal()
dev.off()
jpeg(filename = paste0(output.dir,"/plots_of_clocks/hannum_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=age.hannum)) +
geom_histogram(colour="black", fill="#440154FF")+
geom_vline(aes(xintercept = mean(age.hannum)),col='red')+
labs(title="Hannum")+
theme_minimal()
dev.off()
jpeg(filename = paste0(output.dir,"/plots_of_clocks/horvath_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=age.horvath)) +
geom_histogram(colour="black", fill="#440154FF")+
geom_vline(aes(xintercept = mean(age.horvath)),col='red')+
labs(title="Horvath")+
theme_minimal()
dev.off()
jpeg(filename = paste0(output.dir,"/plots_of_clocks/epiToc_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=epiToc.clock)) +
geom_histogram(colour="black", fill="#440154FF")+
geom_vline(aes(xintercept = mean(epiToc.clock)),col='red')+
labs(title="EpiToc")+
theme_minimal()
dev.off()
jpeg(filename = paste0(output.dir,"/plots_of_clocks/miage_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=miage.clock)) +
geom_histogram(colour="black", fill="#440154FF")+
geom_vline(aes(xintercept = mean(miage.clock)),col='red')+
labs(title="MiAge")+
theme_minimal()
dev.off()
jpeg(filename = paste0(output.dir,"/plots_of_clocks/dunedin_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=dunedin_age)) +
geom_histogram(colour="black", fill="#440154FF")+
geom_vline(aes(xintercept = mean(dunedin_age)),col='red')+
labs(title="DunedinPoAm")+
theme_minimal()
dev.off()