mbms_epigenetic_clock_generation.R

# epigenetic clocks creation
# MBMS
# written by Sarah Watkins
# September 2021

# here we calculate 9 of the epigenetic clocks
# (grimage has to be uploaded to website)
# then we combine them all together in a df
# then we add in batch vars (slide + row)
# then we add in cell counts
# this is the dataset to send to Harvard for the epigenetic clocks
# analyses.

# Set directories:

project.dir <- "/project_dir"

data.dir <- file.path(project.dir, "data_dir")
output.dir <- file.path(project.dir,"output_dir")
clocks.dir <- file.path(project.dir, "clocks_dir")

## http://github.com/perishky/meffil
library(meffil)
options(mc.cores=10)

## http://github.com/perishky/eval.save
library(eval.save)
eval.save.dir(output.dir)

library(readxl)
library(meffonym)

# load in data
# we need meth and we need age
# see dataset generation for creation of mbms.csv (phenotype data)
pheno <- read.csv(file=paste0(data.dir,"/mbms.csv"),header = T)
pheno <- pheno[!is.na(pheno$sentrix_id),]
rownames(pheno) <- pheno$sentrix_id
# see normalisation and QC for norm.beta_mbms_slide_fixed.rda
load(file=paste0(data.dir,"/norm.beta_mbms_slide_fixed.rda"))
meth <- ret
meth <- as.data.frame(meth)
meth_samples <- as.character(colnames(meth))
pheno <- pheno[meth_samples,]

# add the clocks

#####################

# epiToc (385 CpG sites). "The average DNAm over these 385 CpG sites"
## https://static-content.springer.com/esm/art%3A10.1186%2Fs13059-016-1064-3/MediaObjects/13059_2016_1064_MOESM2_ESM.xls
epiToc <- read_xls(paste0(clocks.dir,"/13059_2016_1064_MOESM2_ESM.xls"))
epiToc <- epiToc[[1]][-1]
# not all of the epitoc cpgs may be present in our daatset:
present_epitoc_cpgs <- epiToc[epiToc%in% rownames(meth)]
length(present_epitoc_cpgs)

meth_epiToc <- meth[present_epitoc_cpgs,]
dim(meth_epiToc)
sum.cpgs <- colSums(meth_epiToc)
length(sum.cpgs)
epiToc.clock <- sum.cpgs/length(epiToc)
length(epiToc.clock)
head(epiToc.clock)
epiToc.clock <- as.data.frame(epiToc.clock)


######################

# Zhang mortality (10 CpG sites)
# https://doi.org/10.1038/ncomms14617
# We can have a continuous or 0-10 risk score
# "We .. used the fourth quartile value of cg08362785 and first 
# quartile values of other nine CpGs as the cutoff points, to define 
# aberrant methylation for each CpG (the exact cutoff points are 
# listed in Supplementary Table 4)"
zhang_cpgs <- c("cg01612140", "cg05575921", "cg06126421", "cg08362785",
                "cg10321156", "cg14975410", "cg19572487", "cg23665802",
                "cg24704287", "cg25983901")
zhang_present_on_epic <- zhang_cpgs[zhang_cpgs %in% rownames(meth)]
length(zhang_present_on_epic)
# 10
####### or make the continuous score
zhang_coefficients <- c(-0.38253, -0.92224, -1.70129, 2.71749,
                        -0.02073, -0.04156, -0.28069, -0.89440,
                        -2.98637, -1.80325)

meffonym.add.model("zhang.mortality", c("intercept", zhang_cpgs), c(0, zhang_coefficients), c("Zhang et al 2017 mortality clock from PMID 28303888. Clock details in supplement of the paper."))

########################

# MiAge (268 CpG sites)
# https://dx.doi.org/10.1080%2F15592294.2017.1389361
miage <- read.csv(paste0(clocks.dir,"/2017EPI0166R-s02.csv"), stringsAsFactors=F)
# miage has functions and a script to get the mitotic age estimates
miage_in_epic <- miage$CpG_site_ID[miage$CpG_site_ID %in% rownames(meth)]
length(miage_in_epic)
#268 

source(paste0(clocks.dir,"/miage_function_library.r")) ### library of all functions used in the calculation of mitotic ages                                                                                 
#HM450.mat=as.matrix(read.table("input_HM450_methyl_data.txt",header=T))#### inputHM450.txt is a matrix of HM450 methylation values with each row representing each CpG site and each column representing each sample. The row names of CpG sites and column names corresponding to sample names must be given                                                         
clocksitesID=as.matrix(read.csv(paste0(clocks.dir,"/miage_Additional_File1.csv"),header=T))[,1]   ### epigenetic clock CpG sites
beta=  meth[ match(clocksitesID,rownames(meth)),]##select clock CpG sites 

load(paste0(clocks.dir,"/miage_site_specific_parameters.Rdata")) #### site-specific parameters

b=methyl.age[[1]];c=methyl.age[[2]];d=methyl.age[[3]]

miage.clock=mitotic.age(beta,b,c,d) ### estimated mitotic age
names(miage.clock)=colnames(beta)
head(miage.clock)
miage.clock <- as.data.frame(miage.clock)

#########################

# Levine/PhenoAge (513 sites)
# https://dx.doi.org/10.18632%2Faging.101414
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5940111/bin/aging-10-101414-s002.csv
levine <- read.csv(paste0(clocks.dir,"/aging-10-101414-s002.csv"),stringsAsFactors=F)
levine_in_epic <- levine$CpG[levine$CpG %in% rownames(meth)]
length(levine_in_epic)
# 513
levine_cpgs <- as.character(levine$CpG)
levine_weights <- levine$Weight
class(levine_weights)

meffonym.add.model("phenoage", variables=levine_cpgs, coefficients=levine_weights, c("PhenoAge (Levine) clock"))

meffonym.models()

#########################

# Dunedin
## devtools::install_github("danbelsky/DunedinPoAm38")
library("DunedinPoAm38")
dunedin.sites <- getRequiredProbes()$DunedinPoAm_38
length(dunedin.sites)
# 46
dunedin_in_epic <- dunedin.sites[dunedin.sites %in% rownames(meth)]
length(dunedin_in_epic)
# 46
# rows=cpgs
dunedin_age <- PoAmProjector(meth)
dunedin_age <- data.frame(dunedin_age[[1]])
names(dunedin_age) <- c("dunedin_age")
head(dunedin_age)

########################

# DNAmTL (140 CpG sites)
# https://dx.doi.org/10.18632%2Faging.102173
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6738410/bin/aging-11-102173-s003.xlsx
# load the sites and coefficients
dnamtl <- read.csv(paste0(clocks.dir,"/dnamtl_coefficients.csv"),stringsAsFactors=F)
colnames(dnamtl) <- c("cpg","weight")
head(dnamtl)
dnamtl_cpgs <- as.character(dnamtl$cpg)
length(dnamtl_cpgs)
dnamtl_in_epic <- dnamtl_cpgs[dnamtl_cpgs %in% rownames(meth)]
length(dnamtl_in_epic)
# 140
dnamtl_weights <- dnamtl$weight
class(dnamtl_weights)

meffonym.add.model("dnamtl", variables=dnamtl_cpgs, coefficients=dnamtl_weights, c("DNAmTL clock"))


######################

# Zhang (514 sites)
# script taken from github page
# https://github.com/qzhang314/DNAm-based-age-predictor
# https://raw.githubusercontent.com/qzhang314/DNAm-based-age-predictor/master/en.coef

############# 2. data loading and QC ##################
print("1. Data loading and QC")

meth.t <- as.data.frame(t(meth))

addna<-function(methy){
  methy[is.na(methy)]<-mean(methy,na.rm=T)
  return(methy)
}

print("1.2 Replacing missing values with mean value")
#dataNona<-apply(data,2,function(x) addna(x))   ###############  replace the NA with mean value for each probe 

#dataNona<- dataNona[,apply(dataNona,2,function(x) sum(is.na(x)))!=nrow(dataNona)] ############ remove the probe when it has NA across all individuals
#print(paste0(ncol(data) - ncol(dataNona)," probe(s) is(are) removed since it has (they have) NA across all individuals"))

print("1.3 Standardizing")
meth.t<- apply(meth.t,1,scale)        ############### standardize the DNA methylation within each individual, remove the mean and divided by the SD of each individual     Probe * IND
rownames(meth.t)<-rownames(meth)

############# 3. get the coefficients of each probe from Elastic Net/BLUP method, !!!!WE HAVE TWO PREDICTORS!!!#############

print("2. Loading predictors")
read.table(paste0(clocks.dir,"/en.coef"),stringsAsFactor=F,header=T)->encoef

zhang_on_epic <- encoef$probe[encoef$probe %in% rownames(meth)]
length(zhang_on_epic)
#514
en_int<-encoef[1,2]

encoef<-encoef[-1,]

rownames(encoef)<-encoef$probe

############# 4. get common probes between predictors and data ##############
print("3. Checking misssing probes")

encomm<- intersect(rownames(encoef),rownames(meth.t))

endiff<- nrow(encoef) - length(encomm)

print(paste0(endiff," probe(s) in Elastic Net predictor is(are) not in the data"))

############# 5. extract the common probes and do age prediction ###############
print("4. Predicting")

encoef<-encoef[encomm,]
encoef$coef%*%meth.t[encomm,]+en_int->enpred

zhang_clock <- as.data.frame(t(enpred))
colnames(zhang_clock) <- c("zhang_clock")
head(zhang_clock)
summary(zhang_clock$zhang_clock)

# get Horvath and Hannum cpg numbers

horvath <- read.csv(paste0(clocks.dir,"/13059_2013_3156_MOESM3_ESM.csv"), stringsAsFactors=F, skip=2)
horvath <- horvath$CpGmarker[-1]
horvath_450k <- horvath[horvath %in% rownames(meth)]
length(horvath_450k)

hannum <- read_xlsx(paste0(clocks.dir,"/NIHMS418935-supplement-02.xlsx"))
hannum <- hannum$Marker
hannum_450k <- hannum[hannum %in% rownames(meth)]
length(hannum_450k)

## running meffonym clocks
identical(rownames(pheno),colnames(meth))
meffonym.models()
meth.m <- as.matrix(meth)

meffonym_clocks <- cbind(age=pheno$AgeYRS, ## check age colname
                         sapply(c("phenoage","zhang.mortality", "dnamtl","age.hannum","age.horvath"), function(model) {
                           meffonym.score(meth.m, model)$score
                         }))
head(meffonym_clocks)
meffonym_clocks <- as.data.frame(meffonym_clocks)
cor(meffonym_clocks)
summary(meffonym_clocks$phenoage)
summary(meffonym_clocks$zhang.mortality)
summary(meffonym_clocks$dnamtl)
rownames(meffonym_clocks) <- colnames(meth)
head(meffonym_clocks)

# add the other clocks
head(epiToc.clock)
head(miage.clock)
head(zhang_clock)
head(dunedin_age)

# epiToc.clock.450k, miage.clock.450k, dunedin_age.450k, zhang_clock.450k
all_clocks_mbms <- merge(meffonym_clocks, epiToc.clock, by="row.names")
head(all_clocks_mbms)
rownames(all_clocks_mbms) <- all_clocks_mbms$Row.names
head(all_clocks_mbms)
all_clocks_mbms <- all_clocks_mbms[,-1]
all_clocks_mbms <- merge(all_clocks_mbms, miage.clock, by="row.names")
rownames(all_clocks_mbms) <- all_clocks_mbms$Row.names
head(all_clocks_mbms)
all_clocks_mbms <- all_clocks_mbms[,-1]

all_clocks_mbms <- merge(all_clocks_mbms, dunedin_age, by="row.names")
rownames(all_clocks_mbms) <- all_clocks_mbms$Row.names
head(all_clocks_mbms)
all_clocks_mbms <- all_clocks_mbms[,-1]

all_clocks_mbms <- merge(all_clocks_mbms, zhang_clock, by="row.names")
rownames(all_clocks_mbms) <- all_clocks_mbms$Row.names
head(all_clocks_mbms)
all_clocks_mbms <- all_clocks_mbms[,-1]

head(all_clocks_mbms)
clock_seq <- colnames(all_clocks_mbms)
for(i in 1:ncol(all_clocks_mbms)){
  print(i)
  print(clock_seq[i])
  print(summary(all_clocks_mbms[,i]))
  print(sd(all_clocks_mbms[,i]))
}

save(all_clocks_mbms,file=paste0(output.dir,"/all_clocks_mbms.Rdata"))
cor(all_clocks_mbms)


# finally, load in batch info, add that to the clocks df,
# and add cell counts too

# load batch info
load(file=paste0(data.dir,"/mbms_samplesheet.Rdata"))
samplesheet <- samplesheet[,c("participant", "Slide", "sentrix_row","dna","good")]
samplesheet$Slide <- as.factor(as.character(samplesheet$Slide))
identical(rownames(samplesheet), colnames(meth)) 
identical(rownames(samplesheet), rownames(all_clocks_mbms))
all_clocks_mbms <- merge(all_clocks_mbms,samplesheet,by="row.names")
rownames(all_clocks_mbms) <- all_clocks_mbms$Row.names
all_clocks_mbms <- all_clocks_mbms[,-1]

# create cell counts
cellcounts<-meffil.estimate.cell.counts.from.betas(meth.m,cell.type.reference="blood gse35069 complete",verbose=T)
cellcounts<-data.frame(IID=row.names(cellcounts),cellcounts)
cellcounts <- cellcounts[,-1]
# make sure ID are row names!
identical(rownames(all_clocks_mbms), rownames(cellcounts))
all_clocks_mbms <- merge(all_clocks_mbms,cellcounts,by="row.names")
rownames(all_clocks_mbms) <- all_clocks_mbms$Row.names
names(all_clocks_mbms)[names(all_clocks_mbms) == "Row.names"] <- "mbms_sentrix_id"

save(all_clocks_mbms,file=paste0(output.dir,"/all_clocks_mbms.Rdata"))
write.csv(all_clocks_mbms,file = paste0(output.dir,"/all_clocks_mbms.csv"),quote = F,row.names = T)
save(cellcounts,file=paste0(output.dir,"/mbms_cell_counts.Rdata"))

# create plots to check estimates

jpeg(filename = paste0(output.dir,"/plots_of_clocks/zhang_clock_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=zhang_clock)) +
  geom_histogram(colour="black", fill="#440154FF")+
  geom_vline(aes(xintercept = mean(zhang_clock)),col='red')+
  labs(title="Zhang 2019 clock")+
  theme_minimal()
dev.off()

jpeg(filename = paste0(output.dir,"/plots_of_clocks/chronological_age_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=age)) +
  geom_histogram(colour="black", fill="#440154FF")+
  geom_vline(aes(xintercept = mean(age)),col='red')+
  labs(title="chronological age")+
  theme_minimal()
dev.off()
jpeg(filename = paste0(output.dir,"/plots_of_clocks/phenoage_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=phenoage)) +
  geom_histogram(colour="black", fill="#440154FF")+
  geom_vline(aes(xintercept = mean(phenoage)),col='red')+
  labs(title="Phenoage clock")+
  theme_minimal()
dev.off()
jpeg(filename = paste0(output.dir,"/plots_of_clocks/zhang_mortality_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=zhang.mortality)) +
  geom_histogram(colour="black", fill="#440154FF")+
  geom_vline(aes(xintercept = mean(zhang.mortality)),col='red')+
  labs(title="Zhang mortality clock")+
  theme_minimal()
dev.off()
jpeg(filename = paste0(output.dir,"/plots_of_clocks/dnamtl_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=dnamtl)) +
  geom_histogram(colour="black", fill="#440154FF")+
  geom_vline(aes(xintercept = mean(dnamtl)),col='red')+
  labs(title="DNAmTL")+
  theme_minimal()
dev.off()
jpeg(filename = paste0(output.dir,"/plots_of_clocks/hannum_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=age.hannum)) +
  geom_histogram(colour="black", fill="#440154FF")+
  geom_vline(aes(xintercept = mean(age.hannum)),col='red')+
  labs(title="Hannum")+
  theme_minimal()
dev.off()
jpeg(filename = paste0(output.dir,"/plots_of_clocks/horvath_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=age.horvath)) +
  geom_histogram(colour="black", fill="#440154FF")+
  geom_vline(aes(xintercept = mean(age.horvath)),col='red')+
  labs(title="Horvath")+
  theme_minimal()
dev.off()
jpeg(filename = paste0(output.dir,"/plots_of_clocks/epiToc_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=epiToc.clock)) +
  geom_histogram(colour="black", fill="#440154FF")+
  geom_vline(aes(xintercept = mean(epiToc.clock)),col='red')+
  labs(title="EpiToc")+
  theme_minimal()
dev.off()
jpeg(filename = paste0(output.dir,"/plots_of_clocks/miage_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=miage.clock)) +
  geom_histogram(colour="black", fill="#440154FF")+
  geom_vline(aes(xintercept = mean(miage.clock)),col='red')+
  labs(title="MiAge")+
  theme_minimal()
dev.off()
jpeg(filename = paste0(output.dir,"/plots_of_clocks/dunedin_mbms.jpg"),width = 4, height = 3, units = "in", res = 600)
ggplot(data=all_clocks_mbms, aes(x=dunedin_age)) +
  geom_histogram(colour="black", fill="#440154FF")+
  geom_vline(aes(xintercept = mean(dunedin_age)),col='red')+
  labs(title="DunedinPoAm")+
  theme_minimal()
dev.off()