In [None]:
#1 normalize the bulk data
#2 extract degs and marker genes per cluster
#3 calculate spearman correlation of gene expression and age, correct by multiple test
#4 make heatmaps and perhaps show some curve as an example

In [None]:
library(data.table)
library(ggplot2)
library(readxl)
library(plyr)
library(tidyr)
library(dplyr)
library(extrafont)
library(reshape2)

library(viridis)
library(knitr)
library(stringr)
library(NMF)
library(rsvd)
library(lme4)
library(RColorBrewer)
library(purrr)
library(gtools)
library(gprofiler2)
setwd(".../Test_Code")

In [None]:
datasets <- readRDS(".../RData/SEZ_seurat_object.rds")
DefaultAssay(datasets) <- "RNA"

In [None]:
data<-read.csv(".../RData/Bulk/bulk_data_samples.csv")
head(data)
data_adol_aging<-subset(data,Age_group %in% c('E','F',"G","H"))
data_adol_aging<-drop_na(data_adol_aging)
rownames(data_adol_aging)<-data_adol_aging$Sample
data_adol_aging_samples<-data_adol_aging$Sample
data_adol_aging_samples<-c("Gene",data_adol_aging_samples)
data_adol_aging_samples

In [None]:
bulk<-read.table("/RData/Bulk/bulk_counts_SVZ_Gene.tsv",header = TRUE)
bulk<-drop_na(bulk)
#CPM make a for loop across the columns
bulk_2<-bulk
for (i in 2:length(bulk_2)){
    col=bulk_2[[i]]
    total_reads=sum(col)
    n_factor=1000000/total_reads
    bulk_2[i]=bulk_2[i]*n_factor
    
}

In [None]:
bulk_2_AD_AG<-bulk_2[data_adol_aging_samples]

In [None]:
setwd(".../validations/DEGs")

In [None]:
DEGs<-list.files(pattern ="MAST",
                           full.names = FALSE, recursive = TRUE)

DEGs <- lapply(DEGs, function(x) read.csv(x) %>% mutate(cluster = x))
DEGs<-do.call(rbind,DEGs)
head(DEGs)[1:2,]

In [None]:
est_list<-list()
pval_list<-list()
nMetrics <- ncol(data_adol_aging_age)
numb_of_traits=1
for (i in colnames(bulk_2_AD_AG_DEGs_rd)){
    est <- c()
    p <- c()
    s_corr<-cor.test(bulk_2_AD_AG_DEGs_rd[,i],data_adol_aging_age,method = 'spearman',exact = FALSE)
    p_val<-s_corr$p.value
    estimate<-unname(s_corr$estimate)
     est_list[i]<- estimate  
     pval_list[i]<- p_val  }
est_list<-do.call(cbind,est_list)
pval_list<-do.call(cbind,pval_list)
all_adj<-round(p.adjust(c(pval_list), method ="fdr", n = length(c(pval_list))),2)
# rownames(all_adj)<"adj_p"
# rownames(est_list)<"corr"
results<-rbind(pval_list,all_adj,est_list)
rownames(results)<-c("p","adj_p","corr")
results<-as.data.frame(results)
results<-results[,]
results_sig<-as.data.frame(t(results))
results_sig<-results_sig[results_sig$p<0.05,]
results_sig$dir_corr<-ifelse(results_sig$corr>0, "increase","decrease")
#include if this amtch with direction of log2FC, and then also indicate if we mentioned in the text
results_sig$primerid<-rownames(results_sig)
head(results_sig)[1:2,]
sig<-results_sig$Gene

In [None]:
results_sig_all<-merge(DEGs, results_sig, by="primerid") #DEGs$cluster[match(results_sig$Gene, DEGs$primerid)]
results_sig_all$dir_DEGs<-ifelse(results_sig_all$coef>0, "increase","decrease")
results_sig_all$consistent<-ifelse(results_sig_all$dir_corr==results_sig_all$dir_DEGs, "Yes","No")
results_sig_all<-results_sig_all[,c("primerid","cluster","coef","dir_DEGs","corr","dir_corr","p","adj_p","consistent")]
head(results_sig_all)

In [None]:
results_sig_all<-with(results_sig_all, results_sig_all[order(cluster, consistent,-abs(coef)),])
results_sig_all$cluster<-str_replace(results_sig_all$cluster,"MAST-mixed-sign.csv", "")
results_sig_all$cluster<-paste0("cluster_",results_sig_all$cluster)
write.csv(results_sig_all,"correlations_bulk_DEGs.csv")

In [None]:
to_plot<-results_sig_all[results_sig_all$consistent=="Yes",]
to_plot<-to_plot %>% group_by(cluster) %>% arrange(desc(abs(coef)))# %>% 
   #top_n(5, coef)
to_plot<-with(to_plot, to_plot[order(cluster,corr),])
to_plot$primerid<-factor(to_plot$primerid,levels=unique(to_plot$primerid))

to_plot_lines_pos<-to_plot[to_plot$corr>0,"primerid"]$primerid
to_plot_lines_neg<-to_plot[to_plot$corr<0,"primerid"]$primerid

In [None]:
p<-ggplot(to_plot, aes(x=consistent, y=primerid))+  
  geom_tile(aes(fill= corr),color = "black")+
    scale_fill_gradient2(low="blue", high="red", limits=c(-1,1))+
    theme(strip.text = element_text(size=5), legend.text = element_text(size=5),legend.title = element_text(size=5),legend.key.size = unit(0.3, "cm"),
          panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
    panel.background = element_blank(), axis.line = element_line(colour = "white"),axis.text=element_text(size=7,color="black"),
          text=element_text(size=4),strip.background = element_blank(), axis.text.x = element_text(
                                                                                                   angle = 45, vjust = 1, hjust=1,size=0),
        axis.text.y = element_text(,size=7,vjust = 0.5, hjust=1,face = "italic"),
        axis.title.x = element_blank(),axis.ticks.x=element_blank(),axis.ticks.y=element_blank(),
          axis.title.y = element_blank())+

    geom_text(data=to_plot,aes(x=consistent,y=primerid,label=round(p,3)),
              size=1.5,angle=45)+xlab(" ")+ylab(" ")
p
tiff("heatmap_consistent_corr_abs_all_italic.tiff", res = 400, units = "in", width = 2, height = 5.3)
p
dev.off()#fac

In [None]:
write.csv(to_plot,"to_plot_heatmap.csv")

In [None]:
to_plot_lines_pos<-to_plot[to_plot$corr>0,"primerid"]$primerid
to_plot_lines_pos<-as.data.frame(bulk_2_AD_AG_DEGs_rd[,colnames(bulk_2_AD_AG_DEGs_rd) %in% to_plot_lines_pos])
to_plot_lines_pos$Sample<-rownames(to_plot_lines_pos)
to_plot_lines_pos<-melt(to_plot_lines_pos)
colnames(to_plot_lines_pos)<-c("Sample","gene","expression")
to_plot_lines_pos$gene<-factor(to_plot_lines_pos$gene,levels=unique(to_plot_lines_pos$gene))
age<-data_adol_aging["Age_full_year"]
age$Sample<-rownames(age)
to_plot_lines_pos<-merge(to_plot_lines_pos,age, by="Sample")
the_plot<-ggplot(to_plot_lines_pos, aes(x = Age_full_year, y = expression))+
         geom_point(aes(color=gene)) +
         geom_line(aes(group = gene)) +theme_minimal() +theme(legend.text = element_text(size=7,face = "italic"),legend.title = element_text(size=8),legend.key.size = unit(0.3, "cm"))
tiff("postive_corr_consistent_all_talic.tiff", res = 400, units = "in", width = 4, height = 3)

the_plot        
dev.off()

In [None]:
to_plot_lines_neg<-to_plot[to_plot$corr<0,"primerid"]$primerid
to_plot_lines_neg<-as.data.frame(bulk_2_AD_AG_DEGs_rd[,colnames(bulk_2_AD_AG_DEGs_rd) %in% to_plot_lines_neg])
to_plot_lines_neg$Sample<-rownames(to_plot_lines_neg)
to_plot_lines_neg<-melt(to_plot_lines_neg)
colnames(to_plot_lines_neg)<-c("Sample","gene","expression")
to_plot_lines_neg$gene<-factor(to_plot_lines_neg$gene,levels=unique(to_plot_lines_neg$gene))
age<-data_adol_aging["Age_full_year"]
age$Sample<-rownames(age)
to_plot_lines_neg<-merge(to_plot_lines_neg,age, by="Sample")
the_plot<-ggplot(to_plot_lines_neg, aes(x = Age_full_year, y = expression))+
         geom_point(aes(color=gene)) +
         geom_line(aes(group = gene)) +theme_minimal() +theme(legend.text = element_text(size=7,face = "italic"),legend.title = element_text(size=8),legend.key.size = unit(0.3, "cm"))
tiff("negtive_corr_consistent_all_italic.tiff", res = 400, units = "in", width = 5, height = 3)

the_plot        
dev.off()

In [None]:
consistency<-as.data.frame.matrix(table(results_sig_all$cluster,results_sig_all$consistent))


In [None]:
setwd(".../validations/Markers")

In [None]:
#impor list of marker_genes and list of marker genes

marker_genes<-read.csv("marker_genes.csv")
marker_genes_primerid<-marker_genes$primerid
marker_genes_primerid<-marker_genes_primerid[!duplicated(marker_genes_primerid)]

In [None]:
#check for marker_genes correlation with age
bulk_2_AD_AG_marker_genes<-bulk_2_AD_AG[bulk_2_AD_AG$Gene %in% marker_genes_primerid,]
#if there is a a duplicated gene, then pick the une with higher expression per sample

In [None]:
bulk_2_AD_AG_marker_genes_rd<-list()
for (i in 2:19){
    df<-bulk_2_AD_AG_marker_genes[c(1,i)]
    
    
     unique_genes<-unique(df$Gene)
     V3<-c(colnames(df[2]))
     #V3<-rlang::sym(c(V3))
     df<-df %>%                                      # Top N highest values by group
     group_by(Gene)
     #arrange(desc(.data[[ V3[1] ]]), .by_group=TRUE) %>%
    df<-dplyr::arrange( df, !! rlang::parse_expr("desc(V3)") ) %>% 
    slice(1) %>% ungroup()
    
    
     
    df<-df[match(unique_genes, df$Gene), c("Gene", V3)] 
     bulk_2_AD_AG_marker_genes_rd[[i]]<-df
    
}
bulk_2_AD_AG_marker_genes_rd<-bulk_2_AD_AG_marker_genes_rd[2:19]
bulk_2_AD_AG_marker_genes_rd<-bulk_2_AD_AG_marker_genes_rd %>% reduce(inner_join, by = "Gene") %>% as.data.frame()
#bulk_2_AD_AG_marker_genes_rd<-as.data.frame(bulk_2_AD_AG_marker_genes_rd)
sum(duplicated(bulk_2_AD_AG_marker_genes_rd$Gene))
rownames(bulk_2_AD_AG_marker_genes_rd)<-bulk_2_AD_AG_marker_genes_rd$Gene
bulk_2_AD_AG_marker_genes_rd<-bulk_2_AD_AG_marker_genes_rd[-1]
# colnames(bulk_2_AD_AG_marker_genes_rd)<-samples_info$sample_ID
bulk_2_AD_AG_marker_genes_rd<-t(bulk_2_AD_AG_marker_genes_rd)
head(bulk_2_AD_AG_marker_genes_rd)

In [None]:
est_list<-list()
pval_list<-list()
nMetrics <- ncol(data_adol_aging_age)
numb_of_traits=1
for (i in colnames(bulk_2_AD_AG_marker_genes_rd)){
    est <- c()
    p <- c()
    s_corr<-cor.test(bulk_2_AD_AG_marker_genes_rd[,i],data_adol_aging_age,method = 'spearman',exact = FALSE)
    p_val<-s_corr$p.value
    estimate<-unname(s_corr$estimate)
     est_list[i]<- estimate  
     pval_list[i]<- p_val  }
est_list<-do.call(cbind,est_list)
pval_list<-do.call(cbind,pval_list)
all_adj<-round(p.adjust(c(pval_list), method ="fdr", n = length(c(pval_list))),2)
# rownames(all_adj)<"adj_p"
# rownames(est_list)<"corr"
results<-rbind(pval_list,all_adj,est_list)
rownames(results)<-c("p","adj_p","corr")
results<-as.data.frame(results)
results<-results[,]
results_sig<-as.data.frame(t(results))
results_sig<-results_sig[results_sig$p<0.05,]
results_sig$dir_corr<-ifelse(results_sig$corr>0, "increase","decrease")
#include if this amtch with direction of log2FC, and then also indicate if we mentioned in the text
results_sig$primerid<-rownames(results_sig)
head(results_sig)[1:2,]
sig<-results_sig$Gene

In [None]:
results_sig_all<-merge(marker_genes, results_sig, by="primerid") #marker_genes$cluster[match(results_sig$Gene, marker_genes$primerid)]
results_sig_all$dir_proportion<-"decrease"
results_sig_all$dir_proportion[results_sig_all$cluster==13]<-"increase"



results_sig_all$consistent<-ifelse(results_sig_all$dir_corr==results_sig_all$dir_proportion, "Yes","No")
results_sig_all<-results_sig_all[,c("primerid","cluster","FC","dir_proportion","corr","dir_corr","p","adj_p","consistent")]

In [None]:
results_sig_all<-with(results_sig_all, results_sig_all[order(cluster, consistent,-abs(FC)),])


In [None]:
consistency<-as.data.frame.matrix(table(results_sig_all$cluster,results_sig_all$consistent))


In [None]:
to_plot<-results_sig_all[results_sig_all$consistent=="Yes",]
to_plot<-to_plot %>% group_by(cluster) %>% arrange(desc(FC)) %>% 
   top_n(5, FC)
to_plot<-with(to_plot, to_plot[order(cluster,corr),])
to_plot$primerid<-factor(to_plot$primerid,levels=unique(to_plot$primerid))

to_plot_lines_pos<-to_plot[to_plot$corr>0,"primerid"]$primerid
to_plot_lines_neg<-to_plot[to_plot$corr<0,"primerid"]$primerid

In [None]:
p<-ggplot(droplevels(to_plot), aes(x=consistent, y=primerid))+  
  geom_tile(aes(fill= corr),color = "black")+
    scale_fill_gradient2(low="blue", high="red", limits=c(-1,1))+
    theme(strip.text = element_text(size=4), legend.text = element_text(size=5),legend.title = element_text(size=5),legend.key.size = unit(0.3, "cm"),
          panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
    panel.background = element_blank(), axis.line = element_line(colour = "white"),axis.text=element_text(size=6,color="black"),
          text=element_text(size=4),strip.background = element_blank(), axis.text.x = element_text(
                                                                                                   angle = 45, vjust = 1, hjust=1,size=0),
        axis.text.y = element_text(,size=4.5,vjust = 0.5, hjust=1,face = "italic"),
        axis.title.x = element_blank(),axis.ticks.x=element_blank(),axis.ticks.y=element_blank(),
          axis.title.y = element_blank())+

    geom_text(data=droplevels(to_plot),aes(x=consistent,y=primerid,label=round(p,3)),
              size=1.5,angle=0)+xlab(" ")+ylab(" ")
p
tiff("heatmap_consistent_corr_italic.tiff", res = 400, units = "in", width = 1.5, height = 2)
p
dev.off()#fac

In [None]:
write.csv(to_plot,"to_plot_heatmap.csv")

In [None]:
to_plot_lines_pos<-to_plot[to_plot$corr>0,"primerid"]$primerid
to_plot_lines_pos<-as.data.frame(bulk_2_AD_AG_marker_genes_rd[,colnames(bulk_2_AD_AG_marker_genes_rd) %in% to_plot_lines_pos])
to_plot_lines_pos$Sample<-rownames(to_plot_lines_pos)
to_plot_lines_pos<-melt(to_plot_lines_pos)
colnames(to_plot_lines_pos)<-c("Sample","gene","expression")
to_plot_lines_pos$gene<-factor(to_plot_lines_pos$gene,levels=unique(to_plot_lines_pos$gene))
age<-data_adol_aging["Age_full_year"]
age$Sample<-rownames(age)
to_plot_lines_pos<-merge(to_plot_lines_pos,age, by="Sample")
the_plot<-ggplot(to_plot_lines_pos, aes(x = Age_full_year, y = expression))+
         geom_point(aes(color=gene)) +
         geom_line(aes(group = gene)) +theme_minimal() +theme(legend.text = element_text(size=7,face = "italic"),legend.title = element_text(size=8),legend.key.size = unit(0.3, "cm"))
tiff("postive_corr_consistent_italic.tiff", res = 400, units = "in", width = 4, height = 3)

the_plot        
dev.off()

In [None]:
to_plot_lines_neg<-to_plot[to_plot$corr<0,"primerid"]$primerid
to_plot_lines_neg<-as.data.frame(bulk_2_AD_AG_marker_genes_rd[,colnames(bulk_2_AD_AG_marker_genes_rd) %in% to_plot_lines_neg])
to_plot_lines_neg$Sample<-rownames(to_plot_lines_neg)
to_plot_lines_neg<-melt(to_plot_lines_neg)
colnames(to_plot_lines_neg)<-c("Sample","gene","expression")
to_plot_lines_neg$gene<-factor(to_plot_lines_neg$gene,levels=unique(to_plot_lines_neg$gene))
age<-data_adol_aging["Age_full_year"]
age$Sample<-rownames(age)
to_plot_lines_neg<-merge(to_plot_lines_neg,age, by="Sample")
the_plot<-ggplot(to_plot_lines_neg, aes(x = Age_full_year, y = expression))+
         geom_point(aes(color=gene)) +
         geom_line(aes(group = gene)) +theme_minimal() +theme(legend.text = element_text(size=7,face = "italic"),legend.title = element_text(size=8),legend.key.size = unit(0.3, "cm"))
tiff("negtive_corr_consistent_italic.tiff", res = 400, units = "in", width = 4, height = 3)

the_plot        
dev.off()