### Differential Gene Expression Analysis (DGE)

**Notes on Running the Notebook:**
1) Make sure that you have R installed before running this notebook.
2) The required libraries that need to be installed are listed in the cell below.

Performing DGE using limma package on, 
1) real bulk data (not batch-corrected)
2) bootstrapped bulk samples
3) the bulk expression per cell type derived by deconvolution.

In [10]:
# ref: https://ucdavis-bioinformatics-training.github.io/2018-June-RNA-Seq-Workshop/thursday/DE.html
library(edgeR) # this will load limma as well
library(dplyr)

In [2]:
############################
# create the design matrix #
############################

DGE_limma <- function(bulk, cond_tags){
    design <- model.matrix(~0 + cond_tags)
    colnames(design) <-  c("DPD", "H")
    
    # the input should be counts but for now it is rpkm and some steps of the documentation will be skipped
    fit <- lmFit(log2(bulk + 1), design)
    contrast.matrix <- makeContrasts(DPD-H,levels=design) # DPD vs H (affected vs non-affected)
    fit2 <- contrasts.fit(fit, contrast.matrix)
    fit2 <- eBayes(fit2, trend=TRUE)
    
    gene_count = dim(fit2$coefficients)[1]
    de_df <- topTable(fit2, coef=1, adjust="BH", number= gene_count)
    de_df <- de_df[de_df$AveExpr != 0, ]
    de_df$P.Value <- de_df$P.Value/2 # two-sided test
    de_df$adj.P.Val <- de_df$adj.P.Val/2 # two-sided test
    return (de_df)
}

de_dfs <- c()

### DGE for real bulk data (w/o batch correction)

In [3]:
# read in the real bulk data (w/o batch correction)
dir_real= "dpd_results/bulk_preprocess/"
dpd_bulk_real= read.table(paste0(dir_real, "dpd_bulk_real_notBC"), header= TRUE, sep= "\t")
h_bulk_real= read.table(paste0(dir_real, "h_bulk_real_notBC"), header= TRUE, sep= "\t")

# set ensid as rownames
rownames(dpd_bulk_real) <- dpd_bulk_real$EnsID
dpd_bulk_real <- dpd_bulk_real[ , !(names(dpd_bulk_real) %in% c("EnsID"))]

rownames(h_bulk_real) <- h_bulk_real$EnsID
h_bulk_real <- h_bulk_real[ , !(names(h_bulk_real) %in% c("EnsID"))]

In [4]:
bulk_real <- bind_cols(dpd_bulk_real, h_bulk_real) # bind_cols from dplyr
inds_tags <- c(substr(colnames(dpd_bulk_real), 1, 4), substr(colnames(h_bulk_real), 1, 4)) # finer
table(inds_tags)

cond_tags <- c(rep(c("DPD"), length(colnames(dpd_bulk_real))), 
               rep(c("H"), length(colnames(h_bulk_real)))) # coarser
table(cond_tags)

de_df <- DGE_limma(bulk_real, cond_tags)
# de_df <- de_df %>% rename_all(paste0, "_real")
de_dfs$real <- de_df

inds_tags
DPD1 DPD2 DPD3 DPD4 DPD6 
  32   24   24   24   16 

cond_tags
DPD   H 
 72  48 

### DGE for bootstrapped bulk data (generated from single cell data)

In [5]:
# read in the real batch corrected bulk data
dpd_bulk_real= read.table(paste0(dir_real, "dpd_bulk_bs"), header= TRUE, sep= "\t")
h_bulk_real= read.table(paste0(dir_real, "h_bulk_bs"), header= TRUE, sep= "\t")

# set ensid as rownames
rownames(dpd_bulk_real) <- dpd_bulk_real$EnsID
dpd_bulk_real <- dpd_bulk_real[ , !(names(dpd_bulk_real) %in% c("EnsID"))]

rownames(h_bulk_real) <- h_bulk_real$EnsID
h_bulk_real <- h_bulk_real[ , !(names(h_bulk_real) %in% c("EnsID"))]
bulk_real_bs <- bind_cols(dpd_bulk_real, h_bulk_real) # bind_cols from dplyr

cond_tags <- c(rep(c("DPD"), length(colnames(dpd_bulk_real))), 
               rep(c("H"), length(colnames(h_bulk_real)))) # coarser
table(cond_tags)

de_df <- DGE_limma(bulk_real_bs, cond_tags)
# de_df <- de_df %>% rename_all(paste0, "_realBC")
de_dfs$BS <- de_df

cond_tags
DPD   H 
103 103 

### DGE for deconvoluted bulk data per cell type

In [6]:
ct_names <- c("CN_H", "AS_H", "CBC", "BRC", "INTER", "PGC", "11", "NEU", "NEC")
# the directory contains the bulk expression per cell type after deconvolution with BEDwARS
dir_ct= "dpd_data/BEDwARS_bulk_dpdct/"
bulk_ct <- c()
for (ct in ct_names){
    dpd_bulk = paste0(dir_ct, "bulk_dpd_", ct)
    h_bulk = paste0(dir_ct, "bulk_h_", ct)
    
    dpd_bulk= read.table(dpd_bulk, header= TRUE, sep= "\t")
    h_bulk= read.table(h_bulk, header= TRUE, sep= "\t")
    
    # set ensid as rownames
    rownames(dpd_bulk) <- dpd_bulk$EnsID
    dpd_bulk <- dpd_bulk[ , !(names(dpd_bulk) %in% c("EnsID"))]

    rownames(h_bulk) <- h_bulk$EnsID
    h_bulk <- h_bulk[ , !(names(h_bulk) %in% c("EnsID"))]
    bulk <- bind_cols(dpd_bulk, h_bulk) # bind_cols from dplyr
    
    cond_tags <- c(rep(c("DPD"), length(colnames(dpd_bulk))), 
                   rep(c("H"), length(colnames(h_bulk)))) # coarser
    table(cond_tags)
    
    de_df <- DGE_limma(bulk, cond_tags)
    
    de_dfs[[ct]] <- de_df
    bulk_ct[[ct]] <- bulk

}

In [7]:
bulk_ct[["CN_NEU"]]= bulk_ct[["CN_H"]] + bulk_ct[["NEU"]]
de_df <- DGE_limma(bulk_ct[["CN_NEU"]], cond_tags)
de_dfs[["CN_NEU"]] <- de_df

bulk_ct[["CBC_INTER"]]= bulk_ct[["CBC"]] + bulk_ct[["INTER"]]
de_df <- DGE_limma(bulk_ct[["CBC_INTER"]], cond_tags)
de_dfs[["CBC_INTER"]] <- de_df

In [8]:
de_df_all <- c()
for (key in names(de_dfs)){
    cols <- c("logFC", "adj.P.Val")
    df <- de_dfs[[key]][cols]
    df <- df[order(rownames(df)) , ]
    de_df_all[[key]]= df
}
de_df_merge <- do.call("cbind", de_df_all)

In [9]:
dir.create("dpd_results/bulk_DGE", recursive= TRUE, showWarnings= FALSE)
write.table(de_df_merge, file = "dpd_results/bulk_DGE/DGE_all", quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)