In [145]:
library(dplyr)
library(tidyr)
library(tibble)
library(qvalue)
library(data.table)

source("/mnt/lareaulab/reliscu/code/fisher_test.R")

setwd("/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM")

Here I perform enrichment analysis to find modules enriched for cell type markers. 

These modules will later be used to correlate with exon PSI to define cell type-specific exons.

In [146]:
get_module_enrichments <- function(network_dir, ctype_genes, mod_def="PosFDR") {
    # Traverse networks to get cell type enrichments for each module
    networks <- list.dirs(file.path(getwd(), network_dir), full.names=TRUE, recursive=FALSE)
    networks <- networks[lengths(lapply(networks, list.files)) > 0]

    enrichments_list <- lapply(seq_along(networks), function(i) {
        kME_path <- list.files(networks[i])[grep("kME", list.files(networks[i]))]
        kME <- fread(file.path(networks[i], kME_path), data.table=FALSE)
        mod_col <- grep("PosFDR", colnames(kME))
        mod_genes <- tapply(kME$Gene, kME[,mod_col], list)
        
        if (length(mod_genes) > 0) {
            all_genes <- kME$Gene

            # For each module: calculate enrichment for DE genes from each cell type
            mod_enrichments_list <- lapply(mod_genes, function(mod) {
                lapply(unlist(lapply(ctype_genes, function(set) {
                    fisher_test(set, mod, all=all_genes)
                })), c)
            })
            
            # Save the network module came from
            network_id <- sapply(strsplit(networks[i], "/"), function(x) x[length(x)])
            mod_enrichments_df <- reshape2::melt(mod_enrichments_list)
            colnames(mod_enrichments_df) <- c("Pval", "Cell_type", "Module")
            
            # Save path to module eigengenes table for downstream analyses
            ME_path <- list.files(networks[i])[grep("eigengene", list.files(networks[i]))]
            
            data.frame(
                Network=network_id,
                kME_path=file.path(networks[i], kME_path),
                ME_path=file.path(networks[i], ME_path),
                mod_enrichments_df
            )
        }
    })
    enrichments_df <- do.call(rbind, enrichments_list)
    enrichments_df$Qval <- qvalue(enrichments_df$Pval)$qvalue

    enrichments_df
}

In [177]:
ME_df[,-1, drop=FALSE]

ERROR: Error: object 'ME_df' not found


In [147]:
network_dir <- "mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules"

### Get DE genes frome each cell type: 1 vs. pooled tests

In [148]:
pooled_res_list <- readRDS("data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk_1_vs_pooled_DE_genes_dream.RDS")

In [149]:
pval_threshold <- .05/length(pooled_res_list)
lfc_threshold <- 6

pooled_ctype_genes <- lapply(pooled_res_list, function(df) {
    mask <- (df['adj.P.Val'] < pval_threshold) & (abs(df['logFC']) > lfc_threshold)
    df[mask, 1]
})
names(pooled_ctype_genes) <- names(pooled_res_list) 

In [150]:
all_genes <- unlist(pooled_ctype_genes)
duplicates <- unique(names(table(all_genes)[table(all_genes) > 1]))

# Remove markers that are not unique to a cell type

pooled_ctype_genes <- lapply(pooled_ctype_genes, function(x) x[!(x %in% duplicates)])

In [151]:
data.frame(No.genes=lengths(pooled_ctype_genes)) %>% arrange(-`No.genes`)

Unnamed: 0_level_0,No.genes
Unnamed: 0_level_1,<int>
Peri,1568
Macrophage,395
Endo,304
Astro,282
VLMC,136
SMC,129
Oligo,105
Meis2,83
CR,61
L4,27


In [152]:
pooled_enrichments_df <- get_module_enrichments(network_dir, pooled_ctype_genes)

In [153]:
# Get most enriched cell type for each module
# If cell type is most enriched in multiple modules, choose module with smallest p-value

pooled_top_mods_df <- pooled_enrichments_df %>%
    group_by(Network, Module) %>%
    slice_min(Qval) %>%
    group_by(Cell_type) %>%
    arrange(Network) %>%
    slice_min(Qval, with_ties=FALSE) %>%
    filter(Qval < .05) %>%
    arrange(Qval)

In [154]:
pooled_top_mods_df[,c("Cell_type", "Pval", "Qval", "Module", "Network")]

Cell_type,Pval,Qval,Module,Network
<chr>,<dbl>,<dbl>,<chr>,<chr>
Macrophage,1.365305e-255,1.049141e-250,blue,Bicor-None_signum0.794_minSize10_merge_ME_0.9_20151
Endo,1.225475e-241,1.569487e-237,blue,Bicor-None_signum0.325_minSize10_merge_ME_0.9_20151
Astro,1.2742620000000001e-204,3.415631e-201,turquoise,Bicor-None_signum0.325_minSize4_merge_ME_0.9_20151
Oligo,2.647845e-134,2.712912e-131,green,Bicor-None_signum0.325_minSize10_merge_ME_0.9_20151
VLMC,1.272234e-122,1.1237040000000001e-119,tan,Bicor-None_signum0.794_minSize3_merge_ME_0.9_20151
SMC,1.3595549999999999e-77,8.853582e-75,black,Bicor-None_signum0.794_minSize6_merge_ME_0.9_20151
Peri,9.299967999999999e-58,4.7015620000000005e-55,white,Bicor-None_signum0.586_minSize8_merge_ME_0.9_20151
NP,1.671897e-37,7.097986e-35,brown,Bicor-None_signum0.255_minSize6_merge_ME_0.9_20151
L6_CT,5.362766e-18,1.753579e-15,darkgreen,Bicor-None_signum0.325_minSize4_merge_ME_0.9_20151
Sncg,1.314519e-15,3.713662e-13,brown2,Bicor-None_signum0.255_minSize10_merge_ME_0.9_20151


### Get DE genes frome each cell type: pairwise tests

In [155]:
pairwise_res_list <- readRDS("data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk_pairwise_DE_genes_dream.RDS")

In [156]:
ctypes <- unique(sapply(strsplit(names(pairwise_res_list), "_vs_"), "[", 1))

pairwise_ctype_genes <- lapply(ctypes, function(target) {
    # Subset to pairwise tests with target cell type
    ctype_res_list <- pairwise_res_list[grep(paste0("^", target), names(pairwise_res_list))]

    # For each pairwise test, return genes that meet p-value threshold:
    pval_threshold <- .05 /length(ctype_res_list)
    ctype_genes_list <- lapply(ctype_res_list, function(df) {
        mask <- df$adj.P.Val < pval_threshold
        df[mask, 1]
    }) 

    # Restrict to genes that were identified in EVERY pairwise test
    Reduce(intersect, ctype_genes_list)
})
names(pairwise_ctype_genes) <- ctypes

In [157]:
data.frame(No.genes=lengths(pairwise_ctype_genes)) %>% arrange(-`No.genes`)

Unnamed: 0_level_0,No.genes
Unnamed: 0_level_1,<int>
Macrophage,333
Astro,288
Endo,237
Oligo,131
VLMC,85
SMC,60
Pvalb,31
Lamp5,22
CR,21
Sst,21


In [158]:
pairwise_ctype_genes <- pairwise_ctype_genes[lengths(pairwise_ctype_genes) > 0]

In [159]:
all_genes <- unlist(pairwise_ctype_genes)
duplicates <- unique(names(table(all_genes)[table(all_genes) > 1]))

# Remove markers that are not unique to a cell type

pairwise_ctype_genes <- lapply(pairwise_ctype_genes, function(x) x[!(x %in% duplicates)])

In [160]:
pairwise_ctype_genes <- pairwise_ctype_genes[lengths(pairwise_ctype_genes) > 0]

In [161]:
pairwise_enrichments_df <- get_module_enrichments(network_dir, pairwise_ctype_genes)

In [162]:
# Get most enriched cell type for each module
# If cell type is most enriched in multiple modules, choose module with smallest p-value

pairwise_top_mods_df <- pairwise_enrichments_df %>%
    group_by(Network, Module) %>%
    slice_min(Qval) %>%
    group_by(Cell_type) %>%
    arrange(Network) %>%
    slice_min(Qval, with_ties=FALSE) %>%
    filter(Qval < .05) %>%
    arrange(Qval)

    # Get most enriched cell type for each module

In [163]:
pairwise_top_mods_df[,c("Cell_type", "Pval", "Qval", "Module", "Network")]

Cell_type,Pval,Qval,Module,Network
<chr>,<dbl>,<dbl>,<chr>,<chr>
Astro,0.0,0.0,turquoise,Bicor-None_signum0.219_minSize10_merge_ME_0.9_20151
Endo,0.0,0.0,blue,Bicor-None_signum0.219_minSize12_merge_ME_0.9_20151
Macrophage,0.0,0.0,blue,Bicor-None_signum0.794_minSize10_merge_ME_0.9_20151
Oligo,2.121824e-204,1.856647e-201,green,Bicor-None_signum0.325_minSize10_merge_ME_0.9_20151
VLMC,4.27102e-139,2.853896e-136,tan,Bicor-None_signum0.794_minSize3_merge_ME_0.9_20151
SMC,1.527776e-95,9.516493e-93,black,Bicor-None_signum0.794_minSize6_merge_ME_0.9_20151
NP,5.7149080000000004e-27,2.258372e-24,brown,Bicor-None_signum0.325_minSize4_merge_ME_0.9_20151
Pvalb,7.313579000000001e-27,2.874667e-24,midnightblue,Bicor-None_signum0.325_minSize12_merge_ME_0.9_20151
Sncg,1.426546e-22,5.016938e-20,brown2,Bicor-None_signum0.255_minSize10_merge_ME_0.9_20151
L6_IT,1.11806e-18,3.542225e-16,yellowgreen,Bicor-None_signum0.586_minSize8_merge_ME_0.9_20151


### Get DE genes frome each cell type: 1 vs. mean analysis

In [164]:
mean_res_list <- readRDS("data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk_1_vs_meanOthers_DE_genes_dream.RDS")

In [165]:
pval_threshold <- .05/length(mean_res_list)
lfc_threshold <- 6

mean_ctype_genes <- lapply(mean_res_list, function(df) {
    mask <- (df['adj.P.Val'] < pval_threshold) & (abs(df['logFC']) > lfc_threshold)
    df[mask, 1]
})
names(mean_ctype_genes) <- names(mean_res_list) 

In [166]:
all_genes <- unlist(mean_ctype_genes)
duplicates <- unique(names(table(all_genes)[table(all_genes) > 1]))

# Remove markers that are not unique to a cell type

mean_ctype_genes <- lapply(mean_ctype_genes, function(x) x[!(x %in% duplicates)])

In [167]:
data.frame(No.genes=lengths(mean_ctype_genes)) %>% arrange(-`No.genes`)

Unnamed: 0_level_0,No.genes
Unnamed: 0_level_1,<int>
Peri,1279
Macrophage,355
Endo,325
Astro,213
Oligo,119
VLMC,110
SMC,94
CR,31
Meis2,30
NP,25


In [168]:
mean_enrichments_df <- get_module_enrichments(network_dir, mean_ctype_genes)

In [169]:
# Get most enriched cell type for each module
# If cell type is most enriched in multiple modules, choose module with smallest p-value

mean_top_mods_df <- mean_enrichments_df %>%
    group_by(Network, Module) %>%
    slice_min(Qval) %>%
    group_by(Cell_type) %>%
    arrange(Network) %>%
    slice_min(Qval, with_ties=FALSE) %>%
    filter(Qval < .05) %>%
    arrange(Qval)

In [170]:
mean_top_mods_df[,c("Cell_type", "Pval", "Qval", "Module", "Network")]

Cell_type,Pval,Qval,Module,Network
<chr>,<dbl>,<dbl>,<chr>,<chr>
Macrophage,1.042092e-284,8.007748999999999e-280,blue,Bicor-None_signum0.794_minSize10_merge_ME_0.9_20151
Astro,8.465823999999999e-259,1.084232e-254,turquoise,Bicor-None_signum0.325_minSize4_merge_ME_0.9_20151
Endo,2.425537e-237,1.035475e-233,blue,Bicor-None_signum0.325_minSize10_merge_ME_0.9_20151
Oligo,8.625306e-145,7.89041e-142,green,Bicor-None_signum0.919_minSize3_merge_ME_0.9_20151
VLMC,3.308877e-132,2.676463e-129,tan,Bicor-None_signum0.794_minSize3_merge_ME_0.9_20151
SMC,4.097873e-88,2.668583e-85,black,Bicor-None_signum0.794_minSize6_merge_ME_0.9_20151
Peri,7.1544870000000006e-68,3.983857e-65,white,Bicor-None_signum0.586_minSize8_merge_ME_0.9_20151
NP,3.448277e-45,1.625619e-42,brown,Bicor-None_signum0.255_minSize6_merge_ME_0.9_20151
L6_CT,2.038744e-25,7.495848e-23,tan,Bicor-None_signum0.586_minSize6_merge_ME_0.9_20151
L6_IT,1.1125869999999999e-24,4.071167e-22,yellowgreen,Bicor-None_signum0.586_minSize8_merge_ME_0.9_20151


### Compare enrichment results

In [171]:
pooled_top_mods_df$Network_short <- gsub("Bicor-None_", "", gsub("_merge_ME_0.9_20151", "", pooled_top_mods_df$Network))
pairwise_top_mods_df$Network_short <- gsub("Bicor-None_", "", gsub("_merge_ME_0.9_20151", "", pairwise_top_mods_df$Network))
mean_top_mods_df$Network_short <- gsub("Bicor-None_", "", gsub("_merge_ME_0.9_20151", "", mean_top_mods_df$Network))

cols <- c("Cell_type", "Qval", "Module", "Network_short")

all_top_mods_df <- merge(
    merge(
        pooled_top_mods_df[,cols], 
        pairwise_top_mods_df[,cols], 
        by="Cell_type", all=TRUE
    ), 
    mean_top_mods_df[,cols], by="Cell_type", all=TRUE
)

In [172]:
all_top_mods_df <- all_top_mods_df[,order(colnames(all_top_mods_df))]
all_top_mods_df

Cell_type,Module,Module.x,Module.y,Network_short,Network_short.x,Network_short.y,Qval,Qval.x,Qval.y
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>
Astro,turquoise,turquoise,turquoise,signum0.325_minSize4,signum0.325_minSize4,signum0.219_minSize10,1.084232e-254,3.415631e-201,0.0
CR,tomato,sienna2,mistyrose1,signum0.586_minSize4,signum0.255_minSize10,signum0.219_minSize6,0.02756652,0.01780916,0.0005423606
Endo,blue,blue,blue,signum0.325_minSize10,signum0.325_minSize10,signum0.219_minSize12,1.035475e-233,1.569487e-237,0.0
L2_3_IT,greenyellow,lightgreen,grey60,signum0.794_minSize4,signum0.219_minSize6,signum0.586_minSize4,2.715087e-13,1.041691e-08,2.465739e-12
L4,salmon2,darkmagenta,,signum0.219_minSize8,signum0.255_minSize12,,0.0001168112,0.0120129,
L5_IT,mediumpurple4,darkseagreen4,mediumpurple4,signum0.325_minSize6,signum0.255_minSize8,signum0.325_minSize8,1.622267e-08,0.000131413,1.379342e-11
L5_PT,salmon,salmon,salmon,signum0.325_minSize6,signum0.325_minSize4,signum0.325_minSize4,1.098908e-06,4.183055e-08,2.011463e-05
L6_CT,tan,darkgreen,darkgreen,signum0.586_minSize6,signum0.325_minSize4,signum0.325_minSize4,7.495848e-23,1.753579e-15,9.246279e-14
L6_IT,yellowgreen,darkred,yellowgreen,signum0.586_minSize8,signum0.794_minSize3,signum0.586_minSize8,4.071167e-22,3.99335e-10,3.542225e-16
L6b,darkseagreen4,,lavenderblush2,signum0.586_minSize3,,signum0.325_minSize4,6.582637e-05,,1.033497e-06


### Select the module from the lowest Qval per cell type

In [173]:
# Select the module from the lowest Qval per cell type

pooled_top_mods_df$DE_Test <- "Pooled"
pairwise_top_mods_df$DE_Test <- "Pairwise"
mean_top_mods_df$DE_Test <- "Mean"

all_top_mods_df <- rbind(pooled_top_mods_df, pairwise_top_mods_df, mean_top_mods_df)

In [174]:
top_qval_mods_df <- all_top_mods_df %>%
    group_by(Cell_type) %>%
    slice_min(Qval)

top_qval_mods_df[,cols]

Cell_type,Qval,Module,Network_short
<chr>,<dbl>,<chr>,<chr>
Astro,0.0,turquoise,signum0.219_minSize10
CR,0.0005423606,mistyrose1,signum0.219_minSize6
Endo,0.0,blue,signum0.219_minSize12
L2_3_IT,2.715087e-13,greenyellow,signum0.794_minSize4
L4,0.0001168112,salmon2,signum0.219_minSize8
L5_IT,1.379342e-11,mediumpurple4,signum0.325_minSize8
L5_PT,4.183055e-08,salmon,signum0.325_minSize4
L6_CT,7.495848e-23,tan,signum0.586_minSize6
L6_IT,4.071167e-22,yellowgreen,signum0.586_minSize8
L6b,1.033497e-06,lavenderblush2,signum0.325_minSize4


In [175]:
write.csv(top_qval_mods_df, file="data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk_DE_genes_dream_unique_markers_top_Qval_module.csv", row.names=FALSE, quote=FALSE)