In [None]:
library(dplyr)
library(tidyr)
library(tibble)
library(qvalue)
library(data.table)

source("/mnt/lareaulab/reliscu/code/fisher_test.R")

setwd("/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM")

Here I perform enrichment analysis to find modules enriched for cell type markers. These modules will later be used to correlate to exon PSI to find cell type-specific exons.

In [39]:
network <- "mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules"

### Get DE genes frome each cell type: 1 vs. pooled tests

In [None]:
# pooled_res_list <- readRDS("data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk_1_vs_pooled_DE_genes_dream.RDS")

In [None]:
# pval_threshold <- .05/length(pooled_res_list)

# ctype_genes <- lapply(pooled_res_list, function(df) {
#     mask <- (df['adj.P.Val'] < pval_threshold) & (abs(df['logFC']) > 2)
#     df[mask, 1]
# })
# names(ctype_genes) <- names(pooled_res_list) 

In [None]:
# Traverse networks to get cell type enrichments for each module

networks <- list.dirs(file.path(getwd(), network_dir), full.names=TRUE, recursive=FALSE)
networks <- networks[lengths(lapply(networks, list.files)) > 0]

network_enrichments <- lapply(seq_along(networks), function(i) {
    
    kme_file <- list.files(networks[i])[grep("kME", list.files(networks[i]))]
    kme <- fread(file.path(networks[i], kme_file), data.table=FALSE)
    mod_col <- grep("PosFDR", colnames(kme))
    mod_genes <- tapply(kme$Gene, kme[,mod_col], list)
    
    if (length(mod_genes) > 0) {
        all_genes <- kme$Gene

        # For each module calculate enrichment for top kME genes
        mod_enrichments <- lapply(mod_genes, function(mod) {
            lapply(unlist(lapply(ctype_genes, function(set) {
                fisher_test(set, mod, all=all_genes)
            })), c)
        })
        
        network_id <- sapply(strsplit(networks[i], "/"), function(x) x[length(x)])
        mod_enrichments_df <- reshape2::melt(mod_enrichments)
        colnames(mod_enrichments_df) <- c("Pval", "Cell_type", "Module")
        
        me_file <- list.files(networks[i])[grep("eigengene", list.files(networks[i]))]
        
        data.frame(
            Network=network_id,
            kME_path=file.path(networks[i], kme_file),
            ME_path=file.path(networks[i], me_file),
            mod_enrichments_df
        )
    }
    
})
enrichments_df <- do.call(rbind, network_enrichments)
enrichments_df$Qval <- qvalue(enrichments_df$Pval)$qvalue

In [None]:
# Get most enriched cell type for each module
# If cell type is most enriched in multiple modules, choose module with smallest p-value

top_mods_df <- enrichments_df %>%
    group_by(Network, Module) %>%
    slice_min(Qval) %>%
    group_by(Cell_type) %>%
    slice_min(Qval) %>%
    filter(Qval < .05) %>%
    arrange(Qval)

In [None]:
top_mods_df

### Get DE genes frome each cell type: pairwise tests

In [3]:
pairwise_res_list <- readRDS("data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk_pairwise_DE_genes_dream_processed.RDS")

In [None]:
ctypes <- unique(sapply(strsplit(names(pairwise_res_list), "_"), "[", 1))

ctype_genes <- lapply(ctypes, function(target) {
    ctype_res_list <- pairwise_res_list[grep(paste0("^", target), names(pairwise_res_list))]
    pval_threshold <- .05 
    # /length(ctype_res_list)
    ctype_genes_list <- lapply(ctype_res_list, function(ctype_res) {
        ctype_res[ctype_res$adj.P.Val < pval_threshold, 1]
    }) 
    Reduce(intersect, ctype_genes_list)
})
names(ctype_genes) <- ctypes

In [None]:
# Traverse networks to get cell type enrichments for each module

networks <- list.dirs(file.path(getwd(), network_dir), full.names=TRUE, recursive=FALSE)
networks <- networks[lengths(lapply(networks, list.files)) > 0]

network_enrichments <- lapply(seq_along(networks), function(i) {
    
    kme_file <- list.files(networks[i])[grep("kME", list.files(networks[i]))]
    kme <- fread(file.path(networks[i], kme_file), data.table=FALSE)
    mod_col <- grep("PosFDR", colnames(kme))
    mod_genes <- tapply(kme$Gene, kme[,mod_col], list)
    
    if (length(mod_genes) > 0) {
        all_genes <- kme$Gene

        # For each module calculate enrichment for top kME genes
        mod_enrichments <- lapply(mod_genes, function(mod) {
            lapply(unlist(lapply(ctype_genes, function(set) {
                fisher_test(set, mod, all=all_genes)
            })), c)
        })
        
        network_id <- sapply(strsplit(networks[i], "/"), function(x) x[length(x)])
        mod_enrichments_df <- reshape2::melt(mod_enrichments)
        colnames(mod_enrichments_df) <- c("Pval", "Cell_type", "Module")
        
        me_file <- list.files(networks[i])[grep("eigengene", list.files(networks[i]))]
        
        data.frame(
            Network=network_id,
            kME_path=file.path(networks[i], kme_file),
            ME_path=file.path(networks[i], me_file),
            mod_enrichments_df
        )
    }
    
})
enrichments_df <- do.call(rbind, network_enrichments)
enrichments_df$Qval <- qvalue(enrichments_df$Pval)$qvalue

In [24]:
# Get most enriched cell type for each module
# If cell type is most enriched in multiple modules, choose module with smallest p-value

top_mods_df <- enrichments_df %>%
    group_by(Network, Module) %>%
    slice_min(Qval) %>%
    group_by(Cell_type) %>%
    slice_min(Qval) %>%
    filter(Qval < .05) %>%
    arrange(Qval)

In [25]:
top_mods_df

Network,kME_path,ME_path,Pval,Cell_type,Module,Qval
<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<dbl>
Bicor-None_signum0.219_minSize10_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.219_minSize10_merge_ME_0.9_20151/kME_table_08-38-33.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.219_minSize10_merge_ME_0.9_20151/Module_eigengenes_08-38-33.csv,0,Astro,turquoise,0
Bicor-None_signum0.219_minSize12_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.219_minSize12_merge_ME_0.9_20151/kME_table_08-46-35.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.219_minSize12_merge_ME_0.9_20151/Module_eigengenes_08-46-35.csv,0,Astro,turquoise,0
Bicor-None_signum0.219_minSize6_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.219_minSize6_merge_ME_0.9_20151/kME_table_08-09-19.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.219_minSize6_merge_ME_0.9_20151/Module_eigengenes_08-09-19.csv,0,Astro,turquoise,0
Bicor-None_signum0.219_minSize8_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.219_minSize8_merge_ME_0.9_20151/kME_table_08-26-30.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.219_minSize8_merge_ME_0.9_20151/Module_eigengenes_08-26-30.csv,0,Astro,turquoise,0
Bicor-None_signum0.255_minSize10_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.255_minSize10_merge_ME_0.9_20151/kME_table_07-58-32.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.255_minSize10_merge_ME_0.9_20151/Module_eigengenes_07-58-32.csv,0,Astro,turquoise,0
Bicor-None_signum0.255_minSize12_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.255_minSize12_merge_ME_0.9_20151/kME_table_08-04-54.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.255_minSize12_merge_ME_0.9_20151/Module_eigengenes_08-04-54.csv,0,Astro,turquoise,0
Bicor-None_signum0.255_minSize6_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.255_minSize6_merge_ME_0.9_20151/kME_table_07-34-15.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.255_minSize6_merge_ME_0.9_20151/Module_eigengenes_07-34-15.csv,0,Astro,turquoise,0
Bicor-None_signum0.255_minSize8_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.255_minSize8_merge_ME_0.9_20151/kME_table_07-48-35.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.255_minSize8_merge_ME_0.9_20151/Module_eigengenes_07-48-35.csv,0,Astro,turquoise,0
Bicor-None_signum0.325_minSize10_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.325_minSize10_merge_ME_0.9_20151/kME_table_07-26-22.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.325_minSize10_merge_ME_0.9_20151/Module_eigengenes_07-26-22.csv,0,Astro,turquoise,0
Bicor-None_signum0.325_minSize12_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.325_minSize12_merge_ME_0.9_20151/kME_table_07-30-53.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.325_minSize12_merge_ME_0.9_20151/Module_eigengenes_07-30-53.csv,0,Astro,turquoise,0


### Get DE genes frome each cell type: 1 vs. mean analysis

In [49]:
mean_res_list <- readRDS("data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk_1_vs_meanOthers_DE_genes_dream.RDS")

In [50]:
pval_threshold <- .05/length(mean_res_list)

ctype_genes <- lapply(mean_res_list, function(df) {
    mask <- (df['adj.P.Val'] < pval_threshold) & (abs(df['logFC']) > 5)
    df[mask, 1]
})
names(ctype_genes) <- names(mean_res_list) 

In [51]:
lengths(ctype_genes)

In [None]:
# Traverse networks to get cell type enrichments for each module

networks <- list.dirs(file.path(getwd(), network_dir), full.names=TRUE, recursive=FALSE)
networks <- networks[lengths(lapply(networks, list.files)) > 0]

network_enrichments <- lapply(seq_along(networks), function(i) {
    
    kme_file <- list.files(networks[i])[grep("kME", list.files(networks[i]))]
    kme <- fread(file.path(networks[i], kme_file), data.table=FALSE)
    mod_col <- grep("PosFDR", colnames(kme))
    mod_genes <- tapply(kme$Gene, kme[,mod_col], list)
    
    if (length(mod_genes) > 0) {
        all_genes <- kme$Gene

        # For each module calculate enrichment for top kME genes
        mod_enrichments <- lapply(mod_genes, function(mod) {
            lapply(unlist(lapply(ctype_genes, function(set) {
                fisher_test(set, mod, all=all_genes)
            })), c)
        })
        
        network_id <- sapply(strsplit(networks[i], "/"), function(x) x[length(x)])
        mod_enrichments_df <- reshape2::melt(mod_enrichments)
        colnames(mod_enrichments_df) <- c("Pval", "Cell_type", "Module")
        
        me_file <- list.files(networks[i])[grep("eigengene", list.files(networks[i]))]
        
        data.frame(
            Network=network_id,
            kME_path=file.path(networks[i], kme_file),
            ME_path=file.path(networks[i], me_file),
            mod_enrichments_df
        )
    }
    
})
enrichments_df <- do.call(rbind, network_enrichments)
enrichments_df$Qval <- qvalue(enrichments_df$Pval)$qvalue

In [None]:
# Get most enriched cell type for each module
# If cell type is most enriched in multiple modules, choose module with smallest p-value

top_mods_df <- enrichments_df %>%
    group_by(Network, Module) %>%
    slice_min(Qval) %>%
    group_by(Cell_type) %>%
    slice_min(Qval) %>%
    filter(Qval < .05) %>%
    arrange(Qval)