In [1]:
library(dplyr)
library(tidyr)
library(tibble)
library(qvalue)
library(data.table)

source("/mnt/lareaulab/reliscu/code/fisher_test.R")

setwd("/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM")


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last




Here I perform enrichment analysis to find modules enriched for cell type markers. These modules will later be used to correlate to exon PSI to find cell type-specific exons.

In [2]:
network_dir <- "mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules"

In [3]:
get_module_enrichments <- function(network_dir, ctype_genes, mod_def="PosFDR") {
    # Traverse networks to get cell type enrichments for each module
    networks <- list.dirs(file.path(getwd(), network_dir), full.names=TRUE, recursive=FALSE)
    networks <- networks[lengths(lapply(networks, list.files)) > 0]

    enrichments_list <- lapply(seq_along(networks), function(i) {
        kME_path <- list.files(networks[i])[grep("kME", list.files(networks[i]))]
        kME <- fread(file.path(networks[i], kME_path), data.table=FALSE)
        mod_col <- grep("PosFDR", colnames(kME))
        mod_genes <- tapply(kME$Gene, kME[,mod_col], list)
        
        if (length(mod_genes) > 0) {
            all_genes <- kME$Gene

            # For each module: calculate enrichment for DE genes from each cell type
            mod_enrichments_list <- lapply(mod_genes, function(mod) {
                lapply(unlist(lapply(ctype_genes, function(set) {
                    fisher_test(set, mod, all=all_genes)
                })), c)
            })
            
            # Save the network module came from
            network_id <- sapply(strsplit(networks[i], "/"), function(x) x[length(x)])
            mod_enrichments_df <- reshape2::melt(mod_enrichments_list)
            colnames(mod_enrichments_df) <- c("Pval", "Cell_type", "Module")
            
            # Save path to module eigengenes table for downstream analyses
            ME_path <- list.files(networks[i])[grep("eigengene", list.files(networks[i]))]
            
            data.frame(
                Network=network_id,
                kME_path=file.path(networks[i], kME_path),
                ME_path=file.path(networks[i], ME_path),
                mod_enrichments_df
            )
        }
    })
    enrichments_df <- do.call(rbind, enrichments_list)
    enrichments_df$Qval <- qvalue(enrichments_df$Pval)$qvalue

    enrichments_df
}

### Get DE genes frome each cell type: 1 vs. pooled tests

In [4]:
# pooled_res_list <- readRDS("data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk_1_vs_pooled_DE_genes_dream.RDS")

In [5]:
# pval_threshold <- .05/length(pooled_res_list)

# ctype_genes <- lapply(pooled_res_list, function(df) {
#     mask <- (df['adj.P.Val'] < pval_threshold) & (abs(df['logFC']) > 2)
#     df[mask, 1]
# })
# names(ctype_genes) <- names(pooled_res_list) 

In [6]:
lengths(ctype_genes)

ERROR: Error: object 'ctype_genes' not found


In [7]:
enrichments_df <- get_module_enrichments(network_dir, ctype_genes)

ERROR: Error: object 'ctype_genes' not found


In [8]:
# Get most enriched cell type for each module
# If cell type is most enriched in multiple modules, choose module with smallest p-value

top_mods_df <- enrichments_df %>%
    group_by(Network, Module) %>%
    slice_min(Qval) %>%
    group_by(Cell_type) %>%
    slice_min(Qval,with_ties=FALSE) %>%
    filter(Qval < .05) %>%
    arrange(Qval)

ERROR: Error: object 'enrichments_df' not found


In [9]:
top_mods_df

ERROR: Error: object 'top_mods_df' not found


### Get DE genes frome each cell type: pairwise tests

In [10]:
pairwise_res_list <- readRDS("data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk_pairwise_DE_genes_dream_processed.RDS")

In [23]:
ctypes <- unique(sapply(strsplit(names(pairwise_res_list), "_vs_"), "[", 1))

ctype_genes <- lapply(ctypes, function(target) {
    ctype_res_list <- pairwise_res_list[grep(paste0("^", target), names(pairwise_res_list))]
    pval_threshold <- .05 # /length(ctype_res_list)
    ctype_genes_list <- lapply(ctype_res_list, function(ctype_res) {
        ctype_res[ctype_res$adj.P.Val < pval_threshold, 1]
    }) 
    Reduce(intersect, ctype_genes_list)
})
names(ctype_genes) <- ctypes

In [24]:
data.frame(No.genes=lengths(ctype_genes)) %>% arrange(-`No.genes`)

Unnamed: 0_level_0,No.genes
Unnamed: 0_level_1,<int>
Macrophage,481
Astro,471
Endo,364
Oligo,234
VLMC,141
Pvalb,118
SMC,107
L5_IT,65
Lamp5,58
Sst,54


In [18]:
ctype_genes <- ctype_genes[lengths(ctype_genes) > 0]

In [None]:
enrichments_df <- get_module_enrichments(network_dir, ctype_genes)

In [20]:
# Get most enriched cell type for each module
# If cell type is most enriched in multiple modules, choose module with smallest p-value

top_mods_df <- enrichments_df %>%
    group_by(Network, Module) %>%
    slice_min(Qval) %>%
    group_by(Cell_type) %>%
    slice_min(Qval,with_ties=FALSE) %>%
    filter(Qval < .05) %>%
    arrange(Qval)

In [22]:
top_mods_df[,c("Cell_type", "Pval", "Qval", "Module")]

Cell_type,Pval,Qval,Module
<chr>,<dbl>,<dbl>,<chr>
Astro,0.0,0.0,turquoise
Endo,0.0,0.0,blue
Macrophage,0.0,0.0,yellow
Oligo,1.516717e-284,1.21873e-281,green
VLMC,3.3335050000000003e-206,2.034688e-203,tan
SMC,3.1255469999999997e-134,1.695783e-131,black
NP,2.535164e-80,1.031601e-77,brown
Sncg,1.086617e-50,3.381244e-48,brown2
L2,1.6819e-49,5.13295e-47,lightcyan
Pvalb,1.655253e-43,4.79789e-41,darkorange


### Get DE genes frome each cell type: 1 vs. mean analysis

In [34]:
mean_res_list <- readRDS("data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk_1_vs_meanOthers_DE_genes_dream.RDS")

In [50]:
pval_threshold <- .05/length(mean_res_list)

ctype_genes <- lapply(mean_res_list, function(df) {
    mask <- (df['adj.P.Val'] < pval_threshold) & (abs(df['logFC']) > 5)
    df[mask, 1]
})
names(ctype_genes) <- names(mean_res_list) 

In [54]:
enrichments_df <- get_module_enrichments(network_dir, ctype_genes)

In [None]:
# Get most enriched cell type for each module
# If cell type is most enriched in multiple modules, choose module with smallest p-value

top_mods_df <- enrichments_df %>%
    group_by(Network, Module) %>%
    slice_min(Qval) %>%
    group_by(Cell_type) %>%
    slice_min(Qval,with_ties=FALSE) %>%
    filter(Qval < .05) %>%
    arrange(Qval)

In [None]:
top_mods_df[,c("Cell_type", "Pval", "Qval", "Module")]

Network,kME_path,ME_path,Pval,Cell_type,Module,Qval
<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<dbl>
Bicor-None_signum0.219_minSize6_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.219_minSize6_merge_ME_0.9_20151/kME_table_08-09-19.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.219_minSize6_merge_ME_0.9_20151/Module_eigengenes_08-09-19.csv,6.154253e-277,Astro,turquoise,4.7291130000000004e-272
Bicor-None_signum0.794_minSize12_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.794_minSize12_merge_ME_0.9_20151/kME_table_06-18-14.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.794_minSize12_merge_ME_0.9_20151/Module_eigengenes_06-18-14.csv,1.4179680000000002e-196,Endo,green,7.264061999999999e-193
Bicor-None_signum0.919_minSize6_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.919_minSize6_merge_ME_0.9_20151/kME_table_06-12-22.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.919_minSize6_merge_ME_0.9_20151/Module_eigengenes_06-12-22.csv,1.0717789999999999e-193,Macrophage,yellow,4.5754830000000005e-190
Bicor-None_signum0.794_minSize12_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.794_minSize12_merge_ME_0.9_20151/kME_table_06-18-14.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.794_minSize12_merge_ME_0.9_20151/Module_eigengenes_06-18-14.csv,1.6916459999999999e-186,Oligo,red,5.416296e-183
Bicor-None_signum0.794_minSize3_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.794_minSize3_merge_ME_0.9_20151/kME_table_06-13-30.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.794_minSize3_merge_ME_0.9_20151/Module_eigengenes_06-13-30.csv,1.5451430000000002e-125,VLMC,tan,1.1096579999999999e-122
Bicor-None_signum0.919_minSize6_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.919_minSize6_merge_ME_0.9_20151/kME_table_06-12-22.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.919_minSize6_merge_ME_0.9_20151/Module_eigengenes_06-12-22.csv,3.085103e-121,SMC,brown,2.1357530000000003e-118
Bicor-None_signum0.586_minSize8_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.586_minSize8_merge_ME_0.9_20151/kME_table_06-40-23.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.586_minSize8_merge_ME_0.9_20151/Module_eigengenes_06-40-23.csv,1.906383e-118,Peri,white,1.273845e-115
Bicor-None_signum0.325_minSize4_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.325_minSize4_merge_ME_0.9_20151/kME_table_06-44-39.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.325_minSize4_merge_ME_0.9_20151/Module_eigengenes_06-44-39.csv,9.153633e-89,NP,brown,5.6725209999999996e-86
Bicor-None_signum0.586_minSize8_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.586_minSize8_merge_ME_0.9_20151/kME_table_06-40-23.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.586_minSize8_merge_ME_0.9_20151/Module_eigengenes_06-40-23.csv,4.1799379999999996e-85,Vip,darkorange,2.489915e-82
Bicor-None_signum0.325_minSize12_merge_ME_0.9_20151,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.325_minSize12_merge_ME_0.9_20151/kME_table_07-30-53.csv,/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/mouse_ACA_20pcntCells_0.4pcntVar_200samples_log2_Modules/Bicor-None_signum0.325_minSize12_merge_ME_0.9_20151/Module_eigengenes_07-30-53.csv,5.348967e-82,Sncg,palevioletred3,3.1138689999999997e-79
