In [1]:
library(dplyr)
library(tidyr)
library(tibble)
library(qvalue)
library(data.table)

source("/mnt/lareaulab/reliscu/code/fisher_test.R")

setwd("/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM")


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last




In [2]:
get_module_enrichments <- function(network_dir, ctype_genes, mod_def="PosFDR") {
    # Traverse networks to get cell type enrichments for each module
    networks <- list.dirs(file.path(getwd(), network_dir), full.names=TRUE, recursive=FALSE)
    networks <- networks[lengths(lapply(networks, list.files)) > 0]

    enrichments_list <- lapply(seq_along(networks), function(i) {
        kME_path <- list.files(networks[i])[grep("kME", list.files(networks[i]))]
        kME <- fread(file.path(networks[i], kME_path), data.table=FALSE)
        mod_col <- grep("PosFDR", colnames(kME))
        mod_genes <- tapply(kME$Gene, kME[,mod_col], list)
        
        if (length(mod_genes) > 0) {
            all_genes <- kME$Gene

            # For each module: calculate enrichment for DE genes from each cell type
            mod_enrichments_list <- lapply(mod_genes, function(mod) {
                lapply(unlist(lapply(ctype_genes, function(set) {
                    fisher_test(set, mod, all=all_genes)
                })), c)
            })
            
            # Save the network the module came from
            network_id <- sapply(strsplit(networks[i], "/"), function(x) x[length(x)])
            mod_enrichments_df <- reshape2::melt(mod_enrichments_list)
            colnames(mod_enrichments_df) <- c("Pval", "Cell_type", "Module")
            
            # Save path to module eigengenes table for downstream analyses
            ME_path <- list.files(networks[i])[grep("eigengene", list.files(networks[i]))]
            
            data.frame(
                Network=network_id,
                kME_path=file.path(networks[i], kME_path),
                ME_path=file.path(networks[i], ME_path),
                mod_enrichments_df
            )
        }
    })
    enrichments_df <- do.call(rbind, enrichments_list)
    enrichments_df$Qval <- qvalue(enrichments_df$Pval)$qvalue

    enrichments_df
}

Here I perform enrichment analysis to find modules enriched for cell type markers. 

These modules will later be used to correlate with exon PSI to define cell type-specific exons.

In [3]:
network_dir <- "tasic_2018_ALM_STAR_20pcntCells_40pcntVar_200samples_log2_pseudobulk_Modules"
network_dataset <- "20pcntCells_40pcntVar_200samples_log2_pseudobulk"

In [4]:
mod_def <- "PosBC"
unique <- TRUE

### Prep DE genes

#### 1 vs. pooled tests

In [5]:
pooled_res_list <- readRDS("data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk_1_vs_pooled_DE_genes_dream.RDS")

In [6]:
pval_threshold <- .05/length(pooled_res_list)
lfc_threshold <- 6

pooled_ctype_genes <- lapply(pooled_res_list, function(df) {
    mask <- (df['adj.P.Val'] < pval_threshold) & (abs(df['logFC']) > lfc_threshold)
    df[mask, 1]
})
names(pooled_ctype_genes) <- names(pooled_res_list) 

In [7]:
if (unique) {
    all_genes <- unlist(pooled_ctype_genes)
    duplicates <- unique(names(table(all_genes)[table(all_genes) > 1]))

    # Remove markers that are not unique to a cell type

    pooled_ctype_genes <- lapply(pooled_ctype_genes, function(x) x[!(x %in% duplicates)])
}

In [8]:
data.frame(No.genes=lengths(pooled_ctype_genes)) %>% arrange(-`No.genes`)

Unnamed: 0_level_0,No.genes
Unnamed: 0_level_1,<int>
Peri,1568
Macrophage,395
Endo,304
Astro,282
VLMC,136
SMC,129
Oligo,105
Meis2,83
CR,61
L4,27


#### Pairwise tests

In [9]:
pairwise_res_list <- readRDS("data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk_pairwise_DE_genes_dream.RDS")

In [10]:
ctypes <- unique(sapply(strsplit(names(pairwise_res_list), "_vs_"), "[", 1))

pairwise_ctype_genes <- lapply(ctypes, function(target) {
    # Subset to pairwise tests with target cell type
    ctype_res_list <- pairwise_res_list[grep(paste0("^", target), names(pairwise_res_list))]

    # For each pairwise test, return genes that meet p-value threshold:
    pval_threshold <- .05 /length(ctype_res_list)
    ctype_genes_list <- lapply(ctype_res_list, function(df) {
        mask <- df$adj.P.Val < pval_threshold
        df[mask, 1]
    }) 

    # Restrict to genes that were identified in EVERY pairwise test
    Reduce(intersect, ctype_genes_list)
})
names(pairwise_ctype_genes) <- ctypes

In [11]:
data.frame(No.genes=lengths(pairwise_ctype_genes)) %>% arrange(-`No.genes`)

Unnamed: 0_level_0,No.genes
Unnamed: 0_level_1,<int>
Macrophage,333
Astro,288
Endo,237
Oligo,131
VLMC,85
SMC,60
Pvalb,31
Lamp5,22
CR,21
Sst,21


In [12]:
pairwise_ctype_genes <- pairwise_ctype_genes[lengths(pairwise_ctype_genes) > 0]

In [13]:
if (unique) {
    all_genes <- unlist(pairwise_ctype_genes)
    duplicates <- unique(names(table(all_genes)[table(all_genes) > 1]))

    # Remove markers that are not unique to a cell type

    pairwise_ctype_genes <- lapply(pairwise_ctype_genes, function(x) x[!(x %in% duplicates)])
}

pairwise_ctype_genes <- pairwise_ctype_genes[lengths(pairwise_ctype_genes) > 0]

#### 1 vs. mean tests

In [14]:
mean_res_list <- readRDS("data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk_1_vs_meanOthers_DE_genes_dream.RDS")

In [15]:
pval_threshold <- .05/length(mean_res_list)
lfc_threshold <- 6

mean_ctype_genes <- lapply(mean_res_list, function(df) {
    mask <- (df['adj.P.Val'] < pval_threshold) & (abs(df['logFC']) > lfc_threshold)
    df[mask, 1]
})
names(mean_ctype_genes) <- names(mean_res_list) 

In [16]:
if (unique) {
    all_genes <- unlist(mean_ctype_genes)
    duplicates <- unique(names(table(all_genes)[table(all_genes) > 1]))

    # Remove markers that are not unique to a cell type

    mean_ctype_genes <- lapply(mean_ctype_genes, function(x) x[!(x %in% duplicates)])
}

In [17]:
data.frame(No.genes=lengths(mean_ctype_genes)) %>% arrange(-`No.genes`)

Unnamed: 0_level_0,No.genes
Unnamed: 0_level_1,<int>
Peri,1279
Macrophage,355
Endo,325
Astro,213
Oligo,119
VLMC,110
SMC,94
CR,31
Meis2,30
NP,25


### Enrichment results using 1 vs. pooled DE genes

In [18]:
pooled_enrichments_df <- get_module_enrichments(network_dir, pooled_ctype_genes, mod_def)

In [19]:
# Get most enriched cell type for each module
# If cell type is most enriched in multiple modules, choose module with smallest p-value

pooled_top_mods_df <- pooled_enrichments_df %>%
    group_by(Network, Module) %>%
    slice_min(Qval) %>%
    group_by(Cell_type) %>%
    arrange(Network) %>%
    slice_min(Qval, with_ties=FALSE) %>%
    filter(Qval < .05) %>%
    arrange(Qval)

In [20]:
pooled_top_mods_df[,c("Cell_type", "Pval", "Qval", "Module", "Network")]

Cell_type,Pval,Qval,Module,Network
<chr>,<dbl>,<dbl>,<chr>,<chr>
Macrophage,5.066725e-290,4.291972e-285,black,Bicor-None_signum0.723_minSize8_merge_ME_0.9_20151
Endo,5.7058720000000006e-248,6.041734000000001e-244,cyan,Bicor-None_signum0.864_minSize3_merge_ME_0.9_20151
Astro,2.757588e-193,1.112345e-189,yellow,Bicor-None_signum0.723_minSize3_merge_ME_0.9_20151
Oligo,8.842039e-132,1.4132080000000001e-128,lightcyan,Bicor-None_signum0.723_minSize6_merge_ME_0.9_20151
VLMC,2.026801e-121,2.8145620000000003e-118,lightsteelblue1,Bicor-None_signum0.466_minSize10_merge_ME_0.9_20151
SMC,1.3275039999999999e-70,1.102466e-67,brown,Bicor-None_signum0.931_minSize3_merge_ME_0.9_20151
NP,2.387003e-33,1.123337e-30,darkorange,Bicor-None_signum0.723_minSize4_merge_ME_0.9_20151
Peri,4.729459e-33,2.2134129999999998e-30,blue,Bicor-None_signum0.864_minSize10_merge_ME_0.9_20151
Lamp5,1.337442e-21,4.49577e-19,cyan,Bicor-None_signum0.317_minSize5_merge_ME_0.9_20151
L6_CT,3.2673959999999996e-20,1.028914e-17,navajowhite2,Bicor-None_signum0.723_minSize4_merge_ME_0.9_20151


### Enrichment results using pairwise DE genes

In [21]:
pairwise_enrichments_df <- get_module_enrichments(network_dir, pairwise_ctype_genes, mod_def)

In [22]:
# Get most enriched cell type for each module
# If cell type is most enriched in multiple modules, choose module with smallest p-value

pairwise_top_mods_df <- pairwise_enrichments_df %>%
    group_by(Network, Module) %>%
    slice_min(Qval) %>%
    group_by(Cell_type) %>%
    arrange(Network) %>%
    slice_min(Qval, with_ties=FALSE) %>%
    filter(Qval < .05) %>%
    arrange(Qval)

In [23]:
pairwise_top_mods_df[,c("Cell_type", "Pval", "Qval", "Module", "Network")]

Cell_type,Pval,Qval,Module,Network
<chr>,<dbl>,<dbl>,<chr>,<chr>
Macrophage,0.0,0.0,red,Bicor-None_signum0.723_minSize10_merge_ME_0.9_20151
Astro,2.925e-321,2.96238e-317,yellow,Bicor-None_signum0.466_minSize8_merge_ME_0.9_20151
Endo,9.438148e-312,8.497059e-308,cyan,Bicor-None_signum0.864_minSize3_merge_ME_0.9_20151
Oligo,2.284915e-191,3.192027e-188,lightcyan,Bicor-None_signum0.723_minSize6_merge_ME_0.9_20151
VLMC,8.648820999999999e-136,1.0011129999999999e-132,saddlebrown,Bicor-None_signum0.723_minSize4_merge_ME_0.9_20151
SMC,3.1545589999999997e-86,2.457705e-83,lightgreen,Bicor-None_signum0.466_minSize8_merge_ME_0.9_20151
Pvalb,1.347643e-40,5.902384e-38,blue,Bicor-None_signum0.723_minSize5_merge_ME_0.9_20151
Sst,2.594186e-30,9.731321e-28,red,Bicor-None_signum0.723_minSize8_merge_ME_0.9_20151
Lamp5,3.95667e-29,1.4572419999999998e-26,purple,Bicor-None_signum0.864_minSize4_merge_ME_0.9_20151
Vip,2.527162e-25,7.875608e-23,green,Bicor-None_signum0.317_minSize5_merge_ME_0.9_20151


In [24]:
# pairwise_top_mods_df[,c("Cell_type", "Pval", "Qval", "Module", "Network")]

### Enrichment results using 1 vs. mean DE genes

In [25]:
mean_enrichments_df <- get_module_enrichments(network_dir, mean_ctype_genes, mod_def)

In [26]:
# Get most enriched cell type for each module
# If cell type is most enriched in multiple modules, choose module with smallest p-value

mean_top_mods_df <- mean_enrichments_df %>%
    group_by(Network, Module) %>%
    slice_min(Qval) %>%
    group_by(Cell_type) %>%
    arrange(Network) %>%
    slice_min(Qval, with_ties=FALSE) %>%
    filter(Qval < .05) %>%
    arrange(Qval)

In [27]:
mean_top_mods_df[,c("Cell_type", "Pval", "Qval", "Module", "Network")]

Cell_type,Pval,Qval,Module,Network
<chr>,<dbl>,<dbl>,<chr>,<chr>
Macrophage,1.560208e-313,1.321637e-308,black,Bicor-None_signum0.723_minSize8_merge_ME_0.9_20151
Endo,2.3490019999999997e-230,2.210907e-226,cyan,Bicor-None_signum0.864_minSize3_merge_ME_0.9_20151
Astro,3.454826e-230,2.9265489999999997e-226,yellow,Bicor-None_signum0.466_minSize8_merge_ME_0.9_20151
Oligo,1.0017159999999999e-146,1.488674e-143,lightcyan,Bicor-None_signum0.723_minSize6_merge_ME_0.9_20151
VLMC,6.824551000000001e-120,8.89386e-117,lightsteelblue1,Bicor-None_signum0.466_minSize10_merge_ME_0.9_20151
SMC,5.6598479999999996e-77,4.892246e-74,brown,Bicor-None_signum0.931_minSize3_merge_ME_0.9_20151
Peri,1.0840669999999999e-38,5.370188e-36,skyblue,Bicor-None_signum0.723_minSize6_merge_ME_0.9_20151
NP,5.147565e-35,2.422473e-32,darkorange,Bicor-None_signum0.723_minSize4_merge_ME_0.9_20151
L6_CT,3.34521e-26,1.2707150000000001e-23,mediumpurple2,Bicor-None_signum0.37_minSize10_merge_ME_0.9_20151
Sst,2.127435e-24,7.668635e-22,green,Bicor-None_signum0.864_minSize3_merge_ME_0.9_20151


### Compare enrichment results

In [28]:
pooled_top_mods_df$Network_short <- gsub("Bicor-None_", "", gsub("_merge_ME_0.9_20151", "", pooled_top_mods_df$Network))
pairwise_top_mods_df$Network_short <- gsub("Bicor-None_", "", gsub("_merge_ME_0.9_20151", "", pairwise_top_mods_df$Network))
mean_top_mods_df$Network_short <- gsub("Bicor-None_", "", gsub("_merge_ME_0.9_20151", "", mean_top_mods_df$Network))

cols <- c("Cell_type", "Qval", "Module", "Network_short")

all_top_mods_df <- merge(
    merge(
        pooled_top_mods_df[,cols], 
        pairwise_top_mods_df[,cols], 
        by="Cell_type", all=TRUE
    ), 
    mean_top_mods_df[,cols], by="Cell_type", all=TRUE
)

In [29]:
all_top_mods_df <- all_top_mods_df[,order(colnames(all_top_mods_df))]
all_top_mods_df

Cell_type,Module,Module.x,Module.y,Network_short,Network_short.x,Network_short.y,Qval,Qval.x,Qval.y
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>
Astro,yellow,yellow,yellow,signum0.466_minSize8,signum0.723_minSize3,signum0.466_minSize8,2.9265489999999997e-226,1.112345e-189,2.96238e-317
CR,lightskyblue,,lightskyblue,signum0.466_minSize4,,signum0.466_minSize4,0.02662553,,0.01507463
Endo,cyan,cyan,cyan,signum0.864_minSize3,signum0.864_minSize3,signum0.864_minSize3,2.210907e-226,6.041734000000001e-244,8.497059e-308
L2_3_IT,honeydew1,slateblue,steelblue,signum0.466_minSize8,signum0.317_minSize10,signum0.723_minSize5,3.010301e-08,2.670436e-11,2.503038e-10
L4,,dodgerblue4,,,signum0.466_minSize4,,,0.0006070336,
L5_IT,turquoise,navajowhite2,navajowhite2,signum0.466_minSize10,signum0.723_minSize3,signum0.723_minSize3,9.903185e-16,1.441855e-16,5.462602e-15
L5_PT,mistyrose4,mistyrose4,mistyrose4,signum0.317_minSize5,signum0.317_minSize5,signum0.317_minSize5,1.475558e-07,5.509422e-08,0.0007126516
L6_CT,mediumpurple2,navajowhite2,lightsteelblue,signum0.37_minSize10,signum0.723_minSize4,signum0.37_minSize5,1.2707150000000001e-23,1.028914e-17,1.744501e-08
L6_IT,mediumpurple4,plum3,mediumpurple4,signum0.466_minSize5,signum0.466_minSize6,signum0.466_minSize6,1.175888e-09,5.491443e-10,1.648542e-14
Lamp5,greenyellow,cyan,purple,signum0.723_minSize10,signum0.317_minSize5,signum0.864_minSize4,2.1547629999999998e-19,4.49577e-19,1.4572419999999998e-26


### Select the module from the lowest Qval per cell type

In [30]:
cols <- c("Cell_type", "Qval", "Module", "Network_short")

In [31]:
# Select the module from the lowest Qval per cell type

pooled_top_mods_df$DE_Test <- "Pooled"
pairwise_top_mods_df$DE_Test <- "Pairwise"
mean_top_mods_df$DE_Test <- "Mean"

all_top_mods_df <- rbind(pooled_top_mods_df, pairwise_top_mods_df, mean_top_mods_df)

In [32]:
top_qval_mods_df <- all_top_mods_df %>%
    group_by(Cell_type) %>%
    slice_min(Qval)

top_qval_mods_df[, c(cols, "DE_Test")]

Cell_type,Qval,Module,Network_short,DE_Test
<chr>,<dbl>,<chr>,<chr>,<chr>
Astro,2.96238e-317,yellow,signum0.466_minSize8,Pairwise
CR,0.01507463,lightskyblue,signum0.466_minSize4,Pairwise
Endo,8.497059e-308,cyan,signum0.864_minSize3,Pairwise
L2_3_IT,2.670436e-11,slateblue,signum0.317_minSize10,Pooled
L4,0.0006070336,dodgerblue4,signum0.466_minSize4,Pooled
L5_IT,1.441855e-16,navajowhite2,signum0.723_minSize3,Pooled
L5_PT,5.509422e-08,mistyrose4,signum0.317_minSize5,Pooled
L6_CT,1.2707150000000001e-23,mediumpurple2,signum0.37_minSize10,Mean
L6_IT,1.648542e-14,mediumpurple4,signum0.466_minSize6,Pairwise
Lamp5,1.4572419999999998e-26,purple,signum0.864_minSize4,Pairwise


In [33]:
if (unique) {
    write.csv(top_qval_mods_df, file=paste0("data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk_DE_genes_dream_unique_", network_dataset, "_", mod_def, "_top_Qval_modules.csv"), row.names=FALSE, quote=FALSE)
} else {
    write.csv(top_qval_mods_df, file=paste0("data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk_DE_genes_dream_", network_dataset, "_", mod_def, "_top_Qval_modules.csv"), row.names=FALSE, quote=FALSE)
}
