In [1]:
library(tidyverse)
library(broom)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
# load the mouse brain ESMU. Part of this repo and at https://github.com/perslab/CELLECT/wiki/Precomputed-CELLEX-datasets
esmu = read_csv('esmu/mousebrain.mu.csv.gz')
esmu %>% head

[1mRows: [22m[34m15071[39m [1mColumns: [22m[34m266[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m   (1): gene
[32mdbl[39m (265): ABC, ACBG, ACMB, ACNT1, ACNT2, ACOB, ACTE1, ACTE2, CBGRC, CBINH1,...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


gene,ABC,ACBG,ACMB,ACNT1,ACNT2,ACOB,ACTE1,ACTE2,CBGRC,⋯,TEINH6,TEINH7,TEINH8,TEINH9,VECA,VECC,VECV,VLMC1,VLMC2,VSMCA
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000141668,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,⋯,0.8457562,0.7594462,0.9017036,0.08039406,0.0,0.0,0.0,0,0.0,0
ENSG00000204624,0.0,0,0.9498094,0.3162753,0.7277725,0.9034658,0.8603363,0.7163228,0,⋯,0.5137109,0.4615924,0.0,0.0,0.0,0.0,0.0,0,0.0,0
ENSG00000187848,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0
ENSG00000171522,0.0,0,0.0,0.0,0.0,0.0,0.0,0.7736864,0,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0
ENSG00000183662,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0
ENSG00000113504,0.9101624,0,0.0,0.0,0.0,0.0,0.0,0.0,0,⋯,0.0,0.0,0.0,0.0,0.5108949,0.416002,0.5872623,0,0.03407644,0


In [3]:
# load the kmes file
kmes = read_csv('wgcna_output/renamed_tables/wgcna_output_ds4_mcs20_pF_geneMod.csv')
kmes %>% head

[1mRows: [22m[34m19770[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (8): data, run, cell_cluster, module, name, genes, ortholog_ensg, orthol...
[32mdbl[39m (2): pkMs, gene_loadings

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


data,run,cell_cluster,module,name,genes,ortholog_ensg,ortholog_name,pkMs,gene_loadings
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
dmb,ds4_mcs20_pF,HPF,orangered,HPF__23,ENSMUSG00000120785,,,0.8369424,0.1501479
dmb,ds4_mcs20_pF,HPF,orangered,HPF__23,ENSMUSG00000108990,,,0.8353232,0.1498574
dmb,ds4_mcs20_pF,HPF,orangered,HPF__23,ENSMUSG00000110233,,,0.7805153,0.1400248
dmb,ds4_mcs20_pF,HPF,orangered,HPF__23,ENSMUSG00000083911,,,0.7797708,0.1398913
dmb,ds4_mcs20_pF,HPF,orangered,HPF__23,ENSMUSG00000112311,,,0.7761567,0.1392429
dmb,ds4_mcs20_pF,HPF,orangered,HPF__23,ENSMUSG00000103485,,,0.7539321,0.1352558


In [4]:
# compute correlation between cell type ESmu score and kMEs
# this is a measure of specificity of expression (ESmu) and module-belonging (kME)
# the correlation is a rough way to see if modules correspond closely to any cell type or not

kme_esmu_corr = kmes %>%
select(name, ortholog_ensg, pkMs) %>%
filter(!is.na(ortholog_ensg)) %>%
inner_join(esmu %>% rename(ortholog_ensg = gene), by='ortholog_ensg') %>%
pivot_longer(cols = 4:last_col(), names_to = "esmu_cell_type", values_to = "esmu") %>%
group_by(name, esmu_cell_type)  %>%
  summarise(
    module_size_ensg = length(pkMs),
    correlation = tryCatch(
      {
        test <- cor.test(rank(pkMs, na.last = "keep"), rank(esmu, na.last = "keep"), 
               method = "spearman", alternative = "greater")
        test$estimate
      },
      error = function(e) NA
    ),
    pvalue = tryCatch(
      {
        test <- cor.test(rank(pkMs, na.last = "keep"), rank(esmu, na.last = "keep"), 
               method = "spearman", alternative = "greater")
        test$p.value
      },
      error = function(e) NA
    ),
    .groups = 'drop'
  )


kme_esmu_corr %>% head

[1m[22m[36mℹ[39m In argument: `correlation = tryCatch(...)`.
[36mℹ[39m In group 1: `name = "HPF__01"`, `esmu_cell_type = "ABC"`.
[33m![39m the standard deviation is zero


name,esmu_cell_type,module_size_ensg,correlation,pvalue
<chr>,<chr>,<int>,<dbl>,<dbl>
HPF__01,ABC,3,,
HPF__01,ACBG,3,0.0,0.5
HPF__01,ACMB,3,-0.5,0.8333333
HPF__01,ACNT1,3,-1.0,1.0
HPF__01,ACNT2,3,-0.5,0.8333333
HPF__01,ACOB,3,-0.5,0.8333333


In [5]:
# munge and save correlation table
kme_esmu_corr = kme_esmu_corr %>% 
arrange(pvalue) %>%
group_by(name) %>%
mutate(pvalue_bh = p.adjust(pvalue, method = "BH")) %>%
ungroup

kme_esmu_corr %>% head

dir.create('wgcna_output/renamed_tables/esmu_corr/', recursive = TRUE)
kme_esmu_corr %>% write_csv('wgcna_output/renamed_tables/esmu_corr/kme_esmu_corr.csv')

name,esmu_cell_type,module_size_ensg,correlation,pvalue,pvalue_bh
<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>
ZI__18,MOL1,292,0.5801515,5.7896730000000005e-28,1.534263e-25
ZI__18,MOL2,292,0.5733104,3.253094e-27,4.310349e-25
ZI__18,MFOL1,292,0.5550409,2.695943e-25,2.3814160000000003e-23
ZI__18,MOL3,292,0.5501672,8.371101e-25,5.545854e-23
ZI__18,MFOL2,292,0.489874,2.480713e-19,1.3147780000000001e-17
HPF__32,DGGRC2,166,0.5410776,2.616289e-14,6.933167e-12


“'wgcna_output/renamed_tables/esmu_corr' already exists”


In [6]:
sessionInfo()

R version 4.2.2 (2022-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux

Matrix products: default
BLAS/LAPACK: /nfsdata/tools/anaconda/envs/nmq407/dmb/lib/libopenblasp-r0.3.21.so

locale:
 [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
 [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
 [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] broom_1.0.5     lubridate_1.9.2 forcats_1.0.0   stringr_1.5.0  
 [5] dplyr_1.1.2     purrr_1.0.1     readr_2.1.4     tidyr_1.3.0    
 [9] tibble_3.2.1    ggplot2_3.4.2   tidyverse_2.0.0

loaded via a namespace (and not attached):
 [1] pillar_1.9.0     compiler_4.2.2   base64enc_0.1-3  too