Need to Generate PK from:
CellCall; SignaLink; 

Read in links from:

CellCall: https://github.com/ShellyCoder/cellcall/blob/master/inst/extdata/new_ligand_receptor_TFs.txt

stMLnet: https://github.com/SunXQlab/stMLnet/blob/main/data/ex_databases.rda

scseqcomm: https://gitlab.com/sysbiobig/scseqcomm/-/tree/master/data

Check rough sizes for these, and make a new database from OmniPath where:

source (receptor), target (TF), pathway, database, shortest_path_length (as from OmniPath)

In [1]:
library(tidyverse)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.8     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


### stmlnet

In [7]:
load("~/Downloads/ex_databases.rda")
stmlnet <- ex_databases$RecTF.DB

In [12]:
stmlnet <- stmlnet %>% select(source, target) %>%
    mutate(database='STMLNet')

In [8]:
summary(stmlnet)

    source             target              score          
 Length:872490      Length:872490      Min.   :0.0000000  
 Class :character   Class :character   1st Qu.:0.0000009  
 Mode  :character   Mode  :character   Median :0.0000031  
                                       Mean   :0.0000380  
                                       3rd Qu.:0.0000140  
                                       Max.   :0.3967780  

### CellCall

In [4]:
cellcall <- read.table(url('https://raw.githubusercontent.com/ShellyCoder/cellcall/master/inst/extdata/new_ligand_receptor_TFs.txt'), header = TRUE,  sep = '	',  stringsAsFactors = FALSE)  %>%
    select(source=Receptor_Symbol, target=TF_Symbol, pathway=Pathway) %>% 
    distinct() %>% 
    mutate(source = stringr::str_replace_all(source, ',', '_')) %>%
    mutate(database='cellcall')
head(cellcall)

Unnamed: 0_level_0,source,target,pathway,database
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>
1,OCLN,GATA4,"hsa04530_3,hsa04530_5",cellcall
2,OCLN,YBX3,hsa04530_2,cellcall
3,CCR1,FOXO3,hsa04062_5,cellcall
4,CCR1,IKBKB,hsa04062_5,cellcall
5,CCR1,NFKB1,hsa04062_7,cellcall
6,CCR1,NFKBIA,hsa04062_6,cellcall


In [94]:
summary(cellcall)

    source             target            pathway         
 Length:3878        Length:3878        Length:3878       
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  

### scseqcomm

#### KEGG

In [5]:
load("~/Downloads/TF_PPR_KEGG_human.rda")

csc_kegg <- TF_PPR_KEGG_human  %>% 
    select(source=receptor, target=tf, pathway) %>%
    mutate(source = stringr::str_replace_all(source, ',', '_')) %>% 
    distinct() %>%
    mutate(database='scseqcomm_KEGG')
head(csc_kegg)

Unnamed: 0_level_0,source,target,pathway,database
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>
1,PKM,ENO1,Glycolysis / Gluconeogenesis,scseqcomm_KEGG
2,ALDOA,ENO1,Glycolysis / Gluconeogenesis,scseqcomm_KEGG
3,GPI,ENO1,Glycolysis / Gluconeogenesis,scseqcomm_KEGG
4,MINPP1,ENO1,Glycolysis / Gluconeogenesis,scseqcomm_KEGG
5,NPR2,NME2,Purine metabolism,scseqcomm_KEGG
6,PKM,NME2,Purine metabolism,scseqcomm_KEGG


In [98]:
summary(csc_kegg)

    source             target            pathway         
 Length:69300       Length:69300       Length:69300      
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  

#### Reactome

In [6]:
load("~/Downloads/TF_PPR_REACTOME_human.rda")
scs_reactome <- TF_PPR_REACTOME_human %>% 
    select(source=receptor, target=tf, pathway) %>%
    mutate(source = stringr::str_replace_all(source, ',', '_')) %>% 
    distinct() %>%
    mutate(database='scseqcomm_REACTOME')
head(csc_kegg)

Unnamed: 0_level_0,source,target,pathway,database
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>
1,PKM,ENO1,Glycolysis / Gluconeogenesis,scseqcomm_KEGG
2,ALDOA,ENO1,Glycolysis / Gluconeogenesis,scseqcomm_KEGG
3,GPI,ENO1,Glycolysis / Gluconeogenesis,scseqcomm_KEGG
4,MINPP1,ENO1,Glycolysis / Gluconeogenesis,scseqcomm_KEGG
5,NPR2,NME2,Purine metabolism,scseqcomm_KEGG
6,PKM,NME2,Purine metabolism,scseqcomm_KEGG


In [102]:
summary(scs_reactome)

    source             target            pathway         
 Length:24306       Length:24306       Length:24306      
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  

## Generate Ours

In [1]:
import omnipath
import liana as li
import decoupler as dc
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pathways = omnipath.requests.Annotations.get(resources=['SignaLink_pathway', 'KEGG-PC', 'SIGNOR', 'NetPath'])

In [3]:
pathways = dict()
for res in ['SignaLink_pathway', 'KEGG-PC', 'SIGNOR', 'NetPath']:
    resource = dc.get_resource(res)
    # translate to mouse
    resource = dc.translate_net(resource,
                                target_organism = 'mouse',
                                unique_by = ('geneset', 'genesymbol'))
    
    pathways[res] = resource


In [4]:
pathways = pd.concat(pathways)
pathways = pathways.reset_index().drop(columns='level_1').rename(columns={'level_0': 'database'})

In [5]:
pathways.rename(columns={'genesymbol': 'target'}, inplace=True)

In [6]:
pathways.head()

Unnamed: 0,database,target,pathway
0,SignaLink_pathway,Cd247,T-cell receptor
1,SignaLink_pathway,Zap70,Receptor tyrosine kinase
2,SignaLink_pathway,Zap70,T-cell receptor
3,SignaLink_pathway,Tab2,Toll-like receptor
4,SignaLink_pathway,Tab2,Innate immune pathways


#### generate an lr geneset

In [7]:
resource = li.resource.select_resource('mouseconsensus')

In [8]:
lr_net = li.fun.generate_lr_geneset(resource, net=pathways, source='pathway', weight=None)
lr_net

Unnamed: 0,pathway,interaction,weight
1,NOTCH Signaling,Dll1^Notch1,1.0
61,NOTCH Signaling,Jag1^Notch1,1.0
73,NOTCH Signaling,Adam17^Notch1,1.0
81,NOTCH Signaling,Dlk1^Notch1,1.0
97,NOTCH Signaling,Dll4^Notch1,1.0
...,...,...,...
8068,Metabolic pathways,Ptgs2^Alox5,1.0
8069,Thyroid-stimulating hormone (TSH),Bsg^Slc16a1,1.0
8079,JAK/STAT,Il22^Il10ra,1.0
8083,JAK/STAT,Il22^Il20ra,1.0


In [9]:
# import dorothea
dorothea = dc.get_dorothea(organism='mouse')

In [10]:
dorothea

Unnamed: 0,source,confidence,target,weight
0,Myc,A,Tert,1.000000
1,E2f1,A,Casp7,1.000000
2,E2f1,A,Aurka,1.000000
3,Dnmt1,A,Timp3,-1.000000
4,Dnmt1,A,Sfrp5,1.000000
...,...,...,...,...
28414,Gata3,C,Prom1,0.333333
28415,Gata3,C,Ppt1,0.333333
28416,Gata3,C,Ppp6r3,0.333333
28417,Gata3,C,Ptprf,0.333333


In [11]:
# check for TF links within each pathway for which we have a ligand-receptor interaction

In [12]:
pathway_msk = pathways['pathway'].isin(lr_net['pathway'])
tf_msk = pathways['target'].isin(dorothea['source'])


In [14]:
pathways = pathways[pathway_msk & tf_msk]

In [17]:
receptor_tf = lr_net.merge(pathways, on=['pathway'], how='inner')
receptor_tf = receptor_tf[['interaction', 'target', 'pathway', 'database']]
receptor_tf

Unnamed: 0,interaction,target,pathway,database
0,Dll1^Notch1,Hes1,NOTCH Signaling,SIGNOR
1,Dll1^Notch1,Nfkb1,NOTCH Signaling,SIGNOR
2,Dll1^Notch1,Yy1,NOTCH Signaling,SIGNOR
3,Dll1^Notch1,Hif1a,NOTCH Signaling,SIGNOR
4,Dll1^Notch1,Myod1,NOTCH Signaling,SIGNOR
...,...,...,...,...
12714,Ghrl^Ghsr,Stat3,Ghrelin,NetPath
12715,Ghrl^Ghsr,Creb1,Ghrelin,NetPath
12716,Ghrl^Ghsr,Tp53,Ghrelin,NetPath
12717,Ghrl^Ghsr,Elk1,Ghrelin,NetPath


In [19]:
receptor_tf.to_csv('../../data/receptor_tf.csv', index=False)