# Systematic comparison of GRNs

## Libraries

In [1]:
import pandas as pd
import numpy as np

import decoupler as dc
import matplotlib.pyplot as plt

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [2]:
print(dc.__version__)

1.2.0


## Benchmark data

### Load knockTF benchmark data 

In [3]:
!wget 'https://zenodo.org/record/7035528/files/knockTF_expr.csv?download=1' -O '../../data/knockTF_expr.csv'
!wget 'https://zenodo.org/record/7035528/files/knockTF_meta.csv?download=1' -O '../../data/knockTF_meta.csv'

--2023-05-23 16:01:04--  https://zenodo.org/record/7035528/files/knockTF_expr.csv?download=1
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting to zenodo.org (zenodo.org)|188.185.124.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 146086808 (139M) [text/plain]
Saving to: ‘../../data/knockTF_expr.csv’


2023-05-23 16:01:25 (7,09 MB/s) - ‘../../data/knockTF_expr.csv’ saved [146086808/146086808]

--2023-05-23 16:01:25--  https://zenodo.org/record/7035528/files/knockTF_meta.csv?download=1
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting to zenodo.org (zenodo.org)|188.185.124.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 144861 (141K) [text/plain]
Saving to: ‘../../data/knockTF_meta.csv’


2023-05-23 16:01:26 (6,10 MB/s) - ‘../../data/knockTF_meta.csv’ saved [144861/144861]



In [3]:
mat = pd.read_csv('../../data/knockTF_expr.csv', index_col=0)
obs = pd.read_csv('../../data/knockTF_meta.csv', index_col=0)

### Filter knockTF benchmark data 
Filtering is based on log fold-change of perturbed transcription factors

In [4]:
msk = obs['logFC'] < -1
mat = mat[msk]
obs = obs[msk]
mat.shape, obs.shape, pd.unique(obs['TF'].values).shape

((388, 21985), (388, 13), (234,))

## GRNs 

### Load GRNs

In [5]:
doro_ABC = pd.read_csv('../../data/networks/filtered_dorothea_ABC.csv')
doro_ABCD = pd.read_csv('../../data/networks/filtered_dorothea_ABCD.csv')
regnet = pd.read_csv('../../data/networks/filtered_regnetwork.csv')
pathComp = pd.read_csv('../../data/networks/filtered_pathwayCommons.csv')
chea3 = pd.read_csv('../../data/networks/filtered_chea3.csv')
collecTRI = pd.read_csv('../../output/CollecTRI/CollecTRI_GRN.csv')
collecTRI_rand = dc.shuffle_net(collecTRI, target='target', weight='weight').drop_duplicates(['source', 'target'])

In [6]:
chea3_archs4 = chea3[chea3['confidence'] == 'ARCHS4_Coexpression']
chea3_encode = chea3[chea3['confidence'] == 'ENCODE_ChIP-seq']
chea3_enrich = chea3[chea3['confidence'] == 'Enrichr_Queries']
chea3_GTEx = chea3[chea3['confidence'] == 'GTEx_Coexpression']
chea3_lit = chea3[chea3['confidence'] == 'Literature_ChIP-seq']
chea3_remap = chea3[chea3['confidence'] == 'ReMap_ChIP-seq']

### Overview final GRNs

In [7]:
# Number of TFs included in each network
n_doro = np.sum(np.isin(pd.unique(doro_ABC['source'].values), pd.unique(obs['TF'].values))) 
n_collectri = np.sum(np.isin(pd.unique(collecTRI['source'].values), pd.unique(obs['TF'].values)))
n_regnet = np.sum(np.isin(pd.unique(regnet['source'].values), pd.unique(obs['TF'].values))) 
n_pathComp = np.sum(np.isin(pd.unique(pathComp['source'].values), pd.unique(obs['TF'].values)))
n_cheaArch = np.sum(np.isin(pd.unique(chea3_archs4['source'].values), pd.unique(obs['TF'].values))) 
n_cheaEncode = np.sum(np.isin(pd.unique(chea3_encode['source'].values), pd.unique(obs['TF'].values)))
n_cheaEnrich = np.sum(np.isin(pd.unique(chea3_enrich['source'].values), pd.unique(obs['TF'].values))) 
n_cheaGTEx = np.sum(np.isin(pd.unique(chea3_GTEx['source'].values), pd.unique(obs['TF'].values)))
n_chealit = np.sum(np.isin(pd.unique(chea3_lit['source'].values), pd.unique(obs['TF'].values))) 
n_cheaRemap = np.sum(np.isin(pd.unique(chea3_remap['source'].values), pd.unique(obs['TF'].values)))
n_doro, n_collectri, n_regnet, n_pathComp, n_cheaArch, n_cheaEncode, n_cheaEnrich, n_cheaGTEx, n_chealit, n_cheaRemap

(125, 171, 158, 121, 156, 46, 146, 155, 66, 101)

## Run benchmark

In [8]:
# Build dictionary of networks to test
nets = {
    'ABC': doro_ABC,
    'ABCD': doro_ABCD,
    'collecTRI': collecTRI,
    'regnet': regnet,
    'pathComp': pathComp,
    'chea3_archs4': chea3_archs4,
    'chea3_encode': chea3_encode,
    'chea3_enrich': chea3_enrich,
    'chea3_GTEx': chea3_GTEx,
    'chea3_lit': chea3_lit,
    'chea3_remap': chea3_remap,
    'rand': collecTRI_rand
}

# Example extra arguments
decouple_kws = {
    'ABC': {'methods': 'ulm'},
    'ABCD': {'methods': 'ulm'},
    'collecTRI': {'methods': 'ulm'},
    'regnet': {'methods': 'ulm'},
    'pathComp': {'methods': 'ulm'},
    'chea3_archs4': {'methods': 'ulm'},
    'chea3_encode': {'methods': 'ulm'},
    'chea3_enrich': {'methods': 'ulm'},
    'chea3_GTEx': {'methods': 'ulm'},
    'chea3_lit': {'methods': 'ulm'},
    'chea3_remap': {'methods': 'ulm'},
    'rand': {'methods': 'ulm'}

}

# Run benchmark pipeline
df = dc.benchmark(mat, obs, nets, perturb='TF', sign=-1, verbose=True, decouple_kws=decouple_kws)

Using ABC network...
Extracting inputs...
Formating net...
174 experiments without sources in net, they will be removed.
Running methods...
55 features of mat are empty, they will be removed.
Running ulm on mat with 214 samples and 21930 targets for 297 sources.
Calculating metrics...
Computing metrics...
Done.
Using ABCD network...
Extracting inputs...
Formating net...
146 experiments without sources in net, they will be removed.
Running methods...
52 features of mat are empty, they will be removed.
Running ulm on mat with 242 samples and 21933 targets for 411 sources.
Calculating metrics...
Computing metrics...
Done.
Using collecTRI network...
Extracting inputs...
Formating net...
109 experiments without sources in net, they will be removed.
Running methods...
52 features of mat are empty, they will be removed.
Running ulm on mat with 279 samples and 21933 targets for 766 sources.
Calculating metrics...
Computing metrics...
Done.
Using regnet network...
Extracting inputs...
Formating

### Save results

In [9]:
pd.DataFrame.to_csv(df, '../../output/benchmark/benchmark_res.csv')

## Run benchmark per source
Only done for top three performing networks in the overall benchmark (Dorothea ABC, Regnetwork, CollecTRI)

In [10]:
nets = {
    'ABC': doro_ABC,
    'regnet': regnet,
    'collecTRI': collecTRI
}

# Example extra arguments
decouple_kws = {
    'ABC': {'methods': 'ulm'},
    'regnet': {'methods': 'ulm'},
    'collecTRI': {'methods': 'ulm'}
}

# Run benchmark pipeline
df_source = dc.benchmark(mat, obs, nets, perturb='TF', sign=-1, by='source', verbose=True, decouple_kws=decouple_kws)

Using ABC network...
Extracting inputs...
Formating net...
174 experiments without sources in net, they will be removed.
Running methods...
55 features of mat are empty, they will be removed.
Running ulm on mat with 214 samples and 21930 targets for 297 sources.
Calculating metrics...
Computing metrics...
Done.
Using regnet network...
Extracting inputs...
Formating net...
156 experiments without sources in net, they will be removed.
Running methods...
54 features of mat are empty, they will be removed.
Running ulm on mat with 232 samples and 21931 targets for 638 sources.
Calculating metrics...
Computing metrics...
Done.
Using collecTRI network...
Extracting inputs...
Formating net...
109 experiments without sources in net, they will be removed.
Running methods...
52 features of mat are empty, they will be removed.
Running ulm on mat with 279 samples and 21933 targets for 766 sources.
Calculating metrics...
Computing metrics...
Done.


### Save results

In [11]:
pd.DataFrame.to_csv(df_source, '../../output/benchmark/benchmark_source_res.csv')

## Run benchmark effect of sign in CollecTRI
Evaluation of signing strategies

In [5]:
collecTRI_signed = pd.read_csv('../../output/CollecTRI/CollecTRI_GRN.csv')
collecTRI_TFrole = pd.read_csv('../../output/signed_networks/CollecTRI_signed_TFrole.csv')
collecTRI_PMID_TFrole = pd.read_csv('../../output/signed_networks/CollecTRI_signed_PMID_TFrole.csv')
collecTRI_PMID_regulon_TFrole = pd.read_csv('../../output/signed_networks/CollecTRI_signed_PMID_regulon_TFrole.csv')

In [6]:
# collecTRI agnostic
collecTRI_agnostic = collecTRI_signed.copy()
collecTRI_agnostic['weight'] = 1

# collecTRI_PMID
collecTRI_PMID = collecTRI_signed.copy()
collecTRI_PMID['weight'] = collecTRI_PMID['weight'].where(collecTRI_PMID['sign_decision'].isin(['PMID']), 1)

# collecTRI_TF
collecTRI_TF = collecTRI_TFrole.copy()
collecTRI_TF['weight'] = collecTRI_TF['weight'].where(collecTRI_TF['sign.decision'].isin(['TF role']), 1)

# collecTRI_PMID + collecTRI_TF
collecTRI_PMID_TF = collecTRI_PMID_TFrole.copy()
collecTRI_PMID_TF['weight'] = collecTRI_PMID_TF['weight'].where(collecTRI_PMID_TF['sign.decision'].isin(['PMID','TF role']), 1)

# collecTRI_repression
collecTRI_repression = collecTRI_signed.copy()
collecTRI_repression['weight'] = collecTRI_repression['weight'].where(collecTRI_repression['sign_decision'] != "default activation", -1)

In [7]:
nets = {
    'collecTRI': collecTRI_signed,
    'collecTRI_agnostic': collecTRI_agnostic,
    'collecTRI_PMID': collecTRI_PMID,
    'collecTRI_TF': collecTRI_TF,
    'collecTRI_PMID_TF': collecTRI_PMID_TF,
    'collecTRI_PMID_regulon_TF': collecTRI_PMID_regulon_TFrole,
    'collecTRI_repression': collecTRI_repression
}

# Example extra arguments
decouple_kws = {
    'collecTRI': {'methods': 'ulm'},
    'collecTRI_agnostic': {'methods': 'ulm'},
    'collecTRI_PMID': {'methods': 'ulm'},
    'collecTRI_TF': {'methods': 'ulm'},
    'collecTRI_PMID_TF': {'methods': 'ulm'},
    'collecTRI_PMID_regulon_TF': {'methods': 'ulm'},
    'collecTRI_repression': {'methods': 'ulm'}
}

# Run benchmark pipeline
df_sign = dc.benchmark(mat, obs, nets, perturb='TF', sign=-1, verbose=True, decouple_kws=decouple_kws)

Using collecTRI network...
Extracting inputs...
Formating net...
109 experiments without sources in net, they will be removed.
Running methods...
52 features of mat are empty, they will be removed.
Running ulm on mat with 279 samples and 21933 targets for 766 sources.
Calculating metrics...
Computing metrics...
Done.
Using collecTRI_agnostic network...
Extracting inputs...
Formating net...
109 experiments without sources in net, they will be removed.
Running methods...
52 features of mat are empty, they will be removed.
Running ulm on mat with 279 samples and 21933 targets for 766 sources.
Calculating metrics...
Computing metrics...
Done.
Using collecTRI_PMID network...
Extracting inputs...
Formating net...
109 experiments without sources in net, they will be removed.
Running methods...
52 features of mat are empty, they will be removed.
Running ulm on mat with 279 samples and 21933 targets for 766 sources.
Calculating metrics...
Computing metrics...
Done.
Using collecTRI_TF network...

### Save results

In [8]:
pd.DataFrame.to_csv(df_sign, '../../output/benchmark/benchmark_sign_res.csv')