# Estimate TF activities with decoupler - Comparing the effect of the number of targets on the estimated activities

## Libraries

In [1]:
import pandas as pd
import numpy as np

import decoupler as dc

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


## Benchmark data

### Load KnockTF data
The benchmark data can be downloaded from Zenodo as shown in scripts/benchmark.ipynb

In [2]:
mat = pd.read_csv('../../data/knockTF_expr.csv', index_col=0)
obs = pd.read_csv('../../data/knockTF_meta.csv', index_col=0)

### Filter knockTF data
Filtering is based on log fold-change of perturbed transcription factors

In [3]:
msk = obs['logFC'] < -1
mat = mat[msk]
obs = obs[msk]

## Load GRNs

In [4]:
doro_ABC = pd.read_csv('../../data/networks/filtered_dorothea_ABC.csv')
regnet = pd.read_csv('../../data/networks/filtered_regnetwork.csv')
collecTRI = pd.read_csv('../../output/CollecTRI/CollecTRI.csv')

### Filter GRNs for highly correlated regulons
Activities for TFs with the same or highly correlated regulon can not be correctly estimated using
multivariate linear models. For these regulons, only one TF is kept.

In [5]:
# Remove correlated sources from regnet
decouple_kws =  {'source': 'source', 'target': 'target', 'weight': 'weight', 'min_n': 5}
mat_regnet, obs_regnet, var_regnet, regnet, groupby_regnet = dc.format_benchmark_inputs(mat = mat, obs = obs, sign = -1, net = regnet, by = 'experiment', perturb='TF', groupby = None, decouple_kws=decouple_kws)

corr_res = dc.check_corr(regnet)

In [6]:
idx_corr = corr_res['corr'] >= 0.95
corr_tfs = corr_res[idx_corr]

idx_tfs = np.isin(corr_tfs['source2'].values, obs['TF'].values)
tfs_to_remove1 = corr_tfs[idx_tfs]['source1']
tfs_to_remove2 = corr_tfs[~idx_tfs]['source2']

tfs_to_remove = pd.concat([tfs_to_remove1, tfs_to_remove2])

np.sum(np.isin(tfs_to_remove.values, obs['TF'].values))

1

In [7]:
idx_remove = np.isin(regnet['source'].values, tfs_to_remove.values)
regnet_filtered = regnet.loc[~idx_remove]

## Data preparation before activity estimation 

In [8]:
decouple_kws =  {'source': 'source', 'target': 'target', 'weight': 'weight', 'min_n': 5}

In [9]:
mat_regnet, obs_regnet, var_regnet, regnet, groupby_regnet = dc.format_benchmark_inputs(mat = mat, obs = obs, sign = -1, net = regnet_filtered, by = 'experiment', perturb='TF', groupby = None, decouple_kws=decouple_kws)
mat_doro_ABC, obs_doro_ABC, var_doro_ABC, doro_ABC, groupby_doro_ABC = dc.format_benchmark_inputs(mat = mat, obs = obs, sign = -1, net = doro_ABC, by = 'experiment', perturb='TF', groupby = None, decouple_kws=decouple_kws)
mat_collecTRI, obs_collecTRI, var_collecTRI, collecTRI, groupby_collecTRI = dc.format_benchmark_inputs(mat = mat, obs = obs, sign = -1, net = collecTRI, by = 'experiment', perturb='TF', groupby = None, decouple_kws=decouple_kws)

In [10]:
mat_regnet_new = pd.DataFrame(mat_regnet.A, index=obs_regnet.index, columns=var_regnet.index)
mat_doro_ABC_new = pd.DataFrame(mat_doro_ABC.A, index=obs_doro_ABC.index, columns=var_doro_ABC.index)
mat_collecTRI_new = pd.DataFrame(mat_collecTRI.A, index=obs_collecTRI.index, columns=var_collecTRI.index)

## Activity estimation with decoupler

In [11]:
results_regnet = dc.decouple(mat_regnet_new, net=regnet, source='source', target='target', weight='weight', verbose=True, min_n=5)
results_doro = dc.decouple(mat_doro_ABC_new, net=doro_ABC, source='source', target='target', weight='weight', verbose=True, min_n=5)
results_collecTRI = dc.decouple(mat_collecTRI_new, net=collecTRI, source='source', target='target', weight='weight', verbose=True, min_n=5)

54 features of mat are empty, they will be removed.
Running mlm on mat with 229 samples and 21931 targets for 622 sources.


100%|█████████████████████████████████████████████| 1/1 [00:14<00:00, 14.78s/it]


54 features of mat are empty, they will be removed.
Running ulm on mat with 229 samples and 21931 targets for 622 sources.
54 features of mat are empty, they will be removed.
Running wsum on mat with 229 samples and 21931 targets for 622 sources.
Infering activities on 1 batches.


100%|████████████████████████████████████████████| 1/1 [04:47<00:00, 287.84s/it]


55 features of mat are empty, they will be removed.
Running mlm on mat with 214 samples and 21930 targets for 297 sources.


100%|█████████████████████████████████████████████| 1/1 [00:04<00:00,  4.55s/it]


55 features of mat are empty, they will be removed.
Running ulm on mat with 214 samples and 21930 targets for 297 sources.
55 features of mat are empty, they will be removed.
Running wsum on mat with 214 samples and 21930 targets for 297 sources.
Infering activities on 1 batches.


100%|████████████████████████████████████████████| 1/1 [03:37<00:00, 217.16s/it]


52 features of mat are empty, they will be removed.
Running mlm on mat with 279 samples and 21933 targets for 774 sources.


100%|█████████████████████████████████████████████| 1/1 [00:16<00:00, 16.55s/it]


52 features of mat are empty, they will be removed.
Running ulm on mat with 279 samples and 21933 targets for 774 sources.
52 features of mat are empty, they will be removed.
Running wsum on mat with 279 samples and 21933 targets for 774 sources.
Infering activities on 1 batches.


100%|████████████████████████████████████████████| 1/1 [05:16<00:00, 316.65s/it]


### Save results
consensus activity scores are safed

In [12]:
pd.DataFrame.to_csv(results_regnet['consensus_estimate'], '../../output/benchmark/regnet_activity.csv')
pd.DataFrame.to_csv(results_doro['consensus_estimate'], '../../output/benchmark/dorothea_activity.csv')
pd.DataFrame.to_csv(results_collecTRI['consensus_estimate'], '../../output/benchmark/collecTRI_activity.csv')