In [1]:
#! pip install anoexpress grenadine

In [4]:
import anoexpress as xpress
import pandas as pd
import numpy as np

### Gene regulatory networks

Using [grenadine](https://www.mdpi.com/2073-4425/14/2/269), which seems to be mostly a load of scikit-learn functions. 

In [24]:
from grenadine.Inference.inference import score_links
from grenadine.Inference.regression_predictors import Lasso_score, GENIE3, AdaBoost_regressor, Elastica
from grenadine.Preprocessing.standard_preprocessing import z_score 
from tqdm.autonotebook import tqdm

Load the fold-change data and some gene annotation data, and our list of transcription factors. 

In [25]:
fc_data = xpress.data(data_type='fcs', analysis='gamb_colu_arab_fun', microarray=True).drop(columns='GeneName').set_index('GeneID')
df_annot = pd.read_csv(f"https://raw.githubusercontent.com/sanjaynagi/AnoExpress/main/results/fcs.gamb_colu.tsv", sep="\t")[['GeneID', 'GeneName', 'GeneDescription']]

# Load tfs
tf_list = pd.read_csv("../../resources/AGAP_TFs.tsv", sep="\t")
tf_list.columns = ['GeneID', 'TFID', 'DrosophilaID']

# apply z score 
fc_zdata = z_score(fc_data,axis=1).dropna()

fc_zdata.head(2)

Unnamed: 0_level_0,Tiefora_v_Ngousso,Gou_v_Moz,Ban_v_BanS,BanRe_v_BanS,Bak_v_Kisumu,VK7_v_Kisumu,Cameroon_v_Ngousso,Chad_v_Ngousso,Niger_v_Ngousso,Nigeria_v_Ngousso,...,VK72011,TiassaleMali,TiassaleOkyero,Tiassale2011,KovieOkyero,KovieMalanville,Youande,Hai,Muheza,Dar
GeneID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AGAP007364,-1.621473,0.536389,-0.232404,1.811886,1.02562,1.436226,-0.24114,0.763532,-0.599328,-0.214931,...,0.872929,0.329622,0.231335,-1.251614,0.771988,1.333761,-0.237334,-1.630161,-0.031314,-0.95419
AGAP010071,-2.179811,0.866916,-2.156642,0.079169,0.090754,-0.17569,-1.646923,-1.600585,-1.415233,-1.473156,...,1.114854,0.527519,0.281836,-0.33311,0.416524,0.398674,0.488602,0.420738,0.610534,0.547198


Run the algorithm. We can choose from a few, Im going with an old favourite here - the lasso!

In [None]:
#extra_args = {'alpha':0.5}

algorithm = AdaBoost_regressor # Lasso_score, GENIE3, AdaBoost_regressor


score_matrix = score_links(gene_expression_matrix=fc_zdata,
                           score_predictor=algorithm,
                           tf_list=tf_list.GeneID.to_list(),
                           progress_bar=True,
                       #    **extra_args,
)

score_matrix = score_matrix.T.reset_index().rename(columns={'index':'GeneID'})
score_matrix = score_matrix.merge(tf_list.iloc[:, :2], how='left').set_index(['GeneID', 'TFID']).reset_index()

score_matrix.head(2)

  0%|          | 0/7842 [00:00<?, ?it/s]

In [9]:
score_matrix = score_matrix.T.reset_index().rename(columns={'index':'GeneID'})
score_matrix = score_matrix.merge(tf_list.iloc[:, :2], how='left').set_index(['GeneID', 'TFID']).reset_index()

score_matrix.head(2)

Unnamed: 0,GeneID,TFID,AGAP000002,AGAP000005,AGAP000007,AGAP000008,AGAP000009,AGAP000010,AGAP000011,AGAP000012,...,AGAP013527,AGAP013528,AGAP013529,AGAP013533,AGAP013535,AGAP013540,AGAP013543,AGAP013544,AGAP013545,AGAP028019
0,AGAP000005,Dsp1,0.003414,,0.000649,0.066196,0.000205,0.143268,0.001354,0.054877,...,0.000951,0.017228,0.00265,0.006354,0.002637,0.000489,0.016509,0.00373,0.002947,0.002525
1,AGAP000037,CG8909,0.016105,0.00151,0.002017,0.003618,0.076353,0.003602,0.003562,0.004551,...,0.001401,0.000904,0.001053,0.000417,0.001903,0.003653,0.00188,0.004467,0.022242,0.000165


Now lets find the associated genes for each Transcription factor. 

In [10]:
def tf_regulatees(score_matrix, tf):
    tf_score = score_matrix.set_index(['GeneID', 'TFID']).query(f"GeneID == '{tf}'")
    
    if tf_score.empty:
        return
    
    mask = (tf_score.values > 0)[0]
    tf_score = tf_score.loc[:, mask]
    tf_score = tf_score.melt(var_name='GeneID', value_name='score').merge(df_annot)
    return(tf_score.sort_values(by='score', ascending=False)) 

# get genes for each tf
tf = []
for idx, row in tqdm(tf_list.iterrows()):
    df = tf_regulatees(score_matrix, row['GeneID'])
    if df is None:
        continue
    tf.append(df.assign(tf=row['TFID']))

tf_df = pd.concat(tf).drop_duplicates()

0it [00:00, ?it/s]

### P450s

In [20]:
def list_tf_links(filter_column='GeneName', filter_pattern='CYP', tf=None):-
    """
    List genes associated with transcription factors, filtering my `filter_pattern` on `filter_column`, 
    and optionally filtering bt `tf` (Transcription factor)
    """
    df = tf_df.query(f"{filter_column}.str.contains('{filter_pattern}').fillna(False)", engine='python')
    if tf:
        df = df.query(f"tf == '{tf}'")
    return df

In [19]:
pd.set_option('display.max_rows', 20000)
list_tf_links(filter_column='GeneDescription', filter_pattern='P450').query("score > 0.05")

Unnamed: 0,GeneID,score,GeneName,GeneDescription,tf
1579,AGAP002419,0.29156,CYP4D22,cytochrome P450 [Source:VB Community Annotation],Dsp1
4652,AGAP007480,0.150373,CYP6AH1,cytochrome P450 [Source:VB Community Annotation],Dsp1
7437,AGAP012296,0.082147,CYP9J5,cytochrome P450 [Source:VB Community Annotation],Dsp1
3824,AGAP005992,0.108056,CYP302A1,cytochrome P450 [Source:VB Community Annotation],toy
1880,AGAP002870,0.050719,CYP6AD1,cytochrome P450 [Source:VB Community Annotation],toy
6750,AGAP010966,0.088584,CYP6AJ1,cytochrome P450 [Source:VB Community Annotation],CG34031
7367,AGAP005774,0.122384,CYP49A1,cytochrome P450 [Source:VB Community Annotation],CG2712
2856,AGAP002195,0.109487,CYP325F2,cytochrome P450 [Source:VB Community Annotation],CG2712
13500,AGAP010966,0.106183,CYP6AJ1,cytochrome P450 [Source:VB Community Annotation],CG2712
1908,AGAP001443,0.077586,CYP325J1,cytochrome P450 [Source:VB Community Annotation],CG2712


### Gsts

In [14]:
list_tf_links(filter_column='GeneName', filter_pattern='GST').queru

Unnamed: 0,GeneID,score,GeneName,GeneDescription,tf
5726,AGAP009193,0.050981,GSTE4,glutathione S-transferase epsilon class 4 [Sou...,Dsp1
2829,AGAP004382,0.049836,GSTD3,glutathione S-transferase delta class 3 [Sourc...,Dsp1
3675,AGAP005749,0.042399,GSTO1,glutathione S-transferase omega class 1 [Sourc...,Dsp1
5727,AGAP009194,0.025980,GSTE2,glutathione S-transferase epsilon class 2 [Sou...,Dsp1
5723,AGAP009190,0.013689,GSTE8,glutathione S-transferase epsilon class 8 [Sou...,Dsp1
...,...,...,...,...,...
10796,AGAP004163,0.001573,GSTD7,glutathione S-transferase delta class 7 [Sourc...,E(spl)mgamma-HLH
22908,AGAP009193,0.001240,GSTE4,glutathione S-transferase epsilon class 4 [Sou...,E(spl)mgamma-HLH
22925,AGAP009197,0.001114,GSTE3,glutathione S-transferase epsilon class 3 [Sou...,E(spl)mgamma-HLH
22900,AGAP009191,0.000795,GSTE6,glutathione S-transferase epsilon class 6 [Sou...,E(spl)mgamma-HLH


## coes

In [22]:
list_tf_links(filter_column='GeneDescription', filter_pattern='esterase')

Unnamed: 0,GeneID,score,GeneName,GeneDescription,tf
1444,AGAP002220,0.110741,,U6 snRNA phosphodiesterase [Source:UniProtKB/T...,Dsp1
5996,AGAP009695,0.09503,,palmitoyl-protein thioesterase [Source:VB Comm...,Dsp1
3952,AGAP006228,0.047366,COEAE2F,carboxylesterase [Source:VB Community Annotation],Dsp1
7070,AGAP011575,0.018283,COE15O,carboxylesterase [Source:VB Community Annotation],Dsp1
7256,AGAP011940,0.017821,,sphingomyelin phosphodiesterase [Source:VB Com...,Dsp1
3468,AGAP005371,0.012454,COEBE2C,carboxylesterase beta esterase [Source:VB Comm...,Dsp1
4401,AGAP007001,0.01183,,ubiquitin thioesterase OTU1 [Source:VB Communi...,Dsp1
890,AGAP001356,0.011342,ACE1,Acetylcholinesterase [Source:UniProtKB/Swiss-P...,Dsp1
1873,AGAP002863,0.008507,COEAE6O,carboxylesterase alpha esterase [Source:VB Com...,Dsp1
7029,AGAP011507,0.008087,COE13O,carboxylesterase [Source:VB Community Annotation],Dsp1


### UGTs

In [43]:
list_tf_links(filter_column='GeneDescription', filter_pattern='glucosyltransferase')

Unnamed: 0,GeneID,score,GeneName,GeneDescription,tf
178,AGAP009562,0.147917,,"UDP-glucose:O-linked fucose beta-1,3-glucosylt...",toy
177,AGAP004267,0.084487,,O-glucosyltransferase rumi homolog [Source:Uni...,row
89,AGAP002420,0.078126,,"alpha-1,2-glucosyltransferase [Source:VB Commu...",row
113,AGAP004267,0.016411,,O-glucosyltransferase rumi homolog [Source:Uni...,CG12162
7,AGAP002420,0.083652,,"alpha-1,2-glucosyltransferase [Source:VB Commu...",dalao
48,AGAP002420,0.135029,,"alpha-1,2-glucosyltransferase [Source:VB Commu...",pfk
43,AGAP009137,0.033898,,Ecdysteroid UDP-glucosyltransferase [Source:VB...,CG12605
34,AGAP003560,0.013405,,UDP-glucose:glycoprotein glucosyltransferase [...,ID1
34,AGAP003560,0.013405,,UDP-glucose:glycoprotein glucosyltransferase [...,emc
145,AGAP009562,0.030833,,"UDP-glucose:O-linked fucose beta-1,3-glucosylt...",pdm2


### chemosensory

In [47]:
list_tf_links(filter_column='GeneDescription', filter_pattern='chemo')

Unnamed: 0,GeneID,score,GeneName,GeneDescription,tf
142,AGAP008062,0.053289,CSP4,chemosensory protein 4 [Source:VB Community An...,toy
148,AGAP008055,0.057619,CSP3,chemosensory protein 3 [Source:VB Community An...,pfk
185,AGAP008062,0.00487,CSP4,chemosensory protein 4 [Source:VB Community An...,Oli
217,AGAP008062,0.340335,CSP4,chemosensory protein 4 [Source:VB Community An...,
88,AGAP008055,0.05532,CSP3,chemosensory protein 3 [Source:VB Community An...,Asciz


In [21]:
list_tf_links(filter_column='GeneDescription', filter_pattern='sensory')

Unnamed: 0,GeneID,score,GeneName,GeneDescription,tf
5053,AGAP008051,0.007675,SAP1,sensory appendage protein 1 [Source:VB Communi...,Dsp1
5054,AGAP008055,0.001559,CSP3,chemosensory protein 3 [Source:VB Community An...,Dsp1
5056,AGAP008062,0.000517,CSP4,chemosensory protein 4 [Source:VB Community An...,Dsp1
5056,AGAP008062,0.00249,CSP4,chemosensory protein 4 [Source:VB Community An...,CG8909
5053,AGAP008051,0.001683,SAP1,sensory appendage protein 1 [Source:VB Communi...,CG8909
5054,AGAP008055,0.000775,CSP3,chemosensory protein 3 [Source:VB Community An...,CG8909
5056,AGAP008062,0.046325,CSP4,chemosensory protein 4 [Source:VB Community An...,onecut
5053,AGAP008051,0.002606,SAP1,sensory appendage protein 1 [Source:VB Communi...,onecut
5054,AGAP008055,0.001006,CSP3,chemosensory protein 3 [Source:VB Community An...,onecut
5056,AGAP008062,0.061308,CSP4,chemosensory protein 4 [Source:VB Community An...,toy
