<a href="https://colab.research.google.com/github/sanjaynagi/AnoExpress/blob/main/workflow/notebooks/gene-regulatory-network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q anoexpress grenadine tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.7/113.7 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m299.4/299.4 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.3/203.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [40]:
import anoexpress as xpress
import pandas as pd
import numpy as np

from grenadine.Inference.inference import score_links
from grenadine.Inference.regression_predictors import *
from grenadine.Preprocessing.standard_preprocessing import z_score 
from tqdm.autonotebook import tqdm

def gene_regulatory_network(algorithm, analysis, microarray, **extra_args):
  fc_data = xpress.data(data_type='fcs', analysis=analysis, microarray=microarray).drop(columns='GeneName').set_index('GeneID')
  df_annot = pd.read_csv(f"https://raw.githubusercontent.com/sanjaynagi/AnoExpress/main/results/fcs.gamb_colu.tsv", sep="\t")[['GeneID', 'GeneName', 'GeneDescription']]

  # Load tfs
  tf_list = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/AnoExpress/main/resources/AgamP4.tfs", sep="\t")
  tf_list.columns = ['GeneID', 'TFID', 'DrosophilaID']

  # apply z score 
  fc_zdata = z_score(fc_data,axis=1).dropna()

  score_matrix = score_links(gene_expression_matrix=fc_zdata,
                            score_predictor=algorithm,
                            tf_list=tf_list.GeneID.to_list(),
                            progress_bar=True,
                            **extra_args,
  )

  tf_df = get_targets_from_tf_list(tf_list=tf_list, score_matrix=score_matrix, df_annot=df_annot)
  
  return(tf_df)

def tf_regulatees(score_matrix, tf, df_annot):
    tf_score = score_matrix.set_index(['GeneID', 'TFID']).query(f"GeneID == '{tf}'")
    if tf_score.empty:
        return
    
    mask = (tf_score.values > 0)[0]
    tf_score = tf_score.loc[:, mask]
    tf_score = tf_score.melt(var_name='GeneID', value_name='score').merge(df_annot)
    return(tf_score.sort_values(by='score', ascending=False)) 

def get_targets_from_tf_list(tf_list, score_matrix, df_annot):
  score_matrix = score_matrix.T.reset_index().rename(columns={'index':'GeneID'})
  score_matrix = score_matrix.merge(tf_list.iloc[:, :2], how='left').set_index(['GeneID', 'TFID']).reset_index()
  
  tf = []
  for idx, row in tqdm(tf_list.iterrows()):
      df = tf_regulatees(score_matrix, row['GeneID'], df_annot=df_annot)
      if df is None:
          continue
      tf.append(df.assign(tf=row['TFID']))

  return pd.concat(tf).drop_duplicates()

def list_tf_links(df, filter_name=None, filter_desc=None, filter_tf=None, filter_score=None):
    """
    List genes associated with transcription factors, filtering my Gene Name, Description, TF, and associated score
    """
    if filter_name:
        df = df.query(f"GeneName.str.contains('{filter_name}').fillna(False)", engine='python')
    if filter_desc:
        df = df.query(f"GeneDescription.str.contains('{filter_desc}').fillna(False)", engine='python')    
    if filter_tf:
        df = df.query(f"tf == '{filter_tf}'")
    if filter_score:
        df = df.query(f"score > {filter_score}")
    return df

![image](https://raw.githubusercontent.com/sanjaynagi/AnoExpress/main/docs/logo.png)

# Gene regulatory networks

In this notebook, we build gene regulatory networks using the recent [grenadine](https://www.mdpi.com/2073-4425/14/2/269) package, which although useful does seem to be mostly a load of scikit-learn functions under a new name. 

Run the algorithm. We can choose from a few, see [here](https://grenadine.readthedocs.io/en/latest/grenadine.Inference.html#module-grenadine.Inference.regression_predictors). Any of the regression predictors should work. I like the LASSO as its fast, but the GENIE3 algorithm is pretty well established for GRN building, so perhaps thats a good option?

In [22]:
tf_df = gene_regulatory_network(
            algorithm=Lasso_score, #GENIE3
            analysis='gamb_colu_arab_fun',
            microarray=True,
            **{'alpha':0.5} #extra_args, these get passed to the scikit-learn algorithm, remove if not lasso. 
            )

  0%|          | 0/7842 [00:00<?, ?it/s]

0it [00:00, ?it/s]

### P450s

In [35]:
pd.set_option('display.max_rows', 20000)
list_tf_links(tf_df, filter_desc='P450')

Unnamed: 0,GeneID,score,GeneName,GeneDescription,tf
81,AGAP005774,0.003935,CYP49A1,cytochrome P450 [Source:VB Community Annotation],CG2712
81,AGAP005774,0.003935,CYP49A1,cytochrome P450 [Source:VB Community Annotation],dwg
36,AGAP008020,0.080225,CYP12F2,cytochrome P450 [Source:VB Community Annotation],CG7556
28,AGAP002870,0.019412,CYP6AD1,cytochrome P450 [Source:VB Community Annotation],fd68A
125,AGAP006082,0.159135,CYP301A1,cytochrome P450 [Source:VB Community Annotation],sim
34,AGAP001443,0.126776,CYP325J1,cytochrome P450 [Source:VB Community Annotation],sim
279,AGAP013511,0.084882,CYP6AG2,cytochrome P450 [Source:VB Community Annotation],sim
6,AGAP000194,0.067411,CYP4C25,cytochrome P450 [Source:VB Community Annotation],sim
56,AGAP002429,0.047114,CYP314A1,cytochrome P450 [Source:VB Community Annotation],sim
42,AGAP001861,0.022173,CYP4H14,cytochrome P450 [Source:VB Community Annotation],sim


### Gsts

In [37]:
list_tf_links(df=tf_df, filter_name='GST')

Unnamed: 0,GeneID,score,GeneName,GeneDescription,tf
176,AGAP009342,0.059965,GSTU3,glutathione S-transferase unclassified 3 [Sour...,toy
166,AGAP009191,0.016787,GSTE6,glutathione S-transferase epsilon class 6 [Sou...,toy
57,AGAP003257,0.009996,GSTU2,glutathione S-transferase unclassified 2 [Sour...,CG2712
57,AGAP003257,0.009996,GSTU2,glutathione S-transferase unclassified 2 [Sour...,dwg
58,AGAP009196,0.014347,GSTE7,glutathione S-transferase epsilon class 7 [Sou...,
92,AGAP004378,0.114347,GSTD11,glutathione S-transferase delta class 11 [Sour...,sim
91,AGAP004163,0.026686,GSTD7,glutathione S-transferase delta class 7 [Sourc...,sim
71,AGAP003257,0.014237,GSTU2,glutathione S-transferase unclassified 2 [Sour...,sim
107,AGAP009197,0.165881,GSTE3,glutathione S-transferase epsilon class 3 [Sou...,run
171,AGAP004164,0.079962,GSTD1-4,glutathione S-transferase delta class 1 [Sourc...,row


## coes

In [41]:
list_tf_links(tf_df, filter_desc='esterase')

Unnamed: 0,GeneID,score,GeneName,GeneDescription,tf
93,AGAP008967,0.16286,,"calcium/calmodulin-dependent 3',5'-cyclic nucl...",CG8909
10,AGAP000466,0.118935,ACE2,acetylcholinesterase [Source:VB Community Anno...,CG8909
173,AGAP008096,0.284035,,Sphingomyelin phosphodiesterase [Source:UniPro...,onecut
202,AGAP009695,0.055901,,palmitoyl-protein thioesterase [Source:VB Comm...,onecut
198,AGAP009507,0.0317,,ubiquitin thioesterase protein OTUB1 [Source:V...,onecut
187,AGAP008967,0.002677,,"calcium/calmodulin-dependent 3',5'-cyclic nucl...",onecut
130,AGAP006956,0.052077,COE10O,carboxylesterase [Source:VB Community Annotation],toy
131,AGAP007001,0.029219,,ubiquitin thioesterase OTU1 [Source:VB Communi...,toy
257,AGAP010917,0.039701,COE09916,carboxylesterase [Source:VB Community Annotation],CG15073
257,AGAP010917,0.039701,COE09916,carboxylesterase [Source:VB Community Annotation],CG4282


### UGTs

In [42]:
list_tf_links(tf_df, filter_desc='glucosyltransferase')

Unnamed: 0,GeneID,score,GeneName,GeneDescription,tf
178,AGAP009562,0.147917,,"UDP-glucose:O-linked fucose beta-1,3-glucosylt...",toy
177,AGAP004267,0.084487,,O-glucosyltransferase rumi homolog [Source:Uni...,row
89,AGAP002420,0.078126,,"alpha-1,2-glucosyltransferase [Source:VB Commu...",row
113,AGAP004267,0.016411,,O-glucosyltransferase rumi homolog [Source:Uni...,CG12162
7,AGAP002420,0.083652,,"alpha-1,2-glucosyltransferase [Source:VB Commu...",dalao
48,AGAP002420,0.135029,,"alpha-1,2-glucosyltransferase [Source:VB Commu...",pfk
43,AGAP009137,0.033898,,Ecdysteroid UDP-glucosyltransferase [Source:VB...,CG12605
34,AGAP003560,0.013405,,UDP-glucose:glycoprotein glucosyltransferase [...,ID1
34,AGAP003560,0.013405,,UDP-glucose:glycoprotein glucosyltransferase [...,emc
145,AGAP009562,0.030833,,"UDP-glucose:O-linked fucose beta-1,3-glucosylt...",pdm2


### chemosensory

In [43]:
list_tf_links(tf_df, filter_desc='chemo')

Unnamed: 0,GeneID,score,GeneName,GeneDescription,tf
142,AGAP008062,0.053289,CSP4,chemosensory protein 4 [Source:VB Community An...,toy
148,AGAP008055,0.057619,CSP3,chemosensory protein 3 [Source:VB Community An...,pfk
185,AGAP008062,0.00487,CSP4,chemosensory protein 4 [Source:VB Community An...,Oli
217,AGAP008062,0.340335,CSP4,chemosensory protein 4 [Source:VB Community An...,
88,AGAP008055,0.05532,CSP3,chemosensory protein 3 [Source:VB Community An...,Asciz


In [45]:
list_tf_links(tf_df, filter_desc='sensory')

Unnamed: 0,GeneID,score,GeneName,GeneDescription,tf
142,AGAP008062,0.053289,CSP4,chemosensory protein 4 [Source:VB Community An...,toy
211,AGAP008051,0.055988,SAP1,sensory appendage protein 1 [Source:VB Communi...,CG12162
148,AGAP008055,0.057619,CSP3,chemosensory protein 3 [Source:VB Community An...,pfk
185,AGAP008062,0.00487,CSP4,chemosensory protein 4 [Source:VB Community An...,Oli
217,AGAP008062,0.340335,CSP4,chemosensory protein 4 [Source:VB Community An...,
88,AGAP008055,0.05532,CSP3,chemosensory protein 3 [Source:VB Community An...,Asciz
