<a href="https://colab.research.google.com/github/sanjaynagi/AnoExpress/blob/main/workflow/notebooks/gene-regulatory-network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q anoexpress grenadine tqdm

In [17]:
import anoexpress as xpress
import pandas as pd
import numpy as np

from grenadine.Inference.inference import score_links
from grenadine.Inference.regression_predictors import *
from grenadine.Preprocessing.standard_preprocessing import z_score
from tqdm.autonotebook import tqdm

def gene_regulatory_network(algorithm, analysis, microarray, **extra_args):
  fc_data = xpress.data(data_type='fcs', analysis=analysis, microarray=microarray)
  df_annot = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/AnoExpress/main/resources/AgamP4.annots.tsv", sep="\t")

  # Load tfs
  tf_list = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/AnoExpress/main/resources/AgamP4.tfs", sep="\t")
  tf_list.columns = ['GeneID', 'TFID', 'DrosophilaID']

  # apply z score
  fc_zdata = z_score(fc_data,axis=1).dropna()

  score_matrix = score_links(gene_expression_matrix=fc_zdata,
                            score_predictor=algorithm,
                            tf_list=tf_list.GeneID.to_list(),
                            progress_bar=True,
                            **extra_args,
  )

  tf_df = get_targets_from_tf_list(tf_list=tf_list, score_matrix=score_matrix, df_annot=df_annot)

  return(tf_df)

def tf_regulatees(score_matrix, tf, df_annot):
    tf_score = score_matrix.set_index(['GeneID', 'TFID']).query(f"GeneID == '{tf}'")
    if tf_score.empty:
        return

    mask = (tf_score.values > 0)[0]
    tf_score = tf_score.loc[:, mask]
    tf_score = tf_score.melt(var_name='GeneID', value_name='score').merge(df_annot)
    return(tf_score.sort_values(by='score', ascending=False))

def get_targets_from_tf_list(tf_list, score_matrix, df_annot):
  score_matrix = score_matrix.T.reset_index().rename(columns={'index':'GeneID'})
  score_matrix = score_matrix.merge(tf_list.iloc[:, :2], how='left').set_index(['GeneID', 'TFID']).reset_index()

  tf = []
  for idx, row in tqdm(tf_list.iterrows()):
      df = tf_regulatees(score_matrix, row['GeneID'], df_annot=df_annot)
      if df is None:
          continue
      tf.append(df.assign(tf=row['TFID'],  tf_gene_id=row['GeneID']))

  return pd.concat(tf).drop_duplicates()

def list_tf_links(df, filter_name=None, filter_desc=None, filter_tf=None, filter_score=None):
    """
    List genes associated with transcription factors, filtering my Gene Name, Description, TF, and associated score
    """
    if filter_name:
        df = df.query(f"GeneName.str.contains('{filter_name}').fillna(False)", engine='python')
    if filter_desc:
        df = df.query(f"GeneDescription.str.contains('{filter_desc}').fillna(False)", engine='python')
    if filter_tf:
        df = df.query(f"tf == '{filter_tf}'")
    if filter_score:
        df = df.query(f"score > {filter_score}")
    return df

![image](https://raw.githubusercontent.com/sanjaynagi/AnoExpress/main/docs/logo.png)

# Gene regulatory networks

In this notebook, we build gene regulatory networks using the recent [grenadine](https://www.mdpi.com/2073-4425/14/2/269) package, which although useful does seem to be mostly a load of scikit-learn functions under a new name.

Run the algorithm. We can choose from a few, see [here](https://grenadine.readthedocs.io/en/latest/grenadine.Inference.html#module-grenadine.Inference.regression_predictors). Any of the regression predictors should work. I like the LASSO as its fast, but the GENIE3 algorithm is pretty well established for GRN building, so perhaps thats a good option?

In [24]:
tf_df = gene_regulatory_network(
            algorithm=GENIE3,                   # Lasso_score,
            analysis='gamb_colu_arab_fun',
            microarray=True,
           # **{'alpha':0.5} #extra_args, these get passed to the scikit-learn algorithm, remove if not lasso.
            )

  0%|          | 0/7842 [00:00<?, ?it/s]

0it [00:00, ?it/s]

In [25]:
tf_df

Unnamed: 0,GeneID,score,GeneName,GeneDescription,tf,tf_gene_id
7370,AGAP012148,0.587917,,Mitogen-activated protein kinase [Source:UniPr...,Dsp1,AGAP000005
3674,AGAP005744,0.553407,LRIM26,leucine-rich immune protein (Coil-less) [Sourc...,Dsp1,AGAP000005
4626,AGAP007406,0.533781,,Elongation factor 1-alpha [Source:UniProtKB/Tr...,Dsp1,AGAP000005
4162,AGAP006614,0.513895,,transcription factor BTF3 homolog 4 [Source:VB...,Dsp1,AGAP000005
184,AGAP000281,0.508714,,,Dsp1,AGAP000005
...,...,...,...,...,...,...
6561,AGAP002502,0.000058,,translation initiation factor 4G [Source:VB Co...,E(spl)mgamma-HLH,AGAP012342
15055,AGAP005902,0.000053,,forkhead protein/ forkhead protein domain [Sou...,E(spl)mgamma-HLH,AGAP012342
12429,AGAP004811,0.000047,CTL1,C-type lectin (CTL) [Source:VB Community Annot...,E(spl)mgamma-HLH,AGAP012342
319,AGAP000123,0.000046,CTLSE2,C-type lectin (CTL) [Source:VB Community Annot...,E(spl)mgamma-HLH,AGAP012342


### P450s

In [None]:
pd.set_option('display.max_rows', 20000)
list_tf_links(tf_df, filter_desc='P450')

### Gsts

In [None]:
list_tf_links(df=tf_df, filter_name='GST')

## coes

In [None]:
list_tf_links(tf_df, filter_desc='esterase')

### UGTs

In [None]:
list_tf_links(tf_df, filter_desc='glucosyltransferase')

### chemosensory

In [None]:
list_tf_links(tf_df, filter_desc='chemo')

In [None]:
list_tf_links(tf_df, filter_desc='sensory')