<a href="https://colab.research.google.com/github/sanjaynagi/rna-seq-meta/blob/main/enrichment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

**Load fold change data to get top candidates**

In [17]:
fc_data = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/rna-seq-meta/main/results/fc_data.tsv", sep="\t")
fc_data = fc_data.set_index(['GeneID', 'GeneName', 'GeneDescription'])
fc_data *= -1 # i have reversed the polarity so need to do this

fc_medians = fc_data.apply(np.nanmedian, axis=1).to_frame().rename(columns={0:'median log2 Fold Change'})
fc_medians = fc_medians.sort_values('median log2 Fold Change', ascending=False)

fc_medians.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,median log2 Fold Change
GeneID,GeneName,GeneDescription,Unnamed: 3_level_1
AGAP028402,,,3.01
AGAP000047,CPR130,cuticular protein RR-2 family 130 [Source:VB Community Annotation],2.82
AGAP002865,CYP6P3,cytochrome P450 [Source:VB Community Annotation],2.75


In [42]:
def go_hypergeometric(target_gene_list):
    
  fc_data = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/rna-seq-meta/main/results/fc_data.tsv", sep="\t")
  fc_genes = fc_data.reset_index()['GeneID'].to_list()

  # load gene annotation file 
  gaf_df = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/rna-seq-meta/main/resources/AgamP4.gaf", sep="\t")
  go_annotations = gaf_df[['go_term', 'descriptions']].rename(columns={'go_term':'annotation'}).drop_duplicates()
  gaf_df = gaf_df[['GeneID', 'go_term']].drop_duplicates()
  gaf_df = gaf_df.query("GeneID in @fc_genes")
  N = gaf_df.GeneID.unique().shape[0] #Total number of genes with some annotation 
  k = np.isin(gaf_df.loc[:, 'GeneID'].unique(), target_gene_list).sum() 
  
  hyper_geo = _hypergeometric(
      annotation_df=gaf_df, 
      column_name='go_term', 
      target_gene_list=target_gene_list,
      N=N,
      k=k)    
  hyper_geo = hyper_geo.merge(go_annotations, how='left')
  return(hyper_geo)

def pfam_hypergeometric(target_gene_list):

  fc_data = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/rna-seq-meta/main/results/fc_data.tsv", sep="\t")
  fc_genes = fc_data.reset_index()['GeneID'].to_list()
    
  # load gene annotation file 
  pfam_df = pd.read_csv("https://github.com/sanjaynagi/rna-seq-meta/blob/main/resources/Anogam_long.pep_Pfamscan.seqs.gz?raw=true", sep="\s+", header=None, compression='gzip').iloc[:, [0,4]]
  pfam_df.loc[:, 0] = pfam_df.loc[:, 0].str.replace("Anogam_", "").str.replace("-R[A-Z]", "", regex=True)
  pfam_df.columns = ['GeneID', 'pfam']
  pfam_df = pfam_df.query("GeneID in @fc_genes")
  N = pfam_df.GeneID.unique().shape[0] #Total number of genes with some annotation 
  k = np.isin(pfam_df.loc[:, 'GeneID'].unique(), target_gene_list).sum()  
  
  hyper_geo = _hypergeometric(
      annotation_df=pfam_df, 
      column_name='pfam', 
      target_gene_list=target_gene_list,
      N=N,
      k=k)
      
  return(hyper_geo)

def _hypergeometric(annotation_df, column_name, target_gene_list, N, k):
    from scipy.stats import hypergeom
    from tqdm import tqdm

    sig_list = []
    res_list = []
    for annot in tqdm(annotation_df.loc[:, column_name].unique()):

        annot_genes = annotation_df.query("{col} == @annot".format(col=column_name))['GeneID']
        m = len(annot_genes)

        x = annot_genes.isin(target_gene_list).sum()
        # Python
        res = hypergeom(M=N, 
                        n=m, 
                        N=k).sf(x-1)
        sig_list.append(annot)
        res_list.append(res)    

    hyper_geo =  pd.DataFrame({'annotation': sig_list, 'pval':res_list})
    return(hyper_geo.sort_values(by='pval'))
    

Take the top 5% percentile of median genes (429/8599)

In [39]:
top_median_geneIDs = fc_medians.reset_index().loc[:, 'GeneID'][:429] # first 429 geneIDs ranked by median 

# Go annotations

The only thing with the below is I havent corrected for multiple testing

In [40]:
median_go_hypergeo_df = go_hypergeometric(top_median_geneIDs)
median_go_hypergeo_df.query("pval < 0.05")

100%|██████████| 4737/4737 [00:23<00:00, 203.33it/s]


Unnamed: 0,annotation,pval,descriptions
0,GO:0042302,2.780871e-17,structural constituent of cuticle
1,GO:0004252,9.301260e-16,serine-type endopeptidase activity
2,GO:0016705,2.212413e-14,"oxidoreductase activity, acting on paired dono..."
3,GO:0020037,1.722360e-13,heme binding
4,GO:0005506,4.622948e-13,iron ion binding
...,...,...,...
71,GO:0004705,4.397863e-02,JUN kinase activity
72,GO:0004846,4.397863e-02,urate oxidase activity
73,GO:0050113,4.397863e-02,inositol oxygenase activity
74,GO:0007254,4.397863e-02,JNK cascade


# Pfam Domains

In [41]:
median_pfam_hypergeo_df  = pfam_hypergeometric(top_median_genes)
median_pfam_hypergeo_df.head(30)

100%|██████████| 3695/3695 [00:13<00:00, 266.49it/s]


Unnamed: 0,annotation,pval
2939,C_tripleX,2.026894e-37
24,Trypsin,8.731332e-17
445,CBM_14,1.308475e-15
495,Chitin_bind_4,7.532351e-15
588,p450,1.42149e-14
44,7tm_7,8.535437e-08
359,PBP_GOBP,1.466415e-07
203,Fibrinogen_C,1.641451e-06
610,GST_N_3,4.156112e-06
2101,GST_C,1.099233e-05


Lots of cool stuff here too - P450s, obps, ORs, GRs, GST, Elongases (ELO), chitin binding seems to be coming up alot. 

In [None]:
### FOR REFERENCE - TO MAP pfam domain to gene family name. Lots of other stuff coming up as significant I have no idea what that is 
 {'Cytochrome P450s':'p450', 
             'GSTs':['GST_N', 'GST_N_3', 'GST_C'], 
             'Carboxylesterases': 'COesterase', 
             'ABC-transporters':['ABC_membrane', 'ABC_tran'],
             'CSP': 'OS-D',
             'UGTs': 'UDPGT',
             'Odorant binding proteins':'PBP_GOBP', 
             'Olfactory receptors':'7tm_6', 
             'Ionotropic receptors':['Lig_chan','7tm_1'],
             'Gustatory receptors': '7tm_7',
             'Fatty acid synthases':'ketoacyl-synt',
             'FA Elongase':'ELO',
             'FA desaturase':'FA_desaturase',
             'FA reductase':'NAD_binding_4',
             }