<a href="https://colab.research.google.com/github/sanjaynagi/rna-seq-meta/blob/main/workflow/notebooks/enrichment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd
import numpy as np
import plotly.express as px

**Load fold change data to get top candidates**

In [15]:
fc_data = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/rna-seq-meta/main/results/fc_data.tsv", sep="\t")
fc_data = fc_data.set_index(['GeneID', 'GeneName', 'GeneDescription'])
fc_data *= -1 # i have reversed the polarity so need to do this

fc_medians = fc_data.apply(np.nanmedian, axis=1).to_frame().rename(columns={0:'median log2 Fold Change'})
fc_medians = fc_medians.sort_values('median log2 Fold Change', ascending=False)

fc_medians.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,median log2 Fold Change
GeneID,GeneName,GeneDescription,Unnamed: 3_level_1
AGAP028402,,,3.01
AGAP000047,CPR130,cuticular protein RR-2 family 130 [Source:VB Community Annotation],2.82
AGAP002865,CYP6P3,cytochrome P450 [Source:VB Community Annotation],2.75


In [26]:
def go_hypergeometric(target_gene_list):
    
  fc_data = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/rna-seq-meta/main/results/fc_data.tsv", sep="\t")
  fc_genes = fc_data.reset_index()['GeneID'].to_list()

  # load gene annotation file 
  gaf_df = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/rna-seq-meta/main/resources/AgamP4.gaf", sep="\t")
  go_annotations = gaf_df[['go_term', 'descriptions']].rename(columns={'go_term':'annotation'}).drop_duplicates()
  gaf_df = gaf_df[['GeneID', 'go_term']].drop_duplicates()
  gaf_df = gaf_df.query("GeneID in @fc_genes")
  N = gaf_df.GeneID.unique().shape[0] #Total number of genes with some annotation 
  k = np.isin(gaf_df.loc[:, 'GeneID'].unique(), target_gene_list).sum() 
  
  hyper_geo = _hypergeometric(
      annotation_df=gaf_df, 
      column_name='go_term', 
      target_gene_list=target_gene_list,
      N=N,
      k=k)    
  hyper_geo = hyper_geo.merge(go_annotations, how='left')
  return(hyper_geo)

def pfam_hypergeometric(target_gene_list):

  fc_data = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/rna-seq-meta/main/results/fc_data.tsv", sep="\t")
  fc_genes = fc_data.reset_index()['GeneID'].to_list()
    
  # load gene annotation file 
  pfam_df = pd.read_csv("https://github.com/sanjaynagi/rna-seq-meta/blob/main/resources/Anogam_long.pep_Pfamscan.seqs.gz?raw=true", sep="\s+", header=None, compression='gzip').iloc[:, [0,4]]
  pfam_df.loc[:, 0] = pfam_df.loc[:, 0].str.replace("Anogam_", "").str.replace("-R[A-Z]", "", regex=True)
  pfam_df.columns = ['GeneID', 'pfam']
  pfam_df = pfam_df.query("GeneID in @fc_genes")
  N = pfam_df.GeneID.unique().shape[0] #Total number of genes with some annotation 
  k = np.isin(pfam_df.loc[:, 'GeneID'].unique(), target_gene_list).sum()  
  
  hyper_geo = _hypergeometric(
      annotation_df=pfam_df, 
      column_name='pfam', 
      target_gene_list=target_gene_list,
      N=N,
      k=k)
      
  return(hyper_geo)

def _hypergeometric(annotation_df, column_name, target_gene_list, N, k):
    from scipy.stats import hypergeom
    from tqdm import tqdm
    from statsmodels.stats.multitest import fdrcorrection

    unique_annots = annotation_df.loc[:, column_name].unique()

    sig_list = []
    res_list = []
    for annot in tqdm(unique_annots):

        annot_genes = annotation_df.query("{col} == @annot".format(col=column_name))['GeneID']
        m = len(annot_genes)

        x = annot_genes.isin(target_gene_list).sum()
        # Python
        res = hypergeom(M=N, 
                        n=m, 
                        N=k).sf(x-1)
        sig_list.append(annot)
        res_list.append(res)    

    hyper_geo =  pd.DataFrame({'annotation': sig_list, 'pval':res_list})
    hypo, hyper_geo.loc[:, 'padj'] =  fdrcorrection(hyper_geo['pval'])           #[np.min([padj, 1]) for padj in hyper_geo.loc[:, 'pval']*len(unique_annots)] 
    return(hyper_geo.sort_values(by='pval'))
    

Take the top 5% percentile of median genes (429/8599)

In [27]:
percentile_5 = fc_medians.reset_index()['GeneID'].unique().shape[0] * 0.05

top_median_geneIDs = fc_medians.reset_index().loc[:, 'GeneID'][:int(percentile_5)] # get first 5% percentile geneIDs ranked by median 

# Go annotations

The only thing with the below is I havent corrected for multiple testing

In [28]:
median_go_hypergeo_df = go_hypergeometric(top_median_geneIDs)
median_go_hypergeo_df.query("padj < 0.05")

100%|██████████| 4737/4737 [00:20<00:00, 233.96it/s]


Unnamed: 0,annotation,pval,padj,descriptions
0,GO:0042302,2.780871e-17,1.317299e-13,structural constituent of cuticle
1,GO:0004252,9.30126e-16,2.203003e-12,serine-type endopeptidase activity
2,GO:0016705,2.212413e-14,3.4934e-11,"oxidoreductase activity, acting on paired dono..."
3,GO:0020037,1.72236e-13,2.039705e-10,heme binding
4,GO:0005506,4.622948e-13,4.379781e-10,iron ion binding
5,GO:0006508,2.380475e-12,1.879385e-09,proteolysis
6,GO:0004497,3.757405e-12,2.54269e-09,onooxygenase activity
7,GO:0008236,3.257021e-11,1.928564e-08,serine-type peptidase activity
8,GO:0005576,1.044866e-10,5.49948e-08,xtracellular region
9,GO:0055114,1.465135e-09,6.940344e-07,obsolete oxidation-reduction process


# Pfam Domains

In [29]:
median_pfam_hypergeo_df  = pfam_hypergeometric(top_median_geneIDs)
median_pfam_hypergeo_df.head(30)

100%|██████████| 3695/3695 [00:12<00:00, 298.56it/s]


Unnamed: 0,annotation,pval,padj
2939,C_tripleX,2.026894e-37,7.489372e-34
24,Trypsin,8.731332e-17,1.613114e-13
445,CBM_14,1.308475e-15,1.611605e-12
495,Chitin_bind_4,7.532351e-15,6.958009e-12
588,p450,1.42149e-14,1.050481e-11
44,7tm_7,8.535437e-08,5.256407e-05
359,PBP_GOBP,1.466415e-07,7.740574e-05
203,Fibrinogen_C,1.641451e-06,0.000758145
610,GST_N_3,4.156112e-06,0.001706315
2101,GST_C,1.099233e-05,0.004061667


Lots of cool stuff here too - P450s, obps, ORs, GRs, GST, Elongases (ELO), chitin binding seems to be coming up alot. 

In [20]:
### FOR REFERENCE - TO MAP pfam domain to gene family name. Lots of other stuff coming up as significant I have no idea what that is 
pfam_domain_names = {'Cytochrome P450s':'p450', 
             'GSTs':['GST_N', 'GST_N_3', 'GST_C'], 
             'Carboxylesterases': 'COesterase', 
             'ABC-transporters':['ABC_membrane', 'ABC_tran'],
             'CSP': 'OS-D',
             'UGTs': 'UDPGT',
             'Odorant binding proteins':'PBP_GOBP', 
             'Olfactory receptors':'7tm_6', 
             'Ionotropic receptors':['Lig_chan','7tm_1'],
             'Gustatory receptors': '7tm_7',
             'Fatty acid synthases':'ketoacyl-synt',
             'FA Elongase':'ELO',
             'FA desaturase':'FA_desaturase',
             'FA reductase':'NAD_binding_4',
             }