<a href="https://colab.research.google.com/github/sanjaynagi/rna-seq-meta/blob/main/workflow/notebooks/enrichment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

**Load fold change data to get top candidates**

In [12]:
def load_candidates(database, name, func=np.nanmedian):
    fc_data = pd.read_csv(f"https://raw.githubusercontent.com/sanjaynagi/rna-seq-meta/main/results/fcs.{analysis}.tsv", sep="\t")
    fc_data = fc_data.set_index(['GeneID', 'GeneName', 'GeneDescription'])

    fc_ranked = fc_data.apply(np.nanmedian, axis=1).to_frame().rename(columns={0:f'{name} log2 Fold Change'})
    fc_ranked = fc_ranked.sort_values(f'{name} log2 Fold Change', ascending=False)
    fc_ranked = fc_ranked.reset_index()
    fc_ranked.loc[:, f'{name} Fold Change'] = np.round(2**fc_ranked.loc[:, f'{name} log2 Fold Change'], 2)
    return(fc_ranked)

def go_hypergeometric(database, name, func, percentile=0.05):

    fc_data = pd.read_csv(f"https://raw.githubusercontent.com/sanjaynagi/rna-seq-meta/main/results/fcs.{analysis}.tsv", sep="\t")
    fc_genes = fc_data.reset_index()['GeneID'].to_list()

    # get top 5% percentile genes ranked by median
    fc_ranked = load_candidates(analysis=database, name=name, func=func)
    percentile_idx = fc_ranked.reset_index()['GeneID'].unique().shape[0] * percentile
    top_geneIDs = fc_ranked.reset_index().loc[:, 'GeneID'][:int(percentile_idx)] 

    # load gene annotation file 
    gaf_df = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/rna-seq-meta/main/resources/AgamP4.gaf", sep="\t")
    go_annotations = gaf_df[['go_term', 'descriptions']].rename(columns={'go_term':'annotation'}).drop_duplicates()
    gaf_df = gaf_df[['GeneID', 'go_term']].drop_duplicates()
    gaf_df = gaf_df.query("GeneID in @fc_genes")
    N = gaf_df.GeneID.unique().shape[0] #Total number of genes with some annotation 
    k = np.isin(gaf_df.loc[:, 'GeneID'].unique(), top_geneIDs).sum() 

    hyper_geo = _hypergeometric(
        annotation_df=gaf_df, 
        column_name='go_term', 
        target_gene_list=top_geneIDs,
        N=N,
        k=k)    
    hyper_geo = hyper_geo.merge(go_annotations, how='left')
    return(hyper_geo)

def pfam_hypergeometric(database, name, func, percentile=0.05):

    # get all genes
    fc_data = pd.read_csv(f"https://raw.githubusercontent.com/sanjaynagi/rna-seq-meta/main/results/fcs.{analysis}.tsv", sep="\t")
    fc_genes = fc_data.reset_index()['GeneID'].to_list()

    # get top 5% percentile genes ranked by median
    fc_ranked = load_candidates(analysis=database, name=name, func=func)
    percentile_idx = fc_ranked.reset_index()['GeneID'].unique().shape[0] * percentile
    top_geneIDs = fc_ranked.reset_index().loc[:, 'GeneID'][:int(percentile_idx)] 

    # load gene annotation file 
    pfam_df = pd.read_csv("https://github.com/sanjaynagi/rna-seq-meta/blob/main/resources/Anogam_long.pep_Pfamscan.seqs.gz?raw=true", sep="\s+", header=None, compression='gzip').iloc[:, [0,4]]
    pfam_df.loc[:, 0] = pfam_df.loc[:, 0].str.replace("Anogam_", "").str.replace("-R[A-Z]", "", regex=True)
    pfam_df.columns = ['GeneID', 'pfam']
    pfam_df = pfam_df.query("GeneID in @fc_genes")
    N = pfam_df.GeneID.unique().shape[0] #Total number of genes with some annotation 
    k = np.isin(pfam_df.loc[:, 'GeneID'].unique(), top_geneIDs).sum()  

    # run hypergeometric test
    hyper_geo = _hypergeometric(
        annotation_df=pfam_df, 
        column_name='pfam', 
        target_gene_list=top_geneIDs,
        N=N,
        k=k)
        
    return(hyper_geo)

def _hypergeometric(annotation_df, column_name, target_gene_list, N, k):
    from scipy.stats import hypergeom
    from tqdm import tqdm
    from statsmodels.stats.multitest import fdrcorrection

    # get unique annotations
    unique_annots = annotation_df.loc[:, column_name].unique()

    sig_list = []
    res_list = []
    for annot in tqdm(unique_annots):

        annot_genes = annotation_df.query("{col} == @annot".format(col=column_name))['GeneID']
        m = len(annot_genes)

        x = annot_genes.isin(target_gene_list).sum()
        res = hypergeom(M=N, 
                        n=m, 
                        N=k).sf(x-1)
        sig_list.append(annot)
        res_list.append(res)    

    hyper_geo =  pd.DataFrame({'annotation': sig_list, 'pval':res_list})
    hypo, hyper_geo.loc[:, 'padj'] = fdrcorrection(hyper_geo['pval'])           #[np.min([padj, 1]) for padj in hyper_geo.loc[:, 'pval']*len(unique_annots)] 
    return(hyper_geo.sort_values(by='pval'))

Take the top 5% percentile of median genes (429/8599)

In [13]:
fc_medians = load_candidates(analysis='gamb_colu_arab_fun', name='median', func=np.nanmedian)
fc_medians.head(5)

Unnamed: 0,GeneID,GeneName,GeneDescription,median log2 Fold Change,median Fold Change
0,AGAP028402,,,2.77,6.82
1,AGAP002865,CYP6P3,cytochrome P450 [Source:VB Community Annotation],2.73,6.63
2,AGAP008817,CPLCP3,cuticular protein (putative) CPLCP3 [Source:VB...,2.56,5.9
3,AGAP002867,CYP6P4,cytochrome P450 [Source:VB Community Annotation],2.535,5.8
4,AGAP008448,,,2.41,5.31


# Go annotations

In [14]:
median_go_hypergeo_df = go_hypergeometric(analysis="gamb_colu_arab_fun", name="median", func=np.nanmedian)
median_go_hypergeo_df.query("padj < 0.05")

100%|██████████| 4737/4737 [00:15<00:00, 308.06it/s]


Unnamed: 0,annotation,pval,padj,descriptions
0,GO:0042302,2.354067e-27,1.1151210000000001e-23,structural constituent of cuticle
1,GO:0016705,3.3908979999999995e-19,8.031341e-16,"oxidoreductase activity, acting on paired dono..."
2,GO:0005506,3.013554e-18,4.758402e-15,iron ion binding
3,GO:0020037,5.748606e-17,6.807786e-14,heme binding
4,GO:0004497,9.530799000000001e-17,9.029479e-14,onooxygenase activity
5,GO:0005576,5.84648e-15,4.615796e-12,xtracellular region
6,GO:0004252,1.033755e-12,6.995568e-10,serine-type endopeptidase activity
7,GO:0008061,3.92804e-12,2.325891e-09,chitin binding
8,GO:0006030,2.449735e-11,1.289377e-08,chitin metabolic process
9,GO:0055114,3.824011e-11,1.811434e-08,obsolete oxidation-reduction process


# Pfam Domains

In [15]:
median_pfam_hypergeo_df = pfam_hypergeometric(analysis="gamb_colu_arab_fun", name="median", func=np.nanmedian)
median_pfam_hypergeo_df.head(30)

100%|██████████| 3695/3695 [00:11<00:00, 317.20it/s]


Unnamed: 0,annotation,pval,padj
2939,C_tripleX,1.654443e-37,6.113168e-34
495,Chitin_bind_4,1.755987e-23,3.244186e-20
445,CBM_14,1.088904e-21,1.341167e-18
588,p450,1.7513719999999999e-19,1.61783e-16
24,Trypsin,3.648581e-13,2.696301e-10
211,7tm_6,6.029056e-07,0.0003712893
610,GST_N_3,3.971089e-06,0.002096168
2101,GST_C,1.056013e-05,0.004877462
1766,Mucin-like,8.844017e-05,0.02999998
3010,CPCFC,8.844017e-05,0.02999998


Lots of cool stuff here too - P450s, obps, ORs, GRs, GST, Elongases (ELO), chitin binding seems to be coming up alot. 

In [20]:
### FOR REFERENCE - TO MAP pfam domain to gene family name. Lots of other stuff coming up as significant I have no idea what that is 
pfam_domain_names = {'Cytochrome P450s':'p450', 
             'GSTs':['GST_N', 'GST_N_3', 'GST_C'], 
             'Carboxylesterases': 'COesterase', 
             'ABC-transporters':['ABC_membrane', 'ABC_tran'],
             'CSP': 'OS-D',
             'UGTs': 'UDPGT',
             'Odorant binding proteins':'PBP_GOBP', 
             'Olfactory receptors':'7tm_6', 
             'Ionotropic receptors':['Lig_chan','7tm_1'],
             'Gustatory receptors': '7tm_7',
             'Fatty acid synthases':'ketoacyl-synt',
             'FA Elongase':'ELO',
             'FA desaturase':'FA_desaturase',
             'FA reductase':'NAD_binding_4',
             }