<a href="https://colab.research.google.com/github/sanjaynagi/rna-seq-meta/blob/main/enrichment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

**Load fold change data to get top candidates**

In [17]:
fc_data = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/rna-seq-meta/main/results/fc_data.tsv", sep="\t")
fc_data = fc_data.set_index(['GeneID', 'GeneName', 'GeneDescription'])
fc_data *= -1 # i have reversed the polarity so need to do this
fc_genes = fc_data.reset_index()['GeneID'].to_list()

fc_medians = fc_data.apply(np.nanmedian, axis=1).to_frame().rename(columns={0:'median log2 Fold Change'})
fc_medians = fc_medians.sort_values('median log2 Fold Change', ascending=False)

fc_medians.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,median log2 Fold Change
GeneID,GeneName,GeneDescription,Unnamed: 3_level_1
AGAP028402,,,3.01
AGAP000047,CPR130,cuticular protein RR-2 family 130 [Source:VB Community Annotation],2.82
AGAP002865,CYP6P3,cytochrome P450 [Source:VB Community Annotation],2.75


In [13]:
def go_hypergeometric(target_gene_list, fc_genes, min_annotation_group_size=100, path_to_annotation="https://raw.githubusercontent.com/sanjaynagi/rna-seq-meta/main/resources/AgamP4.gaf"):
    
    # load gene annotation file 
    gaf_df = pd.read_csv(path_to_annotation, sep="\t")
    go_annotations = gaf_df[['go_term', 'descriptions']].rename(columns={'go_term':'annotation'}).drop_duplicates()
    gaf_df = gaf_df[['GeneID', 'go_term']].drop_duplicates()
    gaf_df = gaf_df.query("GeneID in @fc_genes")
    N = gaf_df.GeneID.unique().shape[0] #Total number of genes with some annotation 
    k = np.isin(gaf_df.loc[:, 'GeneID'].unique(), target_gene_list).sum() 
    
    hyper_geo = _hypergeometric(
        annotation_df=gaf_df, 
        column_name='go_term', 
        target_gene_list=target_gene_list,
        N=N,
        k=k,
        min_annotation_group_size=min_annotation_group_size)
    
    hyper_geo = hyper_geo.merge(go_annotations, how='left')
    return(hyper_geo)

def pfam_hypergeometric(target_gene_list, fc_genes, min_annotation_group_size=100):
    
    # load gene annotation file 
    pfam_df = pd.read_csv("https://github.com/sanjaynagi/rna-seq-meta/blob/main/resources/Anogam_long.pep_Pfamscan.seqs.gz?raw=true", sep="\s+", header=None, compression='gzip').iloc[:, [0,4]]
    pfam_df.loc[:, 0] = pfam_df.loc[:, 0].str.replace("Anogam_", "").str.replace("-R[A-Z]", "", regex=True)
    pfam_df.columns = ['GeneID', 'pfam']
    pfam_df = pfam_df.query("GeneID in @fc_genes")
    N = pfam_df.GeneID.unique().shape[0] #Total number of genes with some annotation 
    k = np.isin(pfam_df.loc[:, 'GeneID'].unique(), target_gene_list).sum()  
    
    hyper_geo = _hypergeometric(
        annotation_df=pfam_df, 
        column_name='pfam', 
        target_gene_list=target_gene_list,
        N=N,
        k=k,
        min_annotation_group_size=min_annotation_group_size)
    
    return(hyper_geo)

def _hypergeometric(annotation_df, column_name, target_gene_list, N, k, min_annotation_group_size):
    from scipy.stats import hypergeom
    from tqdm import tqdm

    sig_list = []
    res_list = []
    for annot in tqdm(annotation_df.loc[:, column_name].unique()):

        annot_genes = annotation_df.query("{col} == @annot".format(col=column_name))['GeneID']
        m = len(annot_genes)
        if m < min_annotation_group_size: 
            continue

        x = annot_genes.isin(target_gene_list).sum()
        # Python
        res = hypergeom(M=N, 
                        n=m, 
                        N=k).sf(x-1)
        sig_list.append(annot)
        res_list.append(res)    

    hyper_geo =  pd.DataFrame({'annotation': sig_list, 'pval':res_list})
    return(hyper_geo.sort_values(by='pval'))
    

Take the top 5% percentile of median genes (429/8599)

In [32]:
top_median_genes = fc_medians.reset_index().loc[:, 'GeneID'][:429]

# Go annotations

In [34]:
median_go_hypergeo_df = go_hypergeometric(top_median_genes, fc_genes)
median_go_hypergeo_df.query("pval < 0.05")

100%|██████████| 4737/4737 [00:14<00:00, 317.14it/s]


Unnamed: 0,annotation,pval,descriptions
0,GO:0004252,9.30126e-16,serine-type endopeptidase activity
1,GO:0016705,2.212413e-14,"oxidoreductase activity, acting on paired dono..."
2,GO:0020037,1.72236e-13,heme binding
3,GO:0005506,4.622948e-13,iron ion binding
4,GO:0006508,2.380475e-12,proteolysis
5,GO:0004497,3.757405e-12,onooxygenase activity
6,GO:0008236,3.257021e-11,serine-type peptidase activity
7,GO:0005576,1.044866e-10,xtracellular region
8,GO:0055114,1.465135e-09,obsolete oxidation-reduction process
9,GO:0008233,8.060713e-09,peptidase activity


# Pfam Domains

In [11]:
median_pfam_hypergeo_df  = pfam_hypergeometric(top_median_genes, fc_genes)
median_pfam_hypergeo_df.head(30)

100%|██████████| 3695/3695 [00:08<00:00, 444.15it/s]


Unnamed: 0,annotation,pval
10,CBM_14,1.683055e-21
12,p450,8.404532e-11
0,Trypsin,1.705006e-08
5,LRR_8,0.1745628
2,WD40,0.9871309
1,zf-C2H2,1.0
3,Pkinase,1.0
4,Ank_2,1.0
6,Cadherin,1.0
7,I-set,1.0
