In [13]:
import numpy as np
import pandas as pd

In [None]:
###### configuration - metadata and parameters ######
metadata = fread(snakemake.input['metadata'], sep="\t")
gaffile = snakemake.input['gaf']
selection = snakemake.params['selection']

comparisons = pd.DataFrame(snakemake.params['DEcontrasts'], columns=['contrast'])
comparisons = comparisons.contrast.str.split("_", expand=True)
comparisons.columns = ['sus', 'res']
comparisons = [list(row) for i,row in comparisons.iterrows()]

In [82]:
def load_go_descriptions(obo_path="https://purl.obolibrary.org/obo/go.obo"):
    import urllib.request
    ids = []
    descriptions = []
    with urllib.request.urlopen("http://current.geneontology.org/ontology/go.obo") as url:
        for line in url:
            if line.startswith(b"id"):
                value = line.lstrip(b"id: ").rstrip(b"\n")
                if value.startswith(b"GO"):
                    ids.append(value)
                    descriptions.append(next(url, '').lstrip(b"name:").lstrip().rstrip(b"\n"))
    return(pd.DataFrame({'go_term': [go.decode('utf8') for go in ids], 'descriptions':[desc.decode('utf8') for desc in descriptions]}))

def go_hypergeometric(target_gene_list, gaf_df):
    
    # load gene annotation file 
    go_annotations = gaf_df[['go_term', 'descriptions']].rename(columns={'go_term':'annotation'}).drop_duplicates()
    gaf_df = gaf_df[['GeneID', 'go_term']].drop_duplicates()
    N = gaf_df.GeneID.unique().shape[0] #Total number of genes with some annotation 
    k = np.isin(gaf_df.loc[:, 'GeneID'].unique(), target_gene_list).sum() 
  
    hyper_geo = _hypergeometric(
      annotation_df=gaf_df, 
      column_name='go_term', 
      target_gene_list=target_gene_list,
      N=N,
      k=k)    
    hyper_geo = hyper_geo.merge(go_annotations, how='left')
    return(hyper_geo)

def _hypergeometric(annotation_df, column_name, target_gene_list, N, k):
    from scipy.stats import hypergeom
    from tqdm import tqdm

    sig_list = []
    res_list = []
    
    unique_annots = annotation_df.loc[:, column_name].unique()
    for annot in tqdm(unique_annots):

        annot_genes = annotation_df.query("{col} == @annot".format(col=column_name))['GeneID']
        m = len(annot_genes)

        x = annot_genes.isin(target_gene_list).sum()
        # Python
        res = hypergeom(M=N, 
                        n=m, 
                        N=k).sf(x-1)
        sig_list.append(annot)
        res_list.append(res)    

    hyper_geo = pd.DataFrame({'annotation': sig_list, 'pval':res_list})
    hyper_geo.loc[:, 'padj'] = np.min([hyper_geo.loc[:, 'pval']*len(unique_annots), 1])
    
    return(hyper_geo.sort_values(by='pval'))
    

In [69]:
gaf_df = pd.read_csv("../../resources/reference/VectorBase-50_AgambiaePEST_GO.gaf.gz", sep="\t").reset_index().iloc[:, [1,4]]
gaf_df.columns = ['GeneID', 'go_term']
gaf_df = gaf_df.merge(load_go_descriptions(), how='left')

In [86]:
fst_data = pd.read_csv(f"../../results/variantAnalysis/selection/FstPerGene.tsv", sep="\t")

In [88]:
comp = "Kisumu_BusiaParental"

In [114]:
fst_comp_df = fst_data.loc[:, ['GeneID', f'{comp}_zFst']].sort_values(by=f'{comp}_zFst', ascending=False).dropna()
n_genes = fst_comp_df.shape[0]
percentile_5 = int(n_genes* 0.05)
fst_genes = fst_comp_df.iloc[:percentile_5].loc[:,'GeneID'].to_numpy()

In [None]:
for comp in comparisons:
    de_data = pd.read_csv(f"../../results/genediff/{comp}.csv")
    sig_genes = de_data.query("padj < 0.05 and FC > 2")['GeneID']
    
    gsea_df = go_hypergeometric(sig_genes, gaf_df)
    gsea_df.to_csv(f"../../results/gsea/{comp}_de.tsv", sep="\t")
    
    if selection:
        fst_data = pd.read_csv(f"../../results/variantAnalysis/selection/FstPerGene.tsv", sep="\t")
        fst_comp_df = fst_data.loc[:, ['GeneID', f'{comp}_zFst']].sort_values(by=f'{comp}_zFst', ascending=False).dropna()
        n_genes = fst_comp_df.shape[0]
        percentile_5 = int(n_genes* 0.05) #5th percentile
        fst_genes = fst_comp_df.iloc[:percentile_5].loc[:,'GeneID'].to_numpy()
        gsea_df = go_hypergeometric(fst_genes, gaf_df)
        gsea_df.to_csv(f"../../results/gsea/{comp}_fst.tsv", sep="\t")
        

In [73]:
de_data = pd.read_csv("../../results/genediff/G24-BusiaParental_G28-BusiaSurvivors.csv")
sig_genes = de_data.query("padj < 0.05 and FC > 2")['GeneID']

In [79]:
gsea_df = go_hypergeometric(sig_genes, gaf_df)

100%|██████████████████████████████████████| 5266/5266 [00:15<00:00, 348.61it/s]


In [80]:
gsea_df.query("pval < 0.05")

Unnamed: 0,annotation,pval,padj,descriptions
0,GO:0004252,2.979560e-14,1.569036e-10,serine-type endopeptidase activity
1,GO:0005576,3.742178e-11,1.970631e-07,extracellular region
2,GO:0006508,1.054009e-09,5.550413e-06,proteolysis
3,GO:0016747,1.458789e-06,7.681983e-03,"acyltransferase activity, transferring groups ..."
4,GO:0007586,3.595714e-06,1.893503e-02,digestion
...,...,...,...,...
130,GO:0004760,4.873144e-02,2.566198e+02,serine-pyruvate transaminase activity
131,GO:0019265,4.873144e-02,2.566198e+02,"glycine biosynthetic process, by transaminatio..."
132,GO:0008453,4.873144e-02,2.566198e+02,alanine-glyoxylate transaminase activity
133,GO:0005960,4.873144e-02,2.566198e+02,glycine cleavage complex
