# Calculating Enrichment of Hits within known Lovastatin and Terbinafine target genes

In [1]:
import pandas as pd
from scipy.stats import hypergeom

#variant annotations for ergosterol pathway variants (Needed to know which genes variants target in this notebook)
annotations = pd.read_csv('../data_tables/erg_annos.tsv',sep = '\t', index_col='var_id')

#variant fitness measurements for the ergosterol biosynthesis pathway variants
var_fitness = pd.read_csv('../data_tables/ergosterol_pathway_fitness.tsv', sep = '\t')

#merging the tables so variant fitness and p values are in the same table as gene names
annotated_fit = var_fitness.merge(annotations, 'inner', 'var_id')


## Lovastatin

In [2]:
lovastatin_target_genes = ['HMG1','HMG2']

#number of variants targeting each gene for variants measured in lovastatin
lovastatin_variants_per_gene = annotated_fit[annotated_fit['condition']=='LOV']['Gene'].value_counts()

#number of hits targeting each gene for variants measured in lovastatin
lovastatin_hits_per_gene = annotated_fit[(annotated_fit['condition']=='LOV')&(annotated_fit['padj']<.01)]['Gene'].value_counts()

#total number of variants measured in lovastatin
lovastatin_total_variants = lovastatin_variants_per_gene.sum()

#total number of variants within the two known target genes for lovastatin, HMG1 and HMG2
lovastatin_target_variants =  lovastatin_variants_per_gene.loc[lovastatin_target_genes].sum()


#total number of hits identified in lovastatin
lovastatin_total_hits = lovastatin_hits_per_gene.sum()

#total number of hits within the two known target genes for lovastatin, HMG1 and HMG2
lovastatin_target_hits =  lovastatin_hits_per_gene.loc[lovastatin_target_genes].sum()

hypergeom.sf(lovastatin_target_hits-1, lovastatin_total_variants, lovastatin_total_hits, lovastatin_target_variants)

6.624632946217848e-12

## Terbinafine

In [3]:
terbinafine_target = ['ERG1']


#number of variants targeting each gene for variants measured in terbinafine
terbinafine_variants_per_gene = annotated_fit[annotated_fit['condition']=='TBF']['Gene'].value_counts()

#number of hits targeting each gene for variants measured in terbinafine
terbinafine_hits_per_gene = annotated_fit[(annotated_fit['condition']=='TBF')&(annotated_fit['padj']<.01)]['Gene'].value_counts()

#total number of variants measured in terbinafine
terbinafine_total_variants = terbinafine_variants_per_gene.sum()

#total number of variants within the two known target genes for terbinafine, HMG1 and HMG2
terbinafine_target_variants =  terbinafine_variants_per_gene.loc[terbinafine_target].sum()


#total number of hits identified in terbinafine
terbinafine_total_hits = terbinafine_hits_per_gene.sum()

#total number of hits within the two known target genes for terbinafine, HMG1 and HMG2
terbinafine_target_hits =  terbinafine_hits_per_gene.loc[terbinafine_target].sum()

hypergeom.sf(terbinafine_target_hits-1, terbinafine_total_variants, terbinafine_total_hits, terbinafine_target_variants)

1.5530284112449056e-16