In [None]:
import pandas as pd
import numpy as np
from scipy.stats import hypergeom
import matplotlib.pyplot as plt
%matplotlib inline


annotations = pd.read_csv('../data_tables/erg_annos.tsv',sep = '\t', index_col='var_id')
annotations.loc[annotations['Consequence'].isin(['inframe_deletion','stop_gained','frameshift_variant']), 'Consequence'] = 'missense_variant'
gxe = pd.read_csv('../data_tables/ergosterol_pathway_gxe_interactions.tsv', sep = '\t')
fit  = pd.read_csv('../data_tables/Fitness_files/ergosterol_pathway_fitness.tsv', sep = '\t')


In [None]:
annotated_fit = fit.merge(annotations, 'inner', 'var_id')
annotated_gxe = gxe.merge(annotations, 'inner', 'var_id')

In [None]:
def get_GxE_vars(annotated_gxe_df, annotated_fitness_df):
    '''This code generates the list of GxE variants based on our stringent two-part definition:
    1) the variant has a GxE term with FDR<0.01, and 2) for that term, the direction
    of effect in the two conditions is in opposite directions.
    inputs:
        annotated_gxe_df (pandas DataFrame): table containing pairwise GxE coefficients and pvalues for variants,
        annotated by genomic location.
        annotated_fitness_df (pandas DataFrame):  table containing fitness values for variants in 
        all relevant conditions,annotated by genomic location.
    returns:
        gxe_vars (list of strings): list of variants which show GxE according to our definition 
        (not unique, many represented multiple times) 
    '''
    gxe_vars =[] 
    for i, row in annotated_gxe_df[(annotated_gxe_df['padj']<0.01)].iterrows():
        var = row['var_id']
        conds = row['gxe'].split('-')
        if annotated_fitness_df[(annotated_fitness_df['var_id']==var)&\
                                (annotated_fitness_df['condition'].isin(conds))]['coef'].prod()<0:
            gxe_vars.append(var)
    return gxe_vars


In [None]:
gxe_vars = get_GxE_vars(annotated_gxe,annotated_fit)

In [None]:
all_hits = list(set(gxe_vars))

In [None]:
all_vars = annotated_gxe['var_id'].unique()
all_annos = annotations[annotations.index.isin(all_vars)]['Consequence'].value_counts()

In [None]:
all_vars = annotated_gxe['var_id'].unique()
all_annos = annotations[annotations.index.isin(all_vars)]['Consequence'].value_counts()

all_annos = all_annos[['synonymous_variant','missense_variant','upstream_gene_variant','downstream_gene_variant']]
all_var_anno_fracs = all_annos/len(all_vars)

In [None]:
hit_annos = annotations[annotations.index.isin(all_hits)]['Consequence'].value_counts()

hit_annos  = hit_annos[['synonymous_variant','missense_variant','upstream_gene_variant','downstream_gene_variant']]
hit_fracs = hit_annos/len(all_hits)

In [None]:
enrich = hit_fracs/all_var_anno_fracs

In [None]:

plt.rcParams['font.family'] = "sans-serif"
plt.rcParams['font.sans-serif'] = 'Arial'
plt.rcParams['font.size']  = 7


fig,ax = plt.subplots(figsize = (1.4,1.4))
plt.bar(enrich.index,enrich)

plt.axhline(1, color = 'red',linestyle = 'dashed', linewidth = 1)
plt.ylabel('Enrichment over Library')
plt.xlabel('Variant Annotation')
plt.xticks([0,1,2,3],[ 'Synonymous','Missense','Upstream','Downstream'], rotation = 45);

plt.ylabel('Enrichment over Library')
plt.savefig('../GxE_Figures/Figure_3/fig3f_FC.svg')

In [None]:
annotations[annotations.index.isin(gxe_vars)]['Consequence'].value_counts()

In [None]:
annotations[annotations.index.isin(all_vars)]['Consequence'].value_counts()

In [None]:
#Bonferroni Correction (4 tests, 2 sided)
#Probably overkill

#missense enrichment
hypergeom.sf(98-1,1432,376,256)*8

In [None]:
#upstream enrichmentz
hypergeom.sf(69-1,1432,336,256)*8

In [None]:
#downstream depletion
hypergeom.cdf(12+1,1432,159,256)*8

In [None]:
#synonymous depletion
hypergeom.cdf(59+1,1432,570,256)*8