In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plt.rcParams['font.family'] = "sans-serif"
plt.rcParams['font.sans-serif'] = 'Arial'
plt.rcParams['mathtext.fontset'] = 'custom'
plt.rcParams['mathtext.cal'] = 'Arial' 

%matplotlib inline

## Pre-processing

In [None]:
#Loading in full variant annotations
annos = pd.read_csv('../data_tables/QTL_pool_annotations.tsv',sep = '\t')

#loading in QTL information from Bloom 2019
qtl_list = pd.read_csv('../data_tables/qtl_list.tsv', sep= '\t')

#loading in assignment of each well to each library
libraries= pd.read_excel("/home/shian/CRISPEY3_fraserserver/tables/crispey3_libraries_by_pool_number.xlsx")

#loading in fitness measurement tables for QTL pools
p9 = pd.read_csv('../data_tables/Fitness_files/pool9_fitness_fc.tsv', sep = '\t')
p4 = pd.read_csv('../data_tables/Fitness_files/pool4_fitness_fc.tsv', sep = '\t')
p7 = pd.read_csv('../data_tables/Fitness_files/pool7_fitness_fc.tsv', sep = '\t')


#loading in oligo information mastersheet
oli_info = pd.read_csv('../data_tables/oligos_nonuniq_crispey3_GG_9bp_OLIGO_with_seq_primers.txt', sep = '\t')



In [None]:
#splitting fitness measurements by condition
fluc = p9[p9['condition']=='FLUC']
scp9 = p9[p9['condition']=='P9SC']
cocl = p4[p4['condition']=='COCL']
scp4 = p4[p4['condition']=='P4SC']
caff = p7[p7['condition']=='CAFF']
scp7 = p7[p7['condition']=='P7SC']

## Figure 2a

In [None]:
def get_annotated_df(df,condition):
    cond_genes = qtl_list[qtl_list['trait']==condition]['Gene ID'].unique().tolist()
    df_anno = df.merge(annos,'inner','var_id')
    return df_anno[df_anno['Gene'].isin(cond_genes)]

In [None]:
#Getting designed variants for each pool which are in the relevant QTLs

#getting list of genes in caffeine QTLs in library
caff_genes = qtl_list[qtl_list['trait']=='Caffeine;15mM;2']['Gene ID'].unique().tolist()
#getting variant ids for variants within caffeine genes
designed_caffeine_vars = oli_info[(oli_info['set_name']=='gxe')&(oli_info.pool.isin(libraries['GxE_Caffeine'].dropna().astype(int)))].var_id.unique()
#getting annotations for variants within caffeine genes
designed_caff_qtl_vars = annos[annos['var_id'].isin(designed_caffeine_vars)&(annos['Gene'].isin(caff_genes))]

cocl2_genes = qtl_list[qtl_list['trait']=='Cobalt_Chloride;2mM;2']['Gene ID'].unique().tolist()
designed_cocl2_vars = oli_info[(oli_info['set_name']=='gxe')&(oli_info.pool.isin(libraries['GxE_CoCl2'].dropna().astype(int)))].var_id.unique()
designed_cocl2_qtl_vars = annos[annos['var_id'].isin(designed_cocl2_vars)&(annos['Gene'].isin(cocl2_genes))]

fluc_genes = qtl_list[qtl_list['trait']=='Fluconazole;100uM;2']['Gene ID'].unique().tolist()
designed_fluconazole_vars = oli_info[(oli_info['set_name']=='gxe')&(oli_info.pool.isin(libraries['GxE_Fluconazole'].dropna().astype(int)))].var_id.unique()
designed_fluconazole_qtl_vars = annos[annos['var_id'].isin(designed_fluconazole_vars)&(annos['Gene'].isin(fluc_genes))]

In [None]:
#Printing data to use for figure 2A

print('Designed Caffeine Variants:',designed_caff_qtl_vars.var_id.nunique())
print('Designed CoCl\u2082 Variants:',designed_cocl2_qtl_vars.var_id.nunique())
print('Designed Fluconazole Variants:',designed_fluconazole_qtl_vars.var_id.nunique())

print('############################################################################')
print('Designed Caffeine Gene QTLs:',designed_caff_qtl_vars.Gene.nunique())
print('Designed CoCl\u2082 Gene QTLs:',designed_cocl2_qtl_vars.Gene.nunique())
print('Designed Fluconazole Gene QTLs:',designed_fluconazole_qtl_vars.Gene.nunique())

#Getting annotated fitness dataframes, so only variants which 
#we were able to measure fitness for
caff_ann = get_annotated_df(caff,'Caffeine;15mM;2')
fluc_ann = get_annotated_df(fluc,'Fluconazole;100uM;2')
cocl_ann = get_annotated_df(cocl, 'Cobalt_Chloride;2mM;2')

p7sc_ann = get_annotated_df(scp7,'Caffeine;15mM;2')
p9sc_ann = get_annotated_df(scp9,'Fluconazole;100uM;2')
p4sc_ann = get_annotated_df(scp4, 'Cobalt_Chloride;2mM;2')

print('############################################################################')
print('Measured Caffeine Variants:',caff_ann.var_id.nunique())
print('Measured CoCl\u2082 Variants:',cocl_ann.var_id.nunique())
print('Measured Fluconazole Variants:',fluc_ann.var_id.nunique())

## Calling Hits

In [None]:
#Calling hits with FDR<0.01
p7sc_hits = p7sc_ann[(p7sc_ann['padj']<.01)]
p9sc_hits = p9sc_ann[(p9sc_ann['padj']<.01)]
p4sc_hits = p4sc_ann[(p4sc_ann['padj']<.01)]

caff_hits = caff_ann[(caff_ann['padj']<.01)]
fluc_hits = fluc_ann[(fluc_ann['padj']<.01)]
cocl_hits = cocl_ann[(cocl_ann['padj']<.01)]

In [None]:
#Setting font sizes for graphs
SIZE = 7


plt.rc('font', size=SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SIZE)    # legend fontsize
plt.rc('figure', titlesize=SIZE)  # fontsize of the figure title


## Figure 2c


In [None]:
#getting numbers of hits for QTL conditions and 
#baseline SC media
num_caffpool_caff_hits = caff_hits.var_id.nunique()
num_caffpool_sc_hits = p7sc_hits.var_id.nunique()

num_coclpool_cocl_hits = cocl_hits.var_id.nunique()
num_coclpool_sc_hits = p4sc_hits.var_id.nunique()

num_flucpool_fluc_hits = fluc_hits.var_id.nunique()
num_flucpool_sc_hits = p9sc_hits.var_id.nunique()
fig,ax = plt.subplots(figsize = (1.5,1.5))

#putting numbers of hits for the conditions into a dataframe
numhits = pd.DataFrame({'SC':[num_caffpool_sc_hits,num_flucpool_sc_hits,num_coclpool_sc_hits],\
                        'Stress':[num_caffpool_caff_hits,num_flucpool_fluc_hits,num_coclpool_cocl_hits]})

#setting index for number of hits dataframe
numhits.index = ['CAFF', 'FLC', r'CoCl$_2$']

#plotting a bar plot for the number of hits within each 
#condition
numhits.plot.bar(ax = ax, color = ['grey','black'])
ax.set_ylabel('Variants with FDR <.01')
plt.xlabel('Variant Set')
plt.xticks(rotation = 0)
plt.legend(loc = 'upper left',handlelength=.5, handleheight=.5)
plt.savefig('../GxE_Figures/Figure_2/figure2c_sc_v_condition_hit_comp_fc.svg')

## Figure 2d

In [None]:
#Calculating variant annotation enrichments for hit variants 
#relative to the measured variants.

#copying annotation dataframe
p4sc_all_var_annos_for_plot = p4sc_ann.copy()
#copying hit dataframe
p4sc_hit_annos_for_plot = p4sc_hits.copy()
#Setting frameshift variants, stop gained variants, and inframe deletions
#as missense variants (very few)
p4sc_all_var_annos_for_plot.loc[p4sc_all_var_annos_for_plot['Consequence'].isin(['frameshift_variant','stop_gained','inframe_insertion']), 'Consequence']='missense_variant'
p4sc_hit_annos_for_plot.loc[p4sc_hit_annos_for_plot['Consequence'].isin(['frameshift_variant','stop_gained','inframe_insertion']), 'Consequence']='missense_variant'

#generating a dataframe of variant proportions in measured variants
p4sc_consequence_counts = pd.DataFrame(p4sc_all_var_annos_for_plot.Consequence.value_counts()/len(p4sc_all_var_annos_for_plot))
#renaming the column to show the library proportions of each annotation
p4sc_consequence_counts['Library Proportions'] = p4sc_consequence_counts['Consequence']
#adding a column for hit variant proportions of each annotation
p4sc_consequence_counts['Hit Proportions'] = p4sc_hit_annos_for_plot.Consequence.value_counts()/len(p4sc_hit_annos_for_plot)
#dropping unnecessary column
p4sc_consequence_counts.drop('Consequence',axis = 1, inplace = True)


p7sc_all_var_annos_for_plot = p7sc_ann.copy()
p7sc_hit_var_annos_for_plot = p7sc_hits.copy()
p7sc_all_var_annos_for_plot.loc[p7sc_all_var_annos_for_plot['Consequence'].isin(['frameshift_variant','stop_gained','inframe_insertion', 'inframe_deletion','stop_lost']), 'Consequence']='missense_variant'
p7sc_hit_var_annos_for_plot.loc[p7sc_hit_var_annos_for_plot['Consequence'].isin(['frameshift_variant','stop_gained','inframe_insertion','inframe_deletion', 'stop_lost']), 'Consequence']='missense_variant'
p7sc_consequence_counts = pd.DataFrame((p7sc_all_var_annos_for_plot.Consequence.value_counts()/len(p7sc_all_var_annos_for_plot)))
p7sc_consequence_counts['Library Proportions'] = p7sc_consequence_counts['Consequence']
p7sc_consequence_counts['Hit Proportions'] = p7sc_hit_var_annos_for_plot.Consequence.value_counts()/len(p7sc_hit_var_annos_for_plot)
p7sc_consequence_counts.drop('Consequence',axis = 1, inplace = True)

p9sc_all_var_annos_for_plot = p9sc_ann.copy()
p9sc_hit_var_annos_for_plot = p9sc_hits.copy()
p9sc_all_var_annos_for_plot.loc[p9sc_all_var_annos_for_plot['Consequence'].isin(['frameshift_variant','stop_gained','inframe_insertion', 'inframe_deletion','stop_lost']), 'Consequence']='missense_variant'
p9sc_hit_var_annos_for_plot.loc[p9sc_hit_var_annos_for_plot['Consequence'].isin(['frameshift_variant','stop_gained','inframe_insertion','inframe_deletion', 'stop_lost']), 'Consequence']='missense_variant'
p9sc_consequence_counts = pd.DataFrame((p9sc_all_var_annos_for_plot.Consequence.value_counts()/len(p9sc_all_var_annos_for_plot)))
p9sc_consequence_counts['Library Proportions'] = p9sc_consequence_counts['Consequence']
p9sc_consequence_counts['Hit Proportions'] = p9sc_hit_var_annos_for_plot.Consequence.value_counts()/len(p9sc_hit_var_annos_for_plot)
p9sc_consequence_counts.drop('Consequence',axis = 1, inplace = True)


In [None]:
cocl_all_var_annos_for_plot = cocl_ann.copy()
cocl_hit_annos_for_plot = cocl_hits.copy()
cocl_all_var_annos_for_plot.loc[cocl_all_var_annos_for_plot['Consequence'].isin(['frameshift_variant','stop_gained','inframe_insertion']), 'Consequence']='missense_variant'
cocl_hit_annos_for_plot.loc[cocl_hit_annos_for_plot['Consequence'].isin(['frameshift_variant','stop_gained','inframe_insertion']), 'Consequence']='missense_variant'
cocl_consequence_counts = pd.DataFrame(cocl_all_var_annos_for_plot.Consequence.value_counts()/len(cocl_all_var_annos_for_plot))
cocl_consequence_counts['Library Proportions'] = cocl_consequence_counts['Consequence']
cocl_consequence_counts['Hit Proportions'] = cocl_hit_annos_for_plot.Consequence.value_counts()/len(cocl_hit_annos_for_plot)
cocl_consequence_counts.drop('Consequence',axis = 1, inplace = True)


caff_all_var_annos_for_plot = caff_ann.copy()
caff_hit_var_annos_for_plot = caff_hits.copy()
caff_all_var_annos_for_plot.loc[caff_all_var_annos_for_plot['Consequence'].isin(['frameshift_variant','stop_gained','inframe_insertion', 'inframe_deletion','stop_lost']), 'Consequence']='missense_variant'
caff_hit_var_annos_for_plot.loc[caff_hit_var_annos_for_plot['Consequence'].isin(['frameshift_variant','stop_gained','inframe_insertion','inframe_deletion', 'stop_lost']), 'Consequence']='missense_variant'
caff_consequence_counts = pd.DataFrame((caff_all_var_annos_for_plot.Consequence.value_counts()/len(caff_all_var_annos_for_plot)))
caff_consequence_counts['Library Proportions'] = caff_consequence_counts['Consequence']
caff_consequence_counts['Hit Proportions'] = caff_hit_var_annos_for_plot.Consequence.value_counts()/len(caff_hit_var_annos_for_plot)
caff_consequence_counts.drop('Consequence',axis = 1, inplace = True)

fluc_all_var_annos_for_plot = fluc_ann.copy()
fluc_hit_var_annos_for_plot = fluc_hits.copy()
fluc_all_var_annos_for_plot.loc[fluc_all_var_annos_for_plot['Consequence'].isin(['frameshift_variant','stop_gained','inframe_insertion', 'inframe_deletion','stop_lost']), 'Consequence']='missense_variant'
fluc_hit_var_annos_for_plot.loc[fluc_hit_var_annos_for_plot['Consequence'].isin(['frameshift_variant','stop_gained','inframe_insertion','inframe_deletion', 'stop_lost']), 'Consequence']='missense_variant'
fluc_consequence_counts = pd.DataFrame((fluc_all_var_annos_for_plot.Consequence.value_counts()/len(fluc_all_var_annos_for_plot)))
fluc_consequence_counts['Library Proportions'] = fluc_consequence_counts['Consequence']
fluc_consequence_counts['Hit Proportions'] = fluc_hit_var_annos_for_plot.Consequence.value_counts()/len(fluc_hit_var_annos_for_plot)
fluc_consequence_counts.drop('Consequence',axis = 1, inplace = True)


In [None]:
from scipy.stats import fisher_exact

In [None]:
#Calculating enrichment of missense annotation for fluconazole hits
num_fluc_missense_hits = fluc_hit_var_annos_for_plot[fluc_hit_var_annos_for_plot['Consequence']=='missense_variant'].var_id.nunique()
num_fluc_missense_variants = fluc_all_var_annos_for_plot[fluc_all_var_annos_for_plot['Consequence']=='missense_variant'].var_id.nunique()
num_fluc_hits = fluc_hits.var_id.nunique()
num_fluc_variants = fluc_all_var_annos_for_plot.var_id.nunique()
fluc_table= np.array([[num_fluc_missense_hits,num_fluc_missense_variants-num_fluc_missense_hits],
         [num_fluc_hits-num_fluc_missense_hits, num_fluc_variants-num_fluc_hits-num_fluc_missense_variants+num_fluc_missense_hits]])
#correcting for four annotations multiple test correction
odd_r,p = fisher_exact(fluc_table, 'two-sided')
print(p*4)

In [None]:
#Calculating enrichment of missense annotation for cocl hits
num_cocl_missense_hits = cocl_hit_annos_for_plot[cocl_hit_annos_for_plot['Consequence']=='missense_variant'].var_id.nunique()
num_cocl_missense_variants = cocl_all_var_annos_for_plot[cocl_all_var_annos_for_plot['Consequence']=='missense_variant'].var_id.nunique()
num_cocl_hits = cocl_hits.var_id.nunique()
num_cocl_variants = cocl_all_var_annos_for_plot.var_id.nunique()
cocl_table = np.array([[num_cocl_missense_hits,num_cocl_missense_variants-num_cocl_missense_hits],
         [num_cocl_hits-num_cocl_missense_hits, num_cocl_variants-num_cocl_hits-num_cocl_missense_variants+num_cocl_missense_hits]])
#correcting for four annotations multiple test correction
odd_r,p = fisher_exact(cocl_table,'two-sided')
print(p*4)

In [None]:
#Calculating enrichment of missense annotation for caffeine hits
num_caff_missense_hits = caff_hit_var_annos_for_plot[caff_hit_var_annos_for_plot['Consequence']=='missense_variant'].var_id.nunique()
num_caff_missense_variants = caff_all_var_annos_for_plot[caff_all_var_annos_for_plot['Consequence']=='missense_variant'].var_id.nunique()
num_caff_hits = caff_hits.var_id.nunique()
num_caff_variants = caff_all_var_annos_for_plot.var_id.nunique()
caff_table = np.array([[num_caff_missense_hits,num_caff_missense_variants-num_caff_missense_hits],
         [num_caff_hits-num_caff_missense_hits, num_caff_variants-num_caff_hits-num_caff_missense_variants+num_caff_missense_hits]])
#correcting for four annotations multiple test correction
odd_r, p  = fisher_exact(caff_table,'two-sided')
print(p*4)

In [None]:
fluc_consequence_counts['Enrichment'] = fluc_consequence_counts['Hit Proportions']/fluc_consequence_counts['Library Proportions']
caff_consequence_counts['Enrichment'] = caff_consequence_counts['Hit Proportions']/caff_consequence_counts['Library Proportions']
cocl_consequence_counts['Enrichment'] = cocl_consequence_counts['Hit Proportions']/cocl_consequence_counts['Library Proportions']
fig,ax = plt.subplots(figsize = (1.5,1.5))
plt.rcParams['font.size'] = 7
allconds = fluc_consequence_counts.merge(caff_consequence_counts, left_index=True, right_index=True, suffixes = ('fluc', 'caff')).merge(cocl_consequence_counts,left_index=True, right_index=True)
allconds = allconds.filter(regex = 'Enrichment')
allconds.loc[['synonymous_variant','missense_variant','upstream_gene_variant','downstream_gene_variant']].plot.bar(ax= ax)
lgnd = plt.legend(['FLC', 'CAFF',r'CoCl$_2$'],  loc='upper right',handlelength=.5, handleheight=.5)

plt.axhline(1, color = 'darkgrey',linestyle = 'dashed')
plt.ylabel('Enrichment over Library')
plt.xlabel('Variant Annotation')
plt.xticks([0,1,2,3],[ 'Synonymous','Missense','Upstream','Downstream'], rotation = 45);
plt.savefig('../GxE_Figures/Figure_2/figure2d_annotation_enrichments_fc.svg')