In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import matplotlib.lines as mlines
from statsmodels.robust.scale import mad




warnings.filterwarnings('ignore')

plt.rcParams['font.family'] = "sans-serif"
plt.rcParams['font.sans-serif'] = 'Arial'
plt.rcParams['mathtext.fontset'] = 'custom'
plt.rcParams['mathtext.cal'] = 'Arial' 

%matplotlib inline




OLIGO_TABLE_PATH = '../data_tables/oligos_nonuniq_crispey3_GG_9bp_OLIGO_with_seq_primers.txt'
gxe_df =pd.read_csv('../data_tables/ergosterol_pathway_gxe_interactions.tsv',sep = '\t')
var_fitness_df =pd.read_csv('../data_tables/Fitness_files/ergosterol_pathway_fitness.tsv',sep = '\t')
oli_info = pd.read_csv(OLIGO_TABLE_PATH,'\t')


annotations = pd.read_csv('../data_tables/ergosterol_annotations.tsv',sep='\t')
annotated_vars = var_fitness_df.merge(annotations, 'inner', 'var_id')

In [None]:
lova = pd.read_csv('../data_tables/Deseq_outputs/deseq2_res_bar_P1_LOV_competitiontime_SCM_umi_level.tsv', sep = '\t')
nacl = pd.read_csv('../data_tables/Deseq_outputs/deseq2_res_bar_P1_NACL_competitiontime_SCM_umi_level.tsv', sep = '\t')
caff = pd.read_csv('../data_tables/Deseq_outputs/deseq2_res_bar_P1_CAFF_competitiontime_SCM_umi_level.tsv', sep = '\t')
sc = pd.read_csv('../data_tables/Deseq_outputs/deseq2_res_bar_P1_SCM_competitiontime_SCM_umi_level.tsv', sep = '\t')
cocl = pd.read_csv('../data_tables/Deseq_outputs/deseq2_res_bar_P1_COCL_competitiontime_SCM_umi_level.tsv', sep = '\t')
tbf = pd.read_csv('../data_tables/Deseq_outputs/deseq2_res_bar_P1_TBF_competitiontime_SCM_umi_level.tsv', sep = '\t')


lova['barcode_id'] =lova.index.str.split('_').str[1:3].str.join('_')
nacl['barcode_id'] =nacl.index.str.split('_').str[1:3].str.join('_')
caff['barcode_id'] =caff.index.str.split('_').str[1:3].str.join('_')
sc['barcode_id'] =sc.index.str.split('_').str[1:3].str.join('_')
cocl['barcode_id'] =cocl.index.str.split('_').str[1:3].str.join('_')
tbf['barcode_id'] = tbf.index.str.split('_').str[1:3].str.join('_')

def robust_outlier_removal(umi_df, fc_cutoff=3.5):
    '''
    Accepts a pandas DataFrame of deseq2 results for a given barcode
    and removes outliers based on log2FoldChange
    filters log2FoldChange by robust MADs - gets rid of outlier 
    UMIs from mutation or other causes. Default cutoff chosen to be conservative.
    inputs: 
        umi_df (pandas DataFrame): deseq results for a given barcode
        fc_cutoff (float): cutoff for how many MAD away from the median a UMI must be to be called an outlier
    returns:
        DataFrame for barcode with outliers removed
    '''
    if len(umi_df)==1:
        return pd.DataFrame(index=umi_df.index)
    
    indices = umi_df.index.values
    fcs = umi_df['log2FoldChange'].values
    fc_outlier_stats = [abs(i-np.median(fcs))/mad(fcs) for i in fcs]
    
    outliers = []
    for i in range(len(indices)):
        if fc_outlier_stats[i]>fc_cutoff:
            outliers.append(indices[i])
    
    return umi_df.drop(outliers)



        
def remove_outliers(df, axis, title=None):
    '''removes outliers for all variants using robust_outlier_removal function
    returns dataframe with outliers removed and plots a scatter plot
    with the fitness and baseMean of each UMI (outliers in red)
    inputs:
        df (pandas DataFrame): deseq2 results for a given condition
        axis (matplotlib Axis): axis on which to plot the outlier graphs
        title (String): optional title for outlier plot
    returns:
        DataFrame for condition with outliers removed
    '''
    df.plot.scatter('baseMean', 'log2FoldChange', c = 'r', ax = axis, s = .5)
    df2 = df.groupby('barcode_id').apply(robust_outlier_removal).droplevel('barcode_id')
    df2.plot.scatter('baseMean', 'log2FoldChange', c = 'black', ax = axis, s = .5)
    axis.set_xscale('log')
    axis.set_title(title)
    axis.yaxis.get_label().set_visible(False)
    axis.xaxis.get_label().set_visible(False)
    return df2

def bc_filter(df, cutoff, axis, cond):
    '''Removes super variable barcodes, ones which have UMIs so spread apart
    that normal outlier detection is ineffective. Many of these high variance barcodes
    otherwise end up significant, so this step is meant to be conservative and remove 
    these oddly behaving barcodes.
    inputs: 
        df (pandas DataFrame): deseq2 results for a given condition post outlier removal
        cutoff (float): cutoff for standard deviation of UMI fitness above which barcodes will be removed
        axis (matplotlib Axis): axis upon which to plot histogram of barcode standard deviations
        cond (String): condition name for title of graph
    returns: 
        DataFrame with high variance barcodes removed
        
    '''
    df_stds = df.groupby('barcode_id')['log2FoldChange'].std()
    df_stds.hist(color = 'grey', bins = 100, ax = axis)
    axis.axvline(cutoff, c = 'red',linestyle = 'dashed')
    axis.set_title(cond)
    good_bcs = df_stds[df_stds<cutoff].index
    return df[df['barcode_id'].isin(good_bcs)]


In [None]:

SIZE = 7
plt.rcParams['font.size'] = SIZE
plt.rcParams['font.family'] = 'Arial'

plt.rc('font', size=SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SIZE)    # legend fontsize
plt.rc('figure', titlesize=SIZE)  # fontsize of the figure title

fig,axes = plt.subplots(3,2, figsize = (7.74,7*1.5))
plt.subplots_adjust(wspace = 0.2,hspace = .2)
plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.04)
ax = axes[0,0]
ax2 = axes[1,0]
ax3 = axes[2,0]
ax4 = axes[0,1]
ax5 = axes[1,1]
ax6 = axes[2,1]

sc_filt = remove_outliers(sc, ax, 'SC')
lova_filt = remove_outliers(lova, ax2, 'LOV')
tbf_filt = remove_outliers(tbf, ax3, 'TBF')
cocl_filt = remove_outliers(cocl, ax4, 'COCL')
caff_filt = remove_outliers(caff, ax5, 'CAFF')
nacl_filt = remove_outliers(nacl, ax6, 'NACL')

fig.supxlabel('baseMean')
fig.supylabel('log2FoldChange')
outlier_legend_marker = mlines.Line2D([], [], color='red', marker='o', linestyle='None',
                          markersize=2, label='Removed Outliers')
nonoutlier_legend_marker = mlines.Line2D([], [], color='black', marker='o', linestyle='None',
                          markersize=2, label='Retained UMIs')
fig.legend(handles= [nonoutlier_legend_marker,outlier_legend_marker], bbox_to_anchor = (1.05,0.5))



In [None]:

SIZE = 7
plt.rcParams['font.size'] = SIZE
plt.rcParams['font.family'] = 'Arial'

plt.rc('font', size=SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SIZE)    # legend fontsize
plt.rc('figure', titlesize=SIZE)  # fontsize of the figure title

fig,axes = plt.subplots(3,2, figsize = (7,7*1.5))
plt.subplots_adjust(wspace = 0.2,hspace = .2)
plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.04)
ax = axes[0,0]
ax2 = axes[1,0]
ax3 = axes[2,0]
ax4 = axes[0,1]
ax5 = axes[1,1]
ax6 = axes[2,1]

sc_inf = sc_filt.merge(oli_info, 'inner','barcode_id')
lova_inf = lova_filt.merge(oli_info, 'inner','barcode_id')
tbf_inf = tbf.merge(oli_info, 'inner','barcode_id')
caff_inf = caff_filt.merge(oli_info, 'inner','barcode_id')
cocl_inf = cocl_filt.merge(oli_info, 'inner','barcode_id')
nacl_inf = nacl_filt.merge(oli_info, 'inner','barcode_id')

sc_inf = bc_filter(sc_inf, 0.05,ax, 'SC')
lova_inf =bc_filter(lova_inf,0.05,ax2,'LOV')
tbf_inf = bc_filter(tbf_inf,0.05,ax3,'TBF')
caff_inf = bc_filter(caff_inf, 0.05,ax4,'CAFF')
cocl_inf = bc_filter(cocl_inf,0.05,ax5,'COCL')
nacl_inf = bc_filter(nacl_inf,0.05,ax6,'NACL')



In [None]:
#Plots the distribution of fitness effects for each gene within the ergosterol pathway as a swarm plot,
#with variants with significant fitness effects in red and others in grey. Blue and white
#panels are meant to aid in separating the genes visually.

import seaborn as sns


plt.rcParams['font.family'] = 'Arial'
condition_dict = {'SC':'SC', 'LOV':'Lovastatin',
                  'TBF':'Terbinafine', 'COCL':'CoCl$_2$',
                 'CAFF':'Caffeine', 'NACL':'NaCl'}



def gene_DFE_plotter(condition, axis, target = None, show_legend = False, show_ylabel = False):
    cond_dat = annotated_vars[annotated_vars['condition']==condition]
    genes = ['ERG10','ERG13','HMG1','HMG2','ERG12', 'ERG8','MVD1','IDI1','ERG20','ERG9','ERG1','ERG7','ERG11','NCP1','ERG24','ERG25','ERG26','ERG27','ERG28','ERG29','ERG6','ERG2','ERG3','ERG5','ERG4']
    cond_dat['Significant?'] = cond_dat['padj']<.01
    sns.swarmplot(x =  cond_dat['Gene'], y = cond_dat['coef'], hue =cond_dat['Significant?'], ax = axis,palette=['grey','red'],
                  order = genes, s = 2, hue_order = [False, True])
    
    ymax,ymin = axis.get_ylim()
    span = max(abs(ymax),abs(ymin))
    axis.set_ylim(-span,span)
    axis.set_xticklabels(genes, rotation = 90, fontsize =6)
    axis.set_title(condition_dict[condition], fontsize = 7)
    lgnd = axis.legend(labels = ['FDR<.01','FDR\u2265.01'],loc='center left',bbox_to_anchor=(.89, -4.5), fontsize = 7)
    lgnd.set_visible(show_legend)
    
    lgnd.legendHandles[-1]._sizes = [30]
    lgnd.legendHandles[-2]._sizes = [30]
    
    for i in axis.get_xticks():
        if i%2:
            axis.axvspan(i-.5,i+.5,facecolor = 'lightblue', alpha = .6)
    
    if target:
        axis.axvline(target[0], linestyle = 'dashed', color = 'black', linewidth = .5)
        axis.axvline(target[1], linestyle = 'dashed',color = 'black', linewidth = .5)
    axis.set_ylabel('Variant Fitness',fontsize=7)
    axis.yaxis.get_label().set_visible(show_ylabel)
    axis.xaxis.get_label().set_visible(False)
    

In [None]:
fig,axes = plt.subplots(3,2, figsize = (5,4))
fig.tight_layout()
plt.subplots_adjust(wspace = 0.2,hspace = .8)
SIZE = 7
plt.rc('font', size=SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SIZE)    # legend fontsize
plt.rc('figure', titlesize=SIZE)  # fontsize of the figure title


ax = axes[0,0]
ax2 = axes[1,0]
ax3 = axes[2,0]
ax4 = axes[0,1]
ax5 = axes[1,1]
ax6 = axes[2,1]


gene_DFE_plotter('SC', axis = ax, show_legend = True)
#ax.set_yticks([-15,-7.5,0,7.5,15])
gene_DFE_plotter('COCL', axis = ax5)
#ax5.set_yticks([-10,-5,0,5,10])
gene_DFE_plotter('LOV',axis = ax2, target = [1.5,3.5], show_ylabel=True)
#ax2.set_yticks([-20,-10,0,10,20])
gene_DFE_plotter('CAFF', axis =ax4)
#ax4.set_yticks([-10,-5,0,5,10])
gene_DFE_plotter('TBF', axis = ax3, target= [9.5,10.5])
#ax3.set_yticks([-20,-10,0,10,20])
gene_DFE_plotter('NACL', axis = ax6)
#ax6.set_yticks([-30,-15,0,15,30])


#fig.suptitle('Gene-Level Distributions of Fitness Effects', y = 1.05, fontsize =5)
plt.savefig('../GxE_Figures/Figure_3/fig3c_FC.svg')