### Edit over time plots

Author: Ruth Hanna

This notebook contains the code to create plots showing the % of reads containing a given edit at specified timepoints (e.g. Figure 3G). 

The inputs are: 
1. an input file containing the required metainformation
2. the processed allele frequency plots, which are produced by the BEV_allele_frequencies notebook

The outputs are:
1. a pdf file containing a plot that shows the % of all reads containing the specified edit at each timepoint / drug condition
2. a csv file containing the plotted data

In [4]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

# Set default settings for plotting
sns.set_style('ticks')
sns.set_context('paper')
mpl.rc('pdf', fonttype=42)
mpl.rcParams['axes.linewidth'] = 0.5
mpl.rcParams['xtick.major.width'] = 0.5
mpl.rcParams['ytick.major.width'] = 0.5
mpl.rcParams['xtick.major.size'] = 2
mpl.rcParams['ytick.major.size'] = 2
mpl.rcParams['font.sans-serif'] = "Arial"
mpl.rcParams['font.family'] = "sans-serif"
current_palette = sns.color_palette('Set2')

In [5]:
'''
This function filters out any reads with indels, as indicated
by the presence of '-' in either the Aligned_Sequence (deletions)
or the Reference_Sequence (insertions)
'''
def filter_out_indels(row):
    if ('-' in row['Aligned_Sequence']) or ('-' in row['Reference_Sequence']):
        return True
    else:
        return False

def make_plot_v2(df,output_name,aa,title,hue_order,xlabel='days post-transduction',ylims=(0,100),figsize=(1.2,1.8)):
    color_nums = [7,0,1,2,3,4,5,6]
    colors = [sns.color_palette('Set2')[n] for n in color_nums]
    fig,ax = plt.subplots(figsize=figsize)

    # Plot average of 2 replicates as bar
    sns.barplot(x='day',y=aa,data=df,palette=colors,zorder=1,linewidth=0,hue='treatment',ci=None,hue_order=hue_order)
    
    # Plot individual replicates as dots
    sns.stripplot(df['day'],df[aa],color='black',zorder=2,s=3,hue=df['treatment'],dodge=True,hue_order=hue_order)
    sns.despine(top=True,right=True)
    plt.xlabel(xlabel,fontsize=6)
    
    # Tidy up plot
    ax.set_ylim(ylims)
    plt.xticks(fontsize=6)
    plt.yticks(fontsize=6)
    ax.set_title(title,fontsize=6)
    plt.ylabel('percentage of reads with edit',fontsize=6)
    fig.savefig('../Data/Validation_CRISPResso_results/aa_over_time/'+output_name+'.pdf',transparent=True,bbox_inches='tight')
    plt.close()
    return

def run_v2(row,df):
    # Read in full (unfiltered) allele file
    alleles = pd.read_csv('../Data/Validation_CRISPResso_results/allele_freq/'+row['sg']+'_'+str(row['primer'])+
                          '_allele_frequency_table_around_sgRNA.csv')
    alleles['contains_indel'] = alleles.apply(filter_out_indels,axis=1)
    alleles = alleles.loc[alleles['contains_indel'] == False,:]
    # Get identity of edits at specified position
    # Handles either coding edits (pos_type == 'aa') or non-coding edits (pos_type == 'nuc')
    if row['pos_type'] == 'aa':
        alleles['pos'] = alleles['Translated'].apply(lambda x: x[int(row['pos'])])
    elif row['pos_type'] == 'nuc':
        alleles['pos'] = alleles['Aligned_Sequence'].apply(lambda x: x[int(row['pos'])])
    else:
        print 'pos_type must be "aa" or "nuc"'
        return

    # Assign BEVs to timepoints and treatments
    bev_dict = {}
    for j,r in df.iterrows():
        bev_dict['%Reads_'+r['bev']] = (r['day'],r['treatment'],r['order'])
        
    # Sum up %Reads for all rows with the same value for the specified residue
    cols = bev_dict.keys() + ['pos']
    cols.sort()
    grouped = alleles[cols].groupby(['pos']).sum()
    
    # Transpose and filter for edit of interest
    grouped = grouped[grouped.index == row['edit_to']].transpose()

    for i,col in enumerate(['day','treatment','order']):
        grouped[col] = grouped.index.map(lambda x: bev_dict[x][i])
    
    # Make sure the x-axis values are in the correct order
    grouped = grouped.sort_values(by='order')
    
    # Get hue order from sorted dataframe
    hue_order = grouped.loc[:,'treatment'].drop_duplicates(keep='first').tolist()
    output_name = row['sg'] + '_' + row['edit_name']
    make_plot_v2(df=grouped,title=row['title'],output_name=output_name,aa=row['edit_to'],ylims=(0,row['ylim']),
                 figsize=(row['figwidth'],1.5),xlabel=row['xlabel'],hue_order=hue_order)
    grouped.to_csv('../Data/Validation_CRISPResso_results/aa_over_time/'+output_name+'.csv')
    return

In [6]:
input_file = pd.read_csv('../Data/Validation_CRISPResso_results/aa_over_time/aa_over_time_input_v3.csv')
sg_file = input_file[['sg','primer','pos_type','pos','edit_to','edit_name','ylim','title','figwidth','xlabel']].dropna().drop_duplicates()

for i,row in sg_file.iterrows():
    df = input_file.loc[(input_file['sg'] == row['sg']) &
                        (input_file['primer'] == row['primer']) &
                        (input_file['pos_type'] == row['pos_type']) &
                        (input_file['pos'] == row['pos']) & 
                        (input_file['edit_to'] == row['edit_to']) &
                        (input_file['edit_name'] == row['edit_name']) &
                        (input_file['ylim'] == row['ylim']) &
                        (input_file['title'] == row['title']) &
                        (input_file['figwidth'] == row['figwidth']) &
                        (input_file['xlabel'] == row['xlabel']),:]
    print row['sg']
    run_v2(row,df)


sg5
sg1
sg12
sg3
sg10
sg10
sg3
sg29
sg30
sg31
sg32
sg32
sg33
sg34
sg35
sg36
sg24
sg25
sg26
sg27
sg28
sg28
sg28
sg16
sg15
sg17
sg21
sg19
sg18
sg20
