# Nucleotide percentage plots

This notebook contains code to process the "Nucleotide_percentage_table" file from the CRISPResso output into a file containing the % of each each nucleotide at each target nucleotide, numbered relative to the sgRNA. It also produces a plot showing the same information (as in Figure S4A, D, G, J).

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import base_edit_functions as be
from math import log
from os import path
#import aa_guideseq_visualization as guideseq
mpl.rc('pdf', fonttype=42)
mpl.rcParams['font.sans-serif'] = "Arial"
mpl.rcParams['font.family'] = "sans-serif"

In [2]:
def get_bev_str(bev):
# Converts BEV number from int to 3-digit string    
    bev = int(bev)
    if bev < 10:
        return '00'+str(bev)
    if bev < 100:
        return '0'+str(bev)
    return str(bev)

def check_filepath(filepath,bev,primer):
    file_loc = filepath+'/CRISPResso_on_BEV_'+bev+'_'+primer+'/'+'Nucleotide_percentage_table.txt'
    if path.exists(file_loc):
        return file_loc
    else:
        return 0
        
def get_complement(val):
    complement_dict = {'A':'T','C':'G','G':'C','T':'A'}
    return complement_dict[val]

def get_bev_df(bev_list,rev_com,offset,output_name,primer):
    for i,BEV in enumerate(bev_list):
        filepath = '../Data/Validation_CRISPResso_results/Completed_CRISPResso_files_v2'
        file_loc = check_filepath(filepath,get_bev_str(BEV),primer)
        data = pd.read_table(file_loc,header=None)
        data = data.transpose()
        # Set first row to be column names
        data.columns = data.iloc[0]
        data = data.drop(data.index[0])
        data = data.rename(columns = {np.nan:'WT'})
        # Add row showing the position
        data['position'] = data.index - 1
        data = data.rename(columns = {'-':'del'}) # to display better in Excel
        # Handle reverse complement cases
        if rev_com:
            # Complement
            data['WT'] = data['WT'].apply(get_complement)
            data = data.rename(columns = {'A':'T','C':'G','G':'C','T':'A'})
            # Reverse
            data['position'] = -1 * (data['position'] - max(data['position']))
        
        # Calculate "offset_position" relative to sgRNA using manually-determined offset (custom for each sgRNA)
        data['offset_position'] = data['position'] - offset + 1

        data = data.rename(columns = {'A':'A_'+str(BEV),'C':'C_'+str(BEV),'G':'G_'+str(BEV),'T':'T_'+str(BEV),'del':'del_'+str(BEV),'N':'N_'+str(BEV)})

        if i == 0:
            existing_df = data
        else:
            # Merge onto existing dataframe
            existing_df = pd.merge(existing_df,data,on=['WT','position','offset_position'],how='outer')
    
    # Average columns
    for nuc in ['A','C','G','T','N','del']:
        cols = []
        for bev in bev_list:
            cols.append(nuc+'_'+bev)
        existing_df[cols] = existing_df[cols].astype(float)
        existing_df[nuc+'_avg'] = existing_df[cols].mean(axis=1)
    
    # Write out file
    existing_df.to_csv('../Data/Validation_CRISPResso_results/nucleotide_percentage/BEV_'+output_name+'.csv',index=False)
    
    # Filter for rows where WT is C
    existing_df = existing_df.loc[existing_df['WT'] == 'C']

    return existing_df


In [3]:
def make_plot_v2(data,left_lim,right_lim,bevs,primer,width,height):
    
    # Get list of value_vars for pd.melt
    value_vars = []
    for nuc in ['A','T','G','N','del']:
        for bev in bevs:
            value_vars.append(nuc+'_'+str(bev))
        value_vars.append(nuc+'_avg')
    
    # Make tidy data
    data = data.melt(id_vars=['WT','offset_position'],value_vars=value_vars,
                    var_name='nucleotide',value_name='percentage')
    
    # Filter for nucleotides to include on plot
    data = data[(data['offset_position'] > left_lim) & (data['offset_position'] < right_lim)]
    data = data.sort_values(by='offset_position')
    data['offset_position'] = data['offset_position'].astype(str)
    
    # Convert to percentage
    data['percentage'] = data['percentage'] * 100
    
    # Split "nucleotide" column into two: one with BEV number (or average) and one with nucleotide
    data['BEV'] = data['nucleotide'].apply(lambda x: x.split('_')[1]) # returns BEV number or "avg"
    data['nucleotide'] = data['nucleotide'].apply(lambda x: x.split('_')[0]) # returns nucleotide
    
    # Get order of offset positions for plotting
    order = data.loc[(data['nucleotide'] == 'A') & (data['BEV'] == 'avg'),'offset_position'].tolist() # require that 'BEV' == 'avg' just to deduplicate list
    hue_order = ['A','G','T','N','del']
    
    # Make plot
    fig,ax = plt.subplots(figsize=(width,height))
    sns.set_context(rc = {'patch.linewidth': 0.0})
    
    # Plot average of 2 replicates as bar    
    sns.barplot(x='offset_position',y='percentage',hue='nucleotide',data=data.loc[data['BEV'] == 'avg',:],order=order,hue_order=hue_order,palette=sns.color_palette('Set2'),
               linewidth=0)
    
    # Plot individual replicates as dots
    for bev in bevs:
        sns.stripplot(x='offset_position',y='percentage',hue='nucleotide',data=data.loc[data['BEV'] == bev,:],order=order,hue_order=hue_order,color='black',s=1,dodge=True)
    
    plt.xlabel('position of C',fontsize=6)
    plt.ylabel('percentage',fontsize=6)
    
    # Set y axis to start at 0
    ylim_upper = ax.get_ylim()[1]
    ax.set_ylim(0,ylim_upper)
    
    # Clean up axes
    sns.despine()
    for axis in ['bottom','left']:
        ax.spines[axis].set_linewidth(0.5)
    ax.tick_params(axis='both',labelsize=6,width=0.5,length=2)
    
    # Plot legend 
    handles, labels = ax.get_legend_handles_labels()
    plt.legend(handles[-5:], labels[-5:],loc='upper right',frameon=False,fontsize=6)
    fig.savefig('../Data/Validation_CRISPResso_results/nucleotide_percentage/BEV_'+output_name+'.pdf',transparent=True,bbox_inches = "tight")
    plt.close()
    
    return

In [4]:
# Get input data
input_df = pd.read_csv('../Data/Validation_CRISPResso_results/nucleotide_percentage/nucleotide_percentage_input_v2.csv')

for i,row in input_df.iterrows():
    print row['BEV']
    bev_list = row['BEV'].split(';')
    output_name = '_'.join(bev_list)+'_'+row['primer']
    bev_df = get_bev_df(bev_list,row['rev_com'],row['offset'],output_name,row['primer'])
    make_plot_v2(bev_df,row['left_lim'],row['right_lim'],bev_list,output_name,row['width'],row['height'])
    

364;365
366;367
286;287
288;289
290;291
230;231;372;373
380;381
244;245;376;377
248;249;378;379
300;301
302;303
305
314;315
316;317
319
328;329
330;331
333
342;343
348;349
350;351
352;353
354;355
356;357
358;359
360;361
362;363
368;369
370;371
384;385
388;389
10;11
16;17
22;23
34;35
40;41
42;43
44;45
46;47
52;53
58;59
64;65
70;71
76;77
82;83
88;89
