"""script to generate subplots showing read depth across each of the four amplicons"""

In [62]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
# use pacbio conda environment


In [63]:
#set matplotlib rc parameters
def set_rc_params():
    #set matplotlib default parameters
    rcParams['xtick.major.width'] = 2
    rcParams['ytick.major.width'] = 2
    rcParams['axes.linewidth'] = 2
    #rcParams['lines.linewidth'] = 2
    #remove top and right lines
    rcParams['axes.spines.top'] = False
    rcParams['axes.spines.right'] = False
    #font size
    fontsize = 14
    rcParams['font.size'] = fontsize
    #for getting the microsoft font Arial working, please follow this guide: https://alexanderlabwhoi.github.io/post/2021-03-missingfont/
    rcParams['font.family'] = 'sans-serif'
    rcParams['font.sans-serif'] = ['Arial']
    #allow font to be edited later in pdf editor
    #make svg text editable
    rcParams['svg.fonttype'] = 'none'
    rcParams ['pdf.fonttype'] = 42 
    #align y-axis top most tick with end of axis
    rcParams['axes.autolimit_mode'] = 'round_numbers'
    #set margins to ensure any error bars fit
    rcParams['axes.xmargin'] = 0.2
    rcParams['axes.ymargin'] = 0.2
    #define bar width
    #bar_width = 0.65
    return fontsize
    

In [64]:
# process data function
def process_data(file):
    df = pd.read_csv(file, header=0)
    # create gene column
    df['gene'] = df['barcode_pair_gene'].str.split('_').str[1]
    # remove nan values
    df = df.dropna()    
    # 
    # bc1017_ARF18_SW346_SW442.trim.fasta
   # print(df)
    return df



In [65]:
# make plot showing read depth for each gene
def plot_read_depth(df, output_location):
    """function to plot read counts (count column) for each gene"""


    #df_grouped = df.groupby(['gene'])
    # sort by read depth
    #df_grouped = df_grouped.sort_values(by=['count'], ascending=False)
    # plot

    order=['ARF9','ARF18', 'DREB26','NLP7']
    # sort df by order of list
    df['gene'] = pd.Categorical(df['gene'], order)

    # df.sort_values(by=['gene'], key=lambda x: x.map(order), inplace=True)
    # print(df)

    plt.figure(figsize=(10,10))
    fig = sns.histplot(x='gene',y='count', data=df, )
    #fig = sns.stripplot(x='gene',y='count', data=df)
    #plt.xticks(rotation=90)
    plt.xlabel('Amplicon')
    plt.ylabel('Number of mapped reads')
    # set y lim to start at 0
    plt.ylim(0)
    #plt.title('Read Depth for Each Gene')
    # change x axis order to
    #fig.set_xticklabels(fig.get_xticklabels(), rotation=90)
    #save svg and pdf
    plt.savefig(f'{output_location}/reads_mapped.svg', format='svg', bbox_inches='tight')
    plt.savefig(f'{output_location}/reads_mapped.pdf', format='pdf', bbox_inches='tight')
    #remove plot
    plt.clf()
    plt.close()


    
    #return df_grouped


In [66]:
# make plot showing reads at each mutated site
def plot_mutated_sites(files, output_location):
    """function to plot reads containing each mutation"""
    #read in files
    arf9 = pd.read_csv(files[0], header=0, sep='\t')
    #make column for gene
    arf9['gene'] = 'ARF9'
    arf18 = pd.read_csv(files[1], header=0, sep='\t')
    #make column for gene
    arf18['gene'] = 'ARF18'
    dreb26 = pd.read_csv(files[2], header=0, sep='\t')
    #make column for gene
    dreb26['gene'] = 'DREB26'

    nlp7 = pd.read_csv(files[3], header=0, sep='\t')
    #make column for gene
    nlp7['gene'] = 'NLP7'
    
    #concatenate dfs
    df = pd.concat([arf9, arf18, dreb26, nlp7], axis=0)
    print(df.columns)
    #reset index
    df = df.reset_index(drop=True)
    
    # #sort df by gene

    # order=['ARF9','ARF18', 'DREB26','NLP7']
    # # sort df by order of list
    # df['gene'] = pd.Categorical(df['gene'], order)

    # df.sort_values(by=['gene'], key=lambda x: x.map(order), inplace=True)
    # print(df)

    plt.figure(figsize=(10,10))
    fig = sns.histplot(x='gene',y='read_number', data=df )
    #fig = sns.stripplot(x='gene',y='count', data=df)
    #plt.xticks(rotation=90)
    plt.xlabel('Gene')
    plt.ylabel('Mutation read count')
    # set y lim to start at 0
    plt.ylim(0)
    #plt.title('Read Depth for Each Gene')
    # change x axis order to
    #fig.set_xticklabels(fig.get_xticklabels(), rotation=90)
    #save svg and pdf
    plt.savefig(f'{output_location}/mutation_read_counts.svg', format='svg', bbox_inches='tight')
    plt.savefig(f'{output_location}/mutation_read_counts.pdf', format='pdf', bbox_inches='tight')
    #remove plot
    plt.clf()
    plt.close()

   

In [67]:
#main function
def main():
    # file location
    file_loc = '../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/CB-PPBFX-994_Sam_Witham_EI_SW_ENQ-5142_A_01_demux_by_gene.csv'
    #genotyped_only_mutated_df_file = 
   # mutations_ARF9_genotyped = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/ARF9/ARF9_TFBSoverlapping_genotyped.tsv'
    mutations_ARF9_genotyped_only_mutated = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/ARF9/ARF9_TFBSoverlapping_genotyped_only_mutated.tsv'
    #mutations_ARF9_genotyped_only_mutated_flattened = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/ARF9/ARF9_TFBSoverlapping_genotyped_only_mutated_flattened.tsv'

    #mutations_ARF18_genotyped = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/ARF18/ARF18_TFBSoverlapping_genotyped.tsv'
    #mutations_ARF18_genotyped_only_mutated = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/ARF18/ARF18_TFBSoverlapping_genotyped_only_mutated.tsv'
    mutations_ARF18_genotyped_only_mutated_filtered = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/ARF18/ARF18_TFBSoverlapping_genotyped_only_mutated_filtered.tsv'
    #mutations_ARF18_genotyped_only_mutated_flattened = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/ARF18/ARF18_TFBSoverlapping_genotyped_only_mutated_flattened.tsv'
    #mutations_ARF18_genotyped_only_mutated_flattened_filtered = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/ARF18_filtered/ARF18_TFBSoverlapping_genotyped_only_mutated_flattened.tsv'

    #mutations_DREB26_genotyped = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/DREB26/DREB26_TFBSoverlapping_genotyped.tsv'
    mutations_DREB26_genotyped_only_mutated = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/DREB26/DREB26_TFBSoverlapping_genotyped_only_mutated.tsv'
    #mutations_DREB26_genotyped_only_mutated_flattened = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/DREB26/DREB26_TFBSoverlapping_genotyped_only_mutated_flattened.tsv'

    #mutations_NLP7_genotyped = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/NLP7/NLP7_TFBSoverlapping_genotyped.tsv'
    mutations_NLP7_genotyped_only_mutated = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/NLP7/NLP7_TFBSoverlapping_genotyped_only_mutated.tsv'
    #mutations_NLP7_genotyped_only_mutated_flattened = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/NLP7/NLP7_TFBSoverlapping_genotyped_only_mutated_flattened.tsv'

    output_location = '../../data/CRISPR_library'
    # read in and process data 
    df = process_data(file_loc)
    # set matplotlib rc parameters
    fontsize = set_rc_params()

    #m make new folder
    dirName = f'{output_location}/sequencing_plots'
    try:
        # Create target Directory
        os.mkdir(dirName)
        print("Directory " , dirName ,  " created") 
    except FileExistsError:
        print("Directory " , dirName , " already exists")
    # plot data
    plot_read_depth(df,dirName)
    #median read depth for arf9
    arf9_median = df.loc[df['gene'] == 'ARF9']['count'].median()
    print(f'median reads mapped for ARF9 is {arf9_median}')
    #median read depth for arf18
    arf18_median = df.loc[df['gene'] == 'ARF18']['count'].median()
    print(f'median reads mapped for ARF18 is {arf18_median}')
    #median read depth for dreb26
    dreb26_median = df.loc[df['gene'] == 'DREB26']['count'].median()
    print(f'median reads mapped for DREB26 is {dreb26_median}')
    #median read depth for nlp7
    nlp7_median = df.loc[df['gene'] == 'NLP7']['count'].median()
    print(f'median reads mapped for NLP7 is {nlp7_median}')

    # plot reads at mutation sites
    #make four more subplots
    # fig2, axes2 = plt.subplots(2, 2, figsize=(10, 8), sharex=False)
    # #flatten axis array
    # axes2 = axes2.flatten()
    # make deletion size plots
    plot_mutated_sites([mutations_ARF9_genotyped_only_mutated,mutations_ARF18_genotyped_only_mutated_filtered,mutations_DREB26_genotyped_only_mutated,mutations_NLP7_genotyped_only_mutated],dirName)
    # plot_mutated_sites(mutations_ARF18_genotyped_only_mutated_filtered, axes2[1], 'ARF18')
    # plot_mutated_sites(mutations_DREB26_genotyped_only_mutated, axes2[2], 'DREB26')
    # plot_mutated_sites(mutations_NLP7_genotyped_only_mutated, axes2[3], 'NLP7')






    # mean read depth for arf9
    # arf9_mean = df.loc[df['gene'] == 'ARF9']['count'].mean()
    # print(f'ARF9 mean read depth: {arf9_mean}')
    # # mean read depth for arf18
    # arf18_mean = df.loc[df['gene'] == 'ARF18']['count'].mean()
    # print(f'ARF18 mean read depth: {arf18_mean}')
    # # mean read depth for dreb26
    # dreb26_mean = df.loc[df['gene'] == 'DREB26']['count'].mean()
    # print(f'DREB26 mean read depth: {dreb26_mean}')
    # # mean read depth for nlp7
    # nlp7_mean = df.loc[df['gene'] == 'NLP7']['count'].mean()
    # print(f'NLP7 mean read depth: {nlp7_mean}')
    








In [68]:
#run main
if __name__ == '__main__':
    main()

Directory  ../../data/CRISPR_library/sequencing_plots  already exists
median reads mapped for ARF9 is 314.5
median reads mapped for ARF18 is 680.0
median reads mapped for DREB26 is 342.5
median reads mapped for NLP7 is 660.0
Index(['chr', 'plant_ID', 'platename', 'library', 'first_reaction_primers',
       'second_reaction_primers', 'guide', 'guide_number', 'aligned_sequence',
       'reference_sequence', 'mutation_type', 'genotype', 'read_number',
       'read_percentage', 'insertion_positions', 'deletion_positions',
       'substitution_positions', 'insertion_cut_site_distance',
       'deletion_cut_site_distance', 'substitution_cut_site_distance',
       'cut_site_promoter_position', 'insertion_positions_relative_to_TSS',
       'insertion_genomic_positions', 'deletion_positions_relative_to_TSS',
       'deletion_genomic_positions', 'substitution_positions_relative_to_TSS',
       'substitution_genomic_positions', 'insertion_overlapping_TFBS_family',
       'insertion_overlapping_TF