In [395]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from matplotlib import rcParams
from statannotations.Annotator import Annotator
import matplotlib.patches as mpatches
rcParams.update({'figure.autolayout': True})

In [396]:
#function to read in csv file as pandas df
def read_csv(file_name):
    df = pd.read_csv(file_name)
    #keep the Well, Sample, Target, Cq and Amp Status columns
    df = df[['Well', 'Sample', 'Target', 'Cq', 'Amp Status']]
    
    return df

In [397]:
#function to filter out certain data points
def filter_data(df, amp_status, cq):
    #filter out the data points with amp_status = Amp using .loc
    df = df.loc[df['Amp Status'] == amp_status]
    #make Cq column numerical
    dfcopy = df.copy()
    dfcopy['Cq'] = pd.to_numeric(dfcopy['Cq'])  
    #filter out the data points with cq < cq_threshold using .loc
    dfcopy = dfcopy.loc[dfcopy['Cq'] <= cq]

    return dfcopy

In [398]:
#function to make new columns and sort the data
def sort_data(df):
    #make new column called EF1a_Cq, make the value in this column for a particular sample equal to the Cq value for the EF1a Target for that sample
    ##remove if not amplified
    df = df.loc[df['Amp Status'] == 'Amp']

    #get the mean of each sample/target (take mean of technical replicates)
    df['Cq_mean'] = df.groupby(['Sample','Target'])['Cq'].transform('mean')
    #make a df containing only EF1a target (housekeeping gene)
    df_EF1a = df.loc[df['Target'] == 'EF1a'].copy()

    #rename the Cq_mean column to EF1a_Cq_mean
    df_EF1a.rename(columns={'Cq_mean': 'EF1a_Cq_mean'}, inplace=True)
    #filter other df_EF1a columns
    df_EF1a = df_EF1a[['Sample','EF1a_Cq_mean']]
    #remove duplicates from df_EF1a
    df_EF1a = df_EF1a.drop_duplicates()
    
    #merge the two dfs together
    df = pd.merge(df, df_EF1a, on=['Sample'], how='left')

    #normalise based on eEF1a gene
    df = normalise_data(df, 'Cq_mean','EF1a_Cq_mean','MeanCq_ECnormalised')
    #filter columns
    df = df[['Sample','Target','Cq_mean','MeanCq_ECnormalised']]
    #remove duplicates
    df = df.drop_duplicates()
   
   # print(df)
    #if Sample column ends with NRT, add NRT_Cq column
    df['NRT'] = False
    df.loc[df['Sample'].str.endswith('NRT'), 'NRT'] = True
    #remove NRT string from Sample columns ending with NRT
    df['Sample'] = df['Sample'].str.replace('NRT', '')
    
    #if Sample column ends with H, add condition column with 10mM_nitrate
    df['condition'] = np.nan
    df.loc[df['Sample'].str.endswith('H'), 'condition'] = '10mM_nitrate'
    #remove H string from Sample columns ending with H
    df['Sample'] = df['Sample'].str.replace('H', '')

    #if Sample column ends with L, add condition column with 1mM_nitrate
    df.loc[df['Sample'].str.endswith('L'), 'condition'] = '1mM_nitrate'
    #remove L string from Sample columns ending with L
    df['Sample'] = df['Sample'].str.replace('L', '')
    #remove A, B or C string from Sample columns ending with A, B or C
    df.loc[:, 'Sample_old'] = df['Sample']
    df['Sample'] = df['Sample'].str.replace('A', '')
    df['Sample'] = df['Sample'].str.replace('B', '')
    df['Sample'] = df['Sample'].str.replace('C', '')
    #remove whitespace from Sample columns
    df['Sample'] = df['Sample'].str.strip()
 
    #now make a df containing only Samples with 1mM_nitrate condition
    df_1mM_nitrate = df.loc[df['condition'] == '1mM_nitrate'].copy()

    #make new column that is the Mean expression across all biological replicates
    df_1mM_nitrate['Mean_biological_Cq_ECnormalised'] = df_1mM_nitrate.groupby(['Sample','Target', 'condition'])['MeanCq_ECnormalised'].transform('mean')


    #rename Mean_biological_Cq_ECnormalised column to 1mMnitrate_Cq_mean
    df_1mM_nitrate.rename(columns={'Mean_biological_Cq_ECnormalised': '1mMnitrate_Cq_mean'}, inplace=True)
    #filter other columns
    df_1mM_nitrate = df_1mM_nitrate[['Sample_old','Target','1mMnitrate_Cq_mean']]
    #remove duplicates from df_1mM_nitrate
    df_1mM_nitrate = df_1mM_nitrate.drop_duplicates()
    #merge the dfs
    df = pd.merge(df, df_1mM_nitrate, on=['Sample_old','Target'], how='left')

    

    #remove nan
    df = df.dropna()

    #remove NRT values
    df = df.loc[df['NRT'] == False]


    return df


In [399]:
#function to normalise the data based on a column of Cq values (either to housekeeping or based on nitrate or wild type plant)
def normalise_data(df, orig_col,normalisation_col, new_column_name):
   
    #normalise Cq values to the EF1a housekeeping gene mean Cq value for each sample
    df.loc[:,new_column_name] = df[orig_col] - df[normalisation_col]
    #remove nan values in the new column
    df = df[df[new_column_name].notna()]


    return df

In [400]:
#make individual plots
def make_plots(df,location):
    """function to make barplots of relative expression of each target gene in each plant line"""
    #plot height and width
    height = 5
    width = 2.4
    #make individual plots
    
    
    for plantline in df['Sample'].unique():
        temp_df =  df[df.Sample == plantline]
        for target in temp_df['Target'].unique():
            #if target is not EF1a, make plot
            if target != 'EF1a':            
                new_temp_df = temp_df[temp_df.Target == target]
                #change condition values
                new_temp_df.loc[new_temp_df['condition'] == '10mM_nitrate', 'condition'] = '10'
                new_temp_df.loc[new_temp_df['condition'] == '1mM_nitrate', 'condition'] = '1'

                order = ['1','10']
                #create box pairs
                pair = [('1', '10')]

                #make plot 
                _ = plt.figure(figsize=(width,height))
                fig = sns.barplot(x='condition', y='relative_expression', data=new_temp_df, order=order, linewidth=2,  errcolor="black", edgecolor="black", ci=68, errwidth=1,capsize=0.4,color='cyan')

                fig = sns.swarmplot(x='condition', y='relative_expression', data=new_temp_df, order=order,color='black')

                #add stats
                annotator = Annotator(fig, pair, data=new_temp_df, x='condition', y='relative_expression',order=order,verbose=False)
                annotator.configure(test='t-test_ind', text_format='star',pvalue_thresholds=[[1e-3, "***"],[1e-2, "**"],[0.05, "*"],[1, "ns"]])
                
                #save stats to file
                ax, test_results = annotator.apply_and_annotate()
                with open(f'{location}/individual/stats.txt', 'a') as f:                            
                    for res in test_results:
                        f.write(f'{str(plantline)},{target},{pair},{str(res.data)}\n')
                
                # change axes labels
                _ = plt.ylabel('Relative expression (a.u.)')
                
                
                #add plot title
                _ = plt.title(f'{plantline} {target}')

                #rename x axis labels
                #_ = plt.set_xticklabels( ('1','10') )
                #change x axis name
                _ = plt.xlabel('Nitrate concentration (mM)')
        
                #make xticks diagonal
               # _ = plt.xticks(rotation=90, ha='center')


                #save plot to file
                plt.savefig(
                                f'{location}/individual/{plantline}_{target}.pdf',
                                format="pdf",
                                bbox_inches="tight",transparent=True)
                plt.savefig(
                                f'{location}/individual/{plantline}_{target}.svg',
                                format="svg",
                                bbox_inches="tight",transparent=True)
                plt.close()




In [401]:
#make combined plots with all samples
def make_combined_plots(df,location):
    """function to make barplots of relative expression of each target gene in each plant line on the same axes"""
    #plot height and width
    height = 5
    bar_width = 0.3

    #set width of bars
    def change_width(ax, new_value) :
        for patch in ax.patches :
            current_width = patch.get_width()
            diff = current_width - new_value
            # we change the bar width
            patch.set_width(new_value)
            # we recenter the bar
            patch.set_x(patch.get_x() + diff * .5)

    for target in df['Target'].unique():
            #if target is not EF1a, make plot
            if target != 'EF1a':
                temp_df = df[df.Target == target]
                #change condition values
                temp_df.loc[temp_df['condition'] == '10mM_nitrate', 'condition'] = '10'
                temp_df.loc[temp_df['condition'] == '1mM_nitrate', 'condition'] = '1'

                #get list of samples
                samples_unique = temp_df['Sample'].unique()

                #sample order
                sample_order = ['col-0','125-4','130-4','142-4','142-8','144-5']

                #sort based on custom order
                samples = []
                for i in range(len(sample_order)):
                    if sample_order[i] in samples_unique:
                        samples+=[sample_order[i]]

                #get length of number of samples
                length_samples = len(samples)

                #create order and box pairs based on the length of TFs
                order = []
                box_pairs = []
                for x in range (0, (length_samples)):
                    order.append(samples[x])
                    # if 'col-0' in samples:
                    #     if samples[x] != 'col-0':
                    #         box_pairs.append(('col-0', samples[x]))
                    # if 'col-0' not in samples:
                    box_pairs.append(((samples[x],'1'), (samples[x],'10')))


                fig_args = {'x':'Sample', 'y':'relative_expression','hue':'condition', 'hue_order':['1','10'],'data':temp_df, 'order':order, 'dodge':True}
                #'linewidth':2,  'errcolor':"black", 'edgecolor':"black", 'ci':68, 'errwidth':1,'capsize':0.4

                configuration = {'test':'t-test_ind', 'text_format':'star', 'pvalue_thresholds':[[1e-3, "***"],[1e-2, "**"],[0.05, "*"],[1, "ns"]]}


                #make plot              
                
                _ = plt.figure(figsize=((3+(length_samples-1)*2),height))
                
                #_ = plt.figure(figsize=(width,height))
                fig = sns.barplot(**fig_args, color='cyan',linewidth=2,  errcolor="black", edgecolor="black", ci=68, errwidth=1,capsize=0.4)
                fig = sns.swarmplot(**fig_args, color='black')

                #set width of bars
                change_width(fig, bar_width)

                # #add stats
                annotator = Annotator(fig, box_pairs, **fig_args,verbose=False)
                annotator.configure(**configuration)


                # fig = sns.barplot(x='Sample', y='relative_expression',hue='condition', data=temp_df, order=order, linewidth=2,  errcolor="black", edgecolor="black", ci=68, errwidth=1,capsize=0.4,color='cyan')

                # fig = sns.swarmplot(x='Sample', y='relative_expression',hue='condition', data=temp_df, order=order,color='black')

                #add stats
                # annotator = Annotator(fig, pair, data=temp_df, x='Sample', y='relative_expression',order=order,verbose=False)
                # annotator.configure(test='t-test_ind', text_format='star',pvalue_thresholds=[[1e-3, "***"],[1e-2, "**"],[0.05, "*"],[1, "ns"]])
                
                #save stats to file
                ax, test_results = annotator.apply_and_annotate()
                # with open(f'{location}/targets/stats.txt', 'a') as f:                            
                #     for res in test_results:
                #         f.write(f'{target}',f'{box_pairs},{str(res.data)}\n')
                
                # change axes labels
                _ = plt.ylabel('Relative expression (a.u.)')
                
                
                #add plot title
                _ = plt.title(f'{target}')

                #rename x axis labels
                #_ = plt.set_xticklabels( ('1','10') )
                #change x axis name
                _ = plt.xlabel('Nitrate concentration (mM)')
        
                #make xticks diagonal
                _ = plt.xticks(rotation=45, ha='center')

               #plot legend, excluding legend from swarm plot
                h,l = fig.get_legend_handles_labels()
                #change name of label
                l[3] = "10 mM nitrate"
                #l[2] = "20 mM KNO\u2083 + 20 mM NH\u2083NO\u2083"   
                l[2] = "1 mM nitrate"     
                plt.legend(h[2:4],l[2:4],bbox_to_anchor=(0.3,0.87), loc='lower left',fontsize=10)

                # tight layout
                #plt.tight_layout()


                #save plot to file
                plt.savefig(
                                f'{location}/targets/{target}.pdf',
                                format="pdf",
                                bbox_inches="tight",transparent=True)
                plt.savefig(
                                f'{location}/targets/{target}.svg',
                                format="svg",
                                bbox_inches="tight",transparent=True)
                plt.close()        
    

In [402]:
#set matplotlib rc parameters
def set_rc_params():
    #set matplotlib default parameters
    rcParams['xtick.major.width'] = 2
    rcParams['ytick.major.width'] = 2
    rcParams['axes.linewidth'] = 2
    #rcParams['lines.linewidth'] = 2
    #remove top and right lines
    rcParams['axes.spines.top'] = False
    rcParams['axes.spines.right'] = False
    #font size
    fontsize = 14
    rcParams['font.size'] = fontsize
    #for getting the microsoft font Arial working, please follow this guide: https://alexanderlabwhoi.github.io/post/2021-03-missingfont/
    rcParams['font.family'] = 'sans-serif'
    rcParams['font.sans-serif'] = ['Arial']
    #allow font to be edited later in pdf editor
    #make svg text editable
    rcParams['svg.fonttype'] = 'none'
    rcParams ['pdf.fonttype'] = 42 
    #align y-axis top most tick with end of axis
    rcParams['axes.autolimit_mode'] = 'round_numbers'
    #set margins to ensure any error bars fit
    rcParams['axes.xmargin'] = 0.2
    rcParams['axes.ymargin'] = 0.2
    #define bar width
    #bar_width = 0.65
    

In [403]:
# if __name__ == "__main__" function
def main():
    location = '../../data/CRISPR_library/qPCR/10.08.22'
    csv_file = f'{location}/10.8.22_platelayout_19310threshold.csv'
    #read in file
    df = read_csv(csv_file)
    #filter out the data points with amp_status = Amp and cq above 32
    df = filter_data(df, 'Amp', 40)
    #sort the data
    df = sort_data(df)
    #make copy of df
    df_norm_lowest_sample = df.copy()
    #normalise based on 1mM_nitrate Cq values, mean between all 3 biological reps 
    df = normalise_data(df, 'MeanCq_ECnormalised','1mMnitrate_Cq_mean','MeanCq_EC_1mM_nitrate_normalised')
    #now filter columns
    df = df[['Sample','Target','MeanCq_EC_1mM_nitrate_normalised','condition']]
    #now filter columns
    #df = df[['Sample','Target','MeanCq_ECnormalised','condition']]
    #first do inverse log transformation
    #(fold change of GOI in treated sample if delta delta Ct value  = X then relative expression  = 2 ( to the power of X))
    #df['relative_expression'] = 2**(df['MeanCq_ECnormalised'])
    df['relative_expression'] = 2**(df['MeanCq_EC_1mM_nitrate_normalised'])
    #save df to tsv
    df.to_csv('../../data/CRISPR_library/qPCR/10.8.22_platelayout_19310threshold_normEC1mMnitrate_relative_expression.tsv', sep='\t', index=False)

    #now normalise the df_norm_lowest_sample to the sample with the lowest MeanCq_ECnormalised value
    #get lowest MeanCq_ECnormalised value in df_norm_lowest_sample
    lowest_mean_cq = df_norm_lowest_sample['MeanCq_ECnormalised'].min()
    #get sample with lowest MeanCq_ECnormalised value
    lowest_mean_cq_sample = df_norm_lowest_sample[df_norm_lowest_sample['MeanCq_ECnormalised'] == lowest_mean_cq].iloc[0]['Sample']
    print(f'normalising to lowest mean cq sample: {lowest_mean_cq_sample}')
    #normalise df_norm_lowest_sample to lowest mean cq sample
    df_norm_lowest_sample.loc[:,'MeanCq_EClowestsample'] = df_norm_lowest_sample['MeanCq_ECnormalised'] - lowest_mean_cq
    #remove nan values in the new column
    df_norm_lowest_sample = df_norm_lowest_sample[df_norm_lowest_sample['MeanCq_EClowestsample'].notna()]
    #save df to tsv
    df_norm_lowest_sample.to_csv('../../data/CRISPR_library/qPCR/10.8.22_platelayout_19310threshold_normEClowest_sample_relative_expression.tsv', sep='\t', index=False)

    #create plot folder name
    #make directory for the plots to be exported to
    dirName = f'{location}/plots'
    try:
        # Create target Directory
        os.mkdir(dirName)
        print("Directory " , dirName ,  " created") 
    except FileExistsError:
        print("Directory " , dirName ,  " already exists")


    dirName = f'{location}/plots/individual'
    try:
        # Create target Directory
        os.mkdir(dirName)
        print("Directory " , dirName ,  " created") 
    except FileExistsError:
        print("Directory " , dirName ,  " already exists")

        dirName = f'{location}/plots/targets'
    try:
        # Create target Directory
        os.mkdir(dirName)
        print("Directory " , dirName ,  " created") 
    except FileExistsError:
        print("Directory " , dirName ,  " already exists")


    #save df to csv
    df.to_csv(f'{location}/mean_normalised.csv')


    #set matplotlib rc parameters
    set_rc_params()
    #make plots
    #individual plots
    make_plots(df,f'{location}/plots')
    #combined plots
    make_combined_plots(df,f'{location}/plots')

    #print(df)
    #print(df)
    #print(df[df.NRT==True])
    # normalised_housekeeping = normalise_data(df, 'EF1a')
    # print(normalised_housekeeping)
    #normalised_housekeeping = normalise_data(df, 'EF1a')




In [404]:
if __name__ == "__main__":
    main()

normalising to lowest mean cq sample: 125-4
Directory  ../../data/CRISPR_library/qPCR/10.08.22/plots  already exists
Directory  ../../data/CRISPR_library/qPCR/10.08.22/plots/individual  already exists
Directory  ../../data/CRISPR_library/qPCR/10.08.22/plots/targets  already exists
