In [1328]:
#script to read in .csv output files from SmartRoot analysis, concatenate them and then analyse and make plots
#use qpcr conda environment

In [1329]:
import pandas as pd
import numpy as np
#allow exporting to latex as pgf
# import matplotlib as mpl
# # Use the pgf backend (must be set before pyplot imported)
# mpl.use('pgf')
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import os
import glob 
# import sys
# import argparse
import statsmodels.api as sm
# stats annotations
from statannotations.Annotator import Annotator

import statsmodels.formula.api as smf
from bioinfokit.analys import stat

import pingouin as pg
#allow changing of axes label floats
from matplotlib.ticker import FormatStrFormatter
import matplotlib.patches as mpatches
from scipy import stats
#allow changing of axes label floats
from matplotlib.ticker import FormatStrFormatter
import math
#cycle through alphabet
from string import ascii_uppercase as alc

rcParams.update({'figure.autolayout': True})
#installed statannotations from different branch to master too allow the annotator flag "show_non_significant=False": pip install -e git+https://github.com/sepro/statannotations#egg=statannotations

In [1330]:
#function to read in csv file as pandas df
def read_csv(file_name):
    df = pd.read_csv(file_name)
    #keep the Well, Sample, Target, Cq and Amp Status columns
    df = df[['Well', 'Sample', 'Target', 'Cq', 'Amp Status']]
    
    return df

In [1331]:
#test for normality of the data
def test_normality(df, location):
    """run Shapiro-Wilk test"""
    #iterate over rows
    #get sample names
    sample_names = df['Sample'].unique()
    #get Target names
    target_names = df['Target'].unique()
    #iterate over samples and targets
    #make empty df to store p-values
    p_values = pd.DataFrame(columns=['Sample', 'Target', 'pvalue'])
    for sample in sample_names:
        for target in target_names:
            # #run Shapiro-Wilk test
           # print(sample,'{}: {}'.format(target, stats.shapiro(df['relative_expression'][df.Target == target])))
            #write a df with the results of the Shapiro-Wilk test
            #shapiro_df = pd.DataFrame(columns=['Sample', 'Target', 'p-value'])
            #results = sample,target,'{}'.format(stats.shapiro(df['relative_expression'][df.Target == target]))
            results = sample,target,stats.shapiro(df['relative_expression'][df.Target == target])
            #('125-4', 'NLP7: ShapiroResult(statistic=0.707939088344574, pvalue=9.890874935081229e-05)')
            shapiro_df = pd.DataFrame([results], columns=['Sample','Target', 'shapiro_test']).reset_index(drop=True)
            #get statistic and pvalue
            shapiro_df['statistic'] = shapiro_df['shapiro_test'].apply(lambda x: x[0])
            shapiro_df['pvalue'] = shapiro_df['shapiro_test'].apply(lambda x: x[1])
            #filter columns
            shapiro_df = shapiro_df[['Sample', 'Target', 'pvalue', 'statistic',]]
            #append to p_values df
            p_values = pd.concat([p_values, shapiro_df], axis=0, ignore_index=True)

    #write to tsv
    p_values.to_csv(f'{location}/shapiro_normality.tsv', sep='\t', index=False)

    return p_values





            


In [1332]:
#function to filter out certain data points
def filter_data(df, amp_status, cq):
    #filter out the data points with amp_status = Amp using .loc
    df = df.loc[df['Amp Status'] == amp_status]
    #make Cq column numerical
    dfcopy = df.copy()
    dfcopy['Cq'] = pd.to_numeric(dfcopy['Cq'])  
    #filter out the data points with cq < cq_threshold using .loc
    dfcopy = dfcopy.loc[dfcopy['Cq'] <= cq]

    return dfcopy

In [1333]:
# idea from https://www.nature.com/articles/s41598-021-99727-6#Sec2
# To detect outliers, the CT standard deviation (Cq-SD) of the technical replicates for a given sample is calculated, 
# if the Cq-SD is greater than the cut-off (the default value is 0.3), then the technical replicate furthest 
# from the sample mean is removed. The process occurs recursively until the Cq-SD is less than the cut-off 
# or the value of “max outliers” is reached. This is determined by the parameter ‘Max Proportion’, the 0.5 default 
# means that outliers will be removed until two technical replicates remain. The ‘preserve highly variable replicates’: 
# If the Cq-SD is higher than 0.3, but the absolute (mean-median)/median is less than 0.1, replicates are preserved. 
# This helps to account for a lack of a clear outlier, where two of three replicates are close to equally distributed 
# around the mean.
def remove_outliers(df, max_outliers, ct_sd_threshold):
    
    #copy the dataframe
    dfcopy = df.copy()
    # Add filter columns
    dfcopy['Ignore'] = False
    #dfcopy['Cq-SD'] = int()
    
    #make Cq column numerical
    dfcopy['Cq'] = pd.to_numeric(dfcopy['Cq'])
    #calculate the Cq-SD of the technical replicates for a given sample
    f = (dfcopy['Ignore'].eq(False))
    dfcopy1 = dfcopy[f].groupby(['Sample','Target']).agg({'Cq':['std']})
    #dfcopy = dfcopy[f].groupby(['Sample','Target']).agg({'Cq-SD':['std']})#['Cq'].transform(lambda x: x.std() / np.sqrt(x.count()))
    #make df containing all samples with outliers
    f = dfcopy1['Cq']['std'] > ct_sd_threshold
    dfcopy_outliers = dfcopy1[f]

    
    # dfcopy_outliers = dfcopy[dfcopy['Cq-SD'] > ct_sd_threshold]
    if not dfcopy_outliers.empty:
        #mark all outliers
        #iterate over rows as tuples (sample, target)
        for i, row in enumerate(dfcopy_outliers.itertuples(name = None),1):
            #example row: (('144-5AH', 'DREB26'), 0.6497278066418295)
         
           
            #check that the dfcopy sample name is the same as the sample name in the current row
            f = (dfcopy.Sample == row[0][0]) & (dfcopy.Target == row[0][1]) & (dfcopy['Ignore'].eq(False))
            dx_idx = dfcopy[f].index
            group_size = len(dx_idx)
            min_size = round(group_size * (1-max_outliers))
            size = group_size
            if min_size < 2:
                min_size = 2
                print('Warning: minimum size of technical replicate group is 2')
            while True:
                f = (dfcopy.Sample == row[0][0]) & (dfcopy.Target == row[0][1])
                dx = dfcopy[f].copy()
                dxg = dfcopy[f].groupby(['Sample', 'Target']).agg({'Cq': [np.size, 'std', 'mean']})
                if dxg['Cq']['std'].iloc[0] <= ct_sd_threshold:
                    #Cq std is under threshold, so no outliers
                    break
                size -= 1
                if size < min_size:
                    #not enough technical replicates to remove outliers
                    break
                #remove the technical replicate furthest from the mean
                dx['Distance'] = (dx['Cq'] - dxg['Cq']['mean'].iloc[0])**2
                dx_sorted = dx.sort_values(by = 'Distance', ascending=False).index[0]
                #print()
                #dfcopy = dfcopy.loc[dx_sorted].assign(Ignore=True)
                #dfcopy.loc[dx_sorted].loc(:, 'Ignore') = True
                #print(dx_sorted)
                dfcopy.loc[[dx_sorted], 'Ignore'] = True
                #dfcopy['Ignore'].loc[dx_sorted] = True
                #rint(dx_sorted)
                
    return dfcopy


            


   
    # #remove the highly variable replicates
    # if preserve_highly_variable_replicates == True:
    #     dfcopy = dfcopy.loc[dfcopy['Cq-SD'] > ct_sd_threshold]
    #     dfcopy = dfcopy.loc[dfcopy['Cq-SD'] < (dfcopy['Cq-SD'].mean() - dfcopy['Cq-SD'].median())/dfcopy['Cq-SD'].median()]
    # #remove the outliers until the number of outliers is less than max_outliers
    # while dfcopy['Cq-SD'].count() > max_outliers:
    #     dfcopy = dfcopy.loc[dfcopy['Cq-SD'] < ct_sd_threshold]
    #     if preserve_highly_variable_replicates == True:
    #         dfcopy = dfcopy.loc[dfcopy['Cq-SD'] > ct_sd_threshold]
    #         dfcopy = dfcopy.loc[dfcopy['Cq-SD'] < (dfcopy['Cq-SD'].mean() - dfcopy['Cq-SD'].median())/dfcopy['Cq-SD'].median()]
    # return dfcopy

In [1334]:
#function to recursively find all .csv files in a directory and concatenate them into a single dataframe
def concat_csv_recursive(PATH, EXT):
    #find all .csv files in the directory
    csv_files = [file for path, subdir, fname in os.walk(PATH) 
                for file in glob.glob(os.path.join(path, EXT))]
        #glob.glob(f'{directory}/{EXT}', recursive=True)
    #print(csv_files)
    #initialise empty dataframe
    df = pd.DataFrame()
    #loop through all files and concatenate them into a single dataframe
    for file in csv_files:
        df = pd.concat([df, pd.read_csv(file)], ignore_index=True)
    return df

In [1335]:
def sort_data(df,output_location):
    #sort dataframe by sample name
    df = df.sort_values(by=['image'])
    #remove duplicate rows
    df = df.drop_duplicates(keep='first')
    #make nitrate concentration column using image column
    df['nitrate_concentration'] = df['image'].str.split('_').str[1]
    #make sample name column using image column
    df['sample_name'] = df['image'].str.split('_').str[0]
    #make plate column
    df['plate'] = df['sample_name']+'_'+df['image'].str.split('_').str[2]
    #remove spaces from column names
    df.columns = df.columns.str.replace(' ', '')
   
    
    #make several new columns
    #first make new df which will contain one row per plant
    df_plant = df[df.root_order == 0]
    #remove all lines which have no length or which are NaN
    df_plant = df_plant[df_plant.length.notnull()]
    #df_plant = df_plant[df_plant.length != 0]
    # df_plant = df.groupby(['sample_name', 'plate', 'nitrate_concentration',root_ontology]).agg({'image':'count', 'nitrate_concentration':'first', 'sample_name':'first', 'plate':'first'})
    #print(df_plant)
    ## PR = primary root length (cm)
    #change length column to PR
    df_plant['PR'] = df_plant['length']
    # LR = lateral root number (visible from scan)
    #for each root in df_plant, count the number of rows whose parent root in df is the same as the root id in df_plant
    df_plant['LR'] = df_plant.apply(lambda row: df[(df.parent == row.root) & (df.root_order == 1)].shape[0], axis=1)
    #make list of first order lateral root ids
    df_plant['LR_ids'] = df_plant.apply(lambda row: df[(df.parent == row.root) & (df.root_order == 1)].root.tolist(), axis=1)
    #for each id in LR_ids, count the number of rows whose parent root in df is the same as the root id 
    df_plant['LR_2nd_order'] = df_plant.apply(lambda row: df[(df.parent.isin(row.LR_ids)) & (df.root_order == 2)].shape[0], axis=1)
    # LRL = total lateral root length (all LRs added together - cm). Have separate column for 2nd order lateral roots
    
    df_plant['LRL_1st_order'] = df_plant.apply(lambda row: df[(df.parent == row.root) & (df.root_order == 1)].length.sum(), axis=1)
    df_plant['LRL_2nd_order'] = df_plant.apply(lambda row: df[(df.parent.isin(row.LR_ids)) & (df.root_order == 2)].length.sum(), axis=1)
    #add LRL and 2nd order LRL to get total LRL
    df_plant['LRL'] = df_plant['LRL_1st_order'] + df_plant['LRL_2nd_order']
    # ALRL = average lateral root length (LRL/LR - cm)
    df_plant['ALRL'] = (df_plant.LRL) / (df_plant.LR)
   # df_plant['ALRL'] = df_plant.apply(lambda row: (row.LRL / row.LR, axis=1)
    # TRL = total root length (PR + LRL)
    df_plant['TRL'] = df_plant.PR + df_plant.LRL
    # LRD = lateral root density (LR/PR)
    df_plant['LRD'] = df_plant.LR / df_plant.PR
    # LRL_div_TRL = percentage of LRL contributing to TRL (LRL/TRL)
    df_plant['LRL_div_TRL'] = (df_plant.LRL) / df_plant.TRL

    #add genotype column
    df_plant['genotype'] = df_plant['root_name'].str.split('_').str[0]
    #remove spaces from genotype
    df_plant['genotype'] = df_plant['genotype'].str.replace(' ', '')

    #add log columns for PR, LR, LR_2nd_order, LRL, LRL_2nd_order. ALRL, TRL, LRD, LRL_div_TRL
    df_plant['log_PR'] = np.log(df_plant.PR)
    df_plant['log_LR'] = np.log(df_plant.LR)
    df_plant['log_LR_2nd_order'] = np.log(df_plant.LR_2nd_order)
    df_plant['log_LRL'] = np.log(df_plant.LRL)
    df_plant['LRL_1st_order'] = df_plant.LRL_1st_order
    df_plant['log_LRL_2nd_order'] = np.log(df_plant.LRL_2nd_order)
    df_plant['log_ALRL'] = np.log(df_plant.ALRL)
    df_plant['log_TRL'] = np.log(df_plant.TRL)
    df_plant['log_LRD'] = np.log(df_plant.LRD)
    df_plant['log_LRL_div_TRL'] = np.log(df_plant.LRL_div_TRL)
    






    #save df as tsv file
    # df_plant.to_csv(f'{output_location}/single_plant_data.tsv', sep='\t', index=False)
    #count number of plants for each plant line

    #partition variation across mutants relative to wild type using principal component analysis of all RSA traits
    #do stats: Using a two-way ANOVA, three phenotypic categories: genotype effects in both nitrogen conditions (genotype-dependent), genotype effects in only one condition (nitrogen-condition-dependent) or genotype by nitrogen condition-dependent effects 
    

    #print(len(df))
    return df, df_plant

In [1336]:
#set matplotlib rc parameters
def set_rc_params():
    #set matplotlib default parameters
    rcParams['xtick.major.width'] = 2
    rcParams['ytick.major.width'] = 2
    rcParams['axes.linewidth'] = 2
    rcParams['lines.linewidth'] = 2
    #remove top and right lines
    rcParams['axes.spines.top'] = False
    rcParams['axes.spines.right'] = False
    #font size
    fontsize = 20
    rcParams['font.size'] = fontsize
    #for getting the microsoft font Arial working, please follow this guide: https://alexanderlabwhoi.github.io/post/2021-03-missingfont/
    rcParams['font.family'] = 'sans-serif'
    rcParams['font.sans-serif'] = ['Arial']
    #allow font to be edited later in pdf editor
    #make svg text editable
    rcParams['svg.fonttype'] = 'none'
    rcParams ['pdf.fonttype'] = 42 
    #align y-axis top most tick with end of axis
    rcParams['axes.autolimit_mode'] = 'round_numbers'
    #set margins to ensure any error bars fit
    rcParams['axes.xmargin'] = 0.2
    rcParams['axes.ymargin'] = 0.2
    #define bar width
    #bar_width = 0.65
    #allow math text to be displayed
    #rcParams['mathtext.default'] = 'regular'
    return fontsize
    

In [1337]:
def boxplot(df,var,y_label,sample_name,box_pair_p_values, fontsize, ax):
    """function to make box plots"""
    #plot height and width
   # height = 5
   # width = 4
    
    order = ['1mM','10mM']
    fig_args = {'x':'nitrate_concentration', 'y':var,'data':df, 'order':order, 'dodge':True,'hue':'genotype','hue_order':['col0',sample_name]}#'ax':ax
    configuration = {'test':None, 'text_format':'star', 'pvalue_thresholds':[[1e-3, "***"],[1e-2, "**"],[0.05, "*"],[1, "ns"]]}#"pairs":list(box_pairs_significant.keys()),"pvalues":list(box_pairs_significant.values()), 'loc':'inside'
    #_ = plt.figure(figsize=(width,height))

    fig = sns.boxplot(**fig_args, linewidth=2, palette=["white", "grey"],ax = ax, boxprops={"edgecolor":"black"},flierprops={"markeredgecolor":"black"},whiskerprops={"color":"black"},capprops={"color":"black"}, medianprops={"color":"black"})
    fig = sns.swarmplot(**fig_args, color='black', palette=["black", "black"],size=4, ax=ax)
    #get pairs and pvalues
    #print(box_pairs_significant)
    pairs=list(box_pair_p_values.keys())
    #print(f'pairs={pairs}')
    
    pvalues=list(box_pair_p_values.values())
    #print(f'pvalues={pvalues}')
    #add statsannotator = Annotator(fig, pairs, **fig_args,verbose=False)
    annotator = Annotator(ax, pairs, **fig_args,verbose=False, show_non_significant=False)#show_non_significant=False will be added in the next version of statsannotator
    #annotator.set_pvalues(pvalues)
    annotator.configure(**configuration)
    
    annotator.set_pvalues_and_annotate(pvalues)

    ax.set_xlabel(r'KNO$_{3}$ concentration (mM)')
    ax.set_ylabel(y_label)
    #set y axis limit to start at 0
    _ = ax.set_ylim(0,None)

    

    ##plot legend, excluding legend from swarm plot
    h,l = ax.get_legend_handles_labels()
    #change name of label
    l[0] = "Col-0"
    l[1] = sample_name
    #set edge color
    h[0].set_edgecolor('black')
    h[1].set_edgecolor('black')
    #l[2] = "1 mM nitrate"     
    leg = ax.legend(h[0:2],l[0:2],frameon=False,edgecolor='black')#.set_linewidth(2)#,bbox_to_anchor=(0,0.85), loc='best',fontsize=fontsize,

    #set linewith of each legend object
    for legobj in leg.legendHandles:
        legobj.set_linewidth(2)
    # change axes labels
    #_ = plt.ylabel('Relative expression (a.u.)')
    #rename x axis labels
    _ = ax.set_xticklabels( ('1','10') )

    # change x axis labels
    # _ = ax.set_xticklabels([0,1],['1','10'])
    #max 1 decimal place y tick labels
    # ax.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))

    # #save plot to file
    # plt.savefig(
    #                 f'{output_location}/{var}_{sample_name}_boxplot.pdf',
    #                 format="pdf",
    #                 bbox_inches="tight",transparent=True)
    # plt.savefig(
    #                 f'{output_location}/{var}_{sample_name}_boxplot.svg',
    #                 format="svg",
    #                 bbox_inches="tight",transparent=True)
    # plt.savefig(
    #                 f'{output_location}/{var}_{sample_name}_boxplot.pgf',
    #                 format="pgf",
    #                 bbox_inches="tight",transparent=True)
    plt.cla()  # clear axis              
    plt.close('all')   
      

In [1338]:
def marginal_effects(df,variables,sample_name, fontsize, qpcr_df,combined_output_dir,normal,sample_order):

    """function to calculate marginal means for interaction genotype*nitrate_concentration"""

    #determine how many subplots needed - 7 + number of genes in qpcr_df
    #look at subset of qpcr_df with sample_name in it
    qpcr_df_subset = qpcr_df[qpcr_df['Sample']==sample_name]
    #count number of unique targets
    number_of_targets = len(qpcr_df_subset['Target'].unique())
    #add 7 to number of targets
    number_of_subplots = number_of_targets + 7
    print(f'number of subplots for {sample_name }= {number_of_subplots}')


    #divide number of subplots by 3 to get number of rows
    number_of_rows = math.ceil(number_of_subplots/3) 
    height = 4.8*number_of_rows
    width = 12
    #make subplots equal to number_of_subplots
    fig, axes = plt.subplots(nrows=number_of_rows, ncols=3, figsize=(width, height), sharex=False)
    #flatten axis array
    axes = axes.flatten()
    

    #subset df by and sample name
    qpcr_df_subset = qpcr_df[(qpcr_df['Sample']==sample_name) | (qpcr_df['Sample']=='Col-0')]

    #make ax number count
    ax_num = 0

    #add qpcr data
    for target in qpcr_df_subset['Target'].unique():
    #if target is not EF1a, make plot
        if target != 'EF1a':            
            temp_df = qpcr_df_subset[qpcr_df_subset.Target == target]
            #change condition values
            temp_df.loc[temp_df['condition'] == '10mM_nitrate', 'condition'] = '10'
            temp_df.loc[temp_df['condition'] == '1mM_nitrate', 'condition'] = '1'


            #get list of samples
            samples_unique = temp_df['Sample'].unique()

            

            #sort based on custom order
            samples = []
            for i in range(len(sample_order)):
                if sample_order[i] in samples_unique:
                    samples+=[sample_order[i]]
            #create new df with only sample of interest and Col-0 based on plateID so that Col-0 from the same plate is prioritised
            #if temp_df plateID is "22.08.22_plate1"

            for sample in samples:
                if sample != 'Col-0':
                    plateID_value = temp_df.loc[temp_df['Sample'] == sample, 'plateID'].values[0]
                # print(plateID_value)
                    if plateID_value == '22.08.22_plate1':   
                        #print(f'platevalueis{plateID_value}')                
                        df_new = temp_df[(temp_df.Sample == sample) | ((temp_df.Sample == 'Col-0') & (temp_df.plateID == plateID_value))]

                    if plateID_value == '22.08.22_plate2':

                        df_new = temp_df[(temp_df.Sample == sample) | ((temp_df.Sample == 'Col-0') & (temp_df.plateID == plateID_value))]
                    if plateID_value == '06.09.22_plate1':

                        df_new = temp_df[(temp_df.Sample == sample) | ((temp_df.Sample == 'Col-0') & (temp_df.plateID == plateID_value))]
                    if plateID_value == '06.09.22_plate2':

                        df_new = temp_df[(temp_df.Sample == sample) | ((temp_df.Sample == 'Col-0') & (temp_df.plateID == plateID_value))]
                    if plateID_value == '06.09.22_plate3':

                        df_new = temp_df[(temp_df.Sample == sample) | ((temp_df.Sample == 'Col-0') & (temp_df.plateID == plateID_value))]
                    if plateID_value == '10.8.22':
                        #include all col-0 values from both plates       
                        df_new = temp_df[(temp_df.Sample == sample) | (temp_df.Sample == 'Col-0')]

                    #print(ax_num)
                    make_plots(df_new, normal,sample, fig, axes[ax_num], target)

                    
                    #add 1 to ax_num
                    ax_num += 1

    #make boxplots for each variable
    for var, y_label in variables.items():

        #remove NaN values from dataframe
        df = df.dropna(subset=[var]).copy()
        
        #only run if not empty array
        #try:

        #run anova
        anova = smf.ols(f'{var} ~ genotype*nitrate_concentration + plate', data=df).fit()
        #get marginal means, save as txt file            
        #anova.summary_frame().to_csv(f'{output_location}/marginal_means_{var}.tsv', sep='\t')
        #save anova summary to tsv
        #use type 1 anova to test interaction term. If not significant, refit without the interaction term and use Type-II to test the main effects
        table = sm.stats.anova_lm(anova, type=1)
       
        #check if interaction term genotype:nitrate_concentration is significant
        if table.loc['genotype:nitrate_concentration']['PR(>F)'] >= 0.05:
            #print(table.loc['genotype:nitrate_concentration']['PR(>F)'])
            #if not significant, refit without the interaction term and use Type-II to test the main effects

            anova = smf.ols(f'{var} ~ genotype+nitrate_concentration + plate', data=df).fit()
            #filter columns of df
            df_filtered = df[['genotype','nitrate_concentration',var]].copy()
            #drop na
            df_filtered = df_filtered.dropna(subset=['genotype','nitrate_concentration',var]).copy()
            
            genotypes_unique = df['genotype'].unique()
            length_samples = len(genotypes_unique)
            
            box_pair_p_values = {}
            for x in range (0, (length_samples)):                        
                if genotypes_unique[x] != 'col0':
                    #perform estimated marginal means contrasts between genotype*nitrate interaction
                    #create model, dropping the plate effects because it seems non-significant in most/all cases
                    #test assumption that variances are all equal
                    #split df by nitrate concentration
                    df_10mM = df_filtered[df_filtered['nitrate_concentration'] == '10mM']
                    df_1mM = df_filtered[df_filtered['nitrate_concentration'] == '1mM']
                    var_10mM = pg.homoscedasticity(df_10mM, group='genotype', dv=f'{var}')#Levene test
                    var_1mM = pg.homoscedasticity(df_1mM, group='genotype', dv=f'{var}')

                        #print(f'df_1mM_genotype variance {var} {sample_name}: {var_1mM}')
                    # print(f'df_10mM_genotype variance {var} {sample_name}: {var_10mM}')
                    #followed this https://www.reneshbedre.com/blog/anova.html
                    res = stat()
                    res.anova_stat(df=df_filtered, res_var=var, anova_model=f'{var}~C(genotype)+C(nitrate_concentration)')
                  
                    # for interaction effect between genotype and nitrate_concentration
                    res.tukey_hsd(df=df_filtered, res_var=var, xfac_var=['genotype','nitrate_concentration'], anova_model=f'{var}~C(genotype)+C(nitrate_concentration)')
                    genotype_nitrate_concentration_tukey = res.tukey_summary

                    #get p values
                    p_values = genotype_nitrate_concentration_tukey.copy()


                    # #write stats to file
                    # with open(f'{output_location}/stats/marginal_means_{var}_{sample_name}.txt', 'w', encoding="utf-8", ) as f:
            
                    #     f.write(f'first test assumption that variances are all equal\ndf_1mM_genotype variance {var} {sample_name}:\n {var_1mM}\ndf_10mM_genotype variance {var} {sample_name}:\n {var_10mM}\nanova_type_1:\n{table}\ngenotype:nitrate_concentration is not significant so use type 2 anova excluding interaction term\nanova_type_2:\n{res.anova_summary}\nTukey_post_hocs:\ngenotype_nitrate_concentration_tukey\n{p_values}')
                    

                    #add to box_pair_p_values dictionary the box pair as the key and the p value as the value
                    box_pair_p_values[(('1mM','col0'),('1mM',genotypes_unique[x]))] = p_values.loc[((p_values['group1'] == (f'col0', '1mM')) & (p_values['group2'] == (f'{genotypes_unique[x]}', '1mM')))|((p_values['group1'] == (f'{genotypes_unique[x]}', '1mM')) & (p_values['group2'] == (f'col0', '1mM'))),'p-value'].values[0]
                    box_pair_p_values[(('10mM','col0'),('10mM',genotypes_unique[x]))] = p_values.loc[((p_values['group1'] == (f'col0', '10mM')) & (p_values['group2'] == (f'{genotypes_unique[x]}', '10mM')))|((p_values['group1'] == (f'{genotypes_unique[x]}', '10mM')) & (p_values['group2'] == (f'col0', '10mM'))),'p-value'].values[0]
                    # box_pair_p_values[(('1mM','col0'),('1mM',genotypes_unique[x]))] = p_values.loc[((p_values['group1'] == f'(col0, 1mM)') & (p_values['group2'] == f'({genotypes_unique[x]}, 1mM)'))|((p_values['group1'] == f'({genotypes_unique[x]}, 1mM)') & (p_values['group2'] == f'(col0, 1mM)')),'p-value'].values[0]
                    # box_pair_p_values[(('10mM','col0'),('10mM',genotypes_unique[x]))] = p_values.loc[((p_values['group1'] == f'(col0, 10mM)') & (p_values['group2'] == f'({genotypes_unique[x]}, 10mM)'))|((p_values['group1'] == f'({genotypes_unique[x]}, 10mM)') & (p_values['group2'] == f'(col0, 10mM)')),'p-value'].values[0]
                    

            
            
        if table.loc['genotype:nitrate_concentration']['PR(>F)'] < 0.05:
            #if significant interaction effect, analyse nitrate concentrations separately using one-way ANOVA
            #first split dataframe into separate dataframes for each nitrate concentration
            print(f'{var}{sample_name} is significant for genotype*nitrate_concentration interaction')
            df_low = df[df.nitrate_concentration == '1mM']
            df_high = df[df.nitrate_concentration == '10mM']
            anova_low_nitrate = smf.ols(f'{var} ~ genotype + plate', data=df_low).fit()
            anova_high_nitrate = smf.ols(f'{var} ~ genotype + plate', data=df_high).fit()
            table_low = sm.stats.anova_lm(anova_low_nitrate, type=2)
            table_high = sm.stats.anova_lm(anova_high_nitrate, type=2)
            #print(table_high)
            # #write stats to file
            # with open(f'{output_location}/stats/marginal_means_{var}_{sample_name}.txt', 'w', encoding="utf-8") as f:
            
            #     f.write(f'anova_type_1:\n{table}\ngenotype*nitrate_concentration is significant so analyse each nitrate concentration separately\nanova_type_2_1mM_nitrate:\n{table_low}\nanova_type_2_10mM_nitrate:\n{table_high} \n{anova.summary()}')

            

        
            #get p values
            p_value_low_nitrate_df = pd.DataFrame(data=table_low)
            p_value_high_nitrate_df = pd.DataFrame(data=table_high)
            
            #get box pairs and p values for adding stats annotations
            genotypes_unique = df['genotype'].unique()
            length_samples = len(genotypes_unique)
            box_pair_p_values = {}
            for x in range (0, (length_samples)):                        
                if genotypes_unique[x] != 'col0':                            
                    box_pair_p_values[(('1mM','col0'),('1mM',genotypes_unique[x]))] = p_value_low_nitrate_df.loc['genotype','PR(>F)']
                    box_pair_p_values[(('10mM','col0'),('10mM',genotypes_unique[x]))] = p_value_high_nitrate_df.loc['genotype','PR(>F)']
                    
        
        #PR(>F)
        #make boxplots
        #remove all string before the first underscore in the variable name, and return all subsequent string     

        #split var string on _
        no_log_var = var.split('_')[1:]
        no_log_var = '_'.join(no_log_var)
        #print(no_log_var)
        #make boxplots
        #first filter df
        boxplot_df = df.filter(items=[no_log_var, 'nitrate_concentration','genotype']).dropna().copy()
        #get column types
        #print(boxplot_df.dtypes)

        # #save box_pair_p_values
        # with open(f'{output_location}/stats/box_pair_p_values_{var}_{sample_name}.txt', 'w', encoding="utf-8") as f:
        #     f.write(f'{box_pair_p_values}')

        
        
        boxplot(boxplot_df,no_log_var,y_label,sample_name,box_pair_p_values, fontsize ,axes[ax_num])
        #add 1 to ax_num
        ax_num += 1
    #add A B C labels to all subplots

    #


    letter_count = 1


    for ax in axes:
        
        #if letter_count is higher than number of subplots, remove axis
        if letter_count >= number_of_subplots:
            ax.axis('off')
            letter_count += 1
        else:
            letter = alc[letter_count]
            ax.text(-0.1, 1.1, letter, transform=ax.transAxes, fontsize=18,  va='top', ha='right')#fontweight='bold'
            letter_count += 1

        #ax.label_outer()
    

            


    #save figure
    fig.savefig(f'{combined_output_dir}/{sample_name}_characteristics_plot.pdf', format="pdf", bbox_inches="tight",transparent=True)
    fig.savefig(f'{combined_output_dir}/{sample_name}_characteristics_plot.svg', format="svg", bbox_inches="tight",transparent=True)
    

In [1339]:
#function to normalise the data based on a column of Cq values (either to housekeeping or based on nitrate or wild type plant)
def normalise_data(df, orig_col,normalisation_col, new_column_name):
   
    #normalise Cq values to the EF1a housekeeping gene mean Cq value for each sample
    df.loc[:,new_column_name] = df[orig_col] - df[normalisation_col]
    #remove nan values in the new column
    df = df[df[new_column_name].notna()]


    return df

In [1340]:
#function to make new columns and sort the data
def sort_data_qpcr(df, location, name):
    #make new column called EF1a_Cq, make the value in this column for a particular sample equal to the Cq value for the EF1a Target for that sample
    ##remove if not amplified
    df = df.loc[df['Amp Status'] == 'Amp']
    

    #first remove outliers
    df = remove_outliers(df, 0.5, 0.3)
    #save outliers df to tsv
    df.to_csv(f'{location}/including_outliers_{name}.tsv', sep='\t', index=False)
    #remove outliers
    df = df.loc[df['Ignore'] == False]    
    #get the mean of each sample/target (take mean of technical replicates)
    df['Cq_mean'] = df.groupby(['Sample','Target'])['Cq'].transform('mean')
    #make a df containing only EF1a target (housekeeping gene)
    df_EF1a = df.loc[df['Target'] == 'EF1a'].copy()

    #rename the Cq_mean column to EF1a_Cq_mean
    df_EF1a.rename(columns={'Cq_mean': 'EF1a_Cq_mean'}, inplace=True)
    #filter other df_EF1a columns
    df_EF1a = df_EF1a[['Sample','EF1a_Cq_mean']]
    #remove duplicates from df_EF1a
    df_EF1a = df_EF1a.drop_duplicates()
    
    #merge the two dfs together
    df = pd.merge(df, df_EF1a, on=['Sample'], how='left')
    

    #normalise based on eEF1a gene
    df = normalise_data(df, 'Cq_mean','EF1a_Cq_mean','MeanCq_ECnormalised')
    #filter columns
    df = df[['Sample','Target','Cq_mean','MeanCq_ECnormalised']]
    #remove duplicates
    df = df.drop_duplicates()


    #normalise to sample 125-4AH on each plate (plate normalisation so can compare between plates)
    #get 125-4AH NIR1 10mM nitrate sample MeanCq_ECnormalised
    #sample_125_4AH = df[(df.Sample == '125-4AH')&(df.Target == "NIR1")].MeanCq_ECnormalised.values[0]
    #print(f'{sample_125_4AH}')
    #normalise
    #df.loc[:,'MeanCq_EC_plate_calibrated'] = df['MeanCq_ECnormalised'] - sample_125_4AH
    #remove nan values in the new column
    #df = df[df['MeanCq_EC_plate_calibrated'].notna()]
    #not doing a plate calibration step for now
    df = df.rename(columns={'MeanCq_ECnormalised': 'MeanCq_EC_plate_calibrated'})
    #add column with name
    df['plateID'] = name
   
   # print(df)
    #if Sample column ends with NRT, add NRT_Cq column
    df['NRT'] = False
    df.loc[df['Sample'].str.endswith('NRT'), 'NRT'] = True
    #remove NRT string from Sample columns ending with NRT
    df['Sample'] = df['Sample'].str.replace('NRT', '')
    
    
    #if Sample column ends with H, add condition column with 10mM_nitrate
    df['condition'] = np.nan
    df.loc[df['Sample'].str.endswith('H'), 'condition'] = '10mM_nitrate'
    #remove H string from Sample columns ending with H
    df['Sample'] = df['Sample'].str.replace('H', '')

    #if Sample column ends with L, add condition column with 1mM_nitrate
    df.loc[df['Sample'].str.endswith('L'), 'condition'] = '1mM_nitrate'
    #remove L string from Sample columns ending with L
    df['Sample'] = df['Sample'].str.replace('L', '')
    #remove A, B or C string from Sample columns ending with A, B or C
    df.loc[:, 'Sample_old'] = df['Sample']
    df['Sample'] = df['Sample'].str.replace('A', '')
    df['Sample'] = df['Sample'].str.replace('B', '')
    #remove C string from Sample columns ending with C after the dash
    df['Sample'] = df['Sample'].str.replace('C', '')
    #remove whitespace from Sample columns
    df['Sample'] = df['Sample'].str.strip()
    #if sample is "ol-0", rename to Col-0
    df.loc[df['Sample'] == 'ol-0', 'Sample'] = 'Col-0'
    
 
    #now make a df containing only Samples with 1mM_nitrate condition
    df_1mM_nitrate = df.loc[df['condition'] == '1mM_nitrate'].copy()

    #make new column that is the Mean expression across all biological replicates
    df_1mM_nitrate['Mean_biological_Cq_ECnormalised'] = df_1mM_nitrate.groupby(['Sample','Target', 'condition'])['MeanCq_EC_plate_calibrated'].transform('mean')


    #rename Mean_biological_Cq_ECnormalised column to 1mMnitrate_Cq_mean
    df_1mM_nitrate.rename(columns={'Mean_biological_Cq_ECnormalised': '1mMnitrate_Cq_mean'}, inplace=True)
    #filter other columns
    df_1mM_nitrate = df_1mM_nitrate[['Sample_old','Target','1mMnitrate_Cq_mean']]
    #remove duplicates from df_1mM_nitrate
    df_1mM_nitrate = df_1mM_nitrate.drop_duplicates()
    #merge the dfs
    df = pd.merge(df, df_1mM_nitrate, on=['Sample_old','Target'], how='left')


    

    #remove nan
    #df = df.dropna()

    #remove NRT values
    df = df.loc[df['NRT'] == False]
    #filter out sample 125-4AH
    #f = df.loc[df['Sample'] != '125-4']
    #filter out target ARF18
    #df = df.loc[df['Target'] != 'ARF18']

    
    


    return df


In [1341]:
#function to analyse data and make plots
def analyse_data(df_plant,sample_name, fontsize,qpcr_df,combined_output_dir,normal,sample_order):
    """function to run anovas and make boxplots"""
    #anova_PR <- lm(logPR ~ Genotype*NO3_Level + Plate, data = Roots1)
    #change -inf values to NaN using .loc
    df_plant.loc[df_plant['log_PR'] == -np.inf, 'log_PR'] = np.nan
    df_plant.loc[df_plant['log_LR'] == -np.inf, 'log_LR'] = np.nan
    df_plant.loc[df_plant['log_LRL'] == -np.inf, 'log_LRL'] = np.nan
    df_plant.loc[df_plant['log_ALRL'] == -np.inf, 'log_ALRL'] = np.nan
    df_plant.loc[df_plant['log_TRL'] == -np.inf, 'log_TRL'] = np.nan
    df_plant.loc[df_plant['log_LRD'] == -np.inf, 'log_LRD'] = np.nan
    df_plant.loc[df_plant['log_LRL_div_TRL'] == -np.inf, 'log_LRL_div_TRL'] = np.nan
    df_plant.loc[df_plant['log_LR_2nd_order'] == -np.inf, 'log_LR_2nd_order'] = np.nan
    df_plant.loc[df_plant['log_LRL_2nd_order'] == -np.inf, 'log_LRL_2nd_order'] = np.nan
   

    # anova_PR = smf.ols('PR ~ genotype*nitrate_concentration + plate', data=df_plant).fit()
    #check anova assumptions
    # print(anova_PR.summary())
    # fig = sm.qqplot(anova_PR.resid, line='s')
    #save figure
    #make directory for the plots to be exported to
    # output_dir = f'{output_location}/qqplots'
    
        
    # fig.savefig(f'{output_location}/qqplots/qqplot_PR.png')
    #log_PR residuals look mainly normal from the qqplot, (points at the extreme ends can be discounted)
    variables = ['PR','log_PR','LR','log_LR','LRL','log_LRL','ALRL','log_ALRL','TRL','log_TRL','LRD','log_LRD','LRL_div_TRL','log_LRL_div_TRL','LR_2nd_order','log_LR_2nd_order','LRL_2nd_order','log_LRL_2nd_order']
    #variables_logs = ['log_PR','log_LR','log_LRL','log_ALRL','log_TRL','log_LRD','log_LRL_div_TRL','log_LR_2nd_order','log_LRL_2nd_order']
    variables_logs_dict = {'log_PR':'Primary root length (cm)','log_LR':'Number of lateral roots','log_LRL':'Total lateral root length (cm)','log_ALRL':'Average lateral root length (cm)','log_TRL':'Total root length (cm)','log_LRD':'Lateral root density','log_LRL_div_TRL':'Ratio of lateral root length to\ntotal root length (LRL/TRL)',}#'log_LR_2nd_order':'Number of second order lateral roots','log_LRL_2nd_order':'Second order lateral root length (cm)'
    # qqplots(df_plant,variables,sample_name, output_dir, fontsize)
    #I will only use log transformed data
    #run anovas and calculate marginal effects for interaction genotype*nitrate_concentration
    
    marginal_effects(df_plant,variables_logs_dict, sample_name ,fontsize, qpcr_df,combined_output_dir,normal,sample_order)
    return df_plant



    # ANOVA table using bioinfokit v1.0.3 or later (it uses wrapper script for anova_lm)

    # res = stat()
    # res.anova_stat(df=df_plant, res_var='PR', anova_model='PR ~ genotype*nitrate_concentration + plate')
    # res.anova_summary
    # #generate QQ-plot from standardized residuals
    # # res.anova_std_residuals are standardized residuals obtained from ANOVA (check above)
    # # sm.qqplot(res.anova_std_residuals, line='45')
    # # plt.xlabel("Theoretical Quantiles")
    # # plt.ylabel("Standardized Residuals")
    # # plt.show()
    # res.qq_plot(df=df_plant, res_var='PR', anova_model='PR ~ genotype*nitrate_concentration + plate')
        





In [1342]:
#make individual plots
def make_plots(df_new, normal,sample, fig, ax, target):
    """function to make barplots of relative expression of each target gene in each plant line"""
    # #plot height and width
    # height = 4.5
    # width = 3
   # print(df_new)

    #decide on stats test
    if normal is True:
        stats_test = 't-test_ind'
    if normal is False:
        stats_test = 't-test_welch'


    #make individual plots
     

    # create box pairs
    # pairs = [(('Col-0','1'),(sample,'1')),(('Col-0','10'),(sample,'10')), (('Col-0','1'),('Col-0','10')), ((sample,'1'),(sample,'10'))]
    pairs = [(('1','Col-0'),('1',sample)),(('10','Col-0'),('10',sample)), (('1','Col-0'),('10','Col-0')), (('1',sample),('10',sample))]

    #order = ['Col-0',sample]
    order = ['1','10']




    #make plot
    fig_args = {'x':'condition', 'y':'relative_expression','hue':'Sample', 'hue_order':['Col-0',sample],'data':df_new, 'order':order, 'dodge':True}#'ax':ax

    configuration = {'test':stats_test, 'text_format':'star', 'pvalue_thresholds':[[1e-3, "***"],[1e-2, "**"],[0.05, "*"],[1, "ns"]]}

   # _ = plt.figure(figsize=(width,height))

    sns.barplot(**fig_args, palette=["white", "grey"],linewidth=2,  errcolor="black", edgecolor="black", ci=68, errwidth=1,capsize=0.4,ax=ax)
    sns.swarmplot(**fig_args, color='black',ax=ax, palette=['black','black'])
    
    #fig = sns.barplot(x='Sample', y='relative_expression', data=temp_df, order=order, linewidth=2,  errcolor="black", edgecolor="black", ci=68, errwidth=1,capsize=0.4,color='cyan')

    #fig = sns.swarmplot(x='condition', y='relative_expression', data=temp_df, order=order,color='black')

    #add stats
    annotator = Annotator(ax, pairs, **fig_args,verbose=False, show_non_significant=False)
    annotator.configure(**configuration)
    # # annotator = Annotator(fig, pairs, data=temp_df, x='condition', y='relative_expression',order=order,verbose=False)
    # # annotator.configure(test=stats_test, text_format='star',pvalue_thresholds=[[1e-3, "***"],[1e-2, "**"],[0.05, "*"],[1, "ns"]])
    
    # #save stats to file
    ax, test_results = annotator.apply_and_annotate()
    # with open(f'{location}/individual/stats.txt', 'a') as f:                            
    #     for res in test_results:
    #         f.write(f'{str(sample)},{target},{pairs},{str(res.data)}\n')
    
    # # change axes labels
    _ = ax.set_ylabel('Relative expression (a.u.)')
    
    #max 1 decimal place y tick labels
    # fig.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
    #add plot title
    _ = ax.set_title(f'{target}')

    #rename x axis labels
    _ = ax.set_xticklabels( ('1','10') )
    #change x axis name
    _ = ax.set_xlabel(r'KNO$_{3}$ concentration (mM)')

    #make xticks diagonal
    # _ = plt.xticks(rotation=90, ha='center')

    #plot legend, excluding legend from swarm plot
    h,l = ax.get_legend_handles_labels()
    #change name of label
    #l[3] = "10 mM nitrate"
    l[3] = sample
    l[2] = "Col-0"
    #l[2] = "20 mM KNO\u2083 + 20 mM NH\u2083NO\u2083"   
    #l[2] = "1 mM nitrate"     
    ax.legend(h[2:4],l[2:4],frameon=False,loc='best')#fontsize=fontsize,,bbox_to_anchor=(0,0.85), loc='best',,bbox_to_anchor=(0.6,0.95) ,ncol=len(df_new.Sample.unique()), columnspacing=0.8




In [1343]:
#main function
def main():
    #read in arguments
    #input_dir = args.input
    input_dir = '../../data/CRISPR_library/images/rsa_output'
    #output_dir = args.output
    output_dir = '../../data/CRISPR_library'
    #make directory for the plots to be exported to
    combined_output_dir = f'{output_dir}/characteristics_plots'
    try:
        # Create target Directory
        os.mkdir(combined_output_dir)
        print("Directory " , combined_output_dir ,  " created") 
    except FileExistsError:
        print("Directory " , combined_output_dir ,  " already exists")



    #read in and concatenate .csv files
    df_rsa = concat_csv_recursive(input_dir, '*.csv')
    #print(df.head())

    
    #sort data
    df_rsa,df_plant = sort_data(df_rsa,output_dir)
    #analyse dataframe and make plots
    #analyse_data(output_dir)
    #set matplotlib rc parameters
    fontsize = set_rc_params()
    #first split into separate dataframes for each sample_name
    #then analyse each dataframe and make plots

    # if __name__ == "__main__" function

    location = '../../data/CRISPR_library/qPCR'
    csv_file1 = f'{location}/06.09.22/06.09.22_plate1_19310threshold.csv'
    csv_file2 = f'{location}/06.09.22/06.09.22_plate2_19310threshold.csv'
    csv_file3 = f'{location}/06.09.22/06.09.22_plate3_19310threshold.csv'
    #read in files
    df1 = read_csv(csv_file1)
    df2 = read_csv(csv_file2)
    df3 = read_csv(csv_file3)
    #filter, sort and normalise to plate calibrator
    def filter_sort_normalise(df,location,name):
        #filter out the data points with amp_status = Amp and cq above 32
        df = filter_data(df, 'Amp', 40)

        
        #sort the data, and normalise to the eEF1a gene
        df = sort_data_qpcr(df,location,name)
        

        return df
    #filter, sort and normalise to plate calibrator
    df1 = filter_sort_normalise(df1,location,"06.09.22_plate1")
    #save df1 to file
    #df1.to_csv(f'{location}/10.08.22/10.8.22_plate_test.csv')
    #print(df1)

    df2 = filter_sort_normalise(df2,location,"06.09.22_plate2")
    #df2.to_csv(f'{location}/22.08.22_plate1_test.tsv',sep='\t')
    #print(df2)
    df3 = filter_sort_normalise(df3,location,"06.09.22_plate3")

    #merge the dfs
    df = pd.concat([df1,df2,df3])
    #sort by Sample, Target and condition
    df = df.sort_values(by=['Sample','Target','condition'])
    #save df to file
    # df.to_csv(f'{location}/merged_plates.tsv', sep='\t')
    #make a copy of the df
    df_col_norm = df.copy()

    

    #normalise based on 1mM_nitrate Cq values, mean between all 3 biological reps 
    df = normalise_data(df, 'MeanCq_EC_plate_calibrated','1mMnitrate_Cq_mean','MeanCq_EC_1mM_nitrate_normalised')
    #now filter columns
    df = df[['Sample','Target','MeanCq_EC_1mM_nitrate_normalised','condition']]

    #first do inverse log transformation
    #(fold change of GOI in treated sample if delta delta Ct value  = X then relative expression  = 2 ( to the power of X))
    #df['relative_expression'] = 2**(df['MeanCq_ECnormalised'])
    df['relative_expression'] = 2**(df['MeanCq_EC_1mM_nitrate_normalised'])
    #save df to tsv
    # df.to_csv('../../data/CRISPR_library/qPCR/merged_plates_19310threshold_normEC1mMnitrate_relative_expression.tsv', sep='\t', index=False)

    #Normalise to 1mM_nitrate Col-0 within each original plate
    #now make a df containing only Col-0 Samples with 1mM_nitrate condition
    df_col_1mM_nitrate = df_col_norm.loc[(df_col_norm['condition'] == '1mM_nitrate') & (df_col_norm.Sample=='Col-0')].copy()

    #make new column that is the Mean Col-0 1mM expression across biological replicates for that target on each plate
    df_col_1mM_nitrate['Mean_biological_Cq_ECnormalised'] = df_col_1mM_nitrate.groupby(['Sample','Target', 'condition','plateID'])['MeanCq_EC_plate_calibrated'].transform('mean')


    #rename Mean_biological_Cq_ECnormalised column to Col0_1mMnitrate_Cq_mean
    df_col_1mM_nitrate.rename(columns={'Mean_biological_Cq_ECnormalised': 'Col0_1mMnitrate_Cq_mean'}, inplace=True)
    #filter other columns
    df_col_1mM_nitrate = df_col_1mM_nitrate[['Sample_old','Target','Col0_1mMnitrate_Cq_mean','plateID']]
    #remove duplicates from df_col_1mM_nitrate
    df_col_1mM_nitrate = df_col_1mM_nitrate.drop_duplicates()
    #merge the dfs, putting the Col-0 1mM nitrate mean values for each target across all plant lines within each plate
    df_col_norm = pd.merge(df_col_norm, df_col_1mM_nitrate, on=['Target', 'plateID'], how='left')
    #save df to file
    df_col_norm.to_csv('../../data/CRISPR_library/qPCR/merged_plates_19310threshold_norm_col0_1mMnitrate.tsv', sep='\t', index=False)
    #normalise based on Col0_1mMnitrate_Cq_mean Cq values, mean between all 3 biological reps 
    df_col_norm = normalise_data(df_col_norm, 'MeanCq_EC_plate_calibrated','Col0_1mMnitrate_Cq_mean','MeanCq_EC_Col0_1mM_nitrate_normalised')
    #print(df_col_norm)
    #now filter columns
    df_col_norm = df_col_norm[['Sample','Target','MeanCq_EC_Col0_1mM_nitrate_normalised','condition', 'plateID']]


    # #first do inverse log transformation
    # #(fold change of GOI in treated sample if delta delta Ct value  = X then relative expression  = 2 ( to the power of X))
    # #df['relative_expression'] = 2**(df['MeanCq_ECnormalised'])
    df_col_norm['relative_expression'] = 2**(df_col_norm['MeanCq_EC_Col0_1mM_nitrate_normalised'])
    #remove duplicates
    df_col_norm = df_col_norm.drop_duplicates()

    #set matplotlib rc parameters
    fontsize = set_rc_params()


    

    #test for normality of data - Shapiro-Wilk test
    #test_normality(df)
    normality = test_normality(df_col_norm, location)

    #check if any of the p values are less than 0.05 (not normal)
    significant = normality[normality['pvalue'] < 0.05]
    if significant.empty:
        print('all p values are greater than 0.05, data is normal, using independent t-test')
        normal = True
    if not significant.empty:
        print('some p values are less than 0.05, data is not normal, using welchs t-test')
        normal = False

   
    

    
    #sample order in the plots
    sample_order = ['Col-0','69-9','125-4','127-10','130-4','134-3','139-9','142-4','142-8','144-5','154-4']
    sample_order_col0 = ['Col-0','69-9','125-4','127-10','130-4','134-3','139-9','142-4','142-8','144-5','154-4','Col-0_highnitrate','69-9_highnitrate','125-4_highnitrate','127-10_highnitrate','130-4_highnitrate','134-3_highnitrate','139-9_highnitrate','142-4_highnitrate','142-8_highnitrate','144-5_highnitrate','154-4_highnitrate']
    #individual plots compare between 1 and 10mM nitrate, and only show Col-0 from that plate if present on that plate, otherwise show all Col-0 samples from other two plates

    #compare between 1 and 10mM nitrate
    # make_combined_plots(df_col_norm,f'{location}/plots', normal,sample_order,fontsize)
    # #make plots compared to Col-0 1 and 10mM nitrate
    # make_combined_plots_col0(df_col_norm,f'{location}/plots', normal,sample_order_col0,fontsize)




    
    qpcr_df = df_col_norm.copy()
    for sample_name in df_plant['sample_name'].unique():
        #get dataframe for each sample_name
        df_sample = df_plant[df_plant['sample_name'] == sample_name].copy()
        #analyse dataframe and make plots
        analyse_data(df_sample,sample_name,fontsize,qpcr_df,combined_output_dir,normal,sample_order)





    #analyse_data(df_plant, output_dir)
    
    #save dataframe to csv file
    # df_rsa.to_csv(f'{output_dir}/all_smartroot_data.csv', index=False)

In [1344]:
if __name__ == '__main__':
    main()

Directory  ../../data/CRISPR_library/characteristics_plots  already exists


  result = getattr(ufunc, method)(*inputs, **kwargs)






some p values are less than 0.05, data is not normal, using welchs t-test
number of subplots for 125-4= 11


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in th

number of subplots for 130-4= 10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


number of subplots for 134-3= 10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


number of subplots for 139-9= 10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


number of subplots for 69-9= 10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
