Alleles_frequency_table.zip can be unzipped to a tab-separated text file that shows all reads and alignments to references. The first column shows the aligned sequence of the sequenced read. The second column shows the aligned sequence of the reference sequence. Gaps in each of these columns represent insertions and deletions. The next column 'Reference_Name' shows the name of the reference that the read aligned to. The fourth column, 'Read_Status' shows whether the read was modified or unmodified. The fifth through seventh columns ('n_deleted', 'n_inserted', 'n_substituted') show the number of bases deleted, inserted, and substituted as compared to the reference sequence. The eighth column shows the number of reads having that sequence, and the ninth column shows the percentage of all reads having that sequence.

In [12]:
# #features to add:
# Distance from TSS - get relative position of mutation in the guide site - done. Add distance from cut site metric. - done Then calculate distance from Araport TSS - done
# for this: first create a bed file for all of the mutations (relative to whole Arabidopsis genome). Then do bedtools merge or intersect (or bedtools coverage (../data_sorting/./TFBS_coverage.sh)) with the mapped motif bed file (all TFBSs for all genes). Record each TFBS that overlaps the mutation
# Overlapping TFBSs - subnet
# work and all TFs

# Include secondary mutations in case both deletion and substitution for example - done
# Plant ID
# How many biallelic or homozygous? How many wildtype?
# More than 2 alleles for a gene - record alleles until 80% of reads accounted for
# Prioritise homozygous or biallelic
# How many plants had mutations? How many guides produced mutations in each gene?
#check window around cut site - at the moment I am including mutations 20bp either side, maybe cut the alignments down to 7bp either side before comparing them with find_indels_substitutions()


In [25]:
#use env pybedtools
import pandas as pd
import numpy as np
import io
from pybedtools import BedTool
from collections import defaultdict
from more_itertools import sliced

In [14]:
def read_in_files(mutations_file,mapped_motifs,guide_pairs):
    """read in the files"""
    #read in mapped motifs bed file
    mapped_motifs = pd.read_table(mapped_motifs_bed, sep="\t", header=None)
    if len(mapped_motifs.columns) == 24:
        cols = [
            "chr",
            "start",
            "stop",
            "promoter_AGI",
            "dot1",
            "strand",
            "source",
            "type",
            "dot2",
            "attributes",
            "motif_chr",
            "motif_start",
            "motif_stop",
            "name_rep",
            "score",
            "motif_strand",
            "promoter_AGI2",
            "p-value",
            "q-value",
            "matched_sequence",
            "TF_name",
            "TF_family",
            "TF_AGI",
            "bp_overlap",
        ]
        mapped_motifs.columns = cols
        # filter columns
        mapped_motifs = mapped_motifs[
            [
                "motif_chr",
                "motif_start",
                "motif_stop",
                "name_rep",
                "score",
                "motif_strand",
                "promoter_AGI2",
                "p-value",
                "q-value",
                "matched_sequence",
                "TF_name",
                "TF_family",
                "TF_AGI",
            ]
        ]
        #rename columns
        cols = [
            "motif_chr",
            "motif_start",
            "motif_stop",
            "name_rep",
            "score",
            "motif_strand",
            "promoter_AGI",
            "p-value",
            "q-value",
            "matched_sequence",
            "TF_name",
            "TF_family",
            "TF_AGI",
        ]
        mapped_motifs.columns = cols

    elif len(mapped_motifs.columns) == 13:
        cols = [
            "motif_chr",
            "motif_start",
            "motif_stop",
            "name_rep",
            "score",
            "motif_strand",
            "promoter_AGI",
            "p-value",
            "q-value",
            "matched_sequence",
            "TF_name",
            "TF_family",
            "TF_AGI",
        ]
        mapped_motifs.columns = cols

    elif len(mapped_motifs.columns) == 17:
        cols = [
            "motif_chr",
            "motif_start",
            "motif_stop",
            "name_rep",
            "score",
            "motif_strand",
            "promoter_AGI",
            "p-value",
            "q-value",
            "matched_sequence",
            "TF_name",
            "TF_family",
            "TF_AGI",
            "chr_openchrom",
            "start_openchrom",
            "stop_openchrom",
            "bp_overlap",
        ]
        mapped_motifs.columns = cols

    mutations_df = pd.read_table(mutations_file,sep='\t',header=0)
    guide_pairs_df = pd.read_csv(guide_pairs,header=0)
    #only keep first 2 columns
    guide_cols = ['guide1','guide2']
    guide_pairs_df = guide_pairs_df[guide_cols]
    return mutations_df,mapped_motifs,guide_pairs_df


In [15]:
def merge_bedfiles(bedfile, mapped_motifs_bed, output_buffer):
    """perform bedtools intersect on the two dfs"""
    df = BedTool(bedfile)
    motifs = BedTool(mapped_motifs_bed)
    # -wao =Write the original A and B entries plus the number of base pairs of overlap between the two features.
    # However, A features w/o overlap are also reported with a NULL B feature and overlap = 0
    intersect = df.intersect(motifs, wao=True)
    # Write to output_file
    # Each line in the file contains bed entry a and bed entry b that it overlaps plus the number of bp in the overlap so 19 columns
    output_buffer.write(str(intersect))
    #go back to beginning of buffer
    output_buffer.seek(0)
    mapped_motifs_bed.seek(0)
    return output_buffer


In [28]:
def find_overlapping_TFBSs(mutations_df_chunk,mapped_motifs_bed,mapped_motifs_bed_columns):
    """function to find any overlapping TFBSs from FIMO mapped motif file"""

    #find mutations within
    for mutations_df_index,mutations_df_row in mutations_df_chunk.iterrows():
            
        if mutations_df_row.mutation_type == 'None':
            pass
        else:
            #create temporary df in bed format
            
            cols = ["chr",
            "start",
            "stop",
            "mutation_type",'mutation_count']
            temp_df = pd.DataFrame(columns=cols)
            chr = mutations_df_row.chr
            #if not NaN
            if mutations_df_row.insertion_genomic_positions != 'nan':
                #print("Index:", index)
                #print(row.insertion_genomic_positions)
                #convert genomic positions from string to list
                # Convert string to list if more than one
                insertion_genomic_positions = mutations_df_row.insertion_genomic_positions.strip('][').split(', ')
                #count which mutation number currently on to be added to the temporary bed file
                count = 0
                for gen_pos in insertion_genomic_positions:
                    count += 1
                    #get index
                    # if len(insertion_genomic_positions) > 1:
                    #     index = insertion_genomic_positions.index(gen_pos)
                    # else:
                    #     index = 'nan'
                    start = int(gen_pos)
                    stop = start + 1
                    mutation_type = "insertion"
                    #add to temp_df
                    temp_list = [chr,start,stop,mutation_type,count]
                    temp_df.loc[len(temp_df)] = temp_list
            if mutations_df_row.deletion_genomic_positions != "nan":
                # Convert string to list
                deletion_genomic_positions = mutations_df_row.deletion_genomic_positions.strip('][').split(', ')
                #count which mutation number currently on to be added to the temporary bed file
                count = 0
                for gen_pos in deletion_genomic_positions:
                    count += 1
                    #get index
                    # if len(deletion_genomic_positions) > 1:
                    #     index = deletion_genomic_positions.index(gen_pos)
                    # else:
                    #     index = 'nan'
                    start = int(gen_pos)
                    stop = start + 1
                    mutation_type = "deletion"
                    #add to temp_df
                    temp_list = [chr,start,stop,mutation_type,count]
                    temp_df.loc[len(temp_df)] = temp_list
            if mutations_df_row.substitution_genomic_positions != "nan":
                # Convert string to list
                substitution_genomic_positions = mutations_df_row.substitution_genomic_positions.strip('][').split(', ')
                #count which mutation number currently on to be added to the temporary bed file
                count = 0
                for gen_pos in substitution_genomic_positions:
                    count += 1
                    #get index
                    # if len(substitution_genomic_positions) > 1:
                    #     index = substitution_genomic_positions.index(gen_pos)
                    # else:
                    #     index = 'nan'
                    start = int(gen_pos)
                    stop = start + 1
                    mutation_type = "substitution"
                    #add to temp_df
                    temp_list = [chr,start,stop,mutation_type,count]
                    temp_df.loc[len(temp_df)] = temp_list
            #now do bedtools intersect to find which TFBSs overlap with which mutations
            #sort by chr then start
            temp_df = temp_df.sort_values(["chr", "start"]).reset_index(drop=True)
            # write to buffer
            temp_df_buffer = io.StringIO()
            temp_df.to_csv(temp_df_buffer, sep="\t", index=False, header=None)
            temp_df_buffer.seek(0)
            
            output_buffer = io.StringIO()
        
            output_buffer = merge_bedfiles(temp_df_buffer, mapped_motifs_bed, output_buffer)
            
            #remove temp_df_buffer stream


            #read in output buffer as df
            output_df = pd.read_table(output_buffer, sep='\t')
            
            #get column names and rename columns
            output_df_cols = cols+mapped_motifs_bed_columns+['bp_overlap']
            output_df.columns = output_df_cols
            #for each mutation type get list of overlapping TFBSs. Add these to a dictionary column in the mutations_df_chunk
            #create defaultdicts with lists as values so that non-existing keys can be added to in one go
            insertion_overlapping_TFBS_family = defaultdict(list)
            insertion_overlapping_TFBS_AGI = defaultdict(list)
            #insertion_overlapping_TFBS_total = defaultdict(list)
            deletion_overlapping_TFBS_family = defaultdict(list)
            deletion_overlapping_TFBS_AGI = defaultdict(list)
            #deletion_overlapping_TFBS_total = defaultdict(list)
            substitution_overlapping_TFBS_family = defaultdict(list)
            substitution_overlapping_TFBS_AGI = defaultdict(list)
            #substitution_overlapping_TFBS_total = defaultdict(list)
            

            for index,row in output_df.iterrows():
                # empty number that will increase for each insertion
               # insertion_overlapping_TFBS_count = int()
               #if mutation and has a 1bp overlap
               #if no TF_AGI then pass
                if row.TF_AGI == '.':
                    pass
                elif row.mutation_type == 'insertion' and row.bp_overlap > 0:
                    #add insertion TFBS family information to dictionary for the correct mutation number
                    #print(row)
                    insertion_overlapping_TFBS_family[f'insertion{row.mutation_count}'] += [row.TF_family]
                    #add insertion TFBS AGI information to dictionary for the correct mutation number
                    insertion_overlapping_TFBS_AGI[f'insertion{row.mutation_count}'] += [row.TF_AGI]
                    #add total number of TFBSs overlapping each insertion
                 #   insertion_overlapping_TFBS_total[f'insertion{row.mutation_count}'] += 1
                elif row.mutation_type == 'deletion' and row.bp_overlap > 0:
                    #add deletion TFBS family information to dictionary for the correct mutation number
                    deletion_overlapping_TFBS_family[f'deletion{row.mutation_count}'] += [row.TF_family]
                    #add deletion TFBS AGI information to dictionary for the correct mutation number
                    deletion_overlapping_TFBS_AGI[f'deletion{row.mutation_count}'] += [row.TF_AGI]
                    #add total number of TFBSs overlapping each insertion
                   # deletion_overlapping_TFBS_total[f'deletion{row.mutation_count}'] += 1
                elif row.mutation_type == 'substitution' and row.bp_overlap > 0:
                    #add substitution TFBS family information to dictionary for the correct mutation number
                    substitution_overlapping_TFBS_family[f'substitution{row.mutation_count}'] += [row.TF_family]
                    #add substitution TFBS AGI information to dictionary for the correct mutation number
                    substitution_overlapping_TFBS_AGI[f'substitution{row.mutation_count}'] += [row.TF_AGI]
                    #add total number of TFBSs overlapping each insertion
                 #   substitution_overlapping_TFBS_total[f'substitution{row.mutation_count}'] += 1
            # #calculate total unique TFBS for each insertion, deletion and subsitution
            # insertion_overlapping_TFBS_total_unique = []
            # for insertion,AGI in insertion_overlapping_TFBS_AGI.items():
            #     insertion_overlapping_TFBS_total_unique += np.unique(AGI).astype(list)
            # deletion_overlapping_TFBS_total_unique = []
            # for deletion,AGI in deletion_overlapping_TFBS_AGI.items():
            #     print(f'AGIunique={np.unique(AGI).astype(list)}')
            #     print(f'total={deletion_overlapping_TFBS_total_unique}')
            #     print(f'AGI={AGI}')
            #     deletion_overlapping_TFBS_total_unique += np.unique(AGI).astype(list)
            #     print(f'newtotal={deletion_overlapping_TFBS_total_unique}')
            # substitution_overlapping_TFBS_total_unique = []
            # for substitution,AGI in substitution_overlapping_TFBS_AGI.items():
            #     substitution_overlapping_TFBS_total_unique += np.unique(AGI).astype(list)
                
            #add values to mutations_df row
            #print(f'insertion_overlapping_TFBS_family={dict(insertion_overlapping_TFBS_family)}')
            #print(mutations_df_row)
            #first make overlapping TFBS families and AGIs unique
            insertion_overlapping_TFBS_family = dict(insertion_overlapping_TFBS_family)
            insertion_overlapping_TFBS_AGI = dict(insertion_overlapping_TFBS_AGI)
            deletion_overlapping_TFBS_family = dict(deletion_overlapping_TFBS_family)
            deletion_overlapping_TFBS_AGI = dict(deletion_overlapping_TFBS_AGI)
            substitution_overlapping_TFBS_family = dict(substitution_overlapping_TFBS_family)
            substitution_overlapping_TFBS_AGI = dict(substitution_overlapping_TFBS_AGI)
            #if empty dictionary, change to nan
            if insertion_overlapping_TFBS_family == {}:
                insertion_overlapping_TFBS_family = 'nan'
            else:
                #keep only unique TFBS families
                for k,v in insertion_overlapping_TFBS_family.items():
                    #print(np.unique(v).astype(list))
                    insertion_overlapping_TFBS_family[k] = np.unique(v).tolist()

            if insertion_overlapping_TFBS_AGI == {}:
                insertion_overlapping_TFBS_AGI = 'nan'
            else:
                #keep only unique TFBS AGIs
                for k,v in insertion_overlapping_TFBS_AGI.items():
                    #print(np.unique(v).astype(list))
                    insertion_overlapping_TFBS_AGI[k] = np.unique(v).tolist()

            if deletion_overlapping_TFBS_family == {}:
                deletion_overlapping_TFBS_family = 'nan'
            else:
                #keep only unique TFBS families
                for k,v in deletion_overlapping_TFBS_family.items():
                    #print(np.unique(v).astype(list))
                    deletion_overlapping_TFBS_family[k] = np.unique(v).tolist()

            if deletion_overlapping_TFBS_AGI == {}:
                deletion_overlapping_TFBS_AGI = 'nan'
            else:
                #keep only unique TFBS AGIs
                for k,v in deletion_overlapping_TFBS_AGI.items():
                    #print(np.unique(v).astype(list))
                    deletion_overlapping_TFBS_AGI[k] = np.unique(v).tolist()
            
            if substitution_overlapping_TFBS_family == {}:
                substitution_overlapping_TFBS_family = 'nan'
            else:
                #keep only unique TFBS families
                for k,v in substitution_overlapping_TFBS_family.items():
                    #print(np.unique(v).astype(list))
                    substitution_overlapping_TFBS_family[k] = np.unique(v).tolist()
                    
            if substitution_overlapping_TFBS_AGI == {}:
                substitution_overlapping_TFBS_AGI = 'nan'
            else:
                #keep only unique TFBS AGIs
                for k,v in substitution_overlapping_TFBS_AGI.items():
                    #print(np.unique(v).astype(list))
                    substitution_overlapping_TFBS_AGI[k] = np.unique(v).tolist()





            mutations_df_chunk.loc[mutations_df_index, "insertion_overlapping_TFBS_family"] = str(insertion_overlapping_TFBS_family)
            mutations_df_chunk.loc[mutations_df_index, "insertion_overlapping_TFBS_AGI"]= str(insertion_overlapping_TFBS_AGI)
            mutations_df_chunk.loc[mutations_df_index, "deletion_overlapping_TFBS_family"]= str(deletion_overlapping_TFBS_family)
            mutations_df_chunk.loc[mutations_df_index, "deletion_overlapping_TFBS_AGI"]= str(deletion_overlapping_TFBS_AGI)
            mutations_df_chunk.loc[mutations_df_index, "substitution_overlapping_TFBS_family"]= str(substitution_overlapping_TFBS_family)
            mutations_df_chunk.loc[mutations_df_index, "substitution_overlapping_TFBS_AGI"]= str(substitution_overlapping_TFBS_AGI)
            # row.insertion_overlapping_TFBS_total_unique= insertion_overlapping_TFBS_total_unique
            # row.deletion_overlapping_TFBS_total_unique= deletion_overlapping_TFBS_total_unique
            # row.substitution_overlapping_TFBS_total_unique= substitution_overlapping_TFBS_total_unique
            #remove the buffer stream
    return mutations_df_chunk

In [16]:
def chunkify(mutations_df,mapped_motifs_df,output_folder,gene):
    """function to prepare dfs and slice mutations_df into chunks to reduce memory before running find_overlapping_TFBSs() """
   

    #for each guide containing mutations, create a temporary bed file containing each mutation and then do bedtools intersect to find which overlap TFBs
    #then add the TFBS names into a new column for that row

    #get column names from mapped_motifs_df
    mapped_motifs_bed_columns = list(mapped_motifs_df.columns)


    #turn mapped_motifs_df into a buffer
    mapped_motifs_bed = io.StringIO()
    mapped_motifs_df.to_csv(mapped_motifs_bed, sep="\t", index=False, header=None)
    #go back to start of buffer
    mapped_motifs_bed.seek(0)

    #add columns to mutations_df
    new_columns = ['insertion_overlapping_TFBS_family',
    'insertion_overlapping_TFBS_AGI',
    'deletion_overlapping_TFBS_family',
    'deletion_overlapping_TFBS_AGI',
    'substitution_overlapping_TFBS_family',
    'substitution_overlapping_TFBS_AGI',
                  ]
    #first make a new df that will merge into mutations_df
    temp_new_df = pd.DataFrame(columns=new_columns)
    mutations_df = pd.concat([mutations_df,temp_new_df], axis=1)
    #print(mutations_df)

    #first make certain columns string
    #make columns containing lists string for now so can use groupby
    to_string = ['insertion_positions','deletion_positions','substitution_positions','insertion_cut_site_distance','deletion_cut_site_distance','substitution_cut_site_distance','insertion_positions_relative_to_TSS','insertion_genomic_positions','deletion_positions_relative_to_TSS','deletion_genomic_positions','substitution_positions_relative_to_TSS','substitution_genomic_positions']
    mutations_df[to_string] = mutations_df[to_string].astype(str)

    #convert mutations_df into chunks to reduce memory load
    CHUNK_SIZE = 5

    index_slices = sliced(range(len(mutations_df)), CHUNK_SIZE)
    #create list of chunks
    chunks = []
    for index_slice in index_slices:
        #go back to start of buffer
        mapped_motifs_bed.seek(0)
        chunk = mutations_df.iloc[index_slice] # your dataframe chunk ready for use
        new_chunk = find_overlapping_TFBSs(mutations_df_chunk,mapped_motifs_bed,mapped_motifs_bed_columns)

        
        chunks.append(new_chunk)

    


            

    #concatenate chunks into mutations_df
    mutations_df = pd.concat(chunks, axis=1)            
    #write out mutations_df
    mutations_df.to_csv(f'{output_folder}{gene}_TFBSoverlapping.tsv', sep="\t", index=False, header=1)
                     
    return mutations_df      



In [None]:
# def chunkify(mutations_df,mapped_motifs_df,output_folder,gene):
#     """function to prepare dfs and slice mutations_df into chunks to reduce memory before running find_overlapping_TFBSs() """
   

#     #for each guide containing mutations, create a temporary bed file containing each mutation and then do bedtools intersect to find which overlap TFBs
#     #then add the TFBS names into a new column for that row

#     #get column names from mapped_motifs_df
#     mapped_motifs_bed_columns = list(mapped_motifs_df.columns)


#     #turn mapped_motifs_df into a buffer
#     mapped_motifs_bed = io.StringIO()
#     mapped_motifs_df.to_csv(mapped_motifs_bed, sep="\t", index=False, header=None)
#     #go back to start of buffer
#     mapped_motifs_bed.seek(0)

#     #add columns to mutations_df
#     new_columns = ['insertion_overlapping_TFBS_family',
#     'insertion_overlapping_TFBS_AGI',
#     'deletion_overlapping_TFBS_family',
#     'deletion_overlapping_TFBS_AGI',
#     'substitution_overlapping_TFBS_family',
#     'substitution_overlapping_TFBS_AGI',
#                   ]
#     #first make a new df that will merge into mutations_df
#     temp_new_df = pd.DataFrame(columns=new_columns)
#     mutations_df = pd.concat([mutations_df,temp_new_df], axis=1)
#     #print(mutations_df)

#     #first make certain columns string
#     #make columns containing lists string for now so can use groupby
#     to_string = ['insertion_positions','deletion_positions','substitution_positions','insertion_cut_site_distance','deletion_cut_site_distance','substitution_cut_site_distance','insertion_positions_relative_to_TSS','insertion_genomic_positions','deletion_positions_relative_to_TSS','deletion_genomic_positions','substitution_positions_relative_to_TSS','substitution_genomic_positions']
#     mutations_df[to_string] = mutations_df[to_string].astype(str)

#     #convert mutations_df into chunks to reduce memory load
#     CHUNK_SIZE = 5

#     index_slices = sliced(range(len(mutations_df)), CHUNK_SIZE)

#     for index_slice in index_slices:
#         chunk = mutations_df.iloc[index_slice] # your dataframe chunk ready for use

#     #find mutations within
#     for mutations_df_index,mutations_df_row in mutations_df.iterrows():
            
#         if mutations_df_row.mutation_type == 'None':
#             pass
#         else:
#             #create temporary df in bed format
            
#             cols = ["chr",
#             "start",
#             "stop",
#             "mutation_type",'mutation_count']
#             temp_df = pd.DataFrame(columns=cols)
#             chr = mutations_df_row.chr
#             #if not NaN
#             if mutations_df_row.insertion_genomic_positions != 'nan':
#                 #print("Index:", index)
#                 #print(row.insertion_genomic_positions)
#                 #convert genomic positions from string to list
#                 # Convert string to list if more than one
#                 insertion_genomic_positions = mutations_df_row.insertion_genomic_positions.strip('][').split(', ')
#                 #count which mutation number currently on to be added to the temporary bed file
#                 count = 0
#                 for gen_pos in insertion_genomic_positions:
#                     count += 1
#                     #get index
#                     # if len(insertion_genomic_positions) > 1:
#                     #     index = insertion_genomic_positions.index(gen_pos)
#                     # else:
#                     #     index = 'nan'
#                     start = int(gen_pos)
#                     stop = start + 1
#                     mutation_type = "insertion"
#                     #add to temp_df
#                     temp_list = [chr,start,stop,mutation_type,count]
#                     temp_df.loc[len(temp_df)] = temp_list
#             if mutations_df_row.deletion_genomic_positions != "nan":
#                 # Convert string to list
#                 deletion_genomic_positions = mutations_df_row.deletion_genomic_positions.strip('][').split(', ')
#                 #count which mutation number currently on to be added to the temporary bed file
#                 count = 0
#                 for gen_pos in deletion_genomic_positions:
#                     count += 1
#                     #get index
#                     # if len(deletion_genomic_positions) > 1:
#                     #     index = deletion_genomic_positions.index(gen_pos)
#                     # else:
#                     #     index = 'nan'
#                     start = int(gen_pos)
#                     stop = start + 1
#                     mutation_type = "deletion"
#                     #add to temp_df
#                     temp_list = [chr,start,stop,mutation_type,count]
#                     temp_df.loc[len(temp_df)] = temp_list
#             if mutations_df_row.substitution_genomic_positions != "nan":
#                 # Convert string to list
#                 substitution_genomic_positions = mutations_df_row.substitution_genomic_positions.strip('][').split(', ')
#                 #count which mutation number currently on to be added to the temporary bed file
#                 count = 0
#                 for gen_pos in substitution_genomic_positions:
#                     count += 1
#                     #get index
#                     # if len(substitution_genomic_positions) > 1:
#                     #     index = substitution_genomic_positions.index(gen_pos)
#                     # else:
#                     #     index = 'nan'
#                     start = int(gen_pos)
#                     stop = start + 1
#                     mutation_type = "substitution"
#                     #add to temp_df
#                     temp_list = [chr,start,stop,mutation_type,count]
#                     temp_df.loc[len(temp_df)] = temp_list
#             #now do bedtools intersect to find which TFBSs overlap with which mutations
#             #sort by chr then start
#             temp_df = temp_df.sort_values(["chr", "start"]).reset_index(drop=True)
#             # write to buffer
#             temp_df_buffer = io.StringIO()
#             temp_df.to_csv(temp_df_buffer, sep="\t", index=False, header=None)
#             temp_df_buffer.seek(0)
            
#             output_buffer = io.StringIO()
        
#             output_buffer = merge_bedfiles(temp_df_buffer, mapped_motifs_bed, output_buffer)
            
#             #remove temp_df_buffer stream


#             #read in output buffer as df
#             output_df = pd.read_table(output_buffer, sep='\t')
            
#             #get column names and rename columns
#             output_df_cols = cols+mapped_motifs_bed_columns+['bp_overlap']
#             output_df.columns = output_df_cols
#             #for each mutation type get list of overlapping TFBSs. Add these to a dictionary column in the mutations_df
#             #create defaultdicts with lists as values so that non-existing keys can be added to in one go
#             insertion_overlapping_TFBS_family = defaultdict(list)
#             insertion_overlapping_TFBS_AGI = defaultdict(list)
#             #insertion_overlapping_TFBS_total = defaultdict(list)
#             deletion_overlapping_TFBS_family = defaultdict(list)
#             deletion_overlapping_TFBS_AGI = defaultdict(list)
#             #deletion_overlapping_TFBS_total = defaultdict(list)
#             substitution_overlapping_TFBS_family = defaultdict(list)
#             substitution_overlapping_TFBS_AGI = defaultdict(list)
#             #substitution_overlapping_TFBS_total = defaultdict(list)
            

#             for index,row in output_df.iterrows():
#                 # empty number that will increase for each insertion
#                # insertion_overlapping_TFBS_count = int()
#                #if mutation and has a 1bp overlap
#                #if no TF_AGI then pass
#                 if row.TF_AGI == '.':
#                     pass
#                 elif row.mutation_type == 'insertion' and row.bp_overlap > 0:
#                     #add insertion TFBS family information to dictionary for the correct mutation number
#                     #print(row)
#                     insertion_overlapping_TFBS_family[f'insertion{row.mutation_count}'] += [row.TF_family]
#                     #add insertion TFBS AGI information to dictionary for the correct mutation number
#                     insertion_overlapping_TFBS_AGI[f'insertion{row.mutation_count}'] += [row.TF_AGI]
#                     #add total number of TFBSs overlapping each insertion
#                  #   insertion_overlapping_TFBS_total[f'insertion{row.mutation_count}'] += 1
#                 elif row.mutation_type == 'deletion' and row.bp_overlap > 0:
#                     #add deletion TFBS family information to dictionary for the correct mutation number
#                     deletion_overlapping_TFBS_family[f'deletion{row.mutation_count}'] += [row.TF_family]
#                     #add deletion TFBS AGI information to dictionary for the correct mutation number
#                     deletion_overlapping_TFBS_AGI[f'deletion{row.mutation_count}'] += [row.TF_AGI]
#                     #add total number of TFBSs overlapping each insertion
#                    # deletion_overlapping_TFBS_total[f'deletion{row.mutation_count}'] += 1
#                 elif row.mutation_type == 'substitution' and row.bp_overlap > 0:
#                     #add substitution TFBS family information to dictionary for the correct mutation number
#                     substitution_overlapping_TFBS_family[f'substitution{row.mutation_count}'] += [row.TF_family]
#                     #add substitution TFBS AGI information to dictionary for the correct mutation number
#                     substitution_overlapping_TFBS_AGI[f'substitution{row.mutation_count}'] += [row.TF_AGI]
#                     #add total number of TFBSs overlapping each insertion
#                  #   substitution_overlapping_TFBS_total[f'substitution{row.mutation_count}'] += 1
#             # #calculate total unique TFBS for each insertion, deletion and subsitution
#             # insertion_overlapping_TFBS_total_unique = []
#             # for insertion,AGI in insertion_overlapping_TFBS_AGI.items():
#             #     insertion_overlapping_TFBS_total_unique += np.unique(AGI).astype(list)
#             # deletion_overlapping_TFBS_total_unique = []
#             # for deletion,AGI in deletion_overlapping_TFBS_AGI.items():
#             #     print(f'AGIunique={np.unique(AGI).astype(list)}')
#             #     print(f'total={deletion_overlapping_TFBS_total_unique}')
#             #     print(f'AGI={AGI}')
#             #     deletion_overlapping_TFBS_total_unique += np.unique(AGI).astype(list)
#             #     print(f'newtotal={deletion_overlapping_TFBS_total_unique}')
#             # substitution_overlapping_TFBS_total_unique = []
#             # for substitution,AGI in substitution_overlapping_TFBS_AGI.items():
#             #     substitution_overlapping_TFBS_total_unique += np.unique(AGI).astype(list)
                
#             #add values to mutations_df row
#             #print(f'insertion_overlapping_TFBS_family={dict(insertion_overlapping_TFBS_family)}')
#             #print(mutations_df_row)
#             #first make overlapping TFBS families and AGIs unique
#             insertion_overlapping_TFBS_family = dict(insertion_overlapping_TFBS_family)
#             insertion_overlapping_TFBS_AGI = dict(insertion_overlapping_TFBS_AGI)
#             deletion_overlapping_TFBS_family = dict(deletion_overlapping_TFBS_family)
#             deletion_overlapping_TFBS_AGI = dict(deletion_overlapping_TFBS_AGI)
#             substitution_overlapping_TFBS_family = dict(substitution_overlapping_TFBS_family)
#             substitution_overlapping_TFBS_AGI = dict(substitution_overlapping_TFBS_AGI)
#             #if empty dictionary, change to nan
#             if insertion_overlapping_TFBS_family == {}:
#                 insertion_overlapping_TFBS_family = 'nan'
#             else:
#                 #keep only unique TFBS families
#                 for k,v in insertion_overlapping_TFBS_family.items():
#                     #print(np.unique(v).astype(list))
#                     insertion_overlapping_TFBS_family[k] = np.unique(v).tolist()

#             if insertion_overlapping_TFBS_AGI == {}:
#                 insertion_overlapping_TFBS_AGI = 'nan'
#             else:
#                 #keep only unique TFBS AGIs
#                 for k,v in insertion_overlapping_TFBS_AGI.items():
#                     #print(np.unique(v).astype(list))
#                     insertion_overlapping_TFBS_AGI[k] = np.unique(v).tolist()

#             if deletion_overlapping_TFBS_family == {}:
#                 deletion_overlapping_TFBS_family = 'nan'
#             else:
#                 #keep only unique TFBS families
#                 for k,v in deletion_overlapping_TFBS_family.items():
#                     #print(np.unique(v).astype(list))
#                     deletion_overlapping_TFBS_family[k] = np.unique(v).tolist()

#             if deletion_overlapping_TFBS_AGI == {}:
#                 deletion_overlapping_TFBS_AGI = 'nan'
#             else:
#                 #keep only unique TFBS AGIs
#                 for k,v in deletion_overlapping_TFBS_AGI.items():
#                     #print(np.unique(v).astype(list))
#                     deletion_overlapping_TFBS_AGI[k] = np.unique(v).tolist()
            
#             if substitution_overlapping_TFBS_family == {}:
#                 substitution_overlapping_TFBS_family = 'nan'
#             else:
#                 #keep only unique TFBS families
#                 for k,v in substitution_overlapping_TFBS_family.items():
#                     #print(np.unique(v).astype(list))
#                     substitution_overlapping_TFBS_family[k] = np.unique(v).tolist()
                    
#             if substitution_overlapping_TFBS_AGI == {}:
#                 substitution_overlapping_TFBS_AGI = 'nan'
#             else:
#                 #keep only unique TFBS AGIs
#                 for k,v in substitution_overlapping_TFBS_AGI.items():
#                     #print(np.unique(v).astype(list))
#                     substitution_overlapping_TFBS_AGI[k] = np.unique(v).tolist()





#             mutations_df.loc[mutations_df_index, "insertion_overlapping_TFBS_family"] = str(insertion_overlapping_TFBS_family)
#             mutations_df.loc[mutations_df_index, "insertion_overlapping_TFBS_AGI"]= str(insertion_overlapping_TFBS_AGI)
#             mutations_df.loc[mutations_df_index, "deletion_overlapping_TFBS_family"]= str(deletion_overlapping_TFBS_family)
#             mutations_df.loc[mutations_df_index, "deletion_overlapping_TFBS_AGI"]= str(deletion_overlapping_TFBS_AGI)
#             mutations_df.loc[mutations_df_index, "substitution_overlapping_TFBS_family"]= str(substitution_overlapping_TFBS_family)
#             mutations_df.loc[mutations_df_index, "substitution_overlapping_TFBS_AGI"]= str(substitution_overlapping_TFBS_AGI)
#             # row.insertion_overlapping_TFBS_total_unique= insertion_overlapping_TFBS_total_unique
#             # row.deletion_overlapping_TFBS_total_unique= deletion_overlapping_TFBS_total_unique
#             # row.substitution_overlapping_TFBS_total_unique= substitution_overlapping_TFBS_total_unique
#             #remove the buffer stream


            

#     #concatenate chunks into mutations_df
#     # pd.concat(chunks, axis=1)            
#     #write out mutations_df
#     mutations_df.to_csv(f'{output_folder}{gene}_TFBSoverlapping.tsv', sep="\t", index=False, header=1)
                     
#     return mutations_df      



In [17]:
def genotype_plant_lines(mutations_df,output_folder,gene):
    """function to decide whether each plant line is -homozygous 
    -biallelic - each mutated at same location/site twice
    -chimeric - different mutations in different cells or tissues - dont analyse if chimeric - 1 small region, if more than 2 mutations then probably chimeric - ie.    probably still has tDNA
    Multiple guide sites - eg. multiple agro strains in each plant. Check for paired guides and whether guides that aren’t meant to be paired are paired.
    """
    #homozygouslines are those with no duplicated guides in each plant line
    mutations_df.loc[~mutations_df.duplicated(['plant_ID','guide']),'genotype'] = 'homozygous'
    #biallelic
    #N = 2
    #mutations_df_copy = mutations_df[mutations_df.duplicated(['plant_ID','guide']) | mutations_df['count'].ge(N)]
    #if 70% of reads or more are the same then mark as homozygous
    mutations_df.loc[mutations_df.read_percentage >= 70, 'genotype'] = 'homozygous'
    #if 10% of reads or less are that mutation, mark genotype as 'nan
    mutations_df.loc[mutations_df.read_percentage <=10, 'genotype'] = 'nan'
    #if between 10 and 70% of reads, mark genotype as heterozygous
    mutations_df.loc[(mutations_df.read_percentage > 10) & (mutations_df.read_percentage < 70), 'genotype'] = 'heterozygous'

    #create a count column
    mutations_df['number_of_different_alleles'] = int()
    #create non wild type count column
    mutations_df['number_of_different_non_wt_alleles'] = int()
    
    #create a sum_of_count column
    #mutations_df['sum_of_count'] = int()
    #count how many duplicates there are for heterozygous lines.
    mutations_df.loc[mutations_df.genotype == 'heterozygous','number_of_different_alleles'] = mutations_df[mutations_df.genotype == 'heterozygous'].groupby(['plant_ID','guide'])['number_of_different_alleles'].transform('count')
    #make non wild type count
    mutations_df.loc[(mutations_df.genotype == 'heterozygous')&~(mutations_df.mutation_type == 'None'),'number_of_different_non_wt_alleles'] = mutations_df[(mutations_df.genotype == 'heterozygous')&~(mutations_df.mutation_type == 'None')].groupby(['plant_ID','guide'])['number_of_different_non_wt_alleles'].transform('count')
    #make count numeric
    #mutations_df['count'] = mutations_df['count'].astype(int)
    #print(mutations_df.dtypes)
    #if heterozygous count is 1 then homozygous
    mutations_df.loc[(mutations_df.genotype == 'heterozygous') & (mutations_df.number_of_different_alleles == 1),'genotype'] = 'homozygous'
    #print(mutations_df.loc[(mutations_df.genotype == 'heterozygous') & (mutations_df.count == 1)])
    #
    #if homozygous then count is 1
    mutations_df.loc[mutations_df.genotype == 'homozygous','number_of_different_alleles'] = 1

    #if mutation_type is None, count is 0
    #create mask for wildtype
    # wt_mask = (mutations_df['mutation_type']=='None')
    # mutations_df.loc[wt_mask, 'count'] = 0

    #get the sum of the counts for each set of duplicates( wild type is 0, each mutation is counted as 1)
    # sort=False, as_index=False

   # mutations_df['sum_of_count'] = mutations_df.groupby(['plant_ID','guide'])['sum_of_count'].agg({"count":"sum"})
    #print(mutations_df.groupby(['plant_ID','guide'],sort=False, as_index=False)['count'].agg({"sum_of_count":"sum"}))
   # print(mutations_df.groupby(['plant_ID','guide'])['count'].transform('count'))



    #if number_of_different_alleles is two and no wild type is present in either of the groups of reads then biallelic
    mutations_df.loc[(mutations_df['number_of_different_non_wt_alleles']==2)&~(mutations_df.mutation_type == 'None'),'genotype'] = 'biallelic'
    
    #if number_of_different_alleles more than two then chimeric
    mutations_df.loc[mutations_df['number_of_different_alleles']>2,'genotype'] = 'chimeric'

    # reset count column
    #filtered['count'] = filtered.groupby(['plant_ID','guide'])[['plant_ID','guide']].transform('count')

    #df1.loc[df['count'] < N, 'count'] = 1
    #mutations_df[mutations_df.duplicated('plant_ID','guide')]
    #add plant
    #print(mutations_df_copy)
    #change column order
    mutations_df = mutations_df[['chr', 'plant_ID', 'platename', 'library', 'first_reaction_primers',
       'second_reaction_primers', 'guide', 'guide_number', 'aligned_sequence',
       'reference_sequence', 'mutation_type','genotype', 'read_number', 'read_percentage',
       'insertion_positions', 'deletion_positions', 'substitution_positions',
       'insertion_cut_site_distance', 'deletion_cut_site_distance',
       'substitution_cut_site_distance', 'cut_site_promoter_position',
       'insertion_positions_relative_to_TSS', 'insertion_genomic_positions',
       'deletion_positions_relative_to_TSS', 'deletion_genomic_positions',
       'substitution_positions_relative_to_TSS',
       'substitution_genomic_positions', 'insertion_overlapping_TFBS_family',
       'insertion_overlapping_TFBS_AGI', 'deletion_overlapping_TFBS_family',
       'deletion_overlapping_TFBS_AGI', 'substitution_overlapping_TFBS_family',
       'substitution_overlapping_TFBS_AGI','number_of_different_alleles','number_of_different_non_wt_alleles'
       ]]
    #remove genotype 'nan'
    mutations_df = mutations_df[~(mutations_df.genotype == 'nan')]
    
   
   
    #save df
    mutations_df.to_csv(f'{output_folder}{gene}_TFBSoverlapping_categorised.tsv', sep="\t", index=False, header=1)

    return mutations_df
    



In [1]:
def categorise_guide_pairs(mutations_df_genotyped, guide_pairs_df):
    """function to check which guide pairs where delivered to each plant line and to check whether mutations at more than one guide site are within the other guide pair"""
    #first get unique guides based on 2 columns in guide_pairs_df
    
    unique_guides = pd.concat([guide_pairs_df['guide1'],guide_pairs_df['guide2']]).unique()
    #for guides in unique guides list, add dictionary value of all other potential guides that it is paired with
    #create defaultdict with lists as values so that non-existing keys can be added to in one go
    guide_dict = defaultdict(list)
    for guide in unique_guides:
        #check for instances of that guide in the first column
        filtered_col_1 = guide_pairs_df[guide_pairs_df.guide1==guide]['guide2'].to_list()
        filtered_col_2 = guide_pairs_df[guide_pairs_df.guide2==guide]['guide1'].to_list()

        #then do the same for the second column
        guide_dict[guide] = np.unique((filtered_col_1 + filtered_col_2)).astype(list)
        print(guide_dict)
    

In [19]:
# folder = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call'
# ARF9_root = f'{folder}/ARF9_sgRNAs/test'
# ARF18_root = f'{folder}/ARF18_sgRNAs/7bp_window_noplots'
# DREB26_root = f'{folder}/DREB26_sgRNAs/7bp_window_noplots'
# NLP7_root = f'{folder}/NLP7_sgRNAs/7bp_window_noplots'
# output = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/'
# #dictionary of reference fasta file locations
# reference_folder = '../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/References'
# reference_gene_dict = {'ARF9':f'{reference_folder}/ARF9_promoter.fa','ARF18':f'{reference_folder}/ARF18_promoter.fa','DREB26':f'{reference_folder}/DREB26_promoter.fa','NLP7':f'{reference_folder}/NLP7_promoter.fa'}
# reference_fasta = f'{reference_folder}/genes_longest_region.fa'
# reference_promoter_bed = f'{reference_folder}/genes_longest_region.bed'
# #promoters bed file when 3' end is the TSS (used the bed file from promoter architecture non-overlapping_includingbidirectional_all_genes_newannotation project FIMO folder)
# all_promoters_bed = f'{reference_folder}/promoters.bed'
# #mapped motif bed file of TFBSs scanned with FIMO
# mapped_motifs_bed = '../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation_3KB/FIMO/promoters_5UTR_motifs_mapped_q0_05.bed'
# plant_IDs = f'{reference_folder}/plant_IDs.tsv'

In [20]:
mapped_motifs_bed = '../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation_3KB/FIMO/promoters_5UTR_motifs_mapped_q0_05.bed'
mutations_ARF9 = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/ARF9_merged.tsv'
mutations_ARF18 = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/ARF18_merged.tsv'
mutations_DREB26 = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/DREB26_merged.tsv'
mutations_NLP7 = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/NLP7_merged.tsv'
output_folder = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/'
guide_pairs = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/References/all_guide_pairs.csv'


In [21]:
#read in files
mutations_ARF9_df,mapped_motifs_df,guide_pairs_df =  read_in_files(mutations_ARF9, mapped_motifs_bed, guide_pairs)

In [22]:
mutations_df = find_overlapping_TFBSs(mutations_ARF9_df,mapped_motifs_df,output_folder,'ARF9')

In [23]:
mutations_df_genotyped = genotype_plant_lines(mutations_df,output_folder,'ARF9')

  new_ix = Index(new_ix)
  new_ix = Index(new_ix)


In [24]:
categorise_guide_pairs(mutations_df_genotyped,guide_pairs_df)

AttributeError: 'list' object has no attribute 'unique'