Alleles_frequency_table.zip can be unzipped to a tab-separated text file that shows all reads and alignments to references. The first column shows the aligned sequence of the sequenced read. The second column shows the aligned sequence of the reference sequence. Gaps in each of these columns represent insertions and deletions. The next column 'Reference_Name' shows the name of the reference that the read aligned to. The fourth column, 'Read_Status' shows whether the read was modified or unmodified. The fifth through seventh columns ('n_deleted', 'n_inserted', 'n_substituted') show the number of bases deleted, inserted, and substituted as compared to the reference sequence. The eighth column shows the number of reads having that sequence, and the ninth column shows the percentage of all reads having that sequence.

In [1]:
# #features to add:
# Distance from TSS - get relative position of mutation in the guide site - done. Add distance from cut site metric. - done Then calculate distance from Araport TSS - done
# for this: first create a bed file for all of the mutations (relative to whole Arabidopsis genome). Then do bedtools merge or intersect (or bedtools coverage (../data_sorting/./TFBS_coverage.sh)) with the mapped motif bed file (all TFBSs for all genes). Record each TFBS that overlaps the mutation
# Overlapping TFBSs - subnetwork and all TFs

# Include secondary mutations in case both deletion and substitution for example - done
# Plant ID
# How many biallelic or homozygous? How many wildtype?
# More than 2 alleles for a gene - record alleles until 80% of reads accounted for
# Prioiritse homozygous or biallelic
# How many plants had mutations? How many guides produced mutations in each gene?
#check window around cut site - at the moment I am including mutations 20bp either side, maybe cut the alignments down to 7bp either side before comparing them with find_indels_substitutions()


In [1]:
#use env pybedtools
import pandas as pd
import numpy as np
import io
from pybedtools import BedTool

In [2]:
def read_in_files(mutations_file,mapped_motifs):
    """read in the files"""
    #read in mapped motifs bed file
    mapped_motifs = pd.read_table(mapped_motifs_bed, sep="\t", header=None)
    if len(mapped_motifs.columns) == 24:
        cols = [
            "chr",
            "start",
            "stop",
            "promoter_AGI",
            "dot1",
            "strand",
            "source",
            "type",
            "dot2",
            "attributes",
            "motif_chr",
            "motif_start",
            "motif_stop",
            "name_rep",
            "score",
            "motif_strand",
            "promoter_AGI2",
            "p-value",
            "q-value",
            "matched_sequence",
            "TF_name",
            "TF_family",
            "TF_AGI",
            "bp_overlap",
        ]
        mapped_motifs.columns = cols
        # filter columns
        mapped_motifs = mapped_motifs[
            [
                "motif_chr",
                "motif_start",
                "motif_stop",
                "name_rep",
                "score",
                "motif_strand",
                "promoter_AGI2",
                "p-value",
                "q-value",
                "matched_sequence",
                "TF_name",
                "TF_family",
                "TF_AGI",
            ]
        ]
        #rename columns
        cols = [
            "motif_chr",
            "motif_start",
            "motif_stop",
            "name_rep",
            "score",
            "motif_strand",
            "promoter_AGI",
            "p-value",
            "q-value",
            "matched_sequence",
            "TF_name",
            "TF_family",
            "TF_AGI",
        ]
        mapped_motifs.columns = cols

    elif len(mapped_motifs.columns) == 13:
        cols = [
            "motif_chr",
            "motif_start",
            "motif_stop",
            "name_rep",
            "score",
            "motif_strand",
            "promoter_AGI",
            "p-value",
            "q-value",
            "matched_sequence",
            "TF_name",
            "TF_family",
            "TF_AGI",
        ]
        mapped_motifs.columns = cols

    elif len(mapped_motifs.columns) == 17:
        cols = [
            "motif_chr",
            "motif_start",
            "motif_stop",
            "name_rep",
            "score",
            "motif_strand",
            "promoter_AGI",
            "p-value",
            "q-value",
            "matched_sequence",
            "TF_name",
            "TF_family",
            "TF_AGI",
            "chr_openchrom",
            "start_openchrom",
            "stop_openchrom",
            "bp_overlap",
        ]
        mapped_motifs.columns = cols

    mutations_df = pd.read_table(mutations_file,sep='\t',header=0)
    return mutations_df,mapped_motifs


In [14]:
def merge_bedfiles(bedfile, mapped_motifs_bed, output_buffer):
    """perform bedtools intersect on the two dfs"""
    df = BedTool(bedfile)
    motifs = BedTool(mapped_motifs_bed)
    # -wao =Write the original A and B entries plus the number of base pairs of overlap between the two features.
    # However, A features w/o overlap are also reported with a NULL B feature and overlap = 0
    intersect = df.intersect(motifs, wao=True)
    # Write to output_file
    # Each line in the file contains bed entry a and bed entry b that it overlaps plus the number of bp in the overlap so 19 columns
    output_buffer.write(str(intersect))
    #go back to beginning of buffer
    output_buffer.seek(0)
    mapped_motifs_bed.seek(0)
    return output_buffer


In [20]:
def find_overlapping_TFBSs(mutations_df,mapped_motifs_df):
    """function to find any overlapping TFBSs from FIMO mapped motif file"""
   

    #for each guide containing mutations, create a temporary bed file containing each mutation and then do bedtools intersect to find which overlap TFBs
    #then add the TFBS names into a new column for that row

    #get column names from mapped_motifs_df
    mapped_motifs_bed_columns = list(mapped_motifs_df.columns)


    #turn mapped_motifs_df into a buffer
    mapped_motifs_bed = io.StringIO()
    mapped_motifs_df.to_csv(mapped_motifs_bed, sep="\t", index=False, header=None)
    #go back to start of buffer
    mapped_motifs_bed.seek(0)

    


    #first make certain columns string
    #make columns containing lists string for now so can use groupby
    to_string = ['insertion_positions','deletion_positions','substitution_positions','insertion_cut_site_distance','deletion_cut_site_distance','substitution_cut_site_distance','insertion_positions_relative_to_TSS','insertion_genomic_positions','deletion_positions_relative_to_TSS','deletion_genomic_positions','substitution_positions_relative_to_TSS','substitution_genomic_positions']
    mutations_df[to_string] = mutations_df[to_string].astype(str)
    for index,row in mutations_df.iterrows():
            
        if row.mutation_type == 'None':
            pass
        else:
            #create temporary df in bed format
            
            cols = ["chr",
            "start",
            "stop",
            "mutation_type",'mutation_count']
            temp_df = pd.DataFrame(columns=cols)
            chr = row.chr
            #if not NaN
            if row.insertion_genomic_positions != 'nan':
                #print("Index:", index)
                #print(row.insertion_genomic_positions)
                #convert genomic positions from string to list
                # Convert string to list if more than one
                insertion_genomic_positions = row.insertion_genomic_positions.strip('][').split(', ')
                #count which mutation number currently on to be added to the temporary bed file
                count = 0
                for gen_pos in insertion_genomic_positions:
                    count += 1
                    #get index
                    # if len(insertion_genomic_positions) > 1:
                    #     index = insertion_genomic_positions.index(gen_pos)
                    # else:
                    #     index = 'nan'
                    start = int(gen_pos)
                    stop = start + 1
                    mutation_type = "insertion"
                    #add to temp_df
                    temp_list = [chr,start,stop,mutation_type,count]
                    temp_df.loc[len(temp_df)] = temp_list
            if row.deletion_genomic_positions != "nan":
                # Convert string to list
                deletion_genomic_positions = row.deletion_genomic_positions.strip('][').split(', ')
                #count which mutation number currently on to be added to the temporary bed file
                count = 0
                for gen_pos in deletion_genomic_positions:
                    count += 1
                    #get index
                    # if len(deletion_genomic_positions) > 1:
                    #     index = deletion_genomic_positions.index(gen_pos)
                    # else:
                    #     index = 'nan'
                    start = int(gen_pos)
                    stop = start + 1
                    mutation_type = "deletion"
                    #add to temp_df
                    temp_list = [chr,start,stop,mutation_type,count]
                    temp_df.loc[len(temp_df)] = temp_list
            if row.substitution_genomic_positions != "nan":
                # Convert string to list
                substitution_genomic_positions = row.substitution_genomic_positions.strip('][').split(', ')
                #count which mutation number currently on to be added to the temporary bed file
                count = 0
                for gen_pos in substitution_genomic_positions:
                    count += 1
                    #get index
                    # if len(substitution_genomic_positions) > 1:
                    #     index = substitution_genomic_positions.index(gen_pos)
                    # else:
                    #     index = 'nan'
                    start = int(gen_pos)
                    stop = start + 1
                    mutation_type = "substitution"
                    #add to temp_df
                    temp_list = [chr,start,stop,mutation_type,count]
                    temp_df.loc[len(temp_df)] = temp_list
            #now do bedtools intersect to find which TFBSs overlap with which mutations
            #sort by chr then start
            temp_df = temp_df.sort_values(["chr", "start"]).reset_index(drop=True)
            # write to buffer
            temp_df_buffer = io.StringIO()
            temp_df.to_csv(temp_df_buffer, sep="\t", index=False, header=None)
            temp_df_buffer.seek(0)
            
            output_buffer = io.StringIO()
        
            output_buffer = merge_bedfiles(temp_df_buffer, mapped_motifs_bed, output_buffer)
            #read in output buffer as df
            output_df = pd.read_table(output_buffer, sep='\t')
            #get column names and rename columns
            output_df_cols = cols+mapped_motifs_bed_columns+['bp_overlap']
            output_df.columns = output_df_cols
            #for each mutation type get list of overlapping TFBSs. Add theses to a dictionary column in the mutations_df            
            print(output_df)      



In [16]:
# folder = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call'
# ARF9_root = f'{folder}/ARF9_sgRNAs/test'
# ARF18_root = f'{folder}/ARF18_sgRNAs/7bp_window_noplots'
# DREB26_root = f'{folder}/DREB26_sgRNAs/7bp_window_noplots'
# NLP7_root = f'{folder}/NLP7_sgRNAs/7bp_window_noplots'
# output = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/'
# #dictionary of reference fasta file locations
# reference_folder = '../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/References'
# reference_gene_dict = {'ARF9':f'{reference_folder}/ARF9_promoter.fa','ARF18':f'{reference_folder}/ARF18_promoter.fa','DREB26':f'{reference_folder}/DREB26_promoter.fa','NLP7':f'{reference_folder}/NLP7_promoter.fa'}
# reference_fasta = f'{reference_folder}/genes_longest_region.fa'
# reference_promoter_bed = f'{reference_folder}/genes_longest_region.bed'
# #promoters bed file when 3' end is the TSS (used the bed file from promoter architecture non-overlapping_includingbidirectional_all_genes_newannotation project FIMO folder)
# all_promoters_bed = f'{reference_folder}/promoters.bed'
# #mapped motif bed file of TFBSs scanned with FIMO
# mapped_motifs_bed = '../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation_3KB/FIMO/promoters_5UTR_motifs_mapped_q0_05.bed'
# plant_IDs = f'{reference_folder}/plant_IDs.tsv'

In [17]:
mapped_motifs_bed = '../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation_3KB/FIMO/promoters_5UTR_motifs_mapped_q0_05.bed'
mutations_ARF9 = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/ARF9_merged.tsv'
mutations_ARF18 = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/ARF18_merged.tsv'
mutations_DREB26 = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/DREB26_merged.tsv'
mutations_NLP7 = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/NLP7_merged.tsv'


In [18]:
#read in files
mutations_ARF9_df,mapped_motifs_df =  read_in_files(mutations_ARF9, mapped_motifs_bed)

In [21]:
find_overlapping_TFBSs(mutations_ARF9_df,mapped_motifs_df)

    chr     start      stop mutation_type  mutation_count  motif_chr  \
0     4  12451173  12451174     insertion               1          4   
1     4  12451173  12451174     insertion               1          4   
2     4  12451173  12451174     insertion               1          4   
3     4  12451174  12451175     insertion               2          4   
4     4  12451174  12451175     insertion               2          4   
5     4  12451174  12451175     insertion               2          4   
6     4  12451174  12451175     insertion               2          4   
7     4  12451187  12451188      deletion               1          4   
8     4  12451188  12451189      deletion               2          4   
9     4  12451189  12451190      deletion               3          4   
10    4  12451196  12451197  substitution               1          4   

    motif_start  motif_stop                  name_rep     score motif_strand  \
0      12451149    12451179     BBRBPC_tnt.BPC5_col_a -