Alleles_frequency_table.zip can be unzipped to a tab-separated text file that shows all reads and alignments to references. The first column shows the aligned sequence of the sequenced read. The second column shows the aligned sequence of the reference sequence. Gaps in each of these columns represent insertions and deletions. The next column 'Reference_Name' shows the name of the reference that the read aligned to. The fourth column, 'Read_Status' shows whether the read was modified or unmodified. The fifth through seventh columns ('n_deleted', 'n_inserted', 'n_substituted') show the number of bases deleted, inserted, and substituted as compared to the reference sequence. The eighth column shows the number of reads having that sequence, and the ninth column shows the percentage of all reads having that sequence.

In [1]:
# #features to add:
# Distance from TSS - get relative position of mutation in the guide site - done. Add distance from cut site metric. - done Then calculate distance from Araport TSS - done
# for this: first create a bed file for all of the mutations (relative to whole Arabidopsis genome). Then do bedtools merge or intersect (or bedtools coverage (../data_sorting/./TFBS_coverage.sh)) with the mapped motif bed file (all TFBSs for all genes). Record each TFBS that overlaps the mutation
# Overlapping TFBSs - subnetwork and all TFs

# Include secondary mutations in case both deletion and substitution for example - done
# Plant ID
# How many biallelic or homozygous? How many wildtype?
# More than 2 alleles for a gene - record alleles until 80% of reads accounted for
# Prioiritse homozygous or biallelic
# How many plants had mutations? How many guides produced mutations in each gene?
#check window around cut site - at the moment I am including mutations 20bp either side, maybe cut the alignments down to 7bp either side before comparing them with find_indels_substitutions()


In [9]:
#use env pacbio_post_analysis
import pandas as pd
import numpy as np
import fnmatch
import os
import re 
from pyfaidx import Fasta

In [3]:
# #code from https://github.com/pinellolab/CRISPResso2/blob/master/CRISPResso2/CRISPRessoCOREResources.pyx
# #import cython

# # cimport numpy as np
# import re

# # cdef extern from "stdlib.h":
# #     ctypedef unsigned int size_t
# #     size_t strlen(char* s)


# # cdef extern from "Python.h":
# #     ctypedef void PyObject
# #     int _PyBytes_Resize(PyObject **, size_t)
# #     char * PyBytes_AS_STRING(PyObject *)





In [108]:
#code from https://github.com/pinellolab/CRISPResso2/blob/master/CRISPResso2/CRISPRessoCOREResources.pyx
#I converted it to pure python
def find_indels_substitutions(read_seq_al, ref_seq_al, _include_indx):
    re_find_indels = re.compile("(-*-)")

    #ref_positions holds the indices for which positions map back to the original reference
    # for example,
    #     1 2 3 4 5 6 7 8
    # ref A A T T G G C C
    #
    # and for a given alignment
    # ref A A T T - G G C C
    # aln A - T T T G G C C
    #     1 2 3 4-4 5 6 7 8 <ref positions. Note that the negative values/indices represent places that don't map back to the original reference
    ref_positions=[]
    all_substitution_positions=[]
    substitution_positions=[]
    all_substitution_values=[]
    substitution_values=[]

    all_deletion_positions = []
    deletion_positions = []
    deletion_coordinates = []
    deletion_sizes = []
    #cdef int start_deletion = -1  # the -1 value indicates that there currently isn't a deletion
    start_deletion = -1  # the -1 value indicates that there currently isn't a deletion

    all_insertion_positions = []
    all_insertion_left_positions = []
    insertion_positions = []
    insertion_coordinates = []
    insertion_sizes = []
    #cdef int start_insertion = -1  # the -1 value indicates that there currently isn't an insertion
    start_insertion = -1  # the -1 value indicates that there currently isn't an insertion

    #cdef size_t seq_len = len(ref_seq_al)
    seq_len = len(ref_seq_al)
    include_indx_set = set(_include_indx)
    nucSet = set(['A', 'T', 'C', 'G', 'N'])
    # cdef int idx = 0
    # cdef int idx_c
    # cdef int current_insertion_size = 0
    idx = 0
    #idx_c
    current_insertion_size = 0
    for idx_c, c in enumerate(ref_seq_al):
        #print(idx_c)
        if c != '-':
            ref_positions.append(idx)
            if ref_seq_al[idx_c]!=read_seq_al[idx_c] and read_seq_al[idx_c] != '-' and read_seq_al[idx_c] != 'N':
                all_substitution_positions.append(idx)
                all_substitution_values.append(read_seq_al[idx_c])
                if idx in _include_indx:
                    substitution_positions.append(idx)
                    substitution_values.append(read_seq_al[idx_c])
            if start_insertion != -1:  # this is the end of an insertion
                all_insertion_left_positions.append(start_insertion)
                all_insertion_positions.append(start_insertion)
                all_insertion_positions.append(idx)
                if start_insertion in include_indx_set and idx in include_indx_set:
                    insertion_coordinates.append((start_insertion, idx))
                    insertion_positions.append(start_insertion)
                    insertion_positions.append(idx)
                    insertion_sizes.append(current_insertion_size)
                start_insertion = -1
            current_insertion_size = 0
            idx += 1
        else:  # the current ref position is -
            if idx == 0:
                ref_positions.append(-1)
            else:
                ref_positions.append(-idx)
            if idx > 0 and start_insertion == -1:  # this is the first index of an insertion
                start_insertion = idx - 1
            current_insertion_size += 1

        if read_seq_al[idx_c] == '-' and start_deletion == -1:  # this is the first part of a deletion
            if idx_c - 1 > 0:
                start_deletion = ref_positions[idx_c]
            else:
                start_deletion = 0
        elif read_seq_al[idx_c] != '-' and start_deletion != -1:  # this is the end of a deletion
            end_deletion = ref_positions[idx_c]
            all_deletion_positions.extend(range(start_deletion, end_deletion))
            if include_indx_set.intersection(range(start_deletion, end_deletion)):
                deletion_positions.extend(range(start_deletion, end_deletion))
                deletion_coordinates.append((start_deletion, end_deletion))
                deletion_sizes.append(end_deletion - start_deletion)
            start_deletion = -1

    if start_deletion != -1:
        end_deletion = ref_positions[seq_len - 1]
        all_deletion_positions.extend(range(start_deletion, end_deletion))
        if include_indx_set.intersection(range(start_deletion, end_deletion)):
            deletion_positions.extend(range(start_deletion, end_deletion))
            deletion_coordinates.append((start_deletion, end_deletion))
            deletion_sizes.append(end_deletion - start_deletion)

    # cdef size_t substitution_n = len(substitution_positions)
    # cdef size_t deletion_n = sum(deletion_sizes)
    # cdef size_t insertion_n = sum(insertion_sizes)
    substitution_n = len(all_substitution_positions)
    deletion_n = len(all_deletion_positions)
    insertion_n = len(all_insertion_positions)

    return {
        'all_insertion_positions': all_insertion_positions,
        'all_insertion_left_positions': all_insertion_left_positions,
        'insertion_positions': insertion_positions,
        'insertion_coordinates': insertion_coordinates,
        'insertion_sizes': insertion_sizes,
        'insertion_n': insertion_n,

        'all_deletion_positions': all_deletion_positions,
        'deletion_positions': deletion_positions,
        'deletion_coordinates': deletion_coordinates,
        'deletion_sizes': deletion_sizes,
        'deletion_n': deletion_n,

        'all_substitution_positions': all_substitution_positions,
        'substitution_positions': substitution_positions,
        'all_substitution_values': np.array(all_substitution_values),
        'substitution_values': np.array(substitution_values),
        'substitution_n': substitution_n,

        'ref_positions': ref_positions,
    }

In [137]:
def find_guide_position_in_gene(gene,cut_site_reference_seq, guide_position):
    """function to find the relative promoter position of the current guide site position in the reference gene"""
    fasta_location = reference_gene_dict[gene]
    #read in fasta file
    fasta = Fasta(
        fasta_location
    )
    #get promoter sequence
    prom_seq = str(fasta[f'{gene}_promoter'])
    #print(fasta[f'{gene}_promoter'])
    #find position of substring in string
    start_location = prom_seq.index(cut_site_reference_seq)
    site_promoter_position = start_location + guide_position
    return site_promoter_position

In [110]:
def get_reference_promoter_genomic_positions(reference_promoter_bed,all_promoters_bed):
    """function to get the reference promoter genomic positions and to get the tSS genomic position"""
     #read in reference_promoter_bed
    reference_promoter_df = pd.read_table(
        reference_promoter_bed, sep="\t", header=None
    )
    cols = [
        "chr",
        "start",
        "stop",
        "promoter_name",
        "score",
        "strand",
        ]
    reference_promoter_df.columns = cols

    #read in all_promoters_bed
    all_promoters_df = pd.read_table(
        all_promoters_bed, sep="\t", header=None
    )
    cols2 = [
        "chr",
        "start",
        "stop",
        "AGI",
        "dot",
        "strand",
        "source",
        "type",
        "dot2",
        "attributes",
    ]
    all_promoters_df.columns = cols2

    #AGIs dictionary of the four genes
    AGI_dict = {'DREB26':'AT1G21910','NLP7':'AT4G24020','ARF18':'AT3G61830','ARF9':'AT4G23980'}
    #make empty dict
    reference_genomic_positions = {}
    #get promoter genomic positions
    for gene,AGI in AGI_dict.items():
        promoter_genomic_pos_chromosome = int(reference_promoter_df[reference_promoter_df.promoter_name == f'{gene}_promoter'].chr)
        promoter_genomic_pos_start = int(reference_promoter_df[reference_promoter_df.promoter_name == f'{gene}_promoter'].start)
        promoter_genomic_pos_stop = int(reference_promoter_df[reference_promoter_df.promoter_name == f'{gene}_promoter'].stop)

        #get TSS position
        TSS_pos = int(all_promoters_df[all_promoters_df.AGI==AGI].stop)
        #temp_dict = {gene:[AGI,promoter_genomic_pos_chromosome,promoter_genomic_pos_start,promoter_genomic_pos_stop,TSS_pos]}
        #write to dict
        reference_genomic_positions[gene]= [AGI,promoter_genomic_pos_chromosome,promoter_genomic_pos_start,promoter_genomic_pos_stop,TSS_pos]
    return reference_genomic_positions


In [139]:
def find_genomic_position(reference_genomic_positions,gene,promoter_position):
    """get the genomic position of the cut site"""
    positions_list = reference_genomic_positions[gene]
    #first get genomic position of the cut site
    promoter_genomic_pos_chromosome = positions_list[1]
    promoter_genomic_pos_start = positions_list[2]
    promoter_genomic_pos_stop = positions_list[3]
    #get the genomic location of the input promoter position
    #print(f'prom_position={promoter_position}, prom_genom_pos_start={promoter_genomic_pos_start}')
    genomic_position = promoter_position+promoter_genomic_pos_start
    #get position of cut site relative to TSS position
    #get TSS position
    TSS_pos = positions_list[4]
    position_relative_to_TSS = genomic_position - TSS_pos

    return position_relative_to_TSS,genomic_position, promoter_genomic_pos_chromosome

In [None]:
def find_overlapping_TFBSs(mapped_motifs_bed):
    """function to find any overlapping TFBSs from FIMO mapped motif file"""

In [141]:
def check_guide(root_dir,output,gene,reference_fasta,mapped_motifs_bed,reference_genomic_positions):
    """read in the Alleles_frequency_table txt files"""
    #create the output file
    cols = ['platename','library','first_reaction_primers','second_reaction_primers','guide','aligned_sequence','reference_sequence','mutation_type','read_number','read_percentage','insertion_positions','deletion_positions','substitution_positions','insertion_cut_site_distance','deletion_cut_site_distance','substitution_cut_site_distance','cut_site_promoter_position','insertion_positions_relative_to_TSS','insertion_genomic_positions','deletion_positions_relative_to_TSS','deletion_genomic_positions','substitution_positions_relative_to_TSS','substitution_genomic_positions']#'mutation_size',
    output_df = pd.DataFrame(columns=cols)
    for subdir, dirs, files in os.walk(root_dir):        
            for filename in fnmatch.filter(files,f"*Alleles_frequency_table_around_{gene}_*.txt"):
                #print (os.path.join(subdir, filename))
                #read in as df
                df = pd.read_table(os.path.join(subdir, filename), sep="\t", header=0)
                #filter out rows with reads less than 30
                filtered_reads = df[df['#Reads'] >= 30]
                #save each row to df
                for index, row in filtered_reads.iterrows():
                    
                    #check if aligned and reference are different
                    Aligned_Sequence = row.Aligned_Sequence
                    Reference_Sequence = row.Reference_Sequence
                    

                    #split filename on .
                    partitioned_string = filename.partition('.')
                    #get second PCR reaction primers
                    second_reaction_primers = partitioned_string[0]
                    #get guide name
                    guide = re.search(f"[_]+({gene}(.*))",partitioned_string[2].partition('.')[0])[1]                 
                    #get first reaction PCR primers and library number
                    library_primers = re.search("_bc(.*)",subdir)[0]
                    both_primers = re.findall("SW(\d*)",library_primers)
                    #if 2 found, create first_reaction_primers value
                    if len(both_primers) == 2:
                        first_reaction_primers = f'SW{both_primers[0]}_SW{both_primers[1]}'
                    else:
                        first_reaction_primers = 'NA'

                    
                    library = re.search("[^_bc]+(\d*)",library_primers)[0]
                    read_number = row['#Reads']
                    read_percentage = row['%Reads']
                    
                    
                    #convert library number to different number (from 1017 to 1, 1018 to 2 etc)
                    if int(library) == 1017:
                        new_library = 1
                    elif int(library) == 1018:
                        new_library = 2
                    elif int(library) == 1019:
                        new_library = 3
                    elif int(library) == 1020:
                        new_library = 4
                    elif int(library) == 1021:
                        new_library = 5
                    elif int(library) == 1022:
                        new_library = 6
                    #print(new_library)
                    platename = f'p{new_library}{gene}'
                    #distance_from_TSS =
                    #sequence = 

                    #remove dashes from string
                    # Aligned_Sequence_no_dashes = Aligned_Sequence.replace('-','')
                    # Reference_Sequence_no_dashes = Reference_Sequence.replace('-','')
                    # if Aligned_Sequence == Reference_Sequence:
                    #     mutation_type = 'None'
                    #     mutation_size = 'NA'
                    # elif Aligned_Sequence != Reference_Sequence:
                    #     #if insertion
                    #     if len(Aligned_Sequence_no_dashes) > len(Reference_Sequence_no_dashes):
                    #         mutation_type = 'insertion'
                    #         mutation_size = len(Aligned_Sequence_no_dashes)-len(Reference_Sequence_no_dashes)
                            
                    #     #if deletion
                    #     elif len(Reference_Sequence_no_dashes) > len(Aligned_Sequence_no_dashes):
                    #         mutation_type = 'deletion'
                    #         mutation_size = len(Reference_Sequence_no_dashes)-len(Aligned_Sequence_no_dashes)
                    #     #if substition
                    #     elif len(Reference_Sequence_no_dashes) == len(Aligned_Sequence_no_dashes):
                    #         mutation_type = 'substitution'
                    #         #mutation size is the Hamming distance between the two strings
                    #         mutation_size = sum([1 for x, y in zip(Aligned_Sequence_no_dashes, Reference_Sequence_no_dashes) if x.lower() != y.lower()])
                    
                    #remove dashes from string
                    Aligned_Sequence_no_dashes = Aligned_Sequence.replace('-','')
                    Reference_Sequence_no_dashes = Reference_Sequence.replace('-','')
                    if Aligned_Sequence == Reference_Sequence:
                        mutation_type = 'None'
                        mutation_size = 'NA'
                        insertion_positions = 'NA'
                        deletion_positions = 'NA'
                        substitution_positions = 'NA'
                    elif Aligned_Sequence != Reference_Sequence:
                    #find insertions, mutations, deletions
                        #get length of reference sequence for the index
                       # print(Reference_Sequence)
                       # print(len(Reference_Sequence))
                        #print(range(1,1+len(Reference_Sequence)))
                        ref_index = [range(0,len(Reference_Sequence),1)]
                        indels = find_indels_substitutions(Aligned_Sequence, Reference_Sequence, ref_index)
                        #print(indels)
                        #as well as labelling mutation types, get relative position of the mutation/mutations in the 40bp guide window
                        if indels['insertion_n'] != 0:
                            insertion_positions = indels['all_insertion_positions']
                            if indels['deletion_n'] != 0:
                                deletion_positions = indels['all_deletion_positions']
                                if indels['substitution_n']  != 0:
                                    substitution_positions = indels['all_substitution_positions']
                                    mutation_type = 'insertion+deletion+substitution'
                                    
                                elif indels['substitution_n']  == 0:
                                    substitution_positions = 'NA'
                                    mutation_type = 'insertion+deletion'
                            elif indels['deletion_n'] == 0:
                                deletion_positions = 'NA'
                                if indels['substitution_n']  != 0:
                                    substitution_positions = indels['all_substitution_positions']
                                    mutation_type = 'insertion+substitution'
                                elif indels['substitution_n']  == 0:
                                    substitution_positions = 'NA'
                                    mutation_type = 'insertion'
                        elif indels['insertion_n'] == 0:
                            insertion_positions = 'NA'
                            if indels['deletion_n'] != 0:
                                deletion_positions = indels['all_deletion_positions']
                                if indels['substitution_n'] != 0:
                                    substitution_positions = indels['all_substitution_positions']
                                    mutation_type = 'deletion+substitution'
                                elif indels['substitution_n']  == 0:
                                    substitution_positions = 'NA'
                                    mutation_type = 'deletion'
                            elif indels['deletion_n'] == 0:
                                deletion_positions = 'NA'
                                if indels['substitution_n']  != 0:
                                    substitution_positions = indels['all_substitution_positions']
                                    mutation_type = 'substitution'
                                elif indels['substitution_n']  == 0:
                                    substitution_positions = 'NA'
                                    mutation_type = 'None'
                    #get length of reference sequence
                    ref_length = len(Reference_Sequence_no_dashes)
                    #print(ref_length)

                    #get cut site position in whole promoter
                    cut_site_promoter_position = find_guide_position_in_gene(gene,Reference_Sequence_no_dashes,ref_length//2)
                    #get cut site genomic position in whole promoter
                    cut_site_position_relative_to_TSS,cut_site_genomic_position, promoter_genomic_pos_chromosome = find_genomic_position(reference_genomic_positions,gene,cut_site_promoter_position)


                    #add distance from guide cut site column to df
                    
                   # print(ref_length)
                    if insertion_positions == 'NA':
                        insertion_cut_site_distance = 'NA'
                        insertion_positions_relative_to_TSS = 'NA'
                        insertion_genomic_positions = 'NA'
                    else:
                        #make list of insertion_cut_site_distances (distance of insertion from cut site)
                        insertion_cut_site_distance = [i - ref_length//2 for i in insertion_positions]
                        #make list of insertion cut site genomic position and also the position relative to the Araport TSS
                        insertion_positions_relative_to_TSS = []
                        insertion_genomic_positions = []
                        for i in insertion_positions:
                            site_promoter_position = find_guide_position_in_gene(gene,Reference_Sequence_no_dashes,i)
                            position_relative_to_TSS,genomic_position, promoter_genomic_pos_chromosome = find_genomic_position(reference_genomic_positions,gene,site_promoter_position)
                            insertion_positions_relative_to_TSS.append(position_relative_to_TSS)
                            insertion_genomic_positions.append(genomic_position)
                            


             
                    if deletion_positions == 'NA':
                        deletion_cut_site_distance = 'NA'
                        deletion_positions_relative_to_TSS = 'NA'
                        deletion_genomic_positions = 'NA'

                    else:
                        #make list of deletion positions (distance of deletion from cut site)
                        deletion_cut_site_distance = [i - ref_length//2 for i in deletion_positions]
                        #make list of deletion cut site genomic position and also the position relative to the Araport TSS
                        deletion_positions_relative_to_TSS = []
                        deletion_genomic_positions = []
                        for i in deletion_positions:
                            site_promoter_position = find_guide_position_in_gene(gene,Reference_Sequence_no_dashes,i)
                            position_relative_to_TSS,genomic_position, promoter_genomic_pos_chromosome = find_genomic_position(reference_genomic_positions,gene,site_promoter_position)
                            deletion_positions_relative_to_TSS.append(position_relative_to_TSS)
                            deletion_genomic_positions.append(genomic_position)

                    if substitution_positions == 'NA':
                        substitution_cut_site_distance = 'NA'
                        substitution_positions_relative_to_TSS = 'NA'
                        substitution_genomic_positions = 'NA'
                    else:
                        #make list of substitution positions (distance of substitution from cut site)
                        substitution_cut_site_distance = [i - ref_length//2 for i in substitution_positions]
                        #make list of substitution cut site genomic position and also the position relative to the Araport TSS
                        substitution_positions_relative_to_TSS = []
                        substitution_genomic_positions = []
                        for i in substitution_positions:
                            site_promoter_position = find_guide_position_in_gene(gene,Reference_Sequence_no_dashes,i)
                            position_relative_to_TSS,genomic_position, promoter_genomic_pos_chromosome = find_genomic_position(reference_genomic_positions,gene,site_promoter_position)
                            substitution_positions_relative_to_TSS.append(position_relative_to_TSS)
                            substitution_genomic_positions.append(genomic_position)
                            

                    


                    
    #                     return {
    #     'all_insertion_positions': all_insertion_positions,
    #     'all_insertion_left_positions': all_insertion_left_positions,
    #     'insertion_positions': insertion_positions,
    #     'insertion_coordinates': insertion_coordinates,
    #     'insertion_sizes': insertion_sizes,
    #     'insertion_n': insertion_n,

    #     'all_deletion_positions': all_deletion_positions,
    #     'deletion_positions': deletion_positions,
    #     'deletion_coordinates': deletion_coordinates,
    #     'deletion_sizes': deletion_sizes,
    #     'deletion_n': deletion_n,

    #     'all_substitution_positions': all_substitution_positions,
    #     'substitution_positions': substitution_positions,
    #     'all_substitution_values': np.array(all_substitution_values),
    #     'substitution_values': np.array(substitution_values),
    #     'substitution_n': substitution_n,

    #     'ref_positions': ref_positions,
    # }
                    
                    #append list of values to output_df
                    list = [platename,library,first_reaction_primers,second_reaction_primers,guide,Aligned_Sequence,Reference_Sequence,mutation_type,read_number,read_percentage,insertion_positions,deletion_positions,substitution_positions,insertion_cut_site_distance,deletion_cut_site_distance,substitution_cut_site_distance,cut_site_promoter_position,insertion_positions_relative_to_TSS,insertion_genomic_positions,deletion_positions_relative_to_TSS,deletion_genomic_positions,substitution_positions_relative_to_TSS,substitution_genomic_positions,]#mutation_size
                    output_df.loc[len(output_df)] = list

                    

                    

                    
                    #temp_df.columns = cols
                    #temp_df['long_sample_name'] = filename
                    
                    # temp_df = temp_df.assign(platename=platename,
                    # new_library=new_library,
                    # first_reaction_primers=first_reaction_primers,
                    # second_reaction_primers=second_reaction_primers,
                    # guide=guide,
                    # aligned_sequence=Aligned_Sequence,
                    # reference_sequence=Reference_Sequence,
                    # mutation_type=mutation_type,
                    # mutation_size=mutation_size)
                    # temp_df['platename'] = platename
                    # temp_df['library'] = new_library
                    # temp_df['first_reaction_primers'] = first_reaction_primers
                    # temp_df['second_reaction_primers'] = second_reaction_primers
                    # temp_df['guide'] = guide
                    # temp_df['aligned_sequence'] = Aligned_Sequence
                    # temp_df['reference_sequence'] = Reference_Sequence
                    # temp_df['mutation_type'] = mutation_type
                    # temp_df['mutation_size'] = mutation_size
                    #append to final df
                    #output_df = pd.concat([temp_df,output_df],ignore_index=True)
                  #  p#d.concat(temp_df,output_df)

                    
                    
                    
    #write out the output_df
    output_df.to_csv(f'{output}{gene}.tsv', sep="\t", index=False, header=1)
    


                    
                #print(filtered_reads)
                

In [128]:
folder = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call'
ARF9_root = f'{folder}/ARF9_sgRNAs/test'
ARF18_root = f'{folder}/ARF18_sgRNAs/7bp_window_noplots'
DREB26_root = f'{folder}/DREB26_sgRNAs/7bp_window_noplots'
NLP7_root = f'{folder}/NLP7_sgRNAs/7bp_window_noplots'
output = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/'
#dictionary of reference fasta file locations
reference_folder = '../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/References'
reference_gene_dict = {'ARF9':f'{reference_folder}/ARF9_promoter.fa','ARF18':f'{reference_folder}/ARF18_promoter.fa','DREB26':f'{reference_folder}/DREB26_promoter.fa','NLP7':f'{reference_folder}/NLP7_promoter.fa'}
#reference_fasta = f'{reference_folder}/genes_longest_region.fa'
reference_promoter_bed = f'{reference_folder}/genes_longest_region.bed'
#promoters bed file when 3' end is the TSS (used the bed file from promoter architecture non-overlapping_includingbidirectional_all_genes_newannotation project FIMO folder)
all_promoters_bed = f'{reference_folder}/promoters.bed'
#mapped motif bed file of TFBSs scanned with FIMO
mapped_motifs_bed = '../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation_3KB/FIMO/promoters_5UTR_motifs_mapped_q0_05.bed'

In [129]:
#get reference promoter genomic positions along with TSS position
#in format [AGI,promoter_genomic_pos_chromosome,promoter_genomic_pos_start,promoter_genomic_pos_stop,TSS_pos]
reference_genomic_positions = get_reference_promoter_genomic_positions(reference_promoter_bed,all_promoters_bed)

In [140]:
check_guide(ARF9_root,output,'ARF9',reference_fasta,mapped_motifs_bed,reference_genomic_positions)

{'all_insertion_positions': [19, 20], 'all_insertion_left_positions': [19], 'insertion_positions': [], 'insertion_coordinates': [], 'insertion_sizes': [], 'insertion_n': 2, 'all_deletion_positions': [], 'deletion_positions': [], 'deletion_coordinates': [], 'deletion_sizes': [], 'deletion_n': 0, 'all_substitution_positions': [], 'substitution_positions': [], 'all_substitution_values': array([], dtype=float64), 'substitution_values': array([], dtype=float64), 'substitution_n': 0, 'ref_positions': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, -20, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38]}
{'all_insertion_positions': [19, 20], 'all_insertion_left_positions': [19], 'insertion_positions': [], 'insertion_coordinates': [], 'insertion_sizes': [], 'insertion_n': 2, 'all_deletion_positions': [], 'deletion_positions': [], 'deletion_coordinates': [], 'deletion_sizes': [], 'deletion_n': 0, 'all_substitution_positions': [], 'substitution_pos