## PEGG 2.0

- Updates to PEGG:
    - more flexible data input (following PRIDICT/PrimeDesign format)
    - more flexible PAM sequence searching
    - inclusion of base editing sensor module
    - fixing INS/DEL error
    - G+19 instead of G+20
    - Better fetching of MIT specificity and Rule Set 2/3 information...
    - Automated generation of WT/edited sequence generation
        - Use this to determine errors in pegRNAs...
    - Improved distance to nick and homology overhang sizing
    - Also automate finding of silent variants for MMR-evasion...



In [1]:
import numpy as np
import regex as re
import pandas as pd
import matplotlib.pyplot as plt
import Bio.Seq
import warnings
warnings.filterwarnings('ignore')

## improved PAM finder for various PAM sequences

In [64]:
substitution_example = 'CACACCTACACTGCTCGAAGTAAATATGCGAAGCGCGCGGCCTGGCCGGAGGCGTTCCGCGCCGCCACGTGTTCGTTAACTGTTGATTGGTGGCACATAAGCAATCGTAGTCCGTCAAATTCAGCTCTGTTATCCCGGGCGTTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGG(GGC/TTG)AGAGACCCTCCGTCGGGCTATGTCACTAATACTTTCCAAACGCCCCGTACCGATGCTGAACAAGTCGATGCAGGCTCCCGTCTTTGAAAAGGGGTAAACATACAAGTGGATAGATGATGGGTAGGGGCCTCCAATACATCCAACACTCTACGCCCTCTCCAAGAGCTAGAAGGGCACCCTGCAGTTGGAAAGGG'
ins_example = 'CACACCTACACTGCTCGAAGTAAATATGCGAAGCGCGCGGCCTGGCCGGAGGCGTTCCGCGCCGCCACGTGTTCGTTAACTGTTGATTGGTGGCACATAAGCAATCGTAGTCCGTCAAATTCAGCTCTGTTATCCCGGGCGTTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGGGA(/GTAA)GAGACCCTCCGTCGGGCTATGTCACTAATACTTTCCAAACGCCCCGTACCGATGCTGAACAAGTCGATGCAGGCTCCCGTCTTTGAAAAGGGGTAAACATACAAGTGGATAGATGATGGGTAGGGGCCTCCAATACATCCAACACTCTACGCCCTCTCCAAGAGCTAGAAGGGCACCCTGCAGTTGGAAAGGG'
del_eaxmple = 'CACACCTACACTGCTCGAAGTAAATATGCGAAGCGCGCGGCCTGGCCGGAGGCGTTCCGCGCCGCCACGTGTTCGTTAACTGTTGATTGGTGGCACATAAGCAATCGTAGTCCGTCAAATTCAGCTCTGTTATCCCGGGCGTTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGGGAG(AGAC/)CCTCCGTCGGGCTATGTCACTAATACTTTCCAAACGCCCCGTACCGATGCTGAACAAGTCGATGCAGGCTCCCGTCTTTGAAAAGGGGTAAACATACAAGTGGATAGATGATGGGTAGGGGCCTCCAATACATCCAACACTCTACGCCCTCTCCAAGAGCTAGAAGGGCACCCTGCAGTTGGAAAGGG'

In [76]:

#introduce error messages for each of these

start = substitution_example.find("(")

end = substitution_example.find(")")

replace_seq = substitution_example[start:end+1]
#if '/' in replace_seq:
    #if '+' in replace_seq:
        #throw an error

loc_replace = replace_seq.find('/')
wt_replace = replace_seq[1:loc_replace]
mut_replace = replace_seq[loc_replace+1:-1]

wt_seq = substitution_example[:start] + wt_replace + substitution_example[end+1:]

wt_start = start
wt_end = start+len(wt_replace)


In [109]:
replace_seq = '(/AGGG)'
loc_replace = replace_seq.find('/')
wt_replace = replace_seq[1:loc_replace]
mut_replace = replace_seq[loc_replace+1:-1]
wt_replace

''

In [85]:

context_size = 50

wt_w_context = wt_seq[wt_start-context_size:wt_start] + wt_seq[wt_start:wt_end] + wt_seq[wt_end:wt_end+context_size]



103

In [113]:
def mut_formatter(pridict_format, context_size = 60):
    """ 
    Takes in mutations in format of AAA(AA/GC)ATAGC
    and converts it into a format that allows pegRNAs to be generated

    Parameters
    _____
    pridict_format = list of sequences in e.g. AAA(AA/GC)ATAGC format
    context_size = amount of nt on either side of mutation to select
    """

    original_start = []
    original_end = []

    wt = []
    mut = []
    
    wt_context = []
    mut_context = []

    left_context_length = []
    right_context_length = []

    replace_start = []
    replace_end = []

    for k in pridict_format:
        #find mutation
        start = k.find("(")
        end = k.find(")")

        #get out the mutant and WT allele
        replace_seq = k[start:end+1]
        loc_replace = replace_seq.find('/')
        wt_replace = replace_seq[1:loc_replace]
        mut_replace = replace_seq[loc_replace+1:-1]

        wt.append(wt_replace)
        mut.append(mut_replace)

        #generate full WT sequence
        wt_seq = substitution_example[:start] + wt_replace + substitution_example[end+1:]
        wt_start = start
        wt_end = start+len(wt_replace)
        original_start.append(wt_start)
        original_end.append(wt_end)

        #and just the subset
        left_context = wt_seq[wt_start-context_size:wt_start]
        print(left_context)
        right_context = wt_seq[wt_end:wt_end+context_size]

        wt_w_context = left_context + wt_replace + right_context
        mut_w_context = left_context + mut_replace + right_context

        wt_context.append(wt_w_context)
        mut_context.append(mut_w_context)

        left_context_length.append(len(left_context))
        right_context_length.append(len(right_context))

        replace_start.append(len(left_context))
        replace_end.append(len(left_context)+len(wt_replace))

    col_labels = ['Original_start', 'Original_end', 'WT', 'Mutant', 'WT_context', 'Mutant_context', 'Replace_start', 'Replace_end', 'Left_context_length', 'Right_context_length']
    cols = [original_start, original_end, wt, mut, wt_context, mut_context, replace_start, replace_end, left_context_length, right_context_length]

    df = pd.DataFrame(dict(zip(col_labels, cols)))

    return df

        





In [114]:
substitution_example = 'CACACCTACACTGCTCGAAGTAAATATGCGAAGCGCGCGGCCTGGCCGGAGGCGTTCCGCGCCGCCACGTGTTCGTTAACTGTTGATTGGTGGCACATAAGCAATCGTAGTCCGTCAAATTCAGCTCTGTTATCCCGGGCGTTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGG(GGC/TTG)AGAGACCCTCCGTCGGGCTATGTCACTAATACTTTCCAAACGCCCCGTACCGATGCTGAACAAGTCGATGCAGGCTCCCGTCTTTGAAAAGGGGTAAACATACAAGTGGATAGATGATGGGTAGGGGCCTCCAATACATCCAACACTCTACGCCCTCTCCAAGAGCTAGAAGGGCACCCTGCAGTTGGAAAGGG'
ins_example = 'CACACCTACACTGCTCGAAGTAAATATGCGAAGCGCGCGGCCTGGCCGGAGGCGTTCCGCGCCGCCACGTGTTCGTTAACTGTTGATTGGTGGCACATAAGCAATCGTAGTCCGTCAAATTCAGCTCTGTTATCCCGGGCGTTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGGGA(/GTAA)GAGACCCTCCGTCGGGCTATGTCACTAATACTTTCCAAACGCCCCGTACCGATGCTGAACAAGTCGATGCAGGCTCCCGTCTTTGAAAAGGGGTAAACATACAAGTGGATAGATGATGGGTAGGGGCCTCCAATACATCCAACACTCTACGCCCTCTCCAAGAGCTAGAAGGGCACCCTGCAGTTGGAAAGGG'
del_example = 'CACACCTACACTGCTCGAAGTAAATATGCGAAGCGCGCGGCCTGGCCGGAGGCGTTCCGCGCCGCCACGTGTTCGTTAACTGTTGATTGGTGGCACATAAGCAATCGTAGTCCGTCAAATTCAGCTCTGTTATCCCGGGCGTTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGGGAG(AGAC/)CCTCCGTCGGGCTATGTCACTAATACTTTCCAAACGCCCCGTACCGATGCTGAACAAGTCGATGCAGGCTCCCGTCTTTGAAAAGGGGTAAACATACAAGTGGATAGATGATGGGTAGGGGCCTCCAATACATCCAACACTCTACGCCCTCTCCAAGAGCTAGAAGGGCACCCTGCAGTTGGAAAGGG'

pridict_format = [substitution_example, ins_example, del_example]

df = mut_formatter(pridict_format)

GCGTTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGG
GTTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGG(G
TTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGG(GG


In [112]:
print(df.iloc[2]['WT_context'])
print(df.iloc[2]['Mutant_context'])

TTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGG(GGAGACGAGACCCTCCGTCGGGCTATGTCACTAATACTTTCCAAACGCCCCGTACCGATGCTGAAC
TTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGG(GGGAGACCCTCCGTCGGGCTATGTCACTAATACTTTCCAAACGCCCCGTACCGATGCTGAAC


In [None]:
def PAM_finder(WT_seq, mut_start, mut_end,  PAM, RTT_length_max):
    """Identifies the location of PAM sequences on the + and - strand.
    Returns a 2-d array containing marked locations of PAM sequence start locations on + and - strand.
    
    Parameters
    ----------
    WT_seq: WT sequence...

    PAM_seq: PAM sequence to search for...

    RTT_length_max: maximum size of RTT length for searching...
    
    """

    distance_PAM_to_nick = 3
    PAM_size = len(PAM)
    mut_size = mut_start-mut_end

    search_size = RTT_length_max - distance_PAM_to_nick

    #deal with extra PAM sequences later on...
    #and other potential issues
    plus_search = WT_seq[mut_end-search_size-5 : mut_start+distance_PAM_to_nick + PAM_size+5]


    #---------------Loading in sequences for PAM Searching------------------#
    
    search_size = RTT_length - len(PAM)-1 #need to modify this for insertion/deletions...
    #size mut doesn't capture the size of insertions, only the size of deletions

    plus_search = seq1[seq_start-1-search_size : seq_end+search_size].upper()
    minus_search = plus_search.complement().upper()
    
    mut_start_idx = 1+search_size
    mut_end_idx = 1+search_size+size_mut #not accurate for insertions; does work for indexing though...

    plus_search1 = plus_search[:mut_start_idx+3+len(PAM)-1]
    minus_search1 = minus_search[mut_start_idx-3-len(PAM):]

    #---------------PAM Searching------------------#

    #replacing N with regex symbol
    PAM_regex = PAM.replace('N', '/*.')

    PAM_search_plus = re.compile('(?=(' + PAM_regex + '))', re.IGNORECASE)

    iterator_plus = PAM_search_plus.finditer(str(plus_search1))
    PAM_starts_plus = [match.start() for match in iterator_plus]


    PAM_minus = PAM[::-1]#reversing it
    PAM_regex_minus = PAM_minus.replace('N', '/*.')

    PAM_search_minus = re.compile('(?=(' + PAM_regex_minus + '))', re.IGNORECASE)

    iterator_minus = PAM_search_minus.finditer(str(minus_search1))
    PAM_starts_minus = [match.start() for match in iterator_minus]
    #since things are flipped on minus strand, adding len(PAM) to get the true "start" to the PAM

    PAM_starts_minus = np.asarray(PAM_starts_minus) + len(PAM) + (mut_start_idx-3-len(PAM))#and correct for indexing

    return np.asarray([np.asarray(PAM_starts_plus), PAM_starts_minus], dtype='object')-mut_start_idx

In [47]:
def PAM_finder(WT_seq, mut_start, mut_end, PAM, RTT_length_max):
    """Identifies the location of PAM sequences on the + and - strand.
    Returns a 2-d array containing marked locations of PAM sequence start locations on + and - strand.
    
    Parameters
    ----------
    WT_seq: WT sequence...

    PAM_seq: PAM sequence to search for...

    RTT_length_max: maximum size of RTT length for searching...
    
    """

    #---------------Loading in sequences for PAM Searching------------------#
    
    search_size = RTT_length - len(PAM)-1 #need to modify this for insertion/deletions...
    #size mut doesn't capture the size of insertions, only the size of deletions

    plus_search = seq1[seq_start-1-search_size : seq_end+search_size].upper()
    minus_search = plus_search.complement().upper()
    
    mut_start_idx = 1+search_size
    mut_end_idx = 1+search_size+size_mut #not accurate for insertions; does work for indexing though...

    plus_search1 = plus_search[:mut_start_idx+3+len(PAM)-1]
    minus_search1 = minus_search[mut_start_idx-3-len(PAM):]

    #---------------PAM Searching------------------#

    #replacing N with regex symbol
    PAM_regex = PAM.replace('N', '/*.')

    PAM_search_plus = re.compile('(?=(' + PAM_regex + '))', re.IGNORECASE)

    iterator_plus = PAM_search_plus.finditer(str(plus_search1))
    PAM_starts_plus = [match.start() for match in iterator_plus]


    PAM_minus = PAM[::-1]#reversing it
    PAM_regex_minus = PAM_minus.replace('N', '/*.')

    PAM_search_minus = re.compile('(?=(' + PAM_regex_minus + '))', re.IGNORECASE)

    iterator_minus = PAM_search_minus.finditer(str(minus_search1))
    PAM_starts_minus = [match.start() for match in iterator_minus]
    #since things are flipped on minus strand, adding len(PAM) to get the true "start" to the PAM

    PAM_starts_minus = np.asarray(PAM_starts_minus) + len(PAM) + (mut_start_idx-3-len(PAM))#and correct for indexing

    return np.asarray([np.asarray(PAM_starts_plus), PAM_starts_minus], dtype='object')-mut_start_idx

In [None]:
PAM = 'NGG'

PAM_finder(WT_seq, mut_start, mut_end, PAM, RTT_length_max)

# Generating list of variants for alvin

- Starting with diego library
    - includes SNPs in IDR regions
    - Also intronic SNPs between exons that are an IDR
        - These wont be amenable to MMR-evasive silent edits

- Need WT sequnece
- Mutant sequence
- Frame (i.e. codon frame = 0,1,2)

In [2]:
import pegg
import gffutils

In [3]:
filepath = '/Users/samgould/Desktop/FSR Lab/reference files/GRCh37/ncbi-genomes-2022-03-17/GCF_000001405.25_GRCh37.p13_genomic.fna.gz'

records, index_list = pegg.genome_loader(filepath)

In [4]:
file = '/Users/samgould/Desktop/FSR Lab/reference files/gencode_v19.db'
db = gffutils.FeatureDB(file)

In [5]:
#diego variants
idr = pd.read_csv('filtered_idr_mutations_5count_6nt_indels.csv')
idr.keys()

Index(['COUNT', 'Hugo_Symbol', 'Entrez_Gene_Id', 'Center', 'NCBI_Build',
       'Chromosome', 'Start_Position', 'End_Position', 'Strand', 'Consequence',
       'Variant_Classification', 'Variant_Type', 'Reference_Allele',
       'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'dbSNP_RS',
       'dbSNP_Val_Status', 'Tumor_Sample_Barcode',
       'Matched_Norm_Sample_Barcode', 'Match_Norm_Seq_Allele1',
       'Match_Norm_Seq_Allele2', 'Tumor_Validation_Allele1',
       'Tumor_Validation_Allele2', 'Match_Norm_Validation_Allele1',
       'Match_Norm_Validation_Allele2', 'Verification_Status',
       'Validation_Status', 'Mutation_Status', 'Sequencing_Phase',
       'Sequence_Source', 'Validation_Method', 'Score', 'BAM_File',
       'Sequencer', 't_ref_count', 't_alt_count', 'n_ref_count', 'n_alt_count',
       'HGVSc', 'HGVSp', 'HGVSp_Short', 'Transcript_ID', 'RefSeq',
       'Protein_position', 'Codons', 'Exon_Number', 'gnomAD_AF',
       'gnomAD_AFR_AF', 'gnomAD_AMR_AF', 'gnomAD_ASJ_AF', 'gnom

In [262]:
dels1 = idr[idr['Variant_Type']=='DEL']
dels1[dels1['Tumor_Seq_Allele2']!='-']

Unnamed: 0,COUNT,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,dbSNP_RS,dbSNP_Val_Status,Tumor_Sample_Barcode,Matched_Norm_Sample_Barcode,Match_Norm_Seq_Allele1,Match_Norm_Seq_Allele2,Tumor_Validation_Allele1,Tumor_Validation_Allele2,Match_Norm_Validation_Allele1,Match_Norm_Validation_Allele2,...,Transcript_ID,RefSeq,Protein_position,Codons,Exon_Number,gnomAD_AF,gnomAD_AFR_AF,gnomAD_AMR_AF,gnomAD_ASJ_AF,gnomAD_EAS_AF,gnomAD_FIN_AF,gnomAD_NFE_AF,gnomAD_OTH_AF,gnomAD_SAS_AF,FILTER,Polyphen_Prediction,Polyphen_Score,SIFT_Prediction,SIFT_Score,SWISSPROT,n_depth,t_depth,Annotation_Status,mutationInCis_Flag,transcript_id_TRUE
1468,12.0,APC,324.0,JHU,GRCh37,5,112175760,112175761,+,frameshift_variant,Frame_Shift_Del,DEL,AT,,T,,,GENIE-JHU-01993-02421,,,,,,,,...,ENST00000257430,NM_000038.5,1490.0,cAT/cT,16/16,,,,,,,,,,PASS,,,,,,,2449.0,SUCCESS,False,ENST00000257430.4
8082,5.0,ANKRD26,22852.0,PROV,GRCh37,10,27355404,27355409,+,"splice_region_variant,intron_variant",Splice_Region,DEL,CATAGA,,T,rs386742210,,GENIE-PROV-163a364f5a-bb28ef10d7,,,,,,,,...,ENST00000376087,NM_014915.2,423.0,,,,,,,,,,,,PASS,,,,,,,384.0,SUCCESS,False,ENST00000376087.4
8571,5.0,MYB,4602.0,DFCI,GRCh37,6,135539013,135539016,+,frameshift_variant,Frame_Shift_Del,DEL,TACC,TACC,GA,,,GENIE-DFCI-234520-3238909,,,,,,,,...,ENST00000367814,NM_001161659.1,606.0,agTACC/agGA,15/15,,,,,,,,,,PASS,,,,,,,477.0,SUCCESS,False,ENST00000367814.4
8803,5.0,APC,324.0,MSK,GRCh37,5,112177842,112177846,+,protein_altering_variant,In_Frame_Del,DEL,AAAGT,AAAGT,TA,,,GENIE-MSK-P-0067095-T05-IH3,,,,,,,,...,ENST00000257430,NM_000038.5,2184.0,gAAAGT/gTA,16/16,,,,,,,,,,PASS,,,,,,649.0,882.0,SUCCESS,False,ENST00000257430.4


In [264]:
ins1 = idr[idr['Variant_Type']=='INS']
ins1[ins1['Reference_Allele']!='-']

Unnamed: 0,COUNT,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,dbSNP_RS,dbSNP_Val_Status,Tumor_Sample_Barcode,Matched_Norm_Sample_Barcode,Match_Norm_Seq_Allele1,Match_Norm_Seq_Allele2,Tumor_Validation_Allele1,Tumor_Validation_Allele2,Match_Norm_Validation_Allele1,Match_Norm_Validation_Allele2,...,Transcript_ID,RefSeq,Protein_position,Codons,Exon_Number,gnomAD_AF,gnomAD_AFR_AF,gnomAD_AMR_AF,gnomAD_ASJ_AF,gnomAD_EAS_AF,gnomAD_FIN_AF,gnomAD_NFE_AF,gnomAD_OTH_AF,gnomAD_SAS_AF,FILTER,Polyphen_Prediction,Polyphen_Score,SIFT_Prediction,SIFT_Score,SWISSPROT,n_depth,t_depth,Annotation_Status,mutationInCis_Flag,transcript_id_TRUE
7729,5.0,XBP1,7494.0,MSK,GRCh37,22,29192182,29192183,+,"splice_acceptor_variant,intron_variant",Splice_Site,INS,TG,TG,GTGGGG,,,GENIE-MSK-P-0015800-T11-IH3,,,,,,,,...,ENST00000216037,NM_005080.3,152.0,,,,,,,,,,,,PASS,,,,,,0.0,834.0,SUCCESS,False,ENST00000216037.6


In [6]:
np.unique(idr['Variant_Type'], return_counts=True)

(array(['DEL', 'DNP', 'INS', 'ONP', 'SNP'], dtype=object),
 array([ 729,   42,  361,   13, 7955]))

In [7]:
tx_new = []
for i, val in idr.iterrows():
    h = val['HGVSc']
    tx_new.append(h.split(':')[0])

In [8]:
idr['transcript_id_TRUE'] = tx_new

In [9]:
#check all transcripts are annotated

t_ids = np.unique(idr['transcript_id_TRUE'])

not_found = []

for tx in t_ids:

    cds = list(db.children(tx, order_by='+end', featuretype=['CDS']))
    start_end_cds = [[i.start, i.end] for i in cds]
    if len(start_end_cds)==0:
        not_found.append(tx)

In [12]:
tx = 'ENST00000379607.5'

cds = list(db.children(tx, order_by='+end', featuretype=['CDS']))
start_end_cds = [[i.start, i.end] for i in cds]
start_end_cds

[[20146427, 20146429],
 [20148634, 20148725],
 [20150300, 20150381],
 [20152075, 20152125],
 [20153856, 20153959],
 [20156657, 20156740],
 [20159743, 20159758]]

In [13]:
ins = idr[idr['Variant_Type']=='INS']
dels = idr[idr['Variant_Type']=='DEL']
subs = idr[idr['Variant_Type'].isin(['SNP', 'DNP', 'ONP'])]

In [14]:
def df_formatter(df, context_size = 60):

    wt_w_context = []
    alt_w_context = []

    seq_start = []
    seq_end = []

    for i, val in df.iterrows():
        vt = val['Variant_Type']
        s = val['Start_Position']
        e = val['End_Position']
        ref = val['Reference_Allele']
        alt = val['Tumor_Seq_Allele2']
        chrom = val['Chromosome']

        if chrom == 'X':
            chrom = 22
        elif chrom=='Y':
            chrom = 23
        else:
            chrom = int(chrom)-1

        chr_seq = records[index_list[chrom]].seq.upper()

        if vt in ['SNP', 'ONP', 'DNP']:
            ref = ref
            alt = alt
            #assert ref == chr_seq[s-1:e], print(ref, chr_seq[s-1:e])
            left_context = chr_seq[s-1-context_size:s-1]
            right_context = chr_seq[e:e+context_size]

        elif vt =='INS':
            ref = ''
            alt = alt
            left_context = chr_seq[s-1-context_size:s+1] #need to do this since INS reference alleles are blank
            right_context = chr_seq[e:e+context_size]

        elif vt=='DEL':
            ref = ref
            alt = ''

            left_context = chr_seq[s-1-context_size:s-1]
            right_context = chr_seq[e:e+context_size]

        wt_seq = left_context + ref + right_context
        alt_seq = left_context + alt + right_context

        wt_w_context.append(str(wt_seq))
        alt_w_context.append(str(alt_seq))

        start = s-context_size
        end = e+context_size

        seq_start.append(start)
        seq_end.append(end)

        assert str(chr_seq[start-1:end])==str(wt_seq)

    cols_to_save = ['COUNT', 'Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position', 'Consequence', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele2', 'Tumor_Sample_Barcode', 'RefSeq', 'Protein_position', 'Exon_Number', 'HGVSc', 'HGVSp', 'HGVSp_Short', 'transcript_id_TRUE']
    df_new = df[cols_to_save]

    df_new['seq_start'] = seq_start
    df_new['seq_end'] = seq_end
    df_new['wt_w_context'] = wt_w_context
    df_new['alt_w_context'] = alt_w_context
    df_new = df_new.reset_index()
    
    return df_new




In [255]:
def frame_determiner(exon_subs):
    
    frame_list = []
    coding_start_list = []
    coding_end_list = []
    CDS_WT_correct_orientation = []

    for index1, val in exon_subs.iterrows():
        hg = val['HGVSp_Short']

        tx = val['transcript_id_TRUE']
        cds = list(db.children(tx, order_by='+end', featuretype=['CDS']))
        start_end_cds = [[i.start, i.end] for i in cds]
        strand = db[tx].strand
        #print(strand)
        chrom = val['Chromosome']

        if chrom == 'X':
            chrom = 22
        elif chrom=='Y':
            chrom = 23
        else:
            chrom = int(chrom)-1

        chr_seq = records[index_list[chrom]].seq.upper()


        wt_dna = ''
        codon_locs = []
        for i in start_end_cds:
            wt_dna += chr_seq[i[0]-1:i[1]]

            for j in range(i[0], i[1]+1):
                codon_locs.append(j)

        #and add in the last/stop codon
        #if strand == '+':

        # wt_dna += chr_seq[start_end_cds[-1][1]:start_end_cds[-1][1]+3]
        #elif strand == '-':
        #   wt_dna = chr_seq[start_end_cds[0][0]-3:start_end_cds[0][0]] +  wt_dna

        #check location in cds
        #get information about the wt/alt sequence
        start = val['seq_start']
        end = val['seq_end']
        wt_seq = val['wt_w_context']
        alt_seq = val['alt_w_context']

        inc = []
        dna_dict = dict(zip(list(range(start, end+1)), list(range(len(wt_seq)))))
        for i in range(start, end+1):
            if i in codon_locs:
                inc.append(i)

        if len(inc)==0:
            frame = 'None'
            frame_list.append(frame)
            coding_start_list.append('None')
            coding_end_list.append('None')
            CDS_WT_correct_orientation.append('None')
            #means it's not in a coding region (could be splice or intron)
        
        else:
            inc_start = inc[0]
            inc_end = inc[-1]

            #------and then double check that the frame is correct--------------
            #by checking if the subsequence is located in the WT sequence

            wt_start = dna_dict[inc_start]
            wt_end = dna_dict[inc_end] + 1

            #record what part of the subsequence is part of a CDS
            coding_start_list.append(wt_start)
            coding_end_list.append(wt_end)

            full_prot = str(Bio.Seq.Seq(wt_dna).transcribe().translate())

            subseq = Bio.Seq.Seq(wt_seq[wt_start:wt_end])

            if strand =='-':
                subseq = subseq.reverse_complement()
                inc_start = inc[-1]
                inc_end = inc[0]
                full_prot = str(Bio.Seq.Seq(wt_dna).reverse_complement().transcribe().translate())
                #have to reverse this list as well if the transcript is in the - orientation
                codon_locs = codon_locs[::-1]

            CDS_WT_correct_orientation.append(str(subseq))

            #determine the frame
            start_idx = codon_locs.index(inc_start)

            frame = (3 - (start_idx % 3)) % 3
            frame_list.append(frame)
            
            #and then confirm whether the subsequence is in the protein
            sub1 = str(Bio.Seq.Seq(subseq[frame:]).transcribe().translate())
            
            #assert sub1 in full_prot, print(sub1)
            if sub1 not in full_prot:
                print('error')
                print(strand)
                print(frame)
            
    
    exon_subs['Frame'] = frame_list
    exon_subs['CDS_start'] = coding_start_list
    exon_subs['CDS_end'] = coding_end_list
    exon_subs['CDS_WT_correct_orientation'] = CDS_WT_correct_orientation

    return exon_subs, codon_locs, start_idx, wt_seq, subseq



In [252]:
(3 - (11%3)) %3

1

In [232]:
exon_subs

Unnamed: 0,level_0,index,COUNT,Hugo_Symbol,Chromosome,Start_Position,End_Position,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele2,Tumor_Sample_Barcode,RefSeq,Protein_position,Exon_Number,HGVSc,HGVSp,HGVSp_Short,transcript_id_TRUE,seq_start,seq_end,wt_w_context,alt_w_context
0,0,1,1512.0,TP53,17,7577094,7577094,missense_variant,Missense_Mutation,SNP,G,A,GENIE-JHU-00113-00335,NM_001126112.2,282.0,8/11,ENST00000269305.4:c.844C>T,p.Arg282Trp,p.R282W,ENST00000269305.4,7577034,7577154,CTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCGGAGATTCTC...,CTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCGGAGATTCTC...
1,1,3,777.0,APC,5,112175639,112175639,stop_gained,Nonsense_Mutation,SNP,C,T,GENIE-JHU-00223-00468,NM_000038.5,1450.0,16/16,ENST00000257430.4:c.4348C>T,p.Arg1450Ter,p.R1450*,ENST00000257430.4,112175579,112175699,ACCATGCCACCAAGCAGAAGTAAAACACCTCCACCACCTCCTCAAA...,ACCATGCCACCAAGCAGAAGTAAAACACCTCCACCACCTCCTCAAA...
2,2,6,614.0,APC,5,112173917,112173917,stop_gained,Nonsense_Mutation,SNP,C,T,GENIE-JHU-00123-00365,NM_000038.5,876.0,16/16,ENST00000257430.4:c.2626C>T,p.Arg876Ter,p.R876*,ENST00000257430.4,112173857,112173977,CGCGGAATTGGTCTAGGCAACTACCATCCAGCAACAGAAAATCCAG...,CGCGGAATTGGTCTAGGCAACTACCATCCAGCAACAGAAAATCCAG...
3,3,7,538.0,TP53,17,7577085,7577085,missense_variant,Missense_Mutation,SNP,C,T,GENIE-JHU-00694-00783,NM_001126112.2,285.0,8/11,ENST00000269305.4:c.853G>A,p.Glu285Lys,p.E285K,ENST00000269305.4,7577025,7577145,TAGTGCTCCCTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCG...,TAGTGCTCCCTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCG...
4,4,8,527.0,SRSF2,17,74732959,74732959,missense_variant,Missense_Mutation,SNP,G,T,GENIE-DFCI-001379-11338,NM_001195427.1,95.0,1/3,ENST00000359995.5:c.284C>A,p.Pro95His,p.P95H,ENST00000359995.5,74732899,74733019,TAGCCACCGCCCCCGTACCTGCGGGGTGGCGGTCCCCGGCGGCTGT...,TAGCCACCGCCCCCGTACCTGCGGGGTGGCGGTCCCCGGCGGCTGT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,394,602,19.0,SETD2,3,47164832,47164832,missense_variant,Missense_Mutation,SNP,G,A,GENIE-DFCI-008656-6895,NM_014159.6,432.0,3/21,ENST00000409792.3:c.1294C>T,p.Arg432Cys,p.R432C,ENST00000409792.3,47164772,47164892,CTGTGTATGGCCGAGAATAGCGCGTCCTCTCTCGATAAGGGGAGCT...,CTGTGTATGGCCGAGAATAGCGCGTCCTCTCTCGATAAGGGGAGCT...
352,395,603,19.0,ETV5,3,185797801,185797801,missense_variant,Missense_Mutation,SNP,G,T,GENIE-DFCI-025211-61496,NM_004454.2,152.0,7/13,ENST00000306376.5:c.455C>A,p.Pro152Gln,p.P152Q,ENST00000306376.5,185797741,185797861,ACACCTTGAACTGGGCCAGCTGCAGGGGCATGCCCTGAGGTGGGCA...,ACACCTTGAACTGGGCCAGCTGCAGGGGCATGCCCTGAGGTGGGCA...
353,396,604,19.0,MTOR,1,11190747,11190747,missense_variant,Missense_Mutation,SNP,G,A,GENIE-DFCI-007654-6083,NM_004958.3,1818.0,39/58,ENST00000361445.4:c.5452C>T,p.Arg1818Cys,p.R1818C,ENST00000361445.4,11190687,11190807,CAGTGGCGGCCGTGGTGGCGGCAGTGGTGGCGTTGGTGATGTTGGC...,CAGTGGCGGCCGTGGTGGCGGCAGTGGTGGCGTTGGTGATGTTGGC...
354,398,607,19.0,EIF1AX,X,20156720,20156720,missense_variant,Missense_Mutation,SNP,G,A,GENIE-MSK-P-0044987-T01-IM6,NM_001412.3,13.0,2/7,ENST00000379607.5:c.37C>T,p.Arg13Cys,p.R13C,ENST00000379607.5,20156660,20156780,GACCATCCTCTTTGAATACCAGTTCTCTTTTTTCAGATTCATTCTC...,GACCATCCTCTTTGAATACCAGTTCTCTTTTTTCAGATTCATTCTC...


In [256]:
e, codon_locs, start_idx, wt_seq, subseq = frame_determiner(exon_subs[:])


In [257]:
e

Unnamed: 0,level_0,index,COUNT,Hugo_Symbol,Chromosome,Start_Position,End_Position,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele2,Tumor_Sample_Barcode,RefSeq,Protein_position,Exon_Number,HGVSc,HGVSp,HGVSp_Short,transcript_id_TRUE,seq_start,seq_end,wt_w_context,alt_w_context,Frame,CDS_start,CDS_end,CDS_WT_correct_orientation
0,0,1,1512.0,TP53,17,7577094,7577094,missense_variant,Missense_Mutation,SNP,G,A,GENIE-JHU-00113-00335,NM_001126112.2,282.0,8/11,ENST00000269305.4:c.844C>T,p.Arg282Trp,p.R282W,ENST00000269305.4,7577034,7577154,CTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCGGAGATTCTC...,CTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCGGAGATTCTC...,0,0,121,GGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCT...
1,1,3,777.0,APC,5,112175639,112175639,stop_gained,Nonsense_Mutation,SNP,C,T,GENIE-JHU-00223-00468,NM_000038.5,1450.0,16/16,ENST00000257430.4:c.4348C>T,p.Arg1450Ter,p.R1450*,ENST00000257430.4,112175579,112175699,ACCATGCCACCAAGCAGAAGTAAAACACCTCCACCACCTCCTCAAA...,ACCATGCCACCAAGCAGAAGTAAAACACCTCCACCACCTCCTCAAA...,0,0,121,ACCATGCCACCAAGCAGAAGTAAAACACCTCCACCACCTCCTCAAA...
2,2,6,614.0,APC,5,112173917,112173917,stop_gained,Nonsense_Mutation,SNP,C,T,GENIE-JHU-00123-00365,NM_000038.5,876.0,16/16,ENST00000257430.4:c.2626C>T,p.Arg876Ter,p.R876*,ENST00000257430.4,112173857,112173977,CGCGGAATTGGTCTAGGCAACTACCATCCAGCAACAGAAAATCCAG...,CGCGGAATTGGTCTAGGCAACTACCATCCAGCAACAGAAAATCCAG...,0,0,121,CGCGGAATTGGTCTAGGCAACTACCATCCAGCAACAGAAAATCCAG...
3,3,7,538.0,TP53,17,7577085,7577085,missense_variant,Missense_Mutation,SNP,C,T,GENIE-JHU-00694-00783,NM_001126112.2,285.0,8/11,ENST00000269305.4:c.853G>A,p.Glu285Lys,p.E285K,ENST00000269305.4,7577025,7577145,TAGTGCTCCCTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCG...,TAGTGCTCCCTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCG...,0,0,121,CTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGA...
4,4,8,527.0,SRSF2,17,74732959,74732959,missense_variant,Missense_Mutation,SNP,G,T,GENIE-DFCI-001379-11338,NM_001195427.1,95.0,1/3,ENST00000359995.5:c.284C>A,p.Pro95His,p.P95H,ENST00000359995.5,74732899,74733019,TAGCCACCGCCCCCGTACCTGCGGGGTGGCGGTCCCCGGCGGCTGT...,TAGCCACCGCCCCCGTACCTGCGGGGTGGCGGTCCCCGGCGGCTGT...,2,0,121,TGGACGGGGCCGTGCTGGACGGCCGCGAGCTGCGGGTGCAAATGGC...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,394,602,19.0,SETD2,3,47164832,47164832,missense_variant,Missense_Mutation,SNP,G,A,GENIE-DFCI-008656-6895,NM_014159.6,432.0,3/21,ENST00000409792.3:c.1294C>T,p.Arg432Cys,p.R432C,ENST00000409792.3,47164772,47164892,CTGTGTATGGCCGAGAATAGCGCGTCCTCTCTCGATAAGGGGAGCT...,CTGTGTATGGCCGAGAATAGCGCGTCCTCTCTCGATAAGGGGAGCT...,0,0,121,GGCTCTAGAACTAATTTATCCTATTCCAGGTCAGAACGATCTCATT...
352,395,603,19.0,ETV5,3,185797801,185797801,missense_variant,Missense_Mutation,SNP,G,T,GENIE-DFCI-025211-61496,NM_004454.2,152.0,7/13,ENST00000306376.5:c.455C>A,p.Pro152Gln,p.P152Q,ENST00000306376.5,185797741,185797861,ACACCTTGAACTGGGCCAGCTGCAGGGGCATGCCCTGAGGTGGGCA...,ACACCTTGAACTGGGCCAGCTGCAGGGGCATGCCCTGAGGTGGGCA...,2,0,121,AGCCATTAACCCCTCCTACAACCCCCCTCTCACCCACCCATCAGAA...
353,396,604,19.0,MTOR,1,11190747,11190747,missense_variant,Missense_Mutation,SNP,G,A,GENIE-DFCI-007654-6083,NM_004958.3,1818.0,39/58,ENST00000361445.4:c.5452C>T,p.Arg1818Cys,p.R1818C,ENST00000361445.4,11190687,11190807,CAGTGGCGGCCGTGGTGGCGGCAGTGGTGGCGTTGGTGATGTTGGC...,CAGTGGCGGCCGTGGTGGCGGCAGTGGTGGCGTTGGTGATGTTGGC...,0,0,121,TTCGAAGCTGTGCTACACTACAAACATCAGAACCAAGCCCGCGATG...
354,398,607,19.0,EIF1AX,X,20156720,20156720,missense_variant,Missense_Mutation,SNP,G,A,GENIE-MSK-P-0044987-T01-IM6,NM_001412.3,13.0,2/7,ENST00000379607.5:c.37C>T,p.Arg13Cys,p.R13C,ENST00000379607.5,20156660,20156780,GACCATCCTCTTTGAATACCAGTTCTCTTTTTTCAGATTCATTCTC...,GACCATCCTCTTTGAATACCAGTTCTCTTTTTTCAGATTCATTCTC...,2,0,81,GTAAAGGAGGTAAAAACAGACGCAGGGGTAAGAATGAGAATGAATC...


In [223]:
fp = 'MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD'

fp.find('WDGTALRCVFVPVLGETGAQRKRISARKGSLTTSCP')

-1

In [210]:
wt_seq

'TGGAACCAGACAGAAAAGCGGCTGTTAGTCACTGGCAGCAACAGTCTTACCTGGACTCTGGAATCCATTCTGGTGCCACTACCACAGCTCCTTCTCTGAGTGGTAAAGGCAATCCTGAGGA'

In [176]:
wt_seq

'ATGGCCATGGAACCAGACAGAAAAGCGGCTGTTAGTCACTGGCAGCAACAGTCTTACCTGGACTCTGGAATCCATTCTGGTGCCACTACCACAGCTCCTTCTCTGAGTGGTAAAGGCAATC'

In [204]:
Bio.Seq.Seq(wt_seq[1:]).transcribe().translate()

Seq('NPADCRVLVYLQNQPGTKLLNFLQERNLPPKVVLRHPKVH')

In [154]:
codon_locs

[41265560,
 41265561,
 41265562,
 41265563,
 41265564,
 41265565,
 41265566,
 41265567,
 41265568,
 41265569,
 41265570,
 41265571,
 41265572,
 41266017,
 41266018,
 41266019,
 41266020,
 41266021,
 41266022,
 41266023,
 41266024,
 41266025,
 41266026,
 41266027,
 41266028,
 41266029,
 41266030,
 41266031,
 41266032,
 41266033,
 41266034,
 41266035,
 41266036,
 41266037,
 41266038,
 41266039,
 41266040,
 41266041,
 41266042,
 41266043,
 41266044,
 41266045,
 41266046,
 41266047,
 41266048,
 41266049,
 41266050,
 41266051,
 41266052,
 41266053,
 41266054,
 41266055,
 41266056,
 41266057,
 41266058,
 41266059,
 41266060,
 41266061,
 41266062,
 41266063,
 41266064,
 41266065,
 41266066,
 41266067,
 41266068,
 41266069,
 41266070,
 41266071,
 41266072,
 41266073,
 41266074,
 41266075,
 41266076,
 41266077,
 41266078,
 41266079,
 41266080,
 41266081,
 41266082,
 41266083,
 41266084,
 41266085,
 41266086,
 41266087,
 41266088,
 41266089,
 41266090,
 41266091,
 41266092,
 41266093,
 41266094,

In [15]:
subs_new = df_formatter(subs[:400])

In [28]:
np.unique(subs_new['Variant_Classification'])

array(['Intron', 'Missense_Mutation', 'Nonsense_Mutation', 'Silent',
       'Splice_Region', 'Splice_Site'], dtype=object)

In [29]:
coding_muts = ['Missense_Mutation', 'Nonsense_Mutation', 'Silent']
exon_subs = subs_new[subs_new['Variant_Classification'].isin(coding_muts)].reset_index()
exon_subs

Unnamed: 0,level_0,index,COUNT,Hugo_Symbol,Chromosome,Start_Position,End_Position,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele2,Tumor_Sample_Barcode,RefSeq,Protein_position,Exon_Number,HGVSc,HGVSp,HGVSp_Short,transcript_id_TRUE,seq_start,seq_end,wt_w_context,alt_w_context
0,0,1,1512.0,TP53,17,7577094,7577094,missense_variant,Missense_Mutation,SNP,G,A,GENIE-JHU-00113-00335,NM_001126112.2,282.0,8/11,ENST00000269305.4:c.844C>T,p.Arg282Trp,p.R282W,ENST00000269305.4,7577034,7577154,CTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCGGAGATTCTC...,CTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCGGAGATTCTC...
1,1,3,777.0,APC,5,112175639,112175639,stop_gained,Nonsense_Mutation,SNP,C,T,GENIE-JHU-00223-00468,NM_000038.5,1450.0,16/16,ENST00000257430.4:c.4348C>T,p.Arg1450Ter,p.R1450*,ENST00000257430.4,112175579,112175699,ACCATGCCACCAAGCAGAAGTAAAACACCTCCACCACCTCCTCAAA...,ACCATGCCACCAAGCAGAAGTAAAACACCTCCACCACCTCCTCAAA...
2,2,6,614.0,APC,5,112173917,112173917,stop_gained,Nonsense_Mutation,SNP,C,T,GENIE-JHU-00123-00365,NM_000038.5,876.0,16/16,ENST00000257430.4:c.2626C>T,p.Arg876Ter,p.R876*,ENST00000257430.4,112173857,112173977,CGCGGAATTGGTCTAGGCAACTACCATCCAGCAACAGAAAATCCAG...,CGCGGAATTGGTCTAGGCAACTACCATCCAGCAACAGAAAATCCAG...
3,3,7,538.0,TP53,17,7577085,7577085,missense_variant,Missense_Mutation,SNP,C,T,GENIE-JHU-00694-00783,NM_001126112.2,285.0,8/11,ENST00000269305.4:c.853G>A,p.Glu285Lys,p.E285K,ENST00000269305.4,7577025,7577145,TAGTGCTCCCTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCG...,TAGTGCTCCCTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCG...
4,4,8,527.0,SRSF2,17,74732959,74732959,missense_variant,Missense_Mutation,SNP,G,T,GENIE-DFCI-001379-11338,NM_001195427.1,95.0,1/3,ENST00000359995.5:c.284C>A,p.Pro95His,p.P95H,ENST00000359995.5,74732899,74733019,TAGCCACCGCCCCCGTACCTGCGGGGTGGCGGTCCCCGGCGGCTGT...,TAGCCACCGCCCCCGTACCTGCGGGGTGGCGGTCCCCGGCGGCTGT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,394,602,19.0,SETD2,3,47164832,47164832,missense_variant,Missense_Mutation,SNP,G,A,GENIE-DFCI-008656-6895,NM_014159.6,432.0,3/21,ENST00000409792.3:c.1294C>T,p.Arg432Cys,p.R432C,ENST00000409792.3,47164772,47164892,CTGTGTATGGCCGAGAATAGCGCGTCCTCTCTCGATAAGGGGAGCT...,CTGTGTATGGCCGAGAATAGCGCGTCCTCTCTCGATAAGGGGAGCT...
352,395,603,19.0,ETV5,3,185797801,185797801,missense_variant,Missense_Mutation,SNP,G,T,GENIE-DFCI-025211-61496,NM_004454.2,152.0,7/13,ENST00000306376.5:c.455C>A,p.Pro152Gln,p.P152Q,ENST00000306376.5,185797741,185797861,ACACCTTGAACTGGGCCAGCTGCAGGGGCATGCCCTGAGGTGGGCA...,ACACCTTGAACTGGGCCAGCTGCAGGGGCATGCCCTGAGGTGGGCA...
353,396,604,19.0,MTOR,1,11190747,11190747,missense_variant,Missense_Mutation,SNP,G,A,GENIE-DFCI-007654-6083,NM_004958.3,1818.0,39/58,ENST00000361445.4:c.5452C>T,p.Arg1818Cys,p.R1818C,ENST00000361445.4,11190687,11190807,CAGTGGCGGCCGTGGTGGCGGCAGTGGTGGCGTTGGTGATGTTGGC...,CAGTGGCGGCCGTGGTGGCGGCAGTGGTGGCGTTGGTGATGTTGGC...
354,398,607,19.0,EIF1AX,X,20156720,20156720,missense_variant,Missense_Mutation,SNP,G,A,GENIE-MSK-P-0044987-T01-IM6,NM_001412.3,13.0,2/7,ENST00000379607.5:c.37C>T,p.Arg13Cys,p.R13C,ENST00000379607.5,20156660,20156780,GACCATCCTCTTTGAATACCAGTTCTCTTTTTTCAGATTCATTCTC...,GACCATCCTCTTTGAATACCAGTTCTCTTTTTTCAGATTCATTCTC...


In [336]:
exon_ins = ins_new[ins_new['Variant_Classification']!='Intron']
exon_ins

Unnamed: 0,index,COUNT,Hugo_Symbol,Chromosome,Start_Position,End_Position,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele2,Tumor_Sample_Barcode,RefSeq,Protein_position,Exon_Number,HGVSc,HGVSp,HGVSp_Short,transcript_id_TRUE,seq_start,seq_end,wt_w_context,alt_w_context
0,4,762.0,ASXL1,20,31022441,31022442,frameshift_variant,Frame_Shift_Ins,INS,-,G,GENIE-DFCI-003409-1958,NM_015338.5,642.0,13/13,ENST00000375687.4:c.1934dup,p.Gly646TrpfsTer12,p.G646Wfs*12,ENST00000375687.4,31022381,31022502,GCAGGTCCGAGGGGCGAGAGGTCACCACTGCCATAGAGAGGCGGCC...,GCAGGTCCGAGGGGCGAGAGGTCACCACTGCCATAGAGAGGCGGCC...
1,5,645.0,APC,5,112175952,112175953,frameshift_variant,Frame_Shift_Ins,INS,-,A,GENIE-JHU-00198-00378,NM_000038.5,1554.0,16/16,ENST00000257430.4:c.4666dup,p.Thr1556AsnfsTer3,p.T1556Nfs*3,ENST00000257430.4,112175892,112176013,GGAATGAAACAGAATCAGAGCAGCCTAAAGAATCAAATGAAAACCA...,GGAATGAAACAGAATCAGAGCAGCCTAAAGAATCAAATGAAAACCA...
2,21,280.0,ARID1A,1,27105930,27105931,frameshift_variant,Frame_Shift_Ins,INS,-,G,GENIE-DFCI-001750-9089,NM_006015.4,1847.0,20/20,ENST00000324856.7:c.5548dup,p.Asp1850GlyfsTer4,p.D1850Gfs*4,ENST00000324856.7,27105870,27105991,CTCAGATAAGCTTGGGCGTGTGCAGGAGTTTGACAGTGGCCTGCTG...,CTCAGATAAGCTTGGGCGTGTGCAGGAGTTTGACAGTGGCCTGCTG...
4,55,108.0,TCF7L2,10,114925316,114925317,frameshift_variant,Frame_Shift_Ins,INS,-,A,GENIE-DFCI-151674-1525793,,482.0,15/15,ENST00000355995.4:c.1454dup,p.Cys486ValfsTer8,p.C486Vfs*8,ENST00000355995.4,114925256,114925377,CTCTCTCCCTTGGCATCTGTGCCCTCTATTCACAGATAACTCTCTC...,CTCTCTCCCTTGGCATCTGTGCCCTCTATTCACAGATAACTCTCTC...
5,60,99.0,GATA3,10,8115874,8115875,frameshift_variant,Frame_Shift_Ins,INS,-,G,GENIE-DFCI-002328-1774,,407.0,6/6,ENST00000346208.3:c.1221dup,p.Pro408AlafsTer99,p.P408Afs*99,ENST00000346208.3,8115814,8115935,AGAACAGCTCGTTTAACCCGGCCGCCCTCTCCAGACACATGTCCTC...,AGAACAGCTCGTTTAACCCGGCCGCCCTCTCCAGACACATGTCCTC...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354,8994,5.0,APC,5,112175621,112175622,frameshift_variant,Frame_Shift_Ins,INS,-,A,GENIE-MSK-P-0048781-T01-IM6,NM_000038.5,1444.0,16/16,ENST00000257430.4:c.4333dup,p.Thr1445AsnfsTer10,p.T1445Nfs*10,ENST00000257430.4,112175561,112175682,CCAGATAGCCCTGGACAAACCATGCCACCAAGCAGAAGTAAAACAC...,CCAGATAGCCCTGGACAAACCATGCCACCAAGCAGAAGTAAAACAC...
355,9010,5.0,SRSF2,17,74732960,74732961,inframe_insertion,In_Frame_Ins,INS,-,GGC,GENIE-UHN-AGI147550-BM1,NM_001195427.1,94.0,1/3,ENST00000359995.5:c.282_283insGCC,p.Arg94_Pro95insAla,p.R94_P95insA,ENST00000359995.5,74732900,74733021,AGCCACCGCCCCCGTACCTGCGGGGTGGCGGTCCCCGGCGGCTGTG...,AGCCACCGCCCCCGTACCTGCGGGGTGGCGGTCCCCGGCGGCTGTG...
356,9011,5.0,ZRSR2,X,15841255,15841256,protein_altering_variant,In_Frame_Ins,INS,-,AGCCGC,GENIE-UHN-AGI974552-BM1,NM_005089.3,447.0,11/11,ENST00000307771.7:c.1339_1340insAGCCGC,p.Ser447delinsLysProArg,p.S447delinsKPR,ENST00000307771.7,15841195,15841316,AGGGACCGCAGCAGGGACCGCAGCCGGGGCCGGGGCAGCCGGAGCC...,AGGGACCGCAGCAGGGACCGCAGCCGGGGCCGGGGCAGCCGGAGCC...
357,9013,5.0,TSC1,9,135771988,135771989,protein_altering_variant,In_Frame_Ins,INS,-,GCT,GENIE-UHN-DIVA273992-ARC1,NM_001162426.1,1043.0,23/23,ENST00000298552.3:c.3128_3129insAGC,p.Ser1043delinsArgAla,p.S1043delinsRA,ENST00000298552.3,135771928,135772049,CCACCGACTGCTGAATGGGCCTGCCCTCTGGTGTGGGGGTTTCTCT...,CCACCGACTGCTGAATGGGCCTGCCCTCTGGTGTGGGGGTTTCTCT...


In [90]:
#try it with one example
#val = ins_new.iloc[0]d

def variant_checker(exon_subs):
    
    for index1, val in exon_subs.iterrows():
        hg = val['HGVSp_Short']

        tx = val['transcript_id_TRUE']
        cds = list(db.children(tx, order_by='+end', featuretype=['CDS']))
        start_end_cds = [[i.start, i.end] for i in cds]
        strand = db[tx].strand
        chrom = val['Chromosome']

        if chrom == 'X':
            chrom = 22
        elif chrom=='Y':
            chrom = 23
        else:
            chrom = int(chrom)-1

        chr_seq = records[index_list[chrom]].seq.upper()


        wt_dna = ''
        codon_locs = []
        for i in start_end_cds:
            wt_dna += chr_seq[i[0]-1:i[1]]

            for j in range(i[0], i[1]+1):
                codon_locs.append(j)

        #and add in the last/stop codon
        #if strand == '+':

        # wt_dna += chr_seq[start_end_cds[-1][1]:start_end_cds[-1][1]+3]
        #elif strand == '-':
        #   wt_dna = chr_seq[start_end_cds[0][0]-3:start_end_cds[0][0]] +  wt_dna

        #check location in cds
        #get information about the wt/alt sequence
        start = val['seq_start']
        end = val['seq_end']
        wt_seq = val['wt_w_context']
        alt_seq = val['alt_w_context']

        inc = []
        dna_dict = dict(zip(list(range(start, end+1)), list(range(len(wt_seq)))))
        for i in range(start, end+1):
            if i in codon_locs:
                inc.append(i)

        if len(inc)==0:
            print(index1)
            
        inc_start = inc[0]
        inc_end = inc[-1]

        wt_start = dna_dict[inc_start]
        wt_end = dna_dict[inc_end] + 1

        if wt_end == len(wt_seq):
            alt_end = len(alt_seq)
        else:
            alt_end = wt_end - len(wt_seq)

        first_half = codon_locs.index(inc_start)
        second_half = codon_locs.index(inc_end)


        wt_full_dna = wt_dna[:first_half] + wt_seq[wt_start:wt_end] + wt_dna[second_half+1:]
        alt_full_dna = wt_dna[:first_half] + alt_seq[wt_start:alt_end] + wt_dna[second_half+1:]

        assert wt_full_dna == wt_dna

        if strand == '-':
            wt_full_dna = wt_full_dna.reverse_complement()
            alt_full_dna = alt_full_dna.reverse_complement()

        
        wt_dna = str(wt_full_dna)
        mut_dna = str(alt_full_dna)

        print(len(wt_dna), len(mut_dna))
        wt_seq = Bio.Seq.Seq(wt_dna).transcribe().translate()
        mut_seq = Bio.Seq.Seq(alt_full_dna).transcribe().translate()

        try:
            hgvsp = get_hgvsp_classification(wt_dna, mut_dna, wt_seq, mut_seq)
        except:
            hgvsp = 'undefined'

        #if hgvsp != hg[2:]:
        print(index1)
        print(hgvsp, hg[2:])

        return wt_dna, mut_dna



In [52]:
dels_new = df_formatter(dels[:400])
#coding_muts = ['Frame_Shift_Del'] #'In_Frame_Del',
coding_muts = ['In_Frame_Del'] #'In_Frame_Del',

exon_dels = dels_new[dels_new['Variant_Classification'].isin(coding_muts)].reset_index()


In [82]:
ins_new = df_formatter(ins[:400])
#coding_muts = ['Frame_Shift_Del'] #'In_Frame_Del',
coding_muts = ['In_Frame_Ins', 'Frame_Shift_Ins'] #'In_Frame_Del',

exon_ins = ins_new[ins_new['Variant_Classification'].isin(coding_muts)].reset_index()


In [79]:
np.unique(ins_new['Variant_Classification'])

array(['Frame_Shift_Ins', 'In_Frame_Ins', 'Intron', 'Nonsense_Mutation',
       'Splice_Region', 'Splice_Site'], dtype=object)

In [81]:
'Frame_Shift_Ins', 'In_Frame_Ins'

('Frame_Shift_Ins', 'In_Frame_Ins')

In [89]:
variant_checker(exon_ins.iloc[0:10])

4623 4624
0
G646Wfs*12 G646Wfs*12
8529 8530
1
T1556Nfs*3 T1556Nfs*3
6855 6856
2
D1850Gfs*4 D1850Gfs*4
1857 1858
3
C486Vfs*8 C486Vfs*8
1329 1330
4
P408Afs*443 P408Afs*99
1329 1330
5
P408Sfs*443 P408Afs*99
6855 6858
6
E38_A39insG A45dup
7242 7243
7
H2324Pfs*55 H2324Pfs*55
6708 6714
8
Q121_Q122insPA Q130_Q131dup
4014 4017
9
N27_N28insK N28dup


In [94]:
w, m = variant_checker(exon_ins.iloc[9:10])

wt_dna = w
mut_dna = m
wt_seq = str(Bio.Seq.Seq(w).transcribe().translate())
mut_seq = str(Bio.Seq.Seq(m).transcribe().translate())
get_hgvsp_classification(wt_dna, mut_dna, wt_seq, mut_seq)

4014 4017
9
N27_N28insK N28dup


'N27_N28insK'

In [95]:
print(str(Bio.Seq.Seq(w).transcribe().translate()))
print(str(Bio.Seq.Seq(m).transcribe().translate()))

MASPPRHGPPGPASGDGPNLNNNNNNNNHSVRKCGYLRKQKHGHKRFFVLRGPGAGGDEATAGGGSAPQPPRLEYYESEKKWRSKAGAPKRVIALDCCLNINKRADAKHKYLIALYTKDEYFAVAAENEQEQEGWYRALTDLVSEGRAAAGDAPPAAAPAASCSASLPGALGGSAGAAGAEDSYGLVAPATAAYREVWQVNLKPKGLGQSKNLTGVYRLCLSARTIGFVKLNCEQPSVTLQLMNIRRCGHSDSFFFIEVGRSAVTGPGELWMQADDSVVAQNIHETILEAMKALKELFEFRPRSKSQSSGSSATHPISVPGARRHHHLVNLPPSQTGLVRRSRTDSLAATPPAAKCSSCRVRTASEGDGGAAAGAAAAGARPVSVAGSPLSPGPVRAPLSRSHTLSGGCGGRGSKVALLPAGGALQHSRSMSMPVAHSPPAATSPGSLSSSSGHGSGSYPPPPGPHPPLPHPLHHGPGQRPSSGSASASGSPSDPGFMSLDEYGSSPGDLRAFCSHRSNTPESIAETPPARDGGGGGEFYGYMTMDRPLSHCGRSYRRVSGDAAQDLDRGLRKRTYSLTTPARQRPVPQPSSASLDEYTLMRATFSGSAGRLCPSCPASSPKVAYHPYPEDYGDIEIGSHRSSSSNLGADDGYMPMTPGAALAGSGSGSCRSDDYMPMSPASVSAPKQILQPRAAAAAAAAVPSAGPAGPAPTSAAGRTFPASGGGYKASSPAESSPEDSGYMRMWCGSKLSMEHADGKLLPNGDYLNVSPSDAVTTGTPPDFFSAALHPGGEPLRGVPGCCYSSLPRSYKAPYTCGGDSDQYVLMSSPVGRILEEERLEPQATPGPSQAASAFGAGPTQPPHPVVPSPVRPSGGRPEGFLGQRGRAVRPTRLSLEGLPSLPSMHEYPLPPEPKSPGEYINIDFGEPGARLSPPAPPLLASAASSSSLLSASSPASSLGSGTPGTSSDSRQRSPLSDYMNLDFSSPKSPKPGAPSGHPVG

In [68]:
print(str(Bio.Seq.Seq(m).transcribe().translate()))

MQPPSLLLLLLLLLLCVSVVRPRGLLCGSFPEPCANGGTCLSLSLGQGTCQCAPGFLGETCQFPDPCQNAQLCQNGGSCQALLPAPLGLPSSPSPLTPSFLCTCLPGFTGERCQAKLEDPCPPSFCSKRGRCHIQASGRPQCSCMPGWTGEQCQLRDFCSANPCVNGGVCLATYPQIQCHCPPGFEGHACERDVNECFQDPGPCPKGTSCHNTLGSFQCLCPVGQEGPRCELRAGPCPPRGCSNGGTCQLMPEKDSTFHLCLCPPGFIGPDCEVNPDNCVSHQCQNGGTCQDGLDTYTCLCPETWTGWDCSEDVDECETQGPPHCRNGGTCQNSAGSFHCVCVSGWGGTSCEENLDDCIAATCAPGSTCIDRVGSFSCLCPPGRTGLLCHLEDMCLSQPCHGDAQCSTNPLTGSTLCLCQPGYSGPTCHQDLDECLMAQQGPSPCEHGGSCLNTPGSFNCLCPPGYTGSRCEADHNECLSQPCHPGSTCLDLLATFHCLCPPGLEGQLCEVETNECASAPCLNHADCHDLLNGFQCICLPGFSGTRCEEDIDECRSSPCANGGQCQDQPGAFHCKCLPGFEGPRCQTEVDECLSDPCPVGASCLDLPGAFFCLCPSGFTGQLCEVPLCAPNLCQPKQICKDQKDKANCLCPDGSPGCAPPEDNCTCHHGHCQRSSCVCDVGWTGPECEAELGGCISAPCAHGGTCYPQPSGYNCTCPTGYTGPTCSEEMTACHSGPCLNGGSCNPSPGGYYCTCPPSHTGPQCQTSTDYCVSAPCFNGGTCVNRPGTFSCLCAMGFQGPRCEGKLRPSCADSPCRNRATCQDSPQGPRCLCPTGYTGGSCQTLMDLCAQKPCPRNSHCLQTGPSFHCLCLQGWTGPLCNLPLSSCQKAALSQGIDVSSLCHNGGLCVDSGPSYFCHCPPGFQGSLCQDHVNPCESRPCQNGATCMAQPSGYLCQCAPGYDGQNCSKELDACQSQPCHNHGTCTPKPGGFHCACPPGFVGL

In [63]:
m

'ATGCAGCCCCCTTCACTGCTGCTGCTGCTGCTGCTGCTGCTGCTATGTGTCTCAGTGGTCAGACCCAGAGGGCTGCTGTGTGGGAGTTTCCCAGAACCCTGTGCCAATGGAGGCACCTGCCTGAGCCTGTCTCTGGGACAAGGGACCTGCCAGTGTGCCCCTGGCTTCCTGGGTGAGACGTGCCAGTTTCCTGACCCCTGCCAGAACGCCCAGCTCTGCCAAAATGGAGGCAGCTGCCAAGCCCTGCTTCCCGCTCCCCTAGGGCTCCCCAGCTCTCCCTCTCCATTGACACCCAGCTTCTTGTGCACTTGCCTCCCTGGCTTCACTGGTGAGAGATGCCAGGCCAAGCTTGAAGACCCTTGTCCTCCCTCCTTCTGTTCCAAAAGGGGCCGCTGCCACATCCAGGCCTCGGGCCGCCCACAGTGCTCCTGCATGCCTGGATGGACAGGTGAGCAGTGCCAGCTTCGGGACTTCTGTTCAGCCAACCCATGTGTTAATGGAGGGGTGTGTCTGGCCACATACCCCCAGATCCAGTGCCACTGCCCACCGGGCTTCGAGGGCCATGCCTGTGAACGTGATGTCAACGAGTGCTTCCAGGACCCAGGACCCTGCCCCAAAGGCACCTCCTGCCATAACACCCTGGGCTCCTTCCAGTGCCTCTGCCCTGTGGGGCAGGAGGGTCCACGTTGTGAGCTGCGGGCAGGACCCTGCCCTCCTAGGGGCTGTTCGAATGGGGGCACCTGCCAGCTGATGCCAGAGAAAGACTCCACCTTTCACCTCTGCCTCTGTCCCCCAGGTTTCATAGGCCCAGACTGTGAGGTGAATCCAGACAACTGTGTCAGCCACCAGTGTCAGAATGGGGGCACTTGCCAGGATGGGCTGGACACCTACACCTGCCTCTGCCCAGAAACCTGGACAGGCTGGGACTGCTCCGAAGATGTGGATGAGTGTGAGACCCAGGGTCCCCCTCACTGCAGAAACGGGGGCACCTGCCAGAAC

In [46]:
exon_dels.iloc[[356]]

Unnamed: 0,level_0,index,COUNT,Hugo_Symbol,Chromosome,Start_Position,End_Position,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele2,Tumor_Sample_Barcode,RefSeq,Protein_position,Exon_Number,HGVSc,HGVSp,HGVSp_Short,transcript_id_TRUE,seq_start,seq_end,wt_w_context,alt_w_context
356,396,3349,8.0,SLC34A2,4,25678148,25678150,inframe_deletion,In_Frame_Del,DEL,GCT,-,GENIE-DFCI-271759-3854265,NM_006424.2,617.0,13/13,ENST00000382051.3:c.1864_1866del,p.Cys622del,p.C622del,ENST00000382051.3,25678088,25678210,TGCGCTCGCTGAAGCCCTGGGATGCCGTCGTCTCCAAGTTCACCGG...,TGCGCTCGCTGAAGCCCTGGGATGCCGTCGTCTCCAAGTTCACCGG...


In [38]:
np.unique(dels_new['Variant_Classification'])

array(['Frame_Shift_Del', 'In_Frame_Del', 'Intron', 'Splice_Region',
       'Splice_Site', 'Translation_Start_Site'], dtype=object)

In [341]:
len(wt_full_dna)

1915

In [342]:
len(wt_dna)

1857

In [343]:
exon_ins

Unnamed: 0,index,COUNT,Hugo_Symbol,Chromosome,Start_Position,End_Position,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele2,Tumor_Sample_Barcode,RefSeq,Protein_position,Exon_Number,HGVSc,HGVSp,HGVSp_Short,transcript_id_TRUE,seq_start,seq_end,wt_w_context,alt_w_context
0,4,762.0,ASXL1,20,31022441,31022442,frameshift_variant,Frame_Shift_Ins,INS,-,G,GENIE-DFCI-003409-1958,NM_015338.5,642.0,13/13,ENST00000375687.4:c.1934dup,p.Gly646TrpfsTer12,p.G646Wfs*12,ENST00000375687.4,31022381,31022502,GCAGGTCCGAGGGGCGAGAGGTCACCACTGCCATAGAGAGGCGGCC...,GCAGGTCCGAGGGGCGAGAGGTCACCACTGCCATAGAGAGGCGGCC...
1,5,645.0,APC,5,112175952,112175953,frameshift_variant,Frame_Shift_Ins,INS,-,A,GENIE-JHU-00198-00378,NM_000038.5,1554.0,16/16,ENST00000257430.4:c.4666dup,p.Thr1556AsnfsTer3,p.T1556Nfs*3,ENST00000257430.4,112175892,112176013,GGAATGAAACAGAATCAGAGCAGCCTAAAGAATCAAATGAAAACCA...,GGAATGAAACAGAATCAGAGCAGCCTAAAGAATCAAATGAAAACCA...
2,21,280.0,ARID1A,1,27105930,27105931,frameshift_variant,Frame_Shift_Ins,INS,-,G,GENIE-DFCI-001750-9089,NM_006015.4,1847.0,20/20,ENST00000324856.7:c.5548dup,p.Asp1850GlyfsTer4,p.D1850Gfs*4,ENST00000324856.7,27105870,27105991,CTCAGATAAGCTTGGGCGTGTGCAGGAGTTTGACAGTGGCCTGCTG...,CTCAGATAAGCTTGGGCGTGTGCAGGAGTTTGACAGTGGCCTGCTG...
4,55,108.0,TCF7L2,10,114925316,114925317,frameshift_variant,Frame_Shift_Ins,INS,-,A,GENIE-DFCI-151674-1525793,,482.0,15/15,ENST00000355995.4:c.1454dup,p.Cys486ValfsTer8,p.C486Vfs*8,ENST00000355995.4,114925256,114925377,CTCTCTCCCTTGGCATCTGTGCCCTCTATTCACAGATAACTCTCTC...,CTCTCTCCCTTGGCATCTGTGCCCTCTATTCACAGATAACTCTCTC...
5,60,99.0,GATA3,10,8115874,8115875,frameshift_variant,Frame_Shift_Ins,INS,-,G,GENIE-DFCI-002328-1774,,407.0,6/6,ENST00000346208.3:c.1221dup,p.Pro408AlafsTer99,p.P408Afs*99,ENST00000346208.3,8115814,8115935,AGAACAGCTCGTTTAACCCGGCCGCCCTCTCCAGACACATGTCCTC...,AGAACAGCTCGTTTAACCCGGCCGCCCTCTCCAGACACATGTCCTC...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354,8994,5.0,APC,5,112175621,112175622,frameshift_variant,Frame_Shift_Ins,INS,-,A,GENIE-MSK-P-0048781-T01-IM6,NM_000038.5,1444.0,16/16,ENST00000257430.4:c.4333dup,p.Thr1445AsnfsTer10,p.T1445Nfs*10,ENST00000257430.4,112175561,112175682,CCAGATAGCCCTGGACAAACCATGCCACCAAGCAGAAGTAAAACAC...,CCAGATAGCCCTGGACAAACCATGCCACCAAGCAGAAGTAAAACAC...
355,9010,5.0,SRSF2,17,74732960,74732961,inframe_insertion,In_Frame_Ins,INS,-,GGC,GENIE-UHN-AGI147550-BM1,NM_001195427.1,94.0,1/3,ENST00000359995.5:c.282_283insGCC,p.Arg94_Pro95insAla,p.R94_P95insA,ENST00000359995.5,74732900,74733021,AGCCACCGCCCCCGTACCTGCGGGGTGGCGGTCCCCGGCGGCTGTG...,AGCCACCGCCCCCGTACCTGCGGGGTGGCGGTCCCCGGCGGCTGTG...
356,9011,5.0,ZRSR2,X,15841255,15841256,protein_altering_variant,In_Frame_Ins,INS,-,AGCCGC,GENIE-UHN-AGI974552-BM1,NM_005089.3,447.0,11/11,ENST00000307771.7:c.1339_1340insAGCCGC,p.Ser447delinsLysProArg,p.S447delinsKPR,ENST00000307771.7,15841195,15841316,AGGGACCGCAGCAGGGACCGCAGCCGGGGCCGGGGCAGCCGGAGCC...,AGGGACCGCAGCAGGGACCGCAGCCGGGGCCGGGGCAGCCGGAGCC...
357,9013,5.0,TSC1,9,135771988,135771989,protein_altering_variant,In_Frame_Ins,INS,-,GCT,GENIE-UHN-DIVA273992-ARC1,NM_001162426.1,1043.0,23/23,ENST00000298552.3:c.3128_3129insAGC,p.Ser1043delinsArgAla,p.S1043delinsRA,ENST00000298552.3,135771928,135772049,CCACCGACTGCTGAATGGGCCTGCCCTCTGGTGTGGGGGTTTCTCT...,CCACCGACTGCTGAATGGGCCTGCCCTCTGGTGTGGGGGTTTCTCT...


In [316]:
wt_full_dna.transcribe().translate()

Seq('MKDKQKKKKERTWAEAARLVLENYSDAPMTPKQILQVIEAEGLKEMRSGTSPLA...VVR')

In [330]:
wt_dna = str(wt_dna)
mut_dna = str(alt_full_dna)
wt_seq = Bio.Seq.Seq(wt_dna).transcribe().translate()
mut_seq = Bio.Seq.Seq(alt_full_dna).transcribe().translate()

hgvsp = get_hgvsp_classification(wt_dna, mut_dna, wt_seq, mut_seq)
hgvsc = get_hgvsc(wt_dna, mut_dna)

print(hgvsp)

G646Wfs*12


In [333]:
len(mut_dna)

4624

In [326]:
mut_seq = Bio.Seq.Seq(alt_full_dna).transcribe().translate()



'CGAAGAAGCGCTTGTGGCCATGCTTCTGCTTGCGCAGGTAGCCGCACTTGCGCACGCTGTGGTTGTTGTTGTTGTTGTTGTTGTTGAGGTTGGGGCCGTCTCCGCTCGCCGGCCCGGGCG'

In [262]:
print(str(chr_seq[start-1:end]))
print(val['wt_w_context'])

CGAAGAAGCGCTTGTGGCCATGCTTCTGCTTGCGCAGGTAGCCGCACTTGCGCACGCTGTGGTTGTTGTTGTTGTTGTTGTTGTTGAGGTTGGGGCCGTCTCCGCTCGCCGGCCCGGGCGGCC
CGAAGAAGCGCTTGTGGCCATGCTTCTGCTTGCGCAGGTAGCCGCACTTGCGCACGCTGTTTGTTGTTGTTGTTGTTGTTGTTGAGGTTGGGGCCGTCTCCGCTCGCCGGCCCGGGCGGC


In [264]:
len('GTTGTTGTTGTTGTTGTTGTTGAGGTTGGGGCCGTCTCCGCTCGCCGGCCCGGGCGGC')

58

In [241]:
for i in range(3,10):
    print(i)

3
4
5
6
7
8
9


In [240]:
Bio.Seq.Seq(wt_dna.reverse_complement()).transcribe().translate()

Seq('MASPPRHGPPGPASGDGPNLNNNNNNNNHSVRKCGYLRKQKHGHKRFFVLRGPG...VKE')

In [227]:
val = ins_new.iloc[10]



tx = val['transcript_id_TRUE']
cds = list(db.children(tx, order_by='+end', featuretype=['CDS']))

db[tx].strand

'-'

In [228]:
cds

[<Feature CDS (chr13:110408654-110408655[-]) at 0x7fe9d0f4e0d0>,
 <Feature CDS (chr13:110434389-110438400[-]) at 0x7fe9679b29d0>]

In [209]:
chr_seq[31025138:]

Seq('TAATAAATTATGGCCATGGGAAACATTGTATATTTAGTGTGTGTATTTTGATAA...NNN')

In [211]:
Bio.Seq.Seq(wt_dna).transcribe().translate()

Seq('MKDKQKKKKERTWAEAARLVLENYSDAPMTPKQILQVIEAEGLKEMRSGTSPLA...VR*')

In [200]:
val['seq_start']
val['seq_end']

31022503

In [18]:
def get_hgvsc(mutant_seq, ref_seq):
    """
    Returns the HGVSc notation for a mutant cDNA sequence compared to a reference sequence.

    Note: doesn't work for composite mutations. It will only classify the first mutant in the sequence.

    Need to make a matching function that converts from hgvsc to mutant_seq given ref_seq and hgvsc
    """
    if mutant_seq == ref_seq:
        return 'WT'  # no change, no HGVSc notation
    
    # find the position(s) of the variant(s) and the type of variation
    pos = []
    var_type = ""
    if len(mutant_seq) > len(ref_seq):  # insertion
        for i in range(len(ref_seq)):
            if mutant_seq[i] != ref_seq[i]:
                pos.append(i+1)

        var_type = "ins" + mutant_seq[pos[0]-1:pos[0] -1 + len(mutant_seq)-len(ref_seq)]
        #print(var_type)

    elif len(mutant_seq) < len(ref_seq):  # deletion
        for i in range(len(mutant_seq)):
            if mutant_seq[i] != ref_seq[i]:
                pos.append(i+1)

        #print(pos)
        #print(len(ref_seq)-len(mutant_seq))
        var_type = "del" + ref_seq[pos[0]-1: pos[0] -1 + len(ref_seq)-len(mutant_seq)]

    else:  # substitution; doesn't take into account multiple substitutions...
        for i in range(len(mutant_seq)):
            if mutant_seq[i] != ref_seq[i]:
                pos.append(i+1)
        
        if len(pos)==1:
            var_type = ref_seq[pos[0]-1] + ">" + mutant_seq[pos[0]-1]
        else:
            var_type = ref_seq[pos[0]-1:pos[-1]] + ">" + mutant_seq[pos[0]-1:pos[-1]]
    
    if var_type[0:3]=='ins':
        pos = [i-1 for i in pos]

    # generate the HGVSc notation
    if len(pos) == 1:
        hgvsc = "c." + str(pos[0]) + var_type
    else:
        if var_type[0:3]=='ins':
            hgvsc = "c." + str(pos[0]) + "_" + str(pos[0]+1) + var_type
        elif var_type[0:3]=='del':
            if len(ref_seq)-len(mutant_seq) ==1:
                hgvsc = "c." + str(pos[0]) + var_type
            else:
                hgvsc = "c." + str(pos[0]) + "_" + str(pos[0]+len(ref_seq)-len(mutant_seq)-1) + var_type
        else:
            hgvsc = "c." + str(pos[0]) + "_" + str(pos[-1]) + var_type
    
    return hgvsc

def get_hgvsp_classification(wt_dna, mut_dna, wt_seq, mut_seq):
    """
    Given a wildtype protein sequence and a mutant protein sequence, returns the HGVSp classification for the mutant sequence.

    Args:
        wt_seq (str): The wildtype protein sequence.
        mut_seq (str): The mutant protein sequence.

    Returns:
        str: The HGVSp classification for the mutant protein sequence.


    There are a bunch of cases where this fails miserably. The chain of logical statements doesn't necessarily make sense.
    """
    # Find the position of the first amino acid that differs between the wildtype and mutant sequences
    pos = None
    for i in range(len(wt_seq)):
        if i >= len(mut_seq) or wt_seq[i] != mut_seq[i]:
            pos = i + 1
            break

    #WT sequence (no change)
    if pos==None:
        #check for and identify silent substitution variants

        pp = None
        for i in range(0,len(mut_dna)-3,3):
            mut_cod = mut_dna[i:i+3]
            wt_cod = wt_dna[i:i+3]
            if mut_cod != wt_cod:
                pp = i+1            
        
        if pp!=None:
            pg = pp//3 
            return f'{mut_seq[pg]}{pg+1}{mut_seq[pg]}'
        else:
            return 'WT'

    # Check if the mutant sequence contains a stop codon
    if '*' in mut_seq and mut_seq.index('*') < pos:
        # The mutation is a termination variant
        pos = mut_seq.index('*') + 1
        return f'{wt_seq[pos-1]}{pos}*'


    # Check the difference in sequence length between the wildtype and mutant sequences
    diff_len1 = len(mut_dna) - len(wt_dna)

    diff_len = len(mut_seq) - len(wt_seq)
    #print(diff_len)

    if diff_len1 == 0:
        # The mutation is a substitution
        return f'{wt_seq[pos-1]}{pos}{mut_seq[pos-1]}' 
        #doesn't take into account potential differences 

    elif diff_len1 > 0:
        # The mutation is an insertion
        ins_pos = pos + 1
        
        #check if it's a frameshift based on mRNA sequence
        #not a frameshift if divisible by 3
        if diff_len1 % 3 ==0:
            
            return f'{wt_seq[pos-2]}{pos-1}_{wt_seq[pos-1]}{ins_pos-1}ins{mut_seq[pos-1: pos-1+diff_len]}'
        
        #otherwise it will be a frameshift
        else:
            stop_pos = len(mut_seq)
            for i in range(pos, len(mut_seq)):
                if mut_seq[i]=="*":
                    stop_pos = i+2-pos
                    break

            return f'{wt_seq[pos-1]}{pos}{mut_seq[pos-1]}fs*{stop_pos}'

    
    else:
        # The mutation is a deletion or a more complex variant
        # Check if the mutation is a pure deletion or a more complex variant
        deleted = True
        for i in range(pos, pos - diff_len):
            if i >= len(mut_seq) or mut_seq[i] != wt_seq[i-1]:
                deleted = False
                break

        if deleted:
            # The mutation is a deletion
            del_len = -diff_len
            return f'{wt_seq[pos-1]}{pos}_{wt_seq[pos+del_len-1]}{pos+del_len-1}del'
        
        else:
            # The mutation is a complex variant
            # Find the first stop codon downstream of the mutation
            stop_pos = len(mut_seq)
            for i in range(pos, len(mut_seq)):
                if mut_seq[i]=="*":
                    stop_pos = i+2-pos
                    break

            return f'{wt_seq[pos-1]}{pos}{mut_seq[pos-1]}fs*{stop_pos}'