## PEGG 2.0

- Updates to PEGG:
    - more flexible data input (following PRIDICT/PrimeDesign format)
    - more flexible PAM sequence searching
    - inclusion of base editing sensor module
    - fixing INS/DEL error
    - G+19 instead of G+20
    - Better fetching of MIT specificity and Rule Set 2/3 information...
    - Automated generation of WT/edited sequence generation
        - Use this to determine errors in pegRNAs...
    - Improved distance to nick and homology overhang sizing
    - Also automate finding of silent variants for MMR-evasion...



In [327]:
import numpy as np
import regex as re
import pandas as pd
import matplotlib.pyplot as plt
import Bio.Seq
import warnings
warnings.filterwarnings('ignore')

## improved PAM finder for various PAM sequences

In [64]:
substitution_example = 'CACACCTACACTGCTCGAAGTAAATATGCGAAGCGCGCGGCCTGGCCGGAGGCGTTCCGCGCCGCCACGTGTTCGTTAACTGTTGATTGGTGGCACATAAGCAATCGTAGTCCGTCAAATTCAGCTCTGTTATCCCGGGCGTTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGG(GGC/TTG)AGAGACCCTCCGTCGGGCTATGTCACTAATACTTTCCAAACGCCCCGTACCGATGCTGAACAAGTCGATGCAGGCTCCCGTCTTTGAAAAGGGGTAAACATACAAGTGGATAGATGATGGGTAGGGGCCTCCAATACATCCAACACTCTACGCCCTCTCCAAGAGCTAGAAGGGCACCCTGCAGTTGGAAAGGG'
ins_example = 'CACACCTACACTGCTCGAAGTAAATATGCGAAGCGCGCGGCCTGGCCGGAGGCGTTCCGCGCCGCCACGTGTTCGTTAACTGTTGATTGGTGGCACATAAGCAATCGTAGTCCGTCAAATTCAGCTCTGTTATCCCGGGCGTTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGGGA(/GTAA)GAGACCCTCCGTCGGGCTATGTCACTAATACTTTCCAAACGCCCCGTACCGATGCTGAACAAGTCGATGCAGGCTCCCGTCTTTGAAAAGGGGTAAACATACAAGTGGATAGATGATGGGTAGGGGCCTCCAATACATCCAACACTCTACGCCCTCTCCAAGAGCTAGAAGGGCACCCTGCAGTTGGAAAGGG'
del_eaxmple = 'CACACCTACACTGCTCGAAGTAAATATGCGAAGCGCGCGGCCTGGCCGGAGGCGTTCCGCGCCGCCACGTGTTCGTTAACTGTTGATTGGTGGCACATAAGCAATCGTAGTCCGTCAAATTCAGCTCTGTTATCCCGGGCGTTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGGGAG(AGAC/)CCTCCGTCGGGCTATGTCACTAATACTTTCCAAACGCCCCGTACCGATGCTGAACAAGTCGATGCAGGCTCCCGTCTTTGAAAAGGGGTAAACATACAAGTGGATAGATGATGGGTAGGGGCCTCCAATACATCCAACACTCTACGCCCTCTCCAAGAGCTAGAAGGGCACCCTGCAGTTGGAAAGGG'

In [76]:

#introduce error messages for each of these

start = substitution_example.find("(")

end = substitution_example.find(")")

replace_seq = substitution_example[start:end+1]
#if '/' in replace_seq:
    #if '+' in replace_seq:
        #throw an error

loc_replace = replace_seq.find('/')
wt_replace = replace_seq[1:loc_replace]
mut_replace = replace_seq[loc_replace+1:-1]

wt_seq = substitution_example[:start] + wt_replace + substitution_example[end+1:]

wt_start = start
wt_end = start+len(wt_replace)


In [109]:
replace_seq = '(/AGGG)'
loc_replace = replace_seq.find('/')
wt_replace = replace_seq[1:loc_replace]
mut_replace = replace_seq[loc_replace+1:-1]
wt_replace

''

In [85]:

context_size = 50

wt_w_context = wt_seq[wt_start-context_size:wt_start] + wt_seq[wt_start:wt_end] + wt_seq[wt_end:wt_end+context_size]



103

In [113]:
def mut_formatter(pridict_format, context_size = 60):
    """ 
    Takes in mutations in format of AAA(AA/GC)ATAGC
    and converts it into a format that allows pegRNAs to be generated

    Parameters
    _____
    pridict_format = list of sequences in e.g. AAA(AA/GC)ATAGC format
    context_size = amount of nt on either side of mutation to select
    """

    original_start = []
    original_end = []

    wt = []
    mut = []
    
    wt_context = []
    mut_context = []

    left_context_length = []
    right_context_length = []

    replace_start = []
    replace_end = []

    for k in pridict_format:
        #find mutation
        start = k.find("(")
        end = k.find(")")

        #get out the mutant and WT allele
        replace_seq = k[start:end+1]
        loc_replace = replace_seq.find('/')
        wt_replace = replace_seq[1:loc_replace]
        mut_replace = replace_seq[loc_replace+1:-1]

        wt.append(wt_replace)
        mut.append(mut_replace)

        #generate full WT sequence
        wt_seq = substitution_example[:start] + wt_replace + substitution_example[end+1:]
        wt_start = start
        wt_end = start+len(wt_replace)
        original_start.append(wt_start)
        original_end.append(wt_end)

        #and just the subset
        left_context = wt_seq[wt_start-context_size:wt_start]
        print(left_context)
        right_context = wt_seq[wt_end:wt_end+context_size]

        wt_w_context = left_context + wt_replace + right_context
        mut_w_context = left_context + mut_replace + right_context

        wt_context.append(wt_w_context)
        mut_context.append(mut_w_context)

        left_context_length.append(len(left_context))
        right_context_length.append(len(right_context))

        replace_start.append(len(left_context))
        replace_end.append(len(left_context)+len(wt_replace))

    col_labels = ['Original_start', 'Original_end', 'WT', 'Mutant', 'WT_context', 'Mutant_context', 'Replace_start', 'Replace_end', 'Left_context_length', 'Right_context_length']
    cols = [original_start, original_end, wt, mut, wt_context, mut_context, replace_start, replace_end, left_context_length, right_context_length]

    df = pd.DataFrame(dict(zip(col_labels, cols)))

    return df

        





In [114]:
substitution_example = 'CACACCTACACTGCTCGAAGTAAATATGCGAAGCGCGCGGCCTGGCCGGAGGCGTTCCGCGCCGCCACGTGTTCGTTAACTGTTGATTGGTGGCACATAAGCAATCGTAGTCCGTCAAATTCAGCTCTGTTATCCCGGGCGTTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGG(GGC/TTG)AGAGACCCTCCGTCGGGCTATGTCACTAATACTTTCCAAACGCCCCGTACCGATGCTGAACAAGTCGATGCAGGCTCCCGTCTTTGAAAAGGGGTAAACATACAAGTGGATAGATGATGGGTAGGGGCCTCCAATACATCCAACACTCTACGCCCTCTCCAAGAGCTAGAAGGGCACCCTGCAGTTGGAAAGGG'
ins_example = 'CACACCTACACTGCTCGAAGTAAATATGCGAAGCGCGCGGCCTGGCCGGAGGCGTTCCGCGCCGCCACGTGTTCGTTAACTGTTGATTGGTGGCACATAAGCAATCGTAGTCCGTCAAATTCAGCTCTGTTATCCCGGGCGTTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGGGA(/GTAA)GAGACCCTCCGTCGGGCTATGTCACTAATACTTTCCAAACGCCCCGTACCGATGCTGAACAAGTCGATGCAGGCTCCCGTCTTTGAAAAGGGGTAAACATACAAGTGGATAGATGATGGGTAGGGGCCTCCAATACATCCAACACTCTACGCCCTCTCCAAGAGCTAGAAGGGCACCCTGCAGTTGGAAAGGG'
del_example = 'CACACCTACACTGCTCGAAGTAAATATGCGAAGCGCGCGGCCTGGCCGGAGGCGTTCCGCGCCGCCACGTGTTCGTTAACTGTTGATTGGTGGCACATAAGCAATCGTAGTCCGTCAAATTCAGCTCTGTTATCCCGGGCGTTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGGGAG(AGAC/)CCTCCGTCGGGCTATGTCACTAATACTTTCCAAACGCCCCGTACCGATGCTGAACAAGTCGATGCAGGCTCCCGTCTTTGAAAAGGGGTAAACATACAAGTGGATAGATGATGGGTAGGGGCCTCCAATACATCCAACACTCTACGCCCTCTCCAAGAGCTAGAAGGGCACCCTGCAGTTGGAAAGGG'

pridict_format = [substitution_example, ins_example, del_example]

df = mut_formatter(pridict_format)

GCGTTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGG
GTTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGG(G
TTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGG(GG


In [112]:
print(df.iloc[2]['WT_context'])
print(df.iloc[2]['Mutant_context'])

TTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGG(GGAGACGAGACCCTCCGTCGGGCTATGTCACTAATACTTTCCAAACGCCCCGTACCGATGCTGAAC
TTATGTGTCAAATGGCGTAGAACGGGATTGACTGTTTGACGGTAGCTGCTGAGGCGG(GGGAGACCCTCCGTCGGGCTATGTCACTAATACTTTCCAAACGCCCCGTACCGATGCTGAAC


In [None]:
def PAM_finder(WT_seq, mut_start, mut_end,  PAM, RTT_length_max):
    """Identifies the location of PAM sequences on the + and - strand.
    Returns a 2-d array containing marked locations of PAM sequence start locations on + and - strand.
    
    Parameters
    ----------
    WT_seq: WT sequence...

    PAM_seq: PAM sequence to search for...

    RTT_length_max: maximum size of RTT length for searching...
    
    """

    distance_PAM_to_nick = 3
    PAM_size = len(PAM)
    mut_size = mut_start-mut_end

    search_size = RTT_length_max - distance_PAM_to_nick

    #deal with extra PAM sequences later on...
    #and other potential issues
    plus_search = WT_seq[mut_end-search_size-5 : mut_start+distance_PAM_to_nick + PAM_size+5]


    #---------------Loading in sequences for PAM Searching------------------#
    
    search_size = RTT_length - len(PAM)-1 #need to modify this for insertion/deletions...
    #size mut doesn't capture the size of insertions, only the size of deletions

    plus_search = seq1[seq_start-1-search_size : seq_end+search_size].upper()
    minus_search = plus_search.complement().upper()
    
    mut_start_idx = 1+search_size
    mut_end_idx = 1+search_size+size_mut #not accurate for insertions; does work for indexing though...

    plus_search1 = plus_search[:mut_start_idx+3+len(PAM)-1]
    minus_search1 = minus_search[mut_start_idx-3-len(PAM):]

    #---------------PAM Searching------------------#

    #replacing N with regex symbol
    PAM_regex = PAM.replace('N', '/*.')

    PAM_search_plus = re.compile('(?=(' + PAM_regex + '))', re.IGNORECASE)

    iterator_plus = PAM_search_plus.finditer(str(plus_search1))
    PAM_starts_plus = [match.start() for match in iterator_plus]


    PAM_minus = PAM[::-1]#reversing it
    PAM_regex_minus = PAM_minus.replace('N', '/*.')

    PAM_search_minus = re.compile('(?=(' + PAM_regex_minus + '))', re.IGNORECASE)

    iterator_minus = PAM_search_minus.finditer(str(minus_search1))
    PAM_starts_minus = [match.start() for match in iterator_minus]
    #since things are flipped on minus strand, adding len(PAM) to get the true "start" to the PAM

    PAM_starts_minus = np.asarray(PAM_starts_minus) + len(PAM) + (mut_start_idx-3-len(PAM))#and correct for indexing

    return np.asarray([np.asarray(PAM_starts_plus), PAM_starts_minus], dtype='object')-mut_start_idx

In [47]:
def PAM_finder(WT_seq, mut_start, mut_end, PAM, RTT_length_max):
    """Identifies the location of PAM sequences on the + and - strand.
    Returns a 2-d array containing marked locations of PAM sequence start locations on + and - strand.
    
    Parameters
    ----------
    WT_seq: WT sequence...

    PAM_seq: PAM sequence to search for...

    RTT_length_max: maximum size of RTT length for searching...
    
    """

    #---------------Loading in sequences for PAM Searching------------------#
    
    search_size = RTT_length - len(PAM)-1 #need to modify this for insertion/deletions...
    #size mut doesn't capture the size of insertions, only the size of deletions

    plus_search = seq1[seq_start-1-search_size : seq_end+search_size].upper()
    minus_search = plus_search.complement().upper()
    
    mut_start_idx = 1+search_size
    mut_end_idx = 1+search_size+size_mut #not accurate for insertions; does work for indexing though...

    plus_search1 = plus_search[:mut_start_idx+3+len(PAM)-1]
    minus_search1 = minus_search[mut_start_idx-3-len(PAM):]

    #---------------PAM Searching------------------#

    #replacing N with regex symbol
    PAM_regex = PAM.replace('N', '/*.')

    PAM_search_plus = re.compile('(?=(' + PAM_regex + '))', re.IGNORECASE)

    iterator_plus = PAM_search_plus.finditer(str(plus_search1))
    PAM_starts_plus = [match.start() for match in iterator_plus]


    PAM_minus = PAM[::-1]#reversing it
    PAM_regex_minus = PAM_minus.replace('N', '/*.')

    PAM_search_minus = re.compile('(?=(' + PAM_regex_minus + '))', re.IGNORECASE)

    iterator_minus = PAM_search_minus.finditer(str(minus_search1))
    PAM_starts_minus = [match.start() for match in iterator_minus]
    #since things are flipped on minus strand, adding len(PAM) to get the true "start" to the PAM

    PAM_starts_minus = np.asarray(PAM_starts_minus) + len(PAM) + (mut_start_idx-3-len(PAM))#and correct for indexing

    return np.asarray([np.asarray(PAM_starts_plus), PAM_starts_minus], dtype='object')-mut_start_idx

In [None]:
PAM = 'NGG'

PAM_finder(WT_seq, mut_start, mut_end, PAM, RTT_length_max)

# Generating list of variants for alvin

- Starting with diego library
    - includes SNPs in IDR regions
    - Also intronic SNPs between exons that are an IDR
        - These wont be amenable to MMR-evasive silent edits

- Need WT sequnece
- Mutant sequence
- Frame (i.e. codon frame = 0,1,2)

In [122]:
import pegg
import gffutils

In [120]:
filepath = '/Users/samgould/Desktop/FSR Lab/reference files/GRCh37/ncbi-genomes-2022-03-17/GCF_000001405.25_GRCh37.p13_genomic.fna.gz'

records, index_list = pegg.genome_loader(filepath)

In [123]:
file = '/Users/samgould/Desktop/FSR Lab/reference files/gencode_v19.db'
db = gffutils.FeatureDB(file)

In [116]:
#diego variants
idr = pd.read_csv('filtered_idr_mutations_5count_6nt_indels.csv')
idr.keys()

Index(['COUNT', 'Hugo_Symbol', 'Entrez_Gene_Id', 'Center', 'NCBI_Build',
       'Chromosome', 'Start_Position', 'End_Position', 'Strand', 'Consequence',
       'Variant_Classification', 'Variant_Type', 'Reference_Allele',
       'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'dbSNP_RS',
       'dbSNP_Val_Status', 'Tumor_Sample_Barcode',
       'Matched_Norm_Sample_Barcode', 'Match_Norm_Seq_Allele1',
       'Match_Norm_Seq_Allele2', 'Tumor_Validation_Allele1',
       'Tumor_Validation_Allele2', 'Match_Norm_Validation_Allele1',
       'Match_Norm_Validation_Allele2', 'Verification_Status',
       'Validation_Status', 'Mutation_Status', 'Sequencing_Phase',
       'Sequence_Source', 'Validation_Method', 'Score', 'BAM_File',
       'Sequencer', 't_ref_count', 't_alt_count', 'n_ref_count', 'n_alt_count',
       'HGVSc', 'HGVSp', 'HGVSp_Short', 'Transcript_ID', 'RefSeq',
       'Protein_position', 'Codons', 'Exon_Number', 'gnomAD_AF',
       'gnomAD_AFR_AF', 'gnomAD_AMR_AF', 'gnomAD_ASJ_AF', 'gnom

In [118]:
np.unique(idr['Variant_Type'], return_counts=True)

(array(['DEL', 'DNP', 'INS', 'ONP', 'SNP'], dtype=object),
 array([ 729,   42,  361,   13, 7955]))

In [137]:
tx_new = []
for i, val in idr.iterrows():
    h = val['HGVSc']
    tx_new.append(h.split(':')[0])

In [139]:
idr['transcript_id_TRUE'] = tx_new

In [141]:
#check all transcripts are annotated

t_ids = np.unique(idr['transcript_id_TRUE'])

not_found = []

for tx in t_ids:

    cds = list(db.children(tx, order_by='+end', featuretype=['CDS']))
    start_end_cds = [[i.start, i.end] for i in cds]
    if len(start_end_cds)==0:
        not_found.append(tx)

In [136]:
tx = 'ENST00000379607.5'

cds = list(db.children(tx, order_by='+end', featuretype=['CDS']))
start_end_cds = [[i.start, i.end] for i in cds]


[[20146427, 20146429],
 [20148634, 20148725],
 [20150300, 20150381],
 [20152075, 20152125],
 [20153856, 20153959],
 [20156657, 20156740],
 [20159743, 20159758]]

In [163]:
records[index_list[21]]

SeqRecord(seq=Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN'), id='NC_000022.10', name='NC_000022.10', description='NC_000022.10 Homo sapiens chromosome 22, GRCh37.p13 Primary Assembly', dbxrefs=[])

In [375]:
ins = idr[idr['Variant_Type']=='INS']
dels = idr[idr['Variant_Type']=='DEL']
subs = idr[idr['Variant_Type'].isin(['SNP', 'DNP', 'ONP'])]

In [374]:
def df_formatter(df, context_size = 60):

    wt_w_context = []
    alt_w_context = []

    seq_start = []
    seq_end = []

    for i, val in df.iterrows():
        vt = val['Variant_Type']
        s = val['Start_Position']
        e = val['End_Position']
        ref = val['Reference_Allele']
        alt = val['Tumor_Seq_Allele2']
        chrom = val['Chromosome']

        if chrom == 'X':
            chrom = 22
        elif chrom=='Y':
            chrom = 23
        else:
            chrom = int(chrom)-1

        chr_seq = records[index_list[chrom]].seq.upper()

        if vt in ['SNP', 'ONP', 'DNP']:
            ref = ref
            alt = alt
            #assert ref == chr_seq[s-1:e], print(ref, chr_seq[s-1:e])
            left_context = chr_seq[s-1-context_size:s-1]
            right_context = chr_seq[e:e+context_size]

        elif vt =='INS':
            ref = ''
            alt = alt
            left_context = chr_seq[s-1-context_size:s+1] #need to do this since INS reference alleles are blank
            right_context = chr_seq[e:e+context_size]

        elif vt=='DEL':
            ref = ref
            alt = ''

            left_context = chr_seq[s-1-context_size:s-1]
            right_context = chr_seq[e:e+context_size]

        wt_seq = left_context + ref + right_context
        alt_seq = left_context + alt + right_context

        wt_w_context.append(str(wt_seq))
        alt_w_context.append(str(alt_seq))

        start = s-context_size
        end = e+context_size

        seq_start.append(start)
        seq_end.append(end)

        assert str(chr_seq[start-1:end])==str(wt_seq)

    cols_to_save = ['COUNT', 'Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position', 'Consequence', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele2', 'Tumor_Sample_Barcode', 'RefSeq', 'Protein_position', 'Exon_Number', 'HGVSc', 'HGVSp', 'HGVSp_Short', 'transcript_id_TRUE']
    df_new = df[cols_to_save]

    df_new['seq_start'] = seq_start
    df_new['seq_end'] = seq_end
    df_new['wt_w_context'] = wt_w_context
    df_new['alt_w_context'] = alt_w_context
    df_new = df_new.reset_index()
    
    return df_new




In [377]:
subs_new = df_formatter(subs[:400])

In [379]:
exon_subs = subs_new[subs_new['Variant_Classification']!='Intron']
exon_subs

Unnamed: 0,index,COUNT,Hugo_Symbol,Chromosome,Start_Position,End_Position,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele2,Tumor_Sample_Barcode,RefSeq,Protein_position,Exon_Number,HGVSc,HGVSp,HGVSp_Short,transcript_id_TRUE,seq_start,seq_end,wt_w_context,alt_w_context
0,1,1512.0,TP53,17,7577094,7577094,missense_variant,Missense_Mutation,SNP,G,A,GENIE-JHU-00113-00335,NM_001126112.2,282.0,8/11,ENST00000269305.4:c.844C>T,p.Arg282Trp,p.R282W,ENST00000269305.4,7577034,7577154,CTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCGGAGATTCTC...,CTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCGGAGATTCTC...
1,3,777.0,APC,5,112175639,112175639,stop_gained,Nonsense_Mutation,SNP,C,T,GENIE-JHU-00223-00468,NM_000038.5,1450.0,16/16,ENST00000257430.4:c.4348C>T,p.Arg1450Ter,p.R1450*,ENST00000257430.4,112175579,112175699,ACCATGCCACCAAGCAGAAGTAAAACACCTCCACCACCTCCTCAAA...,ACCATGCCACCAAGCAGAAGTAAAACACCTCCACCACCTCCTCAAA...
2,6,614.0,APC,5,112173917,112173917,stop_gained,Nonsense_Mutation,SNP,C,T,GENIE-JHU-00123-00365,NM_000038.5,876.0,16/16,ENST00000257430.4:c.2626C>T,p.Arg876Ter,p.R876*,ENST00000257430.4,112173857,112173977,CGCGGAATTGGTCTAGGCAACTACCATCCAGCAACAGAAAATCCAG...,CGCGGAATTGGTCTAGGCAACTACCATCCAGCAACAGAAAATCCAG...
3,7,538.0,TP53,17,7577085,7577085,missense_variant,Missense_Mutation,SNP,C,T,GENIE-JHU-00694-00783,NM_001126112.2,285.0,8/11,ENST00000269305.4:c.853G>A,p.Glu285Lys,p.E285K,ENST00000269305.4,7577025,7577145,TAGTGCTCCCTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCG...,TAGTGCTCCCTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCG...
4,8,527.0,SRSF2,17,74732959,74732959,missense_variant,Missense_Mutation,SNP,G,T,GENIE-DFCI-001379-11338,NM_001195427.1,95.0,1/3,ENST00000359995.5:c.284C>A,p.Pro95His,p.P95H,ENST00000359995.5,74732899,74733019,TAGCCACCGCCCCCGTACCTGCGGGGTGGCGGTCCCCGGCGGCTGT...,TAGCCACCGCCCCCGTACCTGCGGGGTGGCGGTCCCCGGCGGCTGT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,603,19.0,ETV5,3,185797801,185797801,missense_variant,Missense_Mutation,SNP,G,T,GENIE-DFCI-025211-61496,NM_004454.2,152.0,7/13,ENST00000306376.5:c.455C>A,p.Pro152Gln,p.P152Q,ENST00000306376.5,185797741,185797861,ACACCTTGAACTGGGCCAGCTGCAGGGGCATGCCCTGAGGTGGGCA...,ACACCTTGAACTGGGCCAGCTGCAGGGGCATGCCCTGAGGTGGGCA...
396,604,19.0,MTOR,1,11190747,11190747,missense_variant,Missense_Mutation,SNP,G,A,GENIE-DFCI-007654-6083,NM_004958.3,1818.0,39/58,ENST00000361445.4:c.5452C>T,p.Arg1818Cys,p.R1818C,ENST00000361445.4,11190687,11190807,CAGTGGCGGCCGTGGTGGCGGCAGTGGTGGCGTTGGTGATGTTGGC...,CAGTGGCGGCCGTGGTGGCGGCAGTGGTGGCGTTGGTGATGTTGGC...
397,605,19.0,TP53,17,7579591,7579591,splice_acceptor_variant,Splice_Site,SNP,C,A,GENIE-DFCI-171465-2394090,NM_001126112.2,33.0,,ENST00000269305.4:c.97-1G>T,,p.X33_splice,ENST00000269305.4,7579531,7579651,TTGTTCAATATCGTCCGGGGACAGCATCAAATCATCCATTGCTTGG...,TTGTTCAATATCGTCCGGGGACAGCATCAAATCATCCATTGCTTGG...
398,607,19.0,EIF1AX,X,20156720,20156720,missense_variant,Missense_Mutation,SNP,G,A,GENIE-MSK-P-0044987-T01-IM6,NM_001412.3,13.0,2/7,ENST00000379607.5:c.37C>T,p.Arg13Cys,p.R13C,ENST00000379607.5,20156660,20156780,GACCATCCTCTTTGAATACCAGTTCTCTTTTTTCAGATTCATTCTC...,GACCATCCTCTTTGAATACCAGTTCTCTTTTTTCAGATTCATTCTC...


In [369]:
exon_dels = dels_new[dels_new['Variant_Classification']!='Intron']
exon_dels

Unnamed: 0,index,COUNT,Hugo_Symbol,Chromosome,Start_Position,End_Position,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele2,Tumor_Sample_Barcode,RefSeq,Protein_position,Exon_Number,HGVSc,HGVSp,HGVSp_Short,transcript_id_TRUE,seq_start,seq_end,wt_w_context,alt_w_context
1,2,983.0,RNF43,17,56435161,56435161,frameshift_variant,Frame_Shift_Del,DEL,C,-,GENIE-DFCI-449465-4787362,,659.0,9/10,ENST00000407977.2:c.1976del,p.Gly659ValfsTer41,p.G659Vfs*41,ENST00000407977.2,56435101,56435221,TGGCAAGCTGGGTGCACAGTTGCATCCTGGGGCCGAGAGCCAGGGG...,TGGCAAGCTGGGTGCACAGTTGCATCCTGGGGCCGAGAGCCAGGGG...
2,9,461.0,APC,5,112175212,112175216,frameshift_variant,Frame_Shift_Del,DEL,AAAAG,-,GENIE-JHU-00184-00370,NM_000038.5,1307.0,16/16,ENST00000257430.4:c.3927_3931del,p.Glu1309AspfsTer4,p.E1309Dfs*4,ENST00000257430.4,112175152,112175276,AGGATGTAATCAGACGACACAGGAAGCAGATTCTGCTAATACCCTG...,AGGATGTAATCAGACGACACAGGAAGCAGATTCTGCTAATACCCTG...
3,11,417.0,TCF7L2,10,114925317,114925317,frameshift_variant,Frame_Shift_Del,DEL,A,-,GENIE-DFCI-449465-4787362,,482.0,15/15,ENST00000355995.4:c.1454del,p.Lys485SerfsTer23,p.K485Sfs*23,ENST00000355995.4,114925257,114925377,TCTCTCCCTTGGCATCTGTGCCCTCTATTCACAGATAACTCTCTCC...,TCTCTCCCTTGGCATCTGTGCCCTCTATTCACAGATAACTCTCTCC...
4,12,357.0,ARID1A,1,27105931,27105931,frameshift_variant,Frame_Shift_Del,DEL,G,-,GENIE-DFCI-026352-61610,NM_006015.4,1848.0,20/20,ENST00000324856.7:c.5548del,p.Asp1850ThrfsTer33,p.D1850Tfs*33,ENST00000324856.7,27105871,27105991,TCAGATAAGCTTGGGCGTGTGCAGGAGTTTGACAGTGGCCTGCTGC...,TCAGATAAGCTTGGGCGTGTGCAGGAGTTTGACAGTGGCCTGCTGC...
5,19,293.0,APC,5,112175678,112175679,frameshift_variant,Frame_Shift_Del,DEL,AG,-,GENIE-JHU-00090-00282,NM_000038.5,1463.0,16/16,ENST00000257430.4:c.4393_4394del,p.Ser1465TrpfsTer3,p.S1465Wfs*3,ENST00000257430.4,112175618,112175739,CCTCAAACAGCTCAAACCAAGCGAGAAGTACCTAAAAATAAAGCAC...,CCTCAAACAGCTCAAACCAAGCGAGAAGTACCTAAAAATAAAGCAC...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724,9040,5.0,NOTCH1,9,139390945,139390947,inframe_deletion,In_Frame_Del,DEL,GTG,-,GENIE-VICC-927057-unk-3,NM_017617.3,2415.0,34/34,ENST00000277541.6:c.7244_7246del,p.Pro2415del,p.P2415del,ENST00000277541.6,139390885,139391007,CACTCAGGAAGCTCCGGCCCAGGTGGCCGCTGGCTGCTGAGCTCAC...,CACTCAGGAAGCTCCGGCCCAGGTGGCCGCTGGCTGCTGAGCTCAC...
725,9080,5.0,NUTM1,15,34648285,34648286,frameshift_variant,Frame_Shift_Del,DEL,AG,-,GENIE-UCSF-13974-8069T,NM_175741.1,664.0,7/7,ENST00000333756.4:c.1992_1993del,p.Gly666SerfsTer26,p.G666Sfs*26,ENST00000333756.4,34648225,34648346,TCCTCTGCAAGGACAAGGGTTAGAAAAGCAAGTCCTGGGATTGCAG...,TCCTCTGCAAGGACAAGGGTTAGAAAAGCAAGTCCTGGGATTGCAG...
726,9087,5.0,SIK2,11,111594604,111594606,inframe_deletion,In_Frame_Del,DEL,CTC,-,GENIE-CRUK-MTST0169-primary,NM_015191.1,844.0,15/15,ENST00000304987.3:c.2534_2536del,p.Ser845del,p.S845del,ENST00000304987.3,111594544,111594666,ACCGCCACCACCCCCTCCACCACCACGACAGCCAGGAGCTGCCCCA...,ACCGCCACCACCCCCTCCACCACCACGACAGCCAGGAGCTGCCCCA...
727,9089,5.0,CSF1R,5,149433716,149433718,inframe_deletion,In_Frame_Del,DEL,CTC,-,GENIE-SCI-0541906662-U3294645R2,NM_005211.3,945.0,22/22,ENST00000286301.3:c.2833_2835del,p.Glu945del,p.E945del,ENST00000286301.3,149433656,149433778,CTGCAGCAAGGGCTGGGCGATATCCCCTTGCTCGCAGCAGGTCAGG...,CTGCAGCAAGGGCTGGGCGATATCCCCTTGCTCGCAGCAGGTCAGG...


In [336]:
exon_ins = ins_new[ins_new['Variant_Classification']!='Intron']
exon_ins

Unnamed: 0,index,COUNT,Hugo_Symbol,Chromosome,Start_Position,End_Position,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele2,Tumor_Sample_Barcode,RefSeq,Protein_position,Exon_Number,HGVSc,HGVSp,HGVSp_Short,transcript_id_TRUE,seq_start,seq_end,wt_w_context,alt_w_context
0,4,762.0,ASXL1,20,31022441,31022442,frameshift_variant,Frame_Shift_Ins,INS,-,G,GENIE-DFCI-003409-1958,NM_015338.5,642.0,13/13,ENST00000375687.4:c.1934dup,p.Gly646TrpfsTer12,p.G646Wfs*12,ENST00000375687.4,31022381,31022502,GCAGGTCCGAGGGGCGAGAGGTCACCACTGCCATAGAGAGGCGGCC...,GCAGGTCCGAGGGGCGAGAGGTCACCACTGCCATAGAGAGGCGGCC...
1,5,645.0,APC,5,112175952,112175953,frameshift_variant,Frame_Shift_Ins,INS,-,A,GENIE-JHU-00198-00378,NM_000038.5,1554.0,16/16,ENST00000257430.4:c.4666dup,p.Thr1556AsnfsTer3,p.T1556Nfs*3,ENST00000257430.4,112175892,112176013,GGAATGAAACAGAATCAGAGCAGCCTAAAGAATCAAATGAAAACCA...,GGAATGAAACAGAATCAGAGCAGCCTAAAGAATCAAATGAAAACCA...
2,21,280.0,ARID1A,1,27105930,27105931,frameshift_variant,Frame_Shift_Ins,INS,-,G,GENIE-DFCI-001750-9089,NM_006015.4,1847.0,20/20,ENST00000324856.7:c.5548dup,p.Asp1850GlyfsTer4,p.D1850Gfs*4,ENST00000324856.7,27105870,27105991,CTCAGATAAGCTTGGGCGTGTGCAGGAGTTTGACAGTGGCCTGCTG...,CTCAGATAAGCTTGGGCGTGTGCAGGAGTTTGACAGTGGCCTGCTG...
4,55,108.0,TCF7L2,10,114925316,114925317,frameshift_variant,Frame_Shift_Ins,INS,-,A,GENIE-DFCI-151674-1525793,,482.0,15/15,ENST00000355995.4:c.1454dup,p.Cys486ValfsTer8,p.C486Vfs*8,ENST00000355995.4,114925256,114925377,CTCTCTCCCTTGGCATCTGTGCCCTCTATTCACAGATAACTCTCTC...,CTCTCTCCCTTGGCATCTGTGCCCTCTATTCACAGATAACTCTCTC...
5,60,99.0,GATA3,10,8115874,8115875,frameshift_variant,Frame_Shift_Ins,INS,-,G,GENIE-DFCI-002328-1774,,407.0,6/6,ENST00000346208.3:c.1221dup,p.Pro408AlafsTer99,p.P408Afs*99,ENST00000346208.3,8115814,8115935,AGAACAGCTCGTTTAACCCGGCCGCCCTCTCCAGACACATGTCCTC...,AGAACAGCTCGTTTAACCCGGCCGCCCTCTCCAGACACATGTCCTC...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354,8994,5.0,APC,5,112175621,112175622,frameshift_variant,Frame_Shift_Ins,INS,-,A,GENIE-MSK-P-0048781-T01-IM6,NM_000038.5,1444.0,16/16,ENST00000257430.4:c.4333dup,p.Thr1445AsnfsTer10,p.T1445Nfs*10,ENST00000257430.4,112175561,112175682,CCAGATAGCCCTGGACAAACCATGCCACCAAGCAGAAGTAAAACAC...,CCAGATAGCCCTGGACAAACCATGCCACCAAGCAGAAGTAAAACAC...
355,9010,5.0,SRSF2,17,74732960,74732961,inframe_insertion,In_Frame_Ins,INS,-,GGC,GENIE-UHN-AGI147550-BM1,NM_001195427.1,94.0,1/3,ENST00000359995.5:c.282_283insGCC,p.Arg94_Pro95insAla,p.R94_P95insA,ENST00000359995.5,74732900,74733021,AGCCACCGCCCCCGTACCTGCGGGGTGGCGGTCCCCGGCGGCTGTG...,AGCCACCGCCCCCGTACCTGCGGGGTGGCGGTCCCCGGCGGCTGTG...
356,9011,5.0,ZRSR2,X,15841255,15841256,protein_altering_variant,In_Frame_Ins,INS,-,AGCCGC,GENIE-UHN-AGI974552-BM1,NM_005089.3,447.0,11/11,ENST00000307771.7:c.1339_1340insAGCCGC,p.Ser447delinsLysProArg,p.S447delinsKPR,ENST00000307771.7,15841195,15841316,AGGGACCGCAGCAGGGACCGCAGCCGGGGCCGGGGCAGCCGGAGCC...,AGGGACCGCAGCAGGGACCGCAGCCGGGGCCGGGGCAGCCGGAGCC...
357,9013,5.0,TSC1,9,135771988,135771989,protein_altering_variant,In_Frame_Ins,INS,-,GCT,GENIE-UHN-DIVA273992-ARC1,NM_001162426.1,1043.0,23/23,ENST00000298552.3:c.3128_3129insAGC,p.Ser1043delinsArgAla,p.S1043delinsRA,ENST00000298552.3,135771928,135772049,CCACCGACTGCTGAATGGGCCTGCCCTCTGGTGTGGGGGTTTCTCT...,CCACCGACTGCTGAATGGGCCTGCCCTCTGGTGTGGGGGTTTCTCT...


In [381]:
#try it with one example
#val = ins_new.iloc[0]

for index1, val in exon_subs.iterrows():
    hg = val['HGVSp_Short']

    tx = val['transcript_id_TRUE']
    cds = list(db.children(tx, order_by='+end', featuretype=['CDS']))
    start_end_cds = [[i.start, i.end] for i in cds]
    strand = db[tx].strand
    chrom = val['Chromosome']

    if chrom == 'X':
        chrom = 22
    elif chrom=='Y':
        chrom = 23
    else:
        chrom = int(chrom)-1

    chr_seq = records[index_list[chrom]].seq.upper()


    wt_dna = ''
    codon_locs = []
    for i in start_end_cds:
        wt_dna += chr_seq[i[0]-1:i[1]]

        for j in range(i[0], i[1]+1):
            codon_locs.append(j)

    #and add in the last/stop codon
    #if strand == '+':

    # wt_dna += chr_seq[start_end_cds[-1][1]:start_end_cds[-1][1]+3]
    #elif strand == '-':
    #   wt_dna = chr_seq[start_end_cds[0][0]-3:start_end_cds[0][0]] +  wt_dna

    #check location in cds
    #get information about the wt/alt sequence
    start = val['seq_start']
    end = val['seq_end']
    wt_seq = val['wt_w_context']
    alt_seq = val['alt_w_context']

    inc = []
    dna_dict = dict(zip(list(range(start, end+1)), list(range(len(wt_seq)))))
    for i in range(start, end+1):
        if i in codon_locs:
            inc.append(i)


    inc_start = inc[0]
    inc_end = inc[-1]

    wt_start = dna_dict[inc_start]
    wt_end = dna_dict[inc_end] + 1

    if wt_end == len(wt_seq):
        alt_end = len(alt_seq)
    else:
        alt_end = wt_end - len(wt_seq)

    first_half = codon_locs.index(inc_start)
    second_half = codon_locs.index(inc_end)


    wt_full_dna = wt_dna[:first_half] + wt_seq[wt_start:wt_end] + wt_dna[second_half+1:]
    alt_full_dna = wt_dna[:first_half] + alt_seq[wt_start:alt_end] + wt_dna[second_half+1:]

    assert wt_full_dna == wt_dna

    if strand == '-':
        wt_full_dna = wt_full_dna.reverse_complement()
        alt_full_dna = alt_full_dna.reverse_complement()

    
    wt_dna = str(wt_full_dna)
    mut_dna = str(alt_full_dna)
    wt_seq = Bio.Seq.Seq(wt_dna).transcribe().translate()
    mut_seq = Bio.Seq.Seq(alt_full_dna).transcribe().translate()

    hgvsp = get_hgvsp_classification(wt_dna, mut_dna, wt_seq, mut_seq)

    if hgvsp != hg[2:]:
        print(index1)
    print(hgvsp, hg[2:])



R282W R282W
R1450* R1450*
R876* R876*
E285K E285K
P95H P95H
S37F S37F
6
N1018S N1018_K1019delinsSE
S45F S45F
P95L P95L
R1114* R1114*
T41A T41A
E286K E286K
S37C S37C
Q1367* Q1367*
S33C S33C
S45P S45P
P95R P95R
20
WT X279_splice
Q1429* Q1429*
R805* R805*
S33F S33F
Q1378* Q1378*
26
WT X423_splice
D32Y D32Y
T41I T41I
E1306* E1306*
Q1406* Q1406*
S257L S257L
R693* R693*
D32N D32N
E1309* E1309*
E1353* E1353*
G34V G34V
Q58L Q58L
Q1338* Q1338*
R283P R283P
P44L P44L
E1379* E1379*
E79Q E79Q
D281H D281H
G34E G34E
46
WT X307_splice
Q1291* Q1291*
E1322* E1322*
E1397* E1397*
50
WT X307_splice
D281N D281N
R282G R282G
G34R G34R
L78P L78P
55
WT X307_splice
Q1303* Q1303*
Q1294* Q1294*
Y935* Y935*
S24F S24F
D1010H D1010H
61
R299R R299=
R488C R488C
R787* R787*
D1010N D1010N
E1286* E1286*
S33P S33P
D281E D281E
D281Y D281Y
70
E914D E914_P915delinsDS
E79K E79K
D32G D32G
73
WT X307_splice
R2204* R2204*
E1408* E1408*
D281G D281G
78
WT X307_splice
79
WT X33_splice
R177Q R177Q
E941* E941*
R2237* R2237*
S1400* S14

IndexError: list index out of range

In [355]:
a = 'ABCAADEFGH'
a[2:]

'CAADEFGH'

In [348]:
4-len(a)-4

4

In [341]:
len(wt_full_dna)

1915

In [342]:
len(wt_dna)

1857

In [343]:
exon_ins

Unnamed: 0,index,COUNT,Hugo_Symbol,Chromosome,Start_Position,End_Position,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele2,Tumor_Sample_Barcode,RefSeq,Protein_position,Exon_Number,HGVSc,HGVSp,HGVSp_Short,transcript_id_TRUE,seq_start,seq_end,wt_w_context,alt_w_context
0,4,762.0,ASXL1,20,31022441,31022442,frameshift_variant,Frame_Shift_Ins,INS,-,G,GENIE-DFCI-003409-1958,NM_015338.5,642.0,13/13,ENST00000375687.4:c.1934dup,p.Gly646TrpfsTer12,p.G646Wfs*12,ENST00000375687.4,31022381,31022502,GCAGGTCCGAGGGGCGAGAGGTCACCACTGCCATAGAGAGGCGGCC...,GCAGGTCCGAGGGGCGAGAGGTCACCACTGCCATAGAGAGGCGGCC...
1,5,645.0,APC,5,112175952,112175953,frameshift_variant,Frame_Shift_Ins,INS,-,A,GENIE-JHU-00198-00378,NM_000038.5,1554.0,16/16,ENST00000257430.4:c.4666dup,p.Thr1556AsnfsTer3,p.T1556Nfs*3,ENST00000257430.4,112175892,112176013,GGAATGAAACAGAATCAGAGCAGCCTAAAGAATCAAATGAAAACCA...,GGAATGAAACAGAATCAGAGCAGCCTAAAGAATCAAATGAAAACCA...
2,21,280.0,ARID1A,1,27105930,27105931,frameshift_variant,Frame_Shift_Ins,INS,-,G,GENIE-DFCI-001750-9089,NM_006015.4,1847.0,20/20,ENST00000324856.7:c.5548dup,p.Asp1850GlyfsTer4,p.D1850Gfs*4,ENST00000324856.7,27105870,27105991,CTCAGATAAGCTTGGGCGTGTGCAGGAGTTTGACAGTGGCCTGCTG...,CTCAGATAAGCTTGGGCGTGTGCAGGAGTTTGACAGTGGCCTGCTG...
4,55,108.0,TCF7L2,10,114925316,114925317,frameshift_variant,Frame_Shift_Ins,INS,-,A,GENIE-DFCI-151674-1525793,,482.0,15/15,ENST00000355995.4:c.1454dup,p.Cys486ValfsTer8,p.C486Vfs*8,ENST00000355995.4,114925256,114925377,CTCTCTCCCTTGGCATCTGTGCCCTCTATTCACAGATAACTCTCTC...,CTCTCTCCCTTGGCATCTGTGCCCTCTATTCACAGATAACTCTCTC...
5,60,99.0,GATA3,10,8115874,8115875,frameshift_variant,Frame_Shift_Ins,INS,-,G,GENIE-DFCI-002328-1774,,407.0,6/6,ENST00000346208.3:c.1221dup,p.Pro408AlafsTer99,p.P408Afs*99,ENST00000346208.3,8115814,8115935,AGAACAGCTCGTTTAACCCGGCCGCCCTCTCCAGACACATGTCCTC...,AGAACAGCTCGTTTAACCCGGCCGCCCTCTCCAGACACATGTCCTC...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354,8994,5.0,APC,5,112175621,112175622,frameshift_variant,Frame_Shift_Ins,INS,-,A,GENIE-MSK-P-0048781-T01-IM6,NM_000038.5,1444.0,16/16,ENST00000257430.4:c.4333dup,p.Thr1445AsnfsTer10,p.T1445Nfs*10,ENST00000257430.4,112175561,112175682,CCAGATAGCCCTGGACAAACCATGCCACCAAGCAGAAGTAAAACAC...,CCAGATAGCCCTGGACAAACCATGCCACCAAGCAGAAGTAAAACAC...
355,9010,5.0,SRSF2,17,74732960,74732961,inframe_insertion,In_Frame_Ins,INS,-,GGC,GENIE-UHN-AGI147550-BM1,NM_001195427.1,94.0,1/3,ENST00000359995.5:c.282_283insGCC,p.Arg94_Pro95insAla,p.R94_P95insA,ENST00000359995.5,74732900,74733021,AGCCACCGCCCCCGTACCTGCGGGGTGGCGGTCCCCGGCGGCTGTG...,AGCCACCGCCCCCGTACCTGCGGGGTGGCGGTCCCCGGCGGCTGTG...
356,9011,5.0,ZRSR2,X,15841255,15841256,protein_altering_variant,In_Frame_Ins,INS,-,AGCCGC,GENIE-UHN-AGI974552-BM1,NM_005089.3,447.0,11/11,ENST00000307771.7:c.1339_1340insAGCCGC,p.Ser447delinsLysProArg,p.S447delinsKPR,ENST00000307771.7,15841195,15841316,AGGGACCGCAGCAGGGACCGCAGCCGGGGCCGGGGCAGCCGGAGCC...,AGGGACCGCAGCAGGGACCGCAGCCGGGGCCGGGGCAGCCGGAGCC...
357,9013,5.0,TSC1,9,135771988,135771989,protein_altering_variant,In_Frame_Ins,INS,-,GCT,GENIE-UHN-DIVA273992-ARC1,NM_001162426.1,1043.0,23/23,ENST00000298552.3:c.3128_3129insAGC,p.Ser1043delinsArgAla,p.S1043delinsRA,ENST00000298552.3,135771928,135772049,CCACCGACTGCTGAATGGGCCTGCCCTCTGGTGTGGGGGTTTCTCT...,CCACCGACTGCTGAATGGGCCTGCCCTCTGGTGTGGGGGTTTCTCT...


In [316]:
wt_full_dna.transcribe().translate()

Seq('MKDKQKKKKERTWAEAARLVLENYSDAPMTPKQILQVIEAEGLKEMRSGTSPLA...VVR')

In [330]:
wt_dna = str(wt_dna)
mut_dna = str(alt_full_dna)
wt_seq = Bio.Seq.Seq(wt_dna).transcribe().translate()
mut_seq = Bio.Seq.Seq(alt_full_dna).transcribe().translate()

hgvsp = get_hgvsp_classification(wt_dna, mut_dna, wt_seq, mut_seq)
hgvsc = get_hgvsc(wt_dna, mut_dna)

print(hgvsp)

G646Wfs*12


In [333]:
len(mut_dna)

4624

In [326]:
mut_seq = Bio.Seq.Seq(alt_full_dna).transcribe().translate()



'CGAAGAAGCGCTTGTGGCCATGCTTCTGCTTGCGCAGGTAGCCGCACTTGCGCACGCTGTGGTTGTTGTTGTTGTTGTTGTTGTTGAGGTTGGGGCCGTCTCCGCTCGCCGGCCCGGGCG'

In [262]:
print(str(chr_seq[start-1:end]))
print(val['wt_w_context'])

CGAAGAAGCGCTTGTGGCCATGCTTCTGCTTGCGCAGGTAGCCGCACTTGCGCACGCTGTGGTTGTTGTTGTTGTTGTTGTTGTTGAGGTTGGGGCCGTCTCCGCTCGCCGGCCCGGGCGGCC
CGAAGAAGCGCTTGTGGCCATGCTTCTGCTTGCGCAGGTAGCCGCACTTGCGCACGCTGTTTGTTGTTGTTGTTGTTGTTGTTGAGGTTGGGGCCGTCTCCGCTCGCCGGCCCGGGCGGC


In [264]:
len('GTTGTTGTTGTTGTTGTTGTTGAGGTTGGGGCCGTCTCCGCTCGCCGGCCCGGGCGGC')

58

In [241]:
for i in range(3,10):
    print(i)

3
4
5
6
7
8
9


In [240]:
Bio.Seq.Seq(wt_dna.reverse_complement()).transcribe().translate()

Seq('MASPPRHGPPGPASGDGPNLNNNNNNNNHSVRKCGYLRKQKHGHKRFFVLRGPG...VKE')

In [227]:
val = ins_new.iloc[10]



tx = val['transcript_id_TRUE']
cds = list(db.children(tx, order_by='+end', featuretype=['CDS']))

db[tx].strand

'-'

In [228]:
cds

[<Feature CDS (chr13:110408654-110408655[-]) at 0x7fe9d0f4e0d0>,
 <Feature CDS (chr13:110434389-110438400[-]) at 0x7fe9679b29d0>]

In [209]:
chr_seq[31025138:]

Seq('TAATAAATTATGGCCATGGGAAACATTGTATATTTAGTGTGTGTATTTTGATAA...NNN')

In [211]:
Bio.Seq.Seq(wt_dna).transcribe().translate()

Seq('MKDKQKKKKERTWAEAARLVLENYSDAPMTPKQILQVIEAEGLKEMRSGTSPLA...VR*')

In [200]:
val['seq_start']
val['seq_end']

31022503

In [325]:
def get_hgvsc(mutant_seq, ref_seq):
    """
    Returns the HGVSc notation for a mutant cDNA sequence compared to a reference sequence.

    Note: doesn't work for composite mutations. It will only classify the first mutant in the sequence.

    Need to make a matching function that converts from hgvsc to mutant_seq given ref_seq and hgvsc
    """
    if mutant_seq == ref_seq:
        return 'WT'  # no change, no HGVSc notation
    
    # find the position(s) of the variant(s) and the type of variation
    pos = []
    var_type = ""
    if len(mutant_seq) > len(ref_seq):  # insertion
        for i in range(len(ref_seq)):
            if mutant_seq[i] != ref_seq[i]:
                pos.append(i+1)

        var_type = "ins" + mutant_seq[pos[0]-1:pos[0] -1 + len(mutant_seq)-len(ref_seq)]
        #print(var_type)

    elif len(mutant_seq) < len(ref_seq):  # deletion
        for i in range(len(mutant_seq)):
            if mutant_seq[i] != ref_seq[i]:
                pos.append(i+1)

        #print(pos)
        #print(len(ref_seq)-len(mutant_seq))
        var_type = "del" + ref_seq[pos[0]-1: pos[0] -1 + len(ref_seq)-len(mutant_seq)]

    else:  # substitution; doesn't take into account multiple substitutions...
        for i in range(len(mutant_seq)):
            if mutant_seq[i] != ref_seq[i]:
                pos.append(i+1)
        
        if len(pos)==1:
            var_type = ref_seq[pos[0]-1] + ">" + mutant_seq[pos[0]-1]
        else:
            var_type = ref_seq[pos[0]-1:pos[-1]] + ">" + mutant_seq[pos[0]-1:pos[-1]]
    
    if var_type[0:3]=='ins':
        pos = [i-1 for i in pos]

    # generate the HGVSc notation
    if len(pos) == 1:
        hgvsc = "c." + str(pos[0]) + var_type
    else:
        if var_type[0:3]=='ins':
            hgvsc = "c." + str(pos[0]) + "_" + str(pos[0]+1) + var_type
        elif var_type[0:3]=='del':
            if len(ref_seq)-len(mutant_seq) ==1:
                hgvsc = "c." + str(pos[0]) + var_type
            else:
                hgvsc = "c." + str(pos[0]) + "_" + str(pos[0]+len(ref_seq)-len(mutant_seq)-1) + var_type
        else:
            hgvsc = "c." + str(pos[0]) + "_" + str(pos[-1]) + var_type
    
    return hgvsc

def get_hgvsp_classification(wt_dna, mut_dna, wt_seq, mut_seq):
    """
    Given a wildtype protein sequence and a mutant protein sequence, returns the HGVSp classification for the mutant sequence.

    Args:
        wt_seq (str): The wildtype protein sequence.
        mut_seq (str): The mutant protein sequence.

    Returns:
        str: The HGVSp classification for the mutant protein sequence.


    There are a bunch of cases where this fails miserably. The chain of logical statements doesn't necessarily make sense.
    """
    # Find the position of the first amino acid that differs between the wildtype and mutant sequences
    pos = None
    for i in range(len(wt_seq)):
        if i >= len(mut_seq) or wt_seq[i] != mut_seq[i]:
            pos = i + 1
            break

    #WT sequence (no change)
    if pos==None:
        #check for and identify silent substitution variants

        pp = None
        for i in range(0,len(mut_dna)-3,3):
            mut_cod = mut_dna[i:i+3]
            wt_cod = wt_dna[i:i+3]
            if mut_cod != wt_cod:
                pp = i+1            
        
        if pp!=None:
            pg = pp//3 
            return f'{mut_seq[pg]}{pg+1}{mut_seq[pg]}'
        else:
            return 'WT'

    # Check if the mutant sequence contains a stop codon
    if '*' in mut_seq and mut_seq.index('*') < pos:
        # The mutation is a termination variant
        pos = mut_seq.index('*') + 1
        return f'{wt_seq[pos-1]}{pos}*'


    # Check the difference in sequence length between the wildtype and mutant sequences
    diff_len1 = len(mut_dna) - len(wt_dna)

    diff_len = len(mut_seq) - len(wt_seq)
    #print(diff_len)

    if diff_len1 == 0:
        # The mutation is a substitution
        return f'{wt_seq[pos-1]}{pos}{mut_seq[pos-1]}' 
        #doesn't take into account potential differences 

    elif diff_len1 > 0:
        # The mutation is an insertion
        ins_pos = pos + 1
        
        #check if it's a frameshift based on mRNA sequence
        #not a frameshift if divisible by 3
        if diff_len1 % 3 ==0:
            
            return f'{wt_seq[pos-2]}{pos-1}_{wt_seq[pos-1]}{ins_pos-1}ins{mut_seq[pos-1: pos-1+diff_len]}'
        
        #otherwise it will be a frameshift
        else:
            stop_pos = len(mut_seq)
            for i in range(pos, len(mut_seq)):
                if mut_seq[i]=="*":
                    stop_pos = i+2-pos
                    break

            return f'{wt_seq[pos-1]}{pos}{mut_seq[pos-1]}fs*{stop_pos}'

    
    else:
        # The mutation is a deletion or a more complex variant
        # Check if the mutation is a pure deletion or a more complex variant
        deleted = True
        for i in range(pos, pos - diff_len):
            if i >= len(mut_seq) or mut_seq[i] != wt_seq[i-1]:
                deleted = False
                break

        if deleted:
            # The mutation is a deletion
            del_len = -diff_len
            return f'{wt_seq[pos-1]}{pos}_{wt_seq[pos+del_len-1]}{pos+del_len-1}del'
        
        else:
            # The mutation is a complex variant
            # Find the first stop codon downstream of the mutation
            stop_pos = len(mut_seq)
            for i in range(pos, len(mut_seq)):
                if mut_seq[i]=="*":
                    stop_pos = i+2-pos
                    break

            return f'{wt_seq[pos-1]}{pos}{mut_seq[pos-1]}fs*{stop_pos}'