# Alvin Oligo Generator

Formatting mutations to generate WT and alt sequence for ML-based pegRNA generation.

- Going to do this with: 
    - Diego's IDR library
        - Need to filter out a few INS/DEL that aren't pure INS/DEL (they're complex indels)
    - p53 follow-up library
        - Not going to worry about the random indels
        - I can include the efficient pegRNAs that I have for a subset of these (e.g. I254_I255insH)

In [2]:
import numpy as np
import regex as re
import pandas as pd
import matplotlib.pyplot as plt
import Bio.Seq
import warnings
import pegg
import gffutils
warnings.filterwarnings('ignore')

In [3]:
#loading in reference genome
#and genome annotations

filepath = '/Users/samgould/Desktop/FSR Lab/reference files/GRCh37/ncbi-genomes-2022-03-17/GCF_000001405.25_GRCh37.p13_genomic.fna.gz'
records, index_list = pegg.genome_loader(filepath)

file = '/Users/samgould/Desktop/FSR Lab/reference files/gencode_v19.db'
db = gffutils.FeatureDB(file)

In [116]:
def df_formatter(df, context_size = 60):

    """ 
    Takes in variants (in cBioPortal format)
    and outputs dataframe with WT and ALT oligos with designated context_size
    context_size = the amount of nt on either side of the variant e.g. AAA(A/G)AAA = context_size of 3
    """

    wt_w_context = []
    alt_w_context = []

    seq_start = []
    seq_end = []

    for i, val in df.iterrows():
        vt = val['Variant_Type']
        s = val['Start_Position']
        e = val['End_Position']
        ref = val['Reference_Allele']
        alt = val['Tumor_Seq_Allele2']
        chrom = val['Chromosome']

        if chrom == 'X':
            chrom = 22
        elif chrom=='Y':
            chrom = 23
        else:
            chrom = int(chrom)-1

        chr_seq = records[index_list[chrom]].seq.upper()

        if vt in ['SNP', 'ONP', 'DNP']:
            ref = ref
            alt = alt
            #assert ref == chr_seq[s-1:e], print(ref, chr_seq[s-1:e])
            left_context = chr_seq[s-1-context_size:s-1]
            right_context = chr_seq[e:e+context_size]

        elif vt =='INS':
            ref = ''
            alt = alt
            #left_context = chr_seq[s-1-context_size:s+1] #need to do this since INS reference alleles are blank
            left_context = chr_seq[s-1-context_size:s]
            right_context = chr_seq[e-1:e+context_size]

        elif vt=='DEL':
            ref = ref
            alt = ''
            left_context = chr_seq[s-1-context_size:s-1]
            right_context = chr_seq[e:e+context_size]

        wt_seq = left_context + ref + right_context
        alt_seq = left_context + alt + right_context

        wt_w_context.append(str(wt_seq))
        alt_w_context.append(str(alt_seq))

        start = s-context_size
        end = e+context_size

        seq_start.append(start)
        seq_end.append(end)

        assert str(chr_seq[start-1:end])==str(wt_seq), print(chr_seq[start-1:end] + '\n' + str(wt_seq))
                                                            

    cols_to_save = ['COUNT', 'Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position', 'Consequence', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele2', 'Tumor_Sample_Barcode', 'RefSeq', 'Protein_position', 'Exon_Number', 'HGVSc', 'HGVSp', 'HGVSp_Short', 'transcript_id_TRUE']
    df_new = df[cols_to_save]

    df_new['seq_start'] = seq_start
    df_new['seq_end'] = seq_end
    df_new['wt_w_context'] = wt_w_context
    df_new['alt_w_context'] = alt_w_context
    df_new = df_new.reset_index()
    
    return df_new

In [234]:
idr_oligos

Unnamed: 0,index,COUNT,Hugo_Symbol,Chromosome,Start_Position,End_Position,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele2,Tumor_Sample_Barcode,RefSeq,Protein_position,Exon_Number,HGVSc,HGVSp,HGVSp_Short,transcript_id_TRUE,seq_start,seq_end,wt_w_context,alt_w_context
0,0,1809.0,EIF1AX,X,20156758,20156758,intron_variant,Intron,DEL,A,-,GENIE-COLU-951-1,NM_001412.3,,,ENST00000379607.5:c.17-11del,,p.*6*,ENST00000379607.5,20156698,20156818,TCATTCTCATTCTTACCCCTGCGTCTGTTTTTACCTCCTTTACCTG...,TCATTCTCATTCTTACCCCTGCGTCTGTTTTTACCTCCTTTACCTG...
1,1,1512.0,TP53,17,7577094,7577094,missense_variant,Missense_Mutation,SNP,G,A,GENIE-JHU-00113-00335,NM_001126112.2,282.0,8/11,ENST00000269305.4:c.844C>T,p.Arg282Trp,p.R282W,ENST00000269305.4,7577034,7577154,CTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCGGAGATTCTC...,CTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCGGAGATTCTC...
2,2,983.0,RNF43,17,56435161,56435161,frameshift_variant,Frame_Shift_Del,DEL,C,-,GENIE-DFCI-449465-4787362,,659.0,9/10,ENST00000407977.2:c.1976del,p.Gly659ValfsTer41,p.G659Vfs*41,ENST00000407977.2,56435101,56435221,TGGCAAGCTGGGTGCACAGTTGCATCCTGGGGCCGAGAGCCAGGGG...,TGGCAAGCTGGGTGCACAGTTGCATCCTGGGGCCGAGAGCCAGGGG...
3,3,777.0,APC,5,112175639,112175639,stop_gained,Nonsense_Mutation,SNP,C,T,GENIE-JHU-00223-00468,NM_000038.5,1450.0,16/16,ENST00000257430.4:c.4348C>T,p.Arg1450Ter,p.R1450*,ENST00000257430.4,112175579,112175699,ACCATGCCACCAAGCAGAAGTAAAACACCTCCACCACCTCCTCAAA...,ACCATGCCACCAAGCAGAAGTAAAACACCTCCACCACCTCCTCAAA...
4,4,762.0,ASXL1,20,31022441,31022442,frameshift_variant,Frame_Shift_Ins,INS,-,G,GENIE-DFCI-003409-1958,NM_015338.5,642.0,13/13,ENST00000375687.4:c.1934dup,p.Gly646TrpfsTer12,p.G646Wfs*12,ENST00000375687.4,31022381,31022502,GCAGGTCCGAGGGGCGAGAGGTCACCACTGCCATAGAGAGGCGGCC...,GCAGGTCCGAGGGGCGAGAGGTCACCACTGCCATAGAGAGGCGGCC...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9090,9090,5.0,ERCC5,13,103527930,103527931,missense_variant,Missense_Mutation,DNP,GG,CA,GENIE-PROV-9b34c1d29d-5652092fc3,NM_000123.3,1080.0,15/15,ENST00000355739.4:c.3238_3239delinsCA,p.Gly1080Gln,p.G1080Q,ENST00000355739.4,103527870,103527991,ATAACAAATACCTTAGAAGAGTCATCAAGCCTGAAAAGAAAGAGGC...,ATAACAAATACCTTAGAAGAGTCATCAAGCCTGAAAAGAAAGAGGC...
9091,9091,5.0,ARID5B,10,63850742,63850742,missense_variant,Missense_Mutation,SNP,C,T,GENIE-PROV-1a9954b812-1203eb4b3a,NM_032199.2,507.0,10/10,ENST00000279873.7:c.1520C>T,p.Pro507Leu,p.P507L,ENST00000279873.7,63850682,63850802,CTCTCCCAGCAGCAGACATGAAGAAAAAAATAGAAGGGTATCAGGA...,CTCTCCCAGCAGCAGACATGAAGAAAAAAATAGAAGGGTATCAGGA...
9092,9092,5.0,ABL2,1,179078007,179078007,missense_variant,Missense_Mutation,SNP,T,C,GENIE-PROV-6a280d78a3-155ebf48a8,NM_001168237.1,799.0,12/12,ENST00000502732.1:c.2395A>G,p.Met799Val,p.M799V,ENST00000502732.1,179077947,179078067,AGGTGGACACTGTCCTTTCCAGCTGGAGTTTGGACCTCTGGCAGTT...,AGGTGGACACTGTCCTTTCCAGCTGGAGTTTGGACCTCTGGCAGTT...
9093,9093,5.0,SOX10,22,38369885,38369885,missense_variant,Missense_Mutation,SNP,C,T,GENIE-PROV-238a11a4e2-b283f19879,,340.0,5/5,ENST00000360880.2:c.1018G>A,p.Val340Met,p.V340M,ENST00000360880.2,38369825,38369945,TCTCTGTCTTCACCTGGGCTTTGGCATCCACACCAGGTGGTGAGAC...,TCTCTGTCTTCACCTGGGCTTTGGCATCCACACCAGGTGGTGAGAC...


In [236]:
def frame_determiner(exon_subs):
    """ 
    Determines the frame of the WT/ALT sequence (0,1, or 2)
    And selects the CDS subset + puts it in the correct orientation
    """
    
    frame_list = []
    coding_start_list = []
    coding_end_list = []
    coding_alt_start_list = []
    coding_alt_end_list = []
    CDS_WT_correct_orientation = []
    CDS_alt_correct_orientation = []
    strand_list = []

    for index1, val in exon_subs.iterrows():
        hg = val['HGVSp_Short']

        tx = val['transcript_id_TRUE']
        cds = list(db.children(tx, order_by='+end', featuretype=['CDS']))
        start_end_cds = [[i.start, i.end] for i in cds]
        strand = db[tx].strand
        strand_list.append(strand)
        #print(strand)
        chrom = val['Chromosome']

        if chrom == 'X':
            chrom = 22
        elif chrom=='Y':
            chrom = 23
        else:
            chrom = int(chrom)-1

        chr_seq = records[index_list[chrom]].seq.upper()


        wt_dna = ''
        codon_locs = []
        for i in start_end_cds:
            wt_dna += chr_seq[i[0]-1:i[1]]

            for j in range(i[0], i[1]+1):
                codon_locs.append(j)

        #and add in the last/stop codon
        #if strand == '+':

        # wt_dna += chr_seq[start_end_cds[-1][1]:start_end_cds[-1][1]+3]
        #elif strand == '-':
        #   wt_dna = chr_seq[start_end_cds[0][0]-3:start_end_cds[0][0]] +  wt_dna

        #check location in cds
        #get information about the wt/alt sequence
        start = val['seq_start']
        end = val['seq_end']
        wt_seq = val['wt_w_context']
        alt_seq = val['alt_w_context']

        inc = []
        dna_dict = dict(zip(list(range(start, end+1)), list(range(len(wt_seq)))))
        for i in range(start, end+1):
            if i in codon_locs:
                inc.append(i)

        if len(inc)==0:
            frame = 'None'
            frame_list.append(frame)
            coding_start_list.append('None')
            coding_end_list.append('None')
            CDS_WT_correct_orientation.append('None')
            CDS_alt_correct_orientation.append('None')
            coding_alt_start_list.append('None')
            coding_alt_end_list.append('None')
            #means it's not in a coding region (could be splice or intron)
        
        else:
            inc_start = inc[0]
            inc_end = inc[-1]

            #------and then double check that the frame is correct--------------
            #by checking if the subsequence is located in the WT sequence

            wt_start = dna_dict[inc_start]
            wt_end = dna_dict[inc_end] + 1

            #record what part of the subsequence is part of a CDS
            coding_start_list.append(wt_start)
            coding_end_list.append(wt_end)

            full_prot = str(Bio.Seq.Seq(wt_dna).transcribe().translate())

            subseq = Bio.Seq.Seq(wt_seq[wt_start:wt_end])

            #also generate the alternate sequence:
            diff = len(alt_seq)-len(wt_seq)
            #if strand=='+':
            alt_start = wt_start
            alt_end = wt_end + diff
            #elif strand =='-':
            #    alt_start = wt_start - diff
            #    alt_end = wt_end
                
            alt_subseq = Bio.Seq.Seq(alt_seq[alt_start:alt_end])
            coding_alt_start_list.append(alt_start)
            coding_alt_end_list.append(alt_end)

            if strand =='-':
                subseq = subseq.reverse_complement()
                alt_subseq = alt_subseq.reverse_complement()
                inc_start = inc[-1]
                inc_end = inc[0]
                full_prot = str(Bio.Seq.Seq(wt_dna).reverse_complement().transcribe().translate())
                #have to reverse this list as well if the transcript is in the - orientation
                codon_locs = codon_locs[::-1]

            CDS_WT_correct_orientation.append(str(subseq))
            CDS_alt_correct_orientation.append(str(alt_subseq))

            #determine the frame
            start_idx = codon_locs.index(inc_start)

            frame = (3 - (start_idx % 3)) % 3
            frame_list.append(frame)
            
            #and then confirm whether the subsequence is in the protein
            sub1 = str(Bio.Seq.Seq(subseq[frame:]).transcribe().translate())
            
            #assert sub1 in full_prot, print(sub1)
            if sub1 not in full_prot:
                print('error')
                print(index1)
                print(val['Variant_Classification'])
                print(val['HGVSp_Short'])
                #print(strand)
                #print(frame)
            
    exon_subs['Strand'] = strand_list
    exon_subs['Frame'] = frame_list
    exon_subs['CDS_wt_start'] = coding_start_list
    exon_subs['CDS_wt_end'] = coding_end_list
    exon_subs['CDS_wt_correct_orientation'] = CDS_WT_correct_orientation
    exon_subs['CDS_alt_start'] = coding_alt_start_list
    exon_subs['CDS_alt_end'] = coding_alt_end_list
    exon_subs['CDS_alt_correct_orientation'] = CDS_alt_correct_orientation

    return exon_subs

# Diego's library

In [16]:
idr = pd.read_csv('filtered_idr_mutations_5count_6nt_indels.csv')

#filtering out the complex indels (i.e. non-pure INS or DEL)
dels1 = idr[idr['Variant_Type']=='DEL']
bad_del_idx = list(dels1[dels1['Tumor_Seq_Allele2']!='-'].index)


ins1 = idr[idr['Variant_Type']=='INS']
bad_ins_idx = list(ins1[ins1['Reference_Allele']!='-'].index)

bad_idxs = bad_del_idx + bad_ins_idx

idr_filtered = idr.drop(index=bad_idxs).reset_index().drop(columns='index')

idr_filtered


Unnamed: 0,COUNT,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,dbSNP_RS,dbSNP_Val_Status,Tumor_Sample_Barcode,Matched_Norm_Sample_Barcode,Match_Norm_Seq_Allele1,Match_Norm_Seq_Allele2,Tumor_Validation_Allele1,Tumor_Validation_Allele2,Match_Norm_Validation_Allele1,Match_Norm_Validation_Allele2,...,HGVSp_Short,Transcript_ID,RefSeq,Protein_position,Codons,Exon_Number,gnomAD_AF,gnomAD_AFR_AF,gnomAD_AMR_AF,gnomAD_ASJ_AF,gnomAD_EAS_AF,gnomAD_FIN_AF,gnomAD_NFE_AF,gnomAD_OTH_AF,gnomAD_SAS_AF,FILTER,Polyphen_Prediction,Polyphen_Score,SIFT_Prediction,SIFT_Score,SWISSPROT,n_depth,t_depth,Annotation_Status,mutationInCis_Flag
0,1809.0,EIF1AX,1964.0,COLU,GRCh37,X,20156758,20156758,+,intron_variant,Intron,DEL,A,A,-,novel,,GENIE-COLU-951-1,NORMAL,,,,,,,...,p.*6*,ENST00000379607,NM_001412.3,,,,,,,,,,,,,PASS,,,,,,,,SUCCESS,False
1,1512.0,TP53,7157.0,JHU,GRCh37,17,7577094,7577094,+,missense_variant,Missense_Mutation,SNP,G,,A,rs28934574,,GENIE-JHU-00113-00335,,,,,,,,...,p.R282W,ENST00000269305,NM_001126112.2,282.0,Cgg/Tgg,8/11,0.000004,0.0,0.000000,0.0,0.000000,0.000000,0.000009,0.000000,0.000000,PASS,probably_damaging,1.000,deleterious,0.00,,,728.0,SUCCESS,False
2,983.0,RNF43,54894.0,DFCI,GRCh37,17,56435161,56435161,+,frameshift_variant,Frame_Shift_Del,DEL,C,C,-,rs781215815,,GENIE-DFCI-449465-4787362,,,,,,,,...,p.G659Vfs*41,ENST00000407977,,659.0,gGt/gt,9/10,,,,,,,,,,PASS,,,,,,,263.0,SUCCESS,False
3,777.0,APC,324.0,JHU,GRCh37,5,112175639,112175639,+,stop_gained,Nonsense_Mutation,SNP,C,,T,rs121913332,,GENIE-JHU-00223-00468,,,,,,,,...,p.R1450*,ENST00000257430,NM_000038.5,1450.0,Cga/Tga,16/16,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,PASS,,,,,,,1391.0,SUCCESS,False
4,762.0,ASXL1,171023.0,DFCI,GRCh37,20,31022441,31022442,+,frameshift_variant,Frame_Shift_Ins,INS,-,-,G,rs756958159,,GENIE-DFCI-003409-1958,,,,,,,,...,p.G646Wfs*12,ENST00000375687,NM_015338.5,642.0,-/G,13/13,,,,,,,,,,PASS,,,,,,,83.0,SUCCESS,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9090,5.0,ERCC5,2073.0,PROV,GRCh37,13,103527930,103527931,+,missense_variant,Missense_Mutation,DNP,GG,,CA,rs587778291,,GENIE-PROV-9b34c1d29d-5652092fc3,,,,,,,,...,p.G1080Q,ENST00000355739,NM_000123.3,1080.0,GGa/CAa,15/15,,,,,,,,,,PASS,benign,0.000,tolerated,0.83,,,789.0,SUCCESS,False
9091,5.0,ARID5B,84159.0,PROV,GRCh37,10,63850742,63850742,+,missense_variant,Missense_Mutation,SNP,C,,T,,,GENIE-PROV-1a9954b812-1203eb4b3a,,,,,,,,...,p.P507L,ENST00000279873,NM_032199.2,507.0,cCc/cTc,10/10,,,,,,,,,,PASS,benign,0.003,tolerated,0.25,,,1204.0,SUCCESS,False
9092,5.0,ABL2,27.0,PROV,GRCh37,1,179078007,179078007,+,missense_variant,Missense_Mutation,SNP,T,,C,rs781504947,,GENIE-PROV-6a280d78a3-155ebf48a8,,,,,,,,...,p.M799V,ENST00000502732,NM_001168237.1,799.0,Atg/Gtg,12/12,0.000044,0.0,0.000029,0.0,0.000000,0.000092,0.000053,0.000163,0.000033,PASS,benign,0.003,tolerated_low_confidence,1.00,,,1230.0,SUCCESS,False
9093,5.0,SOX10,6663.0,PROV,GRCh37,22,38369885,38369885,+,missense_variant,Missense_Mutation,SNP,C,,T,rs549034055,,GENIE-PROV-238a11a4e2-b283f19879,,,,,,,,...,p.V340M,ENST00000360880,,340.0,Gtg/Atg,5/5,0.000020,0.0,0.000029,0.0,0.000055,0.000000,0.000009,0.000000,0.000065,PASS,possibly_damaging,0.622,tolerated,0.26,,,641.0,SUCCESS,False


In [39]:
np.unique(idr_filtered['Variant_Classification'])

array(["3'UTR", "5'UTR", 'Frame_Shift_Del', 'Frame_Shift_Ins',
       'In_Frame_Del', 'In_Frame_Ins', 'Intron', 'Missense_Mutation',
       'Nonsense_Mutation', 'Silent', 'Splice_Region', 'Splice_Site',
       'Translation_Start_Site'], dtype=object)

In [18]:
#also add in the transcript information
tx_new = []
for i, val in idr_filtered.iterrows():
    h = val['HGVSc']
    tx_new.append(h.split(':')[0])

idr_filtered['transcript_id_TRUE'] = tx_new

In [230]:
idr_oligos = df_formatter(idr_filtered)

In [237]:
idr_oligos = frame_determiner(idr_oligos)

error
3266
Intron
p.*1354*


In [254]:
#and spot checking
i = 2609

#i = list(idr_oligos[idr_oligos['Variant_Type']=='DEL'][0:50].index)[10]

wt1 = idr_oligos.iloc[i]['CDS_wt_correct_orientation']
alt1 = idr_oligos.loc[i]['CDS_alt_correct_orientation']
frame = idr_oligos.iloc[i]['Frame']
hgvsp = idr_oligos.iloc[i]['HGVSp_Short']
s = idr_oligos.iloc[i]['Strand']
v = idr_oligos.iloc[i]['Variant_Type']
print(v)
print(s)
print(frame)
print(hgvsp)
print(Bio.Seq.Seq(wt1[frame:]).transcribe().translate())
print(Bio.Seq.Seq(alt1[frame:]).transcribe().translate())

SNP
-
0
p.M1?
MAEAPQVVEIDPDFEPLPRP
VAEAPQVVEIDPDFEPLPRP


In [241]:
#last step = noting which of these oligos are coding, and thus amenable to MMR
np.unique(idr_oligos['Variant_Classification'])

array(["3'UTR", "5'UTR", 'Frame_Shift_Del', 'Frame_Shift_Ins',
       'In_Frame_Del', 'In_Frame_Ins', 'Intron', 'Missense_Mutation',
       'Nonsense_Mutation', 'Silent', 'Splice_Region', 'Splice_Site',
       'Translation_Start_Site'], dtype=object)

In [257]:
#last step = noting which of these oligos are coding, and thus amenable to MMR

cols = ['Frame_Shift_Del', 'Frame_Shift_Ins','In_Frame_Del', 'In_Frame_Ins','Missense_Mutation','Nonsense_Mutation', 'Silent','Translation_Start_Site']
#cols2 = ['Translation_Start_Site']
#idr_oligos[idr_oligos['Variant_Classification'].isin(cols)]

mmr_evasive_amenable = []
for i, val in idr_oligos.iterrows():
    cl = val['Variant_Classification']
    if cl in cols:
        mmr_evasive_amenable.append(True)
    else:
        mmr_evasive_amenable.append(False)

idr_oligos['MMR_evasive_amenable']=mmr_evasive_amenable

In [259]:
idr_oligos[idr_oligos['MMR_evasive_amenable']==True]

Unnamed: 0,index,COUNT,Hugo_Symbol,Chromosome,Start_Position,End_Position,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele2,Tumor_Sample_Barcode,RefSeq,Protein_position,Exon_Number,HGVSc,HGVSp,HGVSp_Short,transcript_id_TRUE,seq_start,seq_end,wt_w_context,alt_w_context,Strand,Frame,CDS_wt_start,CDS_wt_end,CDS_wt_correct_orientation,CDS_alt_start,CDS_alt_end,CDS_alt_correct_orientation,MMR_evasive_amenable
1,1,1512.0,TP53,17,7577094,7577094,missense_variant,Missense_Mutation,SNP,G,A,GENIE-JHU-00113-00335,NM_001126112.2,282.0,8/11,ENST00000269305.4:c.844C>T,p.Arg282Trp,p.R282W,ENST00000269305.4,7577034,7577154,CTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCGGAGATTCTC...,CTGGGGGCAGCTCGTGGTGAGGCTCCCCTTTCTTGCGGAGATTCTC...,-,0,0,121,GGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCT...,0,121,GGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCT...,True
2,2,983.0,RNF43,17,56435161,56435161,frameshift_variant,Frame_Shift_Del,DEL,C,-,GENIE-DFCI-449465-4787362,,659.0,9/10,ENST00000407977.2:c.1976del,p.Gly659ValfsTer41,p.G659Vfs*41,ENST00000407977.2,56435101,56435221,TGGCAAGCTGGGTGCACAGTTGCATCCTGGGGCCGAGAGCCAGGGG...,TGGCAAGCTGGGTGCACAGTTGCATCCTGGGGCCGAGAGCCAGGGG...,-,2,0,121,TGTTCAACTTGCAAAAATCCAGCCTCTCTGCCCGACACCCACAGAG...,0,120,TGTTCAACTTGCAAAAATCCAGCCTCTCTGCCCGACACCCACAGAG...,True
3,3,777.0,APC,5,112175639,112175639,stop_gained,Nonsense_Mutation,SNP,C,T,GENIE-JHU-00223-00468,NM_000038.5,1450.0,16/16,ENST00000257430.4:c.4348C>T,p.Arg1450Ter,p.R1450*,ENST00000257430.4,112175579,112175699,ACCATGCCACCAAGCAGAAGTAAAACACCTCCACCACCTCCTCAAA...,ACCATGCCACCAAGCAGAAGTAAAACACCTCCACCACCTCCTCAAA...,+,0,0,121,ACCATGCCACCAAGCAGAAGTAAAACACCTCCACCACCTCCTCAAA...,0,121,ACCATGCCACCAAGCAGAAGTAAAACACCTCCACCACCTCCTCAAA...,True
4,4,762.0,ASXL1,20,31022441,31022442,frameshift_variant,Frame_Shift_Ins,INS,-,G,GENIE-DFCI-003409-1958,NM_015338.5,642.0,13/13,ENST00000375687.4:c.1934dup,p.Gly646TrpfsTer12,p.G646Wfs*12,ENST00000375687.4,31022381,31022502,GCAGGTCCGAGGGGCGAGAGGTCACCACTGCCATAGAGAGGCGGCC...,GCAGGTCCGAGGGGCGAGAGGTCACCACTGCCATAGAGAGGCGGCC...,+,1,0,122,GCAGGTCCGAGGGGCGAGAGGTCACCACTGCCATAGAGAGGCGGCC...,0,123,GCAGGTCCGAGGGGCGAGAGGTCACCACTGCCATAGAGAGGCGGCC...,True
5,5,645.0,APC,5,112175952,112175953,frameshift_variant,Frame_Shift_Ins,INS,-,A,GENIE-JHU-00198-00378,NM_000038.5,1554.0,16/16,ENST00000257430.4:c.4666dup,p.Thr1556AsnfsTer3,p.T1556Nfs*3,ENST00000257430.4,112175892,112176013,GGAATGAAACAGAATCAGAGCAGCCTAAAGAATCAAATGAAAACCA...,GGAATGAAACAGAATCAGAGCAGCCTAAAGAATCAAATGAAAACCA...,+,2,0,122,GGAATGAAACAGAATCAGAGCAGCCTAAAGAATCAAATGAAAACCA...,0,123,GGAATGAAACAGAATCAGAGCAGCCTAAAGAATCAAATGAAAACCA...,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9090,9090,5.0,ERCC5,13,103527930,103527931,missense_variant,Missense_Mutation,DNP,GG,CA,GENIE-PROV-9b34c1d29d-5652092fc3,NM_000123.3,1080.0,15/15,ENST00000355739.4:c.3238_3239delinsCA,p.Gly1080Gln,p.G1080Q,ENST00000355739.4,103527870,103527991,ATAACAAATACCTTAGAAGAGTCATCAAGCCTGAAAAGAAAGAGGC...,ATAACAAATACCTTAGAAGAGTCATCAAGCCTGAAAAGAAAGAGGC...,+,0,0,122,ATAACAAATACCTTAGAAGAGTCATCAAGCCTGAAAAGAAAGAGGC...,0,122,ATAACAAATACCTTAGAAGAGTCATCAAGCCTGAAAAGAAAGAGGC...,True
9091,9091,5.0,ARID5B,10,63850742,63850742,missense_variant,Missense_Mutation,SNP,C,T,GENIE-PROV-1a9954b812-1203eb4b3a,NM_032199.2,507.0,10/10,ENST00000279873.7:c.1520C>T,p.Pro507Leu,p.P507L,ENST00000279873.7,63850682,63850802,CTCTCCCAGCAGCAGACATGAAGAAAAAAATAGAAGGGTATCAGGA...,CTCTCCCAGCAGCAGACATGAAGAAAAAAATAGAAGGGTATCAGGA...,+,2,0,121,CTCTCCCAGCAGCAGACATGAAGAAAAAAATAGAAGGGTATCAGGA...,0,121,CTCTCCCAGCAGCAGACATGAAGAAAAAAATAGAAGGGTATCAGGA...,True
9092,9092,5.0,ABL2,1,179078007,179078007,missense_variant,Missense_Mutation,SNP,T,C,GENIE-PROV-6a280d78a3-155ebf48a8,NM_001168237.1,799.0,12/12,ENST00000502732.1:c.2395A>G,p.Met799Val,p.M799V,ENST00000502732.1,179077947,179078067,AGGTGGACACTGTCCTTTCCAGCTGGAGTTTGGACCTCTGGCAGTT...,AGGTGGACACTGTCCTTTCCAGCTGGAGTTTGGACCTCTGGCAGTT...,-,0,0,121,CCAAGGTCAAACTCTACATCTTCCATGTCCTCAGGGCTTCCAGAGC...,0,121,CCAAGGTCAAACTCTACATCTTCCATGTCCTCAGGGCTTCCAGAGC...,True
9093,9093,5.0,SOX10,22,38369885,38369885,missense_variant,Missense_Mutation,SNP,C,T,GENIE-PROV-238a11a4e2-b283f19879,,340.0,5/5,ENST00000360880.2:c.1018G>A,p.Val340Met,p.V340M,ENST00000360880.2,38369825,38369945,TCTCTGTCTTCACCTGGGCTTTGGCATCCACACCAGGTGGTGAGAC...,TCTCTGTCTTCACCTGGGCTTTGGCATCCACACCAGGTGGTGAGAC...,-,0,0,121,CTGGGCAGTGCCCTGGCCGTGGCCAGTGGACACTCCGCCTGGATCT...,0,121,CTGGGCAGTGCCCTGGCCGTGGCCAGTGGACACTCCGCCTGGATCT...,True


In [261]:
#idr_oligos.to_csv('Diego_IDR_library_oligos.csv', index=False)

# p53 library generation

- Working off of the initial/original p53 input library
- Going to get all of the CORRECT insertion/deletions this time...

In [285]:
# step 1 = filter out large INS/DEL (over 10 nt)
# step 2 = filter out complex indels (non-pure INS/DEL)

p53 = pd.read_csv('p53_combined_FINAL2.csv')

cutoff_size = 10

exclude_list = []
#step 1 filtration
for i, val in p53.iterrows():
    vt = val['Variant_Type']
    ref = val['Reference_Allele']
    alt = val['Tumor_Seq_Allele2']
    if vt =='INS':
        size = len(alt)
        if size >cutoff_size:
            exclude_list.append(i)
    elif vt=='DEL':
        size = len(ref)
        if size >cutoff_size:
            exclude_list.append(i)

p53 = p53.drop(index=exclude_list).reset_index().drop(columns='index')

#step 2 filtration
dels1 = p53[p53['Variant_Type']=='DEL']
bad_del_idx = list(dels1[dels1['Tumor_Seq_Allele2']!='-'].index)


ins1 = p53[p53['Variant_Type']=='INS']
bad_ins_idx = list(ins1[ins1['Reference_Allele']!='-'].index)

bad_idxs = bad_del_idx + bad_ins_idx

p53_filtered = p53.drop(index=bad_idxs).reset_index().drop(columns=['index', 'Unnamed: 0'])



In [294]:
p53_filtered['transcript_id_TRUE'] = 'ENST00000269305.4'
p53_filtered

#rename the appropriate columns to match up with the formula, then generate...
#or just make a different version of the function; probably easier...
cols_to_save = list(p53_filtered.keys())
cols_to_save

['Hugo_Symbol',
 'Chromosome',
 'Start_Position',
 'End_Position',
 'Variant_Type',
 'Reference_Allele',
 'Tumor_Seq_Allele2',
 'Strand',
 'HGVSp',
 'num_occurences',
 'codon',
 'ref_aa',
 'mut_aa',
 'Most frequent HGVSc',
 'classification',
 'transcript_id_TRUE']

In [293]:
def df_formatter(df, cols_to_save, context_size = 60):

    """ 
    Takes in variants (in cBioPortal format)
    and outputs dataframe with WT and ALT oligos with designated context_size
    context_size = the amount of nt on either side of the variant e.g. AAA(A/G)AAA = context_size of 3
    """

    wt_w_context = []
    alt_w_context = []

    seq_start = []
    seq_end = []

    for i, val in df.iterrows():
        vt = val['Variant_Type']
        s = val['Start_Position']
        e = val['End_Position']
        ref = val['Reference_Allele']
        alt = val['Tumor_Seq_Allele2']
        chrom = val['Chromosome']

        if chrom == 'X':
            chrom = 22
        elif chrom=='Y':
            chrom = 23
        else:
            chrom = int(chrom)-1

        chr_seq = records[index_list[chrom]].seq.upper()

        if vt in ['SNP', 'ONP', 'DNP']:
            ref = ref
            alt = alt
            #assert ref == chr_seq[s-1:e], print(ref, chr_seq[s-1:e])
            left_context = chr_seq[s-1-context_size:s-1]
            right_context = chr_seq[e:e+context_size]

        elif vt =='INS':
            ref = ''
            alt = alt
            #left_context = chr_seq[s-1-context_size:s+1] #need to do this since INS reference alleles are blank
            left_context = chr_seq[s-1-context_size:s]
            right_context = chr_seq[e-1:e+context_size]

        elif vt=='DEL':
            ref = ref
            alt = ''
            left_context = chr_seq[s-1-context_size:s-1]
            right_context = chr_seq[e:e+context_size]

        wt_seq = left_context + ref + right_context
        alt_seq = left_context + alt + right_context

        wt_w_context.append(str(wt_seq))
        alt_w_context.append(str(alt_seq))

        start = s-context_size
        end = e+context_size

        seq_start.append(start)
        seq_end.append(end)

        assert str(chr_seq[start-1:end])==str(wt_seq), print(chr_seq[start-1:end] + '\n' + str(wt_seq))
                                                            

    #cols_to_save = ['COUNT', 'Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position', 'Consequence', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele2', 'Tumor_Sample_Barcode', 'RefSeq', 'Protein_position', 'Exon_Number', 'HGVSc', 'HGVSp', 'HGVSp_Short', 'transcript_id_TRUE']
    df_new = df[cols_to_save]

    df_new['seq_start'] = seq_start
    df_new['seq_end'] = seq_end
    df_new['wt_w_context'] = wt_w_context
    df_new['alt_w_context'] = alt_w_context
    df_new = df_new.reset_index()
    
    return df_new

In [295]:
p53_oligos = df_formatter(p53_filtered, cols_to_save)

In [297]:
p53_oligos = p53_oligos.rename(columns = {'HGVSp':'HGVSp_Short'}).drop(columns = 'Strand')

In [299]:
p53_oligos = frame_determiner(p53_oligos)

In [363]:
#and spot checking
i = 1106

wt1 = p53_oligos.iloc[i]['CDS_wt_correct_orientation']
alt1 = p53_oligos.loc[i]['CDS_alt_correct_orientation']
frame = p53_oligos.iloc[i]['Frame']
hgvsp = p53_oligos.iloc[i]['HGVSp_Short']
s = p53_oligos.iloc[i]['Strand']
v = p53_oligos.iloc[i]['Variant_Type']
print(v)
print(s)
print(frame)
print(hgvsp)
print(Bio.Seq.Seq(wt1[frame:]).transcribe().translate())
print(Bio.Seq.Seq(alt1[frame:]).transcribe().translate())

DEL
-
1
p.Asn263IlefsTer82
GNLLGRNSFEVRVCACPGRDR
GIYWDGTALRCVFVPVLGETG


In [342]:
p53_oligos['MMR_evasive_amenable']=True

In [344]:
#p53_oligos.to_csv('p53_library_oligos.csv', index=False)