In [1]:
import os
import glob
import copy
import numpy as np
import Bio
import scipy.spatial
import pickle
import matplotlib.pyplot as plt
import pandas as pd
from rnai_scripts import *
import bokeh.io
import bokeh.plotting

# Enable viewing Bokeh plots in the notebook
bokeh.io.output_notebook()

# RNAi recoding 

## Reading in the Smed transcriptome
We read in the Smed_v6 transcriptome orfs that were extracted using orfipy. We then join them all into one string and obtain the codon frequencies. 

In [2]:
fname = 'data/dd_Smed_v6_transcripts_orfs_large3.fa' # makes smallest proteins be around 30 amino acids
descriptors, seqs = read_many_fasta(fname)
# join all ORFS into one large transcriptome 
transcriptome = ''.join(seqs)
# get aminoacidweights and codon weights 

codon_frequencies_dic = get_codon_frequencies(transcriptome) 



Now we find the GC count in the transcriptome of Sophie Walton

In [3]:
print((transcriptome.count('C') + transcriptome.count('G'))/len(transcriptome))

0.3607041500977551


I also found a published version of amino acid frequencies:

In [4]:
df = pd.read_csv('data/codon_usage_smed.csv')


AAs = df['codon'].values
freqs = df['frequency'].values/1000.

codon_frequencies_dic_published = {}
for i in range(len(AAs)):
    codon_frequencies_dic_published[AAs[i]] = freqs[i]
print(sum(freqs))

1.00000000001


Now we get frequencies of doublets

In [5]:
doubletscode = get_codon_frequencies_doublets(transcriptome)

Let's calculate the average discrepency between the doublets vs. codon frequencies. 

In [6]:
diff_published_vs_me = {}
for a in AAs:
    
    diff_published_vs_me[a] = codon_frequencies_dic_published[a] - codon_frequencies_dic[a]
values = np.array(list(diff_published_vs_me.values()))
print(np.mean(values))
print(np.mean(np.abs(values))) # values usually on order 
print(np.sum(np.abs(values)))

1.5624986385038425e-13
0.0016427577738011736
0.10513649752327511


Here we find the discrepencies between the frequencies of each doublet vs. the product frequency of the separate codons. 

In [7]:

diff_dic = {}
diff_dic_norm = {}
doublets_high = {}
for pair in doubletscode.keys():
    
    # we ignore stop codons being first or conditions where ATG is at the end of a codon for now 

    if 'TAG' == pair[:3]:
        continue
    if 'TGA' == pair[:3]:
        continue

    
    freq1 = codon_frequencies_dic[pair[:3]]
    freq2 = codon_frequencies_dic[pair[3:]]
    
    if doubletscode[pair] == 0.0:
        continue
    diff_dic_norm[pair] = (doubletscode[pair] - freq1*freq2)/(doubletscode[pair])
    diff_dic[pair] = (doubletscode[pair] - freq1*freq2)
  #  if 'TAA' == pair[:3]:
   #     print(doubletscode[pair], diff_dic_norm[pair])

In [8]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='Doublets - Codon1xCodon2',
    y_axis_label='ECDF',
  #  x_axis_type = 'log'
    
)
diffs, ecdf_diffs = ecdf_vals(np.array(list(diff_dic.values())))
print(np.sum(np.array(list(doubletscode.values()))))
p.circle(diffs*1e4, ecdf_diffs)

#diffs, ecdf_diffs = ecdf_vals(np.array(list(doublets.values())))
#p.circle(diffs, ecdf_diffs, color = 'orange')
bokeh.io.show(p)

1.0


In [9]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='(Doublets - Codon1xCodon2)/(doubletscode[pair])',
    y_axis_label='ECDF',
  #  x_axis_type = 'log'
    
)
diffs, ecdf_diffs = ecdf_vals(np.array(list(diff_dic_norm.values())))
print(np.sum(np.array(list(doubletscode.values()))))
p.circle(diffs, ecdf_diffs)

#diffs, ecdf_diffs = ecdf_vals(np.array(list(doublets.values())))
#p.circle(diffs, ecdf_diffs, color = 'orange')
bokeh.io.show(p)

1.0


Here we look at the doublets whose normalized difference between the doublet vs the actualy codon frequency is quite small indicating that the doublet occurs much less frequent than what would be expected if the codons were independent of each other. 

In [10]:
values = np.array(list(diff_dic_norm.values()))
inds_sort = np.argsort(values)
keys = np.array(list(diff_dic_norm.keys()))
keys[inds_sort][:20]

array(['CTTAAG', 'GGTATG', 'CTTATG', 'AGTATG', 'ACTATG', 'ATCTTA',
       'CGTATG', 'CCTATG', 'GTCTTA', 'CTTAGG', 'CCGCGA', 'CCTAAG',
       'CCCTAA', 'TGTATG', 'CCGCGG', 'ATTATG', 'TTTAAG', 'GCCTAA',
       'TGCGAA', 'GCTTAA'], dtype='<U6')

In [11]:
diff_dic_norm['TTTAAG']

-1.186117909072891

In [12]:
diff_dic_norm['TTCAAG']

0.17786418531119655

We do this for the non normalized ones to just make sure everything is not weird because of normalization.

In [13]:
values = np.array(list(diff_dic.values()))*1e4
inds_sort = np.argsort(values)
keys = np.array(list(diff_dic.keys()))
keys[inds_sort][:20]


array(['AAAAAA', 'ATTATG', 'AATATG', 'TTTAAA', 'ATTAAA', 'ATTAAT',
       'TTTAAT', 'TTTATG', 'GATATG', 'GTTAAT', 'CTTAAA', 'ATTAAG',
       'TATAAT', 'TATATG', 'GTTAAA', 'TTTAAG', 'TTTTTT', 'GTTATG',
       'CTTAAT', 'ATTAAC'], dtype='<U6')

We see that lots of As or Ts in a row seems to not be a good thing.... We should avoid this

We use our codon frequencies dictionary to compute CAI weights (based on the weight definition for the CAI) for all codons 

$$w_i = \frac{f_i}{\max (f_j)} i,j \in [ \text{synonymouse codons for amino acid} ]$$

Where $f_i$ is the frequency of codon $i$. 

We obtain two dictionaries: 


aminoacidweights: keys are amino acids, values are arrays of $w_i$ for all synonymous codons. The order of the codons is the as those used in aminoacidcode. 
    
gencodeweights: keys are codons, values are $w_i$ for each codon

In [17]:
aminoacidweights, gencodeweights = get_codon_weights(codon_frequencies_dic)

We pickle dump everything so we do not have to repeat the above line later. 

In [18]:
pickle.dump( aminoacidweights,
            open( "data/Smed_transcriptome_aminoacidweights.p", "wb" ) )
pickle.dump( gencodeweights, 
            open( "data/Smed_transcriptome_gencodeweights.p", "wb" ) )
pickle.dump( aminoacidcode,
            open( "data/aminoacidcode.p", "wb" ))
pickle.dump( doubletscode,
            open( "data/doubletscode.p", "wb" ))

We reload everything with pickle because why not. 

In [19]:
aminoacidweights = pickle.load( open( "data/Smed_transcriptome_aminoacidweights.p",
                                     "rb" ) )
gencodeweights = pickle.load( open( "data/Smed_transcriptome_gencodeweights.p", 
                                   "rb" ) )
aminoacidcode = pickle.load(open("data/aminoacidcode.p", 'rb'))
doubletscode = pickle.load(
            open( "data/doubletscode.p", "rb" ))

## We recode the p53 ORF 

We try to only RNAi 500 bp 

In [150]:
p53_ORF = 'ATGGCCCAGCAATATATTACTTCGGCTTTCGATCCAAACTTCACAACGTTACAGCATCAGACTTCAATACATTATAAATCCTCTCCAATTGAAATGATTGTTCCGATGCAATGCAACCAAAACCAAGCCACCTTATCTATAACCCCTGTTCATATTTCTCCATTTGTTAATATATCAGATTCTAATAATCACAATTTGACGAATATCGTCGAGCCTGTTTCATCAAATACTATGTCTCCTGCTTTTAAATCCGACGACATGCCAACATTGTCTTCTTATCCAGGTCCGTATAATTTTCAAATCTTTATTCCCAACGGAGAATTTGATGAATCCAAAAGAAAAGGACAGACATGTGTGTTTCAAACCGATAAAATGGGAAATCACCAATTATTTACCAAACCTCATCCTCATTATTGGAGGTTAAATTATTCAGCTGATCCTTCTATGTCAACGGAAAACATGTATATTCGGATGGTTCCAGTTTTTGGGGATCCAGAAAAAGCTCAATGCATTTTGGAAAGATGTGCAAAACACAAAGAAGTAACAACCGATGAAAATCACTGGAAATATCGTAGCATGCTCATTGTAGAAAAAACCTGTGCACATTACTTTCAGGATTCGGCAACGAAAAGAGTTTGCATTTTATTACCGTTTGAAAAGCATGCGGAAGGAGAGATTTATTCTTCCGTCAACTGTCAATTTGCATGCTACAACAGTTGCTTTAATCAAGATTCAGGTGGTCGGAAAACACTTTATTTAATCATCACTCTAGAATTTCTCGATAAAAAAACAAATAAATTCGATGTATGGGGTCGACAGTGTTTGCAATTTCGCAGTTGTGCTTGTCCAAGTAGAGACTGGAGAGATAAAAAGATTAAAGGCGATCCAGAAATGTTACTGAAATTCAAAGAAAAACGAATCAAAACCGAAGAAAAATTAAATAATTTGGTGATTTCTAAAAGCGTCCCTATTAATATGGGTGGAAAGGATGCTATCATAAGAGTTCTTCCCTCGTTGCCAGGACTCGATGACGCTATTAACGCATTAGTTTGCGGATACTTACTGAATCGAACAACCAACATAAGCGCAATAATAGCAGCATTTAATCAGATGAAAGACGTAGAACATTTAATTATCGATCAATTCACATCAAATTTAGATCAAAATACATGTGACAGTAAAAGTCCTTCACAAACTCCAGAGTCTCAGATTTCTCCGAATACATCAAACCTTCAATTCAACGATTACGGTTCACTTTACGGTGAACCATGTCAACCCTATAGACCGATGCACCAGCAAGTTGTTAATAATTTTTCCTCTCCAGGAATTTTCAGTAAAATACCTTTTGAAACTTATCCGGTTAGTTATGACATTAAACTTTCACATGAAATGCCGCAGCACTTTGATGAGCTGCCATCAGACAACTATAATAGACACTGA'

In [151]:
p53_protein = 'MAQQYITSAFDPNFTTLQHQTSIHYKSSPIEMIVPMQCNQNQATLSITPVHISPFVNISDSNNHNLTNIVEPVSSNTMSPAFKSDDMPTLSSYPGPYNFQIFIPNGEFDESKRKGQTCVFQTDKMGNHQLFTKPHPHYWRLNYSADPSMSTENMYIRMVPVFGDPEKAQCILERCAKHKEVTTDENHWKYRSMLIVEKTCAHYFQDSATKRVCILLPFEKHAEGEIYSSVNCQFACYNSCFNQDSGGRKTLYLIITLEFLDKKTNKFDVWGRQCLQFRSCACPSRDWRDKKIKGDPEMLLKFKEKRIKTEEKLNNLVISKSVPINMGGKDAIIRVLPSLPGLDDAINALVCGYLLNRTTNISAIIAAFNQMKDVEHLIIDQFTSNLDQNTCDSKSPSQTPESQISPNTSNLQFNDYGSLYGEPCQPYRPMHQQVVNNFSSPGIFSKIPFETYPVSYDIKLSHEMPQHFDELPSDNYNRH*'

In [152]:
translate(p53_ORF) == p53_protein

True

In [153]:
seqs[228]

'TACCAGTATCACGTATCATTCGCTCCACCAATTGACTCTAGATTAATGCGAATAAAGATTGTACAGGGACTTTCTGAGTCTGACCTTGGAGTAGTAAAGGAGGCTCGAGCATTCGATGGAATGAACCTTTATATACCACAGTTACTTAAGAACAAGGAAACTATTATAAAGGTTAATAAGCCAACAGACAAAACAGTAGTTGACGTAAAGGTTGTATTCACAAATAACGTAAACTTCTCAGAGTGCCCAATGGTATACAACGTATTATTCAAGCGAATAGAGAACTCTTTACGAATGGTAAAGATAGGAAGAGACTACTTTTATCCAGAGAAGAAAATTGTTTTAGATAGACGAAGAATGGAGATTTGGCCAGGTTACGTTACTTCAATACAGAACTTCGATGGAGGACTTTTACTTCAGTGTGACGTTTCTCATAAGGTAATAAGAAACGACTCAGTTTACGATATTATGATGGAGATAAACAAGACAGTAAATAACAAGGGACAGATG'

In [154]:


recode = int(534)
seqs, seqs_small, cais, dists, cais_full, dists_full = sliding_window_RNAi_recoding(recode, p53_ORF, p53_protein, aminoacidweights, gencodeweights, 
                                                 random = False, no_wobble = True, enforce_different_codons = True, wiggle = False)



In [114]:
translate(seqs[228]) == translate('TCCGTCAACTGTCAATTTGCATGCTACAACAGTTGCTTTAATCAAGATTCAGGTGGTCGGAAAACACTTTATTTAATCATCACTCTAGAATTTCTCGATAAAAAAACAAATAAATTCGATGTATGGGGTCGACAGTGTTTGCAATTTCGCAGTTGTGCTTGTCCAAGTAGAGACTGGAGAGATAAAAAGATTAAAGGCGATCCAGAAATGTTACTGAAATTCAAAGAAAAACGAATCAAAACCGAAGAAAAATTAAATAATTTGGTGATTTCTAAAAGCGTCCCTATTAATATGGGTGGAAAGGATGCTATCATAAGAGTTCTTCCCTCGTTGCCAGGACTCGATGACGCTATTAACGCATTAGTTTGCGGATACTTACTGAATCGAACAACCAACATAAGCGCAATAATAGCAGCATTTAATCAGATGAAAGACGTAGAACATTTAATTATCGATCAATTCACATCAAATTTAGATCAAAATACATGTGACAGTAAAAGTCCTTCACAAACTCCAGAGTCTCAGATTTCTCCG')

True

In [161]:
dists[271:293*3]

array([0.37453184, 0.37453184, 0.37453184, 0.37453184, 0.37453184,
       0.37453184, 0.37827715, 0.37640449, 0.37265918, 0.37265918,
       0.37265918, 0.37453184, 0.37453184, 0.37078652, 0.37078652,
       0.36891386, 0.37078652, 0.37078652, 0.37078652, 0.37078652,
       0.37078652, 0.37078652, 0.37265918, 0.37265918, 0.37265918,
       0.37265918, 0.37265918, 0.37453184, 0.37265918, 0.37078652,
       0.37078652])

In [164]:
dists[1]

0.3258426966292135

In [165]:
dists[-2]

0.3707865168539326

In [71]:
dists[228] - np.min(dists)

0.05882352941176466

In [64]:
seqs[228]

'TCAGTAAATTGCCAGTTCGCTTGTTATAATTCATGTTTCAACCAGGACTCTGGAGGAAGAAAGACTTTATACCTTATAATAACATTAGAGTTCTTAGACAAGAAGACTAACAAGTTTGACGTTTGGGGAAGACAATGCTTACAGTTCAGATCATGCGCATGCCCTTCACGAGATTGGCGAGACAAGAAAATAAAGGGAGACCCTGAGATGCTTTTAAAGTTTAAGGAGAAGAGAATAAAGACAGAGGAGAAGCTTAACAACTTAGTTATATCAAAGTCAGTACCAATAAACATGGGAGGTAAAGACGCAATAATTCGAGTATTACCATCATTACCTGGTTTAGACGATGCAATAAATGCTCTTGTATGTGGTTATCTTTTAAACAGAACTACAAATATTTCAGCTATTATTGCTGCTTTCAACCAAATGAAGGATGTTGAGCACCTTATAATAGACCAGTTTACTTCTAACCTTGACCAGAACACTTGCGATTCAAAGTCACCATCTCAG'

In [65]:
translate(seqs[228])

'SVNCQFACYNSCFNQDSGGRKTLYLIITLEFLDKKTNKFDVWGRQCLQFRSCACPSRDWRDKKIKGDPEMLLKFKEKRIKTEEKLNNLVISKSVPINMGGKDAIIRVLPSLPGLDDAINALVCGYLLNRTTNISAIIAAFNQMKDVEHLIIDQFTSNLDQNTCDSKSPSQ'

In [61]:
translate(p53_ORF[684:]

'TCCGTCAACTGTCAATTTGCATGCTACAACAGTTGCTTTAATCAAGATTCAGGTGGTCGGAAAACACTTTATTTAATCATCACTCTAGAATTTCTCGATAAAAAAACAAATAAATTCGATGTATGGGGTCGACAGTGTTTGCAATTTCGCAGTTGTGCTTGTCCAAGTAGAGACTGGAGAGATAAAAAGATTAAAGGCGATCCAGAAATGTTACTGAAATTCAAAGAAAAACGAATCAAAACCGAAGAAAAATTAAATAATTTGGTGATTTCTAAAAGCGTCCCTATTAATATGGGTGGAAAGGATGCTATCATAAGAGTTCTTCCCTCGTTGCCAGGACTCGATGACGCTATTAACGCATTAGTTTGCGGATACTTACTGAATCGAACAACCAACATAAGCGCAATAATAGCAGCATTTAATCAGATGAAAGACGTAGAACATTTAATTATCGATCAATTCACATCAAATTTAGATCAAAATACATGTGACAGTAAAAGTCCTTCACAAACTCCAGAGTCTCAGATTTCTCCGAATACATCAAACCTTCAATTCAACGATTACGGTTCACTTTACGGTGAACCATGTCAACCCTATAGACCGATGCACCAGCAAGTTGTTAATAATTTTTCCTCTCCAGGAATTTTCAGTAAAATACCTTTTGAAACTTATCCGGTTAGTTATGACATTAAACTTTCACATGAAATGCCGCAGCACTTTGATGAGCTGCCATCAGACAACTATAATAGACACTGA'

We RNAi recode the subsection given to us by the prime3 thing

In [100]:
dd = 'ATGGCCCAGCAATATATTACTTCGGCTTTCGATCCAAACTTCACAACGTTACAGCATCAGACTTCAATACATTATAAATCCTCTCCAATTGAAATGATTGTTCCGATGCAATGCAACCAAAACCAAGCCACCTTATCTATAACCCCTGTTCATATTTCTCCATTTGTTAATATATCAGATTCTAATAATCACAATTTGACGAATATCGTCGAGCCTGTTTCATCAAATACTATGTCTCCTGCTTTTAAATCCGACGACATGCCAACATTGTCTTCTTATCCAGGTCCGTATAATTTTCAAATCTTTATTCCCAACGGAGAATTTGATGAATCCAAAAGAAAAGGACAGACATGTGTGTTTCAAACCGATAAAATGGGAAATCACCAATTATTTACCAAACCTCATCCTCATTATTGGAGGTTAAATTATTCAGCTGATCCTTCTATGTCAACGGAAAACATGTATATTCGGATGGTTCCAGTTTTTGGGGATCCAGAAAAAGCTCAATGCATTTTGGAAAGATGTGCAAAACACAAAGAAGTAACAACCGATGAAAATCACTGGAAATATCGTAGCATGCTCATTGTAGAAAAAACCTGTGCACATTACTTTCAGGATTCGGCAACGAAAAGAGTTTGCATTTTATTACCGTTTGAAAAGCATGCGGAAGGAGAGATTTATTCTTCCGTCAACTGTCAATTTGCATGCTACAACAGTTGCTTTAATCAAGATTCAGGTGGTCGGAAAACACTTTATTTAATCATCACTCTAGAATTTCTCGATAAAAAAACAAATAAATTCGATGTATGGGGTCGACAGTGTTTGCAATTTCGCAGTTGTGCTTGTCCAAGTAGAGACTGGAGAGATAAAAAGATTAAAGGCGATCCAGAAATGTTACTGAAATTCAAAGAAAAACGAATCAAAACCGAAGAAAAATTAAATAATTTGGTGATTTCTAAAAGCGTCCCTATTAATATGGGTGGAAAGGATGCTATCATAAGAGTTCTTCCCTCGTTGCCAGGACTCGATGACGCTATTAACGCATTAGTTTGCGGATACTTACTGAATCGAACAACCAACATAAGCGCAATAATAGCAGCATTTAATCAGATGAAAGACGTAGAACATTTAATTATCGATCAATTCACATCAAATTTAGATCAAAATACATGTGACAGTAAAAGTCCTTCACAAACTCCAGAGTCTCAGATTTCTCCGAATACATCAAACCTTCAATTCAACGATTACGGTTCACTTTACGGTGAACCATGTCAACCCTATAGACCGATGCACCAGCAAGTTGTTAATAATTTTTCCTCTCCAGGAATTTTCAGTAAAATACCTTTTGAAACTTATCCGGTTAGTTATGACATTAAACTTTCACATGAAATGCCGCAGCACTTTGATGAGCTGCCATCAGACAACTATAATAGACACTGA'

In [137]:
dd = 'TCCGTCAACTGTCAATTTGCATGCTACAACAGTTGCTTTAATCAAGATTCAGGTGGTCGGAAAACACTTTATTTAATCATCACTCTAGAATTTCTCGATAAAAAAACAAATAAATTCGATGTATGGGGTCGACAGTGTTTGCAATTTCGCAGTTGTGCTTGTCCAAGTAGAGACTGGAGAGATAAAAAGATTAAAGGCGATCCAGAAATGTTACTGAAATTCAAAGAAAAACGAATCAAAACCGAAGAAAAATTAAATAATTTGGTGATTTCTAAAAGCGTCCCTATTAATATGGGTGGAAAGGATGCTATCATAAGAGTTCTTCCCTCGTTGCCAGGACTCGATGACGCTATTAACGCATTAGTTTGCGGATACTTACTGAATCGAACAACCAACATAAGCGCAATAATAGCAGCATTTAATCAGATGAAAGACGTAGAACATTTAATTATCGATCAATTCACATCAAATTTAGATCAAAATACATGTGACAGTAAAAGTCCTTCACAA'

In [138]:
len(seqs[228])


504

In [139]:
translate(seqs[228]) == translate(dd)

False

In [99]:
p53_rnai = 'TCCGTCAACTGTCAATTTGCATGCTACAACAGTTGCTTTAATCAAGATTCAGGTGGTCGGAAAACACTTTATTTAATCATCACTCTAGAATTTCTCGATAAAAAAACAAATAAATTCGATGTATGGGGTCGACAGTGTTTGCAATTTCGCAGTTGTGCTTGTCCAAGTAGAGACTGGAGAGATAAAAAGATTAAAGGCGATCCAGAAATGTTACTGAAATTCAAAGAAAAACGAATCAAAACCGAAGAAAAATTAAATAATTTGGTGATTTCTAAAAGCGTCCCTATTAATATGGGTGGAAAGGATGCTATCATAAGAGTTCTTCCCTCGTTGCCAGGACTCGATGACGCTATTAACGCATTAGTTTGCGGATACTTACTGAATCGAACAACCAACATAAGCGCAATAATAGCAGCATTTAATCAGATGAAAGACGTAGAACATTTAATTATCGATCAATTCACATCAAATTTAGATCAAAATACATGTGACAGTAAAAGTCCTTCACAAACTCCAGAGTCTCAGATTTCTCCG'

## recoding vasa-1

In [72]:
vasa1_ORF = 'ATGTCGGTTAATGATGGATTAAGTGATGAAGAATGGGGAGCATGGGGACAAACTTCTGTAGTCACAGAAAACATAAGTACTCAAAAAAATAATTCAAAACCTCAAAATGGCTTTGGCAGTGATGATGAAGTAATTATCAATACTGACACCTCAAATGTTCAAGCAATCTCAAAGTTTAGTGTAAATAGTTATAACAAAGAAAATAATTTAAATGGCGGATCTTATGAAAAAAGAGGAGGTTTTAATGGAAATAGGGGAACTAACAGATTTGCTACCAGAAAAGTAGAATCAGAGGACAAGAATGAAACTAATGGTGATAGAAATTATAACAGGAATGGTTACTCTAATGATAGAGATACTAAACCGAACTATCAGAATAACAGAAACTCAGAATTTAAAAGGAACGGAAATGAACAAAATAATTATCCAAATGATAGAAATTTTAATAAACGAGATAATAGTCCCGGTGATAAAAATCAGTATGCGAATAAAAGGAATGATTCTAGAGAACGCTCGCGTAATAATGGATTGTTTGATGATTCAAGAGAACGTAAAACTCCTCGTGGTGACAAACGTGATGATTCAAGAGATCGTAAAATAAACCGAGACGATTCGAGGGATCGTAGAAAGTATCGTGATGATTCCAGAGATCGTAGAAAGTATCGTGACGATTCCAGAGATCGTAAAAAATATCGAGACGATTCAAGGGATCGCAAGAAGTATCGTGATGATTCAAATCGTAAAAAGTATCGTGATAGCTCGATGGATCGTAGAAAACCTCGTGATGATTCCAGGGATCATAAAAAGTATCGTGACGATTCTAGAGATAAACGAAACAATTTAAAGCGCCGTGATGATTCCTATGATAATAATGATAGAAATCGAAGACCAGATAGAAGAAATCGAGATGATTCTCGCGAAAATCGAAAGAAAAGGGACGATTCGCAAGAAAATCGAACACGAAATAGAAAAGAGGAATTTAAACGTGATGAATCCTGGGAAGACCGTAGATCCGAATTCCAACGAGATCAGAGAATAAATAATGATAGTTTTGCTAGAGCAACGAAAGAAAATGGAGCTAACAATTTTGAATCGAAAAAGTTCCGTGGAAATAACGATGCAAATAATGGATTTAGAAACGACGAATTTGATGGCAATTTCCAAGGAAAACGTAATGGAAATTCCAACGATTTTTCCACTGAATTCGATAGAACTGTGACTTTGGAAGAAAATCCTGCCTATTCATCGAAAACATTTGTTCGTGGTCAAAAGCAGCCGGAACAAACCAATCAAAATGATGATGCTATTCCAATCGTTAAAAGAGCGACATTTATCCCCGATGATAATCAAGAAGATTACGAACTCCATGTAAATTCAGGAATTAATTTCGATAATTACGACAAAATCCCGGTTGAGGTTACAGGCGATGATGTTCCGCCGGCCTTGAACACGTTTTCCAGTCTTCATCTCCCGGAATTTCTCACATCGAATGTTGAAAATCTCAAATATACAAAATTGACCCCAGTGCAAAAATATGCAATTCCGATTATCGATTCCAAACGCGACCTCATGGCATGCGCACAAACCGGTTCTGGCAAAACTGCGGCTTTTCTAATCCCGATAATCAAATCTCTCAGTGAGAATGGGACGGAATCTCCAGCATCGGCAGTCGCGTTTCCGAAAGCTTTAATAATGGCTCCAACTAGGGAGTTGTGTCGTCAGATATTCACTGCTGCGCGTCATCTCTGTCGCGGTTCCAATATCAAATGTGCCTATATTTACGGTGGAATTGAAATGAACAAATCCCGACGTAATATACAAGCAACCGGTTGCGATATCCTCGTGGCGACTCCAGGCCGATTGATTCATTTCCTGGAATTAGTTTGGCTCTCTTTGAGATATTTGCAGTTTTTTGTGCTCGATGAGGCCGATCGAATGCTCGATTCCGATGGATTTTATGAAAGCGTTACGAAAATCTACAACGAGGCCAATTTTAGTGGAGACGATCGATCGATCCAAATTTCGATGTTTAGTGCAACTTTCCCCAATGAAATTCAAACTTTAGCGAGAAATCTCCTGAAAAATTATCTATTTCTAGCTGTTGGAGTTGTTGGCTCAGCAAATAGTGACGTAAAGCAGGAGATTATCCAATCGGATCAAAGGGAGAAAGTCAATACTGCAATTGAATATATAAAAACTATTCCCGATGAGAAAACTCTGATTTTCGTTGAGAGCAAAAGAATGGCCGATTTCATGGGAATAAAGCTCGGATATTTGGGATTTAAAGCGACGACAATTCACGGTGATCGGGAACAGGAACAGAGAGAAATCGCTCTTAATGATTTCAAAAGTGGCCGAGTTAATTTCATGGTTGCCACTAATGTTGCCGCTCGAGGTCTCGATATTCCGAAAGTCGATAATGTAATCAATATCGACATGCCTGACACAATCGATACGTATGTTCATCGAATCGGTAGAACCGGTCGTTGTGGAAATGTTGGACGTGCAATTAGCTTTTTCGATGAAATGAAGGATATTGGATTGGCACAAGGGCTTGTTAGTAAGCTGCAAGAAGCCAATCAGGAGTGTCCTGATTGGTTGAGAGCACTATGTGACGGATCCGGTTCCAGGATGGCCAATTACTCGAGAGACACTAGAAAAAATGTTAAAAGCTCTAAATATATTGACAACCCTACCGATGACGGATTTATGAAGGGTACAAATATTGATTATGATGACGTGAAGCCGACTTCTGAATGGCTCGAAGATTAG'

In [73]:
vasa1_protein = 'MSVNDGLSDEEWGAWGQTSVVTENISTQKNNSKPQNGFGSDDEVIINTDTSNVQAISKFSVNSYNKENNLNGGSYEKRGGFNGNRGTNRFATRKVESEDKNETNGDRNYNRNGYSNDRDTKPNYQNNRNSEFKRNGNEQNNYPNDRNFNKRDNSPGDKNQYANKRNDSRERSRNNGLFDDSRERKTPRGDKRDDSRDRKINRDDSRDRRKYRDDSRDRRKYRDDSRDRKKYRDDSRDRKKYRDDSNRKKYRDSSMDRRKPRDDSRDHKKYRDDSRDKRNNLKRRDDSYDNNDRNRRPDRRNRDDSRENRKKRDDSQENRTRNRKEEFKRDESWEDRRSEFQRDQRINNDSFARATKENGANNFESKKFRGNNDANNGFRNDEFDGNFQGKRNGNSNDFSTEFDRTVTLEENPAYSSKTFVRGQKQPEQTNQNDDAIPIVKRATFIPDDNQEDYELHVNSGINFDNYDKIPVEVTGDDVPPALNTFSSLHLPEFLTSNVENLKYTKLTPVQKYAIPIIDSKRDLMACAQTGSGKTAAFLIPIIKSLSENGTESPASAVAFPKALIMAPTRELCRQIFTAARHLCRGSNIKCAYIYGGIEMNKSRRNIQATGCDILVATPGRLIHFLELVWLSLRYLQFFVLDEADRMLDSDGFYESVTKIYNEANFSGDDRSIQISMFSATFPNEIQTLARNLLKNYLFLAVGVVGSANSDVKQEIIQSDQREKVNTAIEYIKTIPDEKTLIFVESKRMADFMGIKLGYLGFKATTIHGDREQEQREIALNDFKSGRVNFMVATNVAARGLDIPKVDNVINIDMPDTIDTYVHRIGRTGRCGNVGRAISFFDEMKDIGLAQGLVSKLQEANQECPDWLRALCDGSGSRMANYSRDTRKNVKSSKYIDNPTDDGFMKGTNIDYDDVKPTSEWLED*'

In [74]:
translate(vasa1_ORF) == vasa1_protein

True

In [130]:
recode = int(504)
seqs, seqs_small, cais, dists, cais_full, dists_full = sliding_window_RNAi_recoding(recode, vasa1_ORF, vasa1_protein, aminoacidweights, gencodeweights, 
                                                 random = False, no_wobble = True, enforce_different_codons = True, wiggle = False)



In [133]:
rnai = 'GAACGCTCGCGTAATAATGGATTGTTTGATGATTCAAGAGAACGTAAAACTCCTCGTGGTGACAAACGTGATGATTCAAGAGATCGTAAAATAAACCGAGACGATTCGAGGGATCGTAGAAAGTATCGTGATGATTCCAGAGATCGTAGAAAGTATCGTGACGATTCCAGAGATCGTAAAAAATATCGAGACGATTCAAGGGATCGCAAGAAGTATCGTGATGATTCAAATCGTAAAAAGTATCGTGATAGCTCGATGGATCGTAGAAAACCTCGTGATGATTCCAGGGATCATAAAAAGTATCGTGACGATTCTAGAGATAAACGAAACAATTTAAAGCGCCGTGATGATTCCTATGATAATAATGATAGAAATCGAAGACCAGATAGAAGAAATCGAGATGATTCTCGCGAAAATCGAAAGAAAAGGGACGATTCGCAAGAAAATCGAACACGAAATAGAAAAGAGGAATTTAAACGTGATGAATCCTGGGAAGACCGTAGA'

In [135]:
translate(seqs[507//3]) == translate(rnai)

True

In [136]:
seqs[507//3]

'GAGAGATCAAGAAACAACGGTTTATTCGACGACTCTCGAGAGAGAAAGACACCAAGAGGAGATAAGAGAGACGACTCTCGAGACAGAAAGATTAATAGAGATGACTCAAGAGACAGACGAAAATACAGAGACGACTCACGAGACAGACGAAAATACAGAGATGACTCACGAGACAGAAAGAAGTACAGAGATGACTCTAGAGACAGAAAAAAATACAGAGACGACTCTAACAGAAAGAAATACAGAGACTCATCAATGGACAGACGAAAGCCAAGAGACGACTCAAGAGACCACAAGAAATACAGAGATGACTCACGAGACAAGAGAAATAACCTTAAAAGAAGAGACGACTCATACGACAACAACGACCGAAACAGACGACCTGACCGACGAAACAGAGACGACTCAAGAGAGAACAGAAAAAAGAGAGATGACTCACAGGAGAACAGAACTAGAAACCGAAAGGAAGAGTTCAAGAGAGACGAGTCATGGGAGGATAGACGA'

In [122]:
translate(seqs[int(507/3)])

'ERSRNNGLFDDSRERKTPRGDKRDDSRDRKINRDDSRDRRKYRDDSRDRRKYRDDSRDRKKYRDDSRDRKKYRDDSNRKKYRDSSMDRRKPRDDSRDHKKYRDDSRDKRNNLKRRDDSYDNNDRNRRPDRRNRDDSRENRKKRDDSQENRTRNRKEEFKRDESWEDRR'

In [79]:
np.argsort(dists)[::-1]

array([150, 138, 153, 152, 151, 149, 148, 147, 146, 145, 144, 143, 142,
       141, 140, 139, 137, 136, 168, 117, 116, 182, 183, 170, 115, 169,
       167, 119, 166, 162, 161, 160, 159, 114, 118, 123, 120, 121, 124,
       125, 126, 127, 128, 129, 130, 181, 131, 132, 133, 134, 135, 180,
       122, 113, 185, 184, 186, 187, 164, 165, 172, 156, 155, 154, 171,
       163, 157, 158, 177, 111, 190, 112, 189, 188, 191, 176, 174, 175,
       178, 173, 179, 205, 103, 207, 544, 543, 542, 105, 104, 106, 110,
       109, 107, 108, 206, 197, 204, 195, 541, 540, 539, 192, 193, 203,
       194, 196, 102, 199, 200, 201, 202, 101, 575, 576, 577, 545, 486,
       100,  92, 485,  91,  99, 198, 208, 209, 210, 211,  94,  95,  96,
        97,  98,  93, 579, 578, 525, 524, 221, 538, 537, 526, 527, 529,
       572, 571, 570, 528, 530, 536, 535, 534, 533, 531, 532, 212, 614,
        88, 215, 214, 213, 619,  90,  89,  87, 217, 618, 617,  84,  83,
       616, 615, 216, 481, 227,  86,  85,  59, 546, 548, 226, 48

## recoding smedwi-3

In [84]:
smedwi3_ORF = 'ATGTCAGGAAGTAGTGGAATAGGTAGAGGCCGCAGTCGTGGGCTGTTGATGCAAAAGTTTCTGAATAAAGATGTTCTTGTTCCTTCTGTTGAATCTTTAGAAGACAAAGCTCTTAATAAGCTAGGAATTCCACCTCCTGGGTCTACCGTAGAAAAAAATACAGAGTCAAGTTCTATATCAAGTGGAGATTCAAGAAGTAATCCCAGTGGAGATTCAAGAAATAACATAAAACTACAAGATAGTGATATTGAGAATCGCAATATTACCATTGTTACGCGACCATTATCCTGCATAGGCAGGGGCCGGGGTTTAAGCAATCCCTCAAGTTTAACTACGTCATCGGGTAAATCGGATAAAATTACTGAAAACGAAGAACCTGGTCAAATTAAAAAGTTTGTAGGTCGCGGTAGAGGATTGTTGAATTCTCAGAAAGAATGTTCAAACTCTACTCCATCTGAAGTTTCAAATGAATTGAAACAAATGAAAATTTCAAATGATGATAAAATGACGGTTTCTTCAGAAGCAAAGTCACAATTTGAGAACATTGAAAAACCTATTAGTAAATTTCGTCGACGTGAATATCCAACTCAAATAAAAGAACCATGTAATACAAGAAATGATTCATCGCCATCTTTAACTTTAAGTGCTAATTACGTTAAAGTTAGGACTACACAACCCCATATATATCAATACCATGTTTCCTTTGCACCTCCGATAGATTCAAGGTTGATGAGAATTAAAATAGTTCAAGGGTTATCAGAATCAGATTTAGGGGTTGTCAAAGAAGCAAGAGCTTTTGACGGTATGAATTTATACATTCCTCAACTTTTAAAAAATAAAGAGACAATAATTAAAGTAAACAAACCGACTGATAAGACTGTCGTGGATGTTAAAGTAGTTTTTACTAACAATGTTAATTTTAGTGAATGTCCTATGGTTTATAATGTTCTTTTTAAAAGAATTGAAAATTCACTCAGAATGGTTAAAATTGGTAGGGATTATTTCTACCCTGAAAAAAAGATAGTACTTGACCGTAGAAGGATGGAAATATGGCCGGGATATGTAACAAGTATCCAAAATTTTGACGGTGGTTTACTGTTACAATGCGATGTGTCACACAAAGTTATTCGAAATGATAGTGTGTATGACATAATGATGGAAATTAATAAAACTGTTAACAATAAAGGTCAAATGCAAACTGCTGCGATTAATCAACTATTGGGTCAAATTGTGTTGACTCCTCACAATAACCGAAATTATAGAATTACTGATATAGATTGGGCTAAAAATTGTTTAAGTGAATTCGATAAAGGAGGCGAAAAAATTAGCTACCGGGATTATTTTAGGAACACGTATGGGCTACAAATTCGTGATCTAGAGCAGCCTATGATAGTTAGTAAATCTAATAGCAGATCTGGTAAAAACCGAGGTCCCAAAGGATCAAAAGAAGTGGATGGTGGATTGGTCTATTTAATTCCAGAATTGTGTATGCTAACTGGTTTGACAGATGACATGATTAAAGATTTTCGTTTGATGAGAGAATTACACGAGCATTGTCGAGTTACTCCCAAGAAAAGACACGAAGCCTTACTGGAATTCGTGGATAACATATATAGCTGTGAGGAAGCTAAGAAACTTTTAGGATATTGGGGTATAACGATTGAAAAGGACACTGTCAACATAAATGCTTGTAAAATGAATCCAGAAATGATATATTTTGGAAATGAAGCTTCTGTTAGTGCTGGGGAACAAGCTGAATTTAAACAAGCCTTGGCACATAATAAAGTTATAGGTGGTATTCGTATTGAAAATTGGATATTAATTTCTCCAAAAAGTTTACTGACAAAAGCAAATGGTCTGTTACAGGCTTTAATGAGCAAATCTCCTAGAGTTGGAGTTATGTTTGGAAAACCCAAAATAGTTGAAATGAACAATGATCGAACAGAAGAGTATTTAAAAGAATTAAAGAGAAATGTGGCTCCTGGTGTGCAGTTAGTAGTTACAATTTTATCTGCTGTTAGAGAAGATCGATACAATGCAATAAAAAAATTTTGTTATGTGGATTGTCCTGTTCCAAGTCAAGTGGTATTAGCCCAAACATTGAAAGAAGGGCCTAAATTAAATAGTGTGGCAGTTAATATAGCCCTTCAAATAAACGCAAAATTAGGTGGAGAGCTGTGGGCTGTCAAAATACCTATTAAGAAGTTTATGGTTGTTGGACTTGATGTTTGGCATGATACTAAAGGGAGAAGTAGATCAGTTGGAGCCGTAGTTGGTTCAACTAATGCGCTATGCACAAGGTGGTTTTCGAAATCGCATTTGCAAGAACAAGATAAAGAAATTATGTACGTATTACAGTCGTGTATGTTAAGCCTTTTAAAGGCTTATTTTGAAGAAAATAATTTTTTGCCTGAGACTATCTTTATGTATAGGGATGGTGTTAGTGATGGTCAGTTAGGATATGTTCAAAAAACTGAAATTGAACAATTCTTTAAAGTTTTTGAATCGTTTAGTGCTGATTATAAACCTAATATGGTATATAATGTTGTTCAAAAGAGAATTAATACTAGGCTCTATGTAAGTGATCCGAAAAATAAAGGACAAATAAATAACCCCAATCCTGGTACAATTGTCGACCATACTGTTACGAGGGCTAACCTTTATGATTTTTTTCTTGTTTCTCAATCGGTTAGGCAGGGAACTGTAACTCCGACGCATTACGTTGTTTTATGTGACAATTCTAAATACACTCCGCATCAGGTTCAGTTGATGGCTTATAAAACATGTCATATATATTACAATTGGCCAGGAACGGTTCGAGTACCAGCACCTTGTATGTATGCTCATAAATTGGCATATATGGTTGGTCAGAATTTGAAAGCTGAACCTAGTAATCTTCTATGTGACAGACTTTTTTATTTGTAA'

In [87]:
smedwi3_protein = 'MSGSSGIGRGRSRGLLMQKFLNKDVLVPSVESLEDKALNKLGIPPPGSTVEKNTESSSISSGDSRSNPSGDSRNNIKLQDSDIENRNITIVTRPLSCIGRGRGLSNPSSLTTSSGKSDKITENEEPGQIKKFVGRGRGLLNSQKECSNSTPSEVSNELKQMKISNDDKMTVSSEAKSQFENIEKPISKFRRREYPTQIKEPCNTRNDSSPSLTLSANYVKVRTTQPHIYQYHVSFAPPIDSRLMRIKIVQGLSESDLGVVKEARAFDGMNLYIPQLLKNKETIIKVNKPTDKTVVDVKVVFTNNVNFSECPMVYNVLFKRIENSLRMVKIGRDYFYPEKKIVLDRRRMEIWPGYVTSIQNFDGGLLLQCDVSHKVIRNDSVYDIMMEINKTVNNKGQMQTAAINQLLGQIVLTPHNNRNYRITDIDWAKNCLSEFDKGGEKISYRDYFRNTYGLQIRDLEQPMIVSKSNSRSGKNRGPKGSKEVDGGLVYLIPELCMLTGLTDDMIKDFRLMRELHEHCRVTPKKRHEALLEFVDNIYSCEEAKKLLGYWGITIEKDTVNINACKMNPEMIYFGNEASVSAGEQAEFKQALAHNKVIGGIRIENWILISPKSLLTKANGLLQALMSKSPRVGVMFGKPKIVEMNNDRTEEYLKELKRNVAPGVQLVVTILSAVREDRYNAIKKFCYVDCPVPSQVVLAQTLKEGPKLNSVAVNIALQINAKLGGELWAVKIPIKKFMVVGLDVWHDTKGRSRSVGAVVGSTNALCTRWFSKSHLQEQDKEIMYVLQSCMLSLLKAYFEENNFLPETIFMYRDGVSDGQLGYVQKTEIEQFFKVFESFSADYKPNMVYNVVQKRINTRLYVSDPKNKGQINNPNPGTIVDHTVTRANLYDFFLVSQSVRQGTVTPTHYVVLCDNSKYTPHQVQLMAYKTCHIYYNWPGTVRVPAPCMYAHKLAYMVGQNLKAEPSNLLCDRLFYL*'

In [89]:
translate(smedwi3_ORF) == smedwi3_protein

True

In [145]:
recode = int(510)
seqs, seqs_small, cais, dists, cais_full, dists_full = sliding_window_RNAi_recoding(recode, smedwi3_ORF, smedwi3_protein, aminoacidweights, gencodeweights, 
                                                 random = False, no_wobble = True, enforce_different_codons = True, wiggle = False)



In [140]:
rnaiwi3 = 'GGAATAGGTAGAGGCCGCAGTCGTGGGCTGTTGATGCAAAAGTTTCTGAATAAAGATGTTCTTGTTCCTTCTGTTGAATCTTTAGAAGACAAAGCTCTTAATAAGCTAGGAATTCCACCTCCTGGGTCTACCGTAGAAAAAAATACAGAGTCAAGTTCTATATCAAGTGGAGATTCAAGAAGTAATCCCAGTGGAGATTCAAGAAATAACATAAAACTACAAGATAGTGATATTGAGAATCGCAATATTACCATTGTTACGCGACCATTATCCTGCATAGGCAGGGGCCGGGGTTTAAGCAATCCCTCAAGTTTAACTACGTCATCGGGTAAATCGGATAAAATTACTGAAAACGAAGAACCTGGTCAAATTAAAAAGTTTGTAGGTCGCGGTAGAGGATTGTTGAATTCTCAGAAAGAATGTTCAAACTCTACTCCATCTGAAGTTTCAAATGAATTGAAACAAATGAAAATTTCAAATGATGATAAAATGACGGTTTCTTCAGAAGCA'

In [146]:
translate(rnaiwi3)

'GIGRGRSRGLLMQKFLNKDVLVPSVESLEDKALNKLGIPPPGSTVEKNTESSSISSGDSRSNPSGDSRNNIKLQDSDIENRNITIVTRPLSCIGRGRGLSNPSSLTTSSGKSDKITENEEPGQIKKFVGRGRGLLNSQKECSNSTPSEVSNELKQMKISNDDKMTVSSEA'

In [147]:
translate(seqs[5]) == translate(rnaiwi3)

True

In [148]:
seqs[5]

'GGTATTGGACGAGGAAGATCAAGAGGATTATTAATGCAGAAATTCTTAAACAAGGACGTATTAGTACCATCAGTAGAGTCACTTGAGGATAAGGCATTAAACAAATTAGGTATACCTCCACCAGGATCAACAGTTGAGAAGAACACTGAATCTTCATCAATTTCTTCAGGTGACTCTCGATCAAACCCATCAGGTGACTCTCGAAACAATATTAAGTTACAGGACTCAGACATAGAAAACAGAAACATAACAATAGTAACAAGACCTCTTTCATGTATTGGAAGAGGAAGAGGACTTTCAAACCCATCTTCACTTACAACATCTTCAGGAAAGTCAGACAAGATAACAGAGAATGAGGAGCCAGGACAGATAAAGAAATTCGTTGGAAGAGGACGAGGTTTATTAAACTCACAAAAGGAGTGCTCTAATTCAACACCTTCAGAGGTATCTAACGAGTTAAAGCAGATGAAGATATCTAACGACGACAAGATGACAGTATCATCTGAGGCT'

## Recode p53 subdomain

In [166]:
p53_domain = 'TCTTCTTATCCAGGTCCGTATAATTTTCAAATCTTTATTCCCAACGGAGAATTTGATGAATCCAAAAGAAAAGGACAGACATGTGTGTTTCAAACCGATAAAATGGGAAATCACCAATTATTTACCAAACCTCATCCTCATTATTGGAGGTTAAATTATTCAGCTGATCCTTCTATGTCAACGGAAAACATGTATATTCGGATGGTTCCAGTTTTTGGGGATCCAGAAAAAGCTCAATGCATTTTGGAAAGATGTGCAAAACACAAAGAAGTAACAACCGATGAAAATCACTGGAAATATCGTAGCATGCTCATTGTAGAAAAAACCTGTGCACATTACTTTCAGGATTCGGCAACGAAAAGAGTTTGCATTTTATTACCGTTTGAAAAGCATGCGGAAGGAGAGATTTATTCTTCCGTCAACTGTCAATTTGCATGCTACAACAGTTGCTTTAATCAAGATTCAGGTGGTCGGAAAACACTTTATTTAATCATCACTCTAGAATTTCTCGATAAAAAAACAAATAAATTCGATGTATGGGGTCGACAGTGTTTGCAATTTCGCAGTTGTGCTTGTCCAAGTAGAGACTGGAGAGAT'

In [167]:
p53_domainprotein = translate(p53_domain)

In [168]:
p53_domainprotein

'SSYPGPYNFQIFIPNGEFDESKRKGQTCVFQTDKMGNHQLFTKPHPHYWRLNYSADPSMSTENMYIRMVPVFGDPEKAQCILERCAKHKEVTTDENHWKYRSMLIVEKTCAHYFQDSATKRVCILLPFEKHAEGEIYSSVNCQFACYNSCFNQDSGGRKTLYLIITLEFLDKKTNKFDVWGRQCLQFRSCACPSRDWRD'

In [182]:
recode = 501
seqs, seqs_small, cais, dists, cais_full, dists_full = sliding_window_RNAi_recoding(recode, p53_domain, p53_domainprotein,
                                                                                    aminoacidweights, gencodeweights, 
                                                 random = False, no_wobble = True, enforce_different_codons = True, wiggle = False)



In [183]:
dists

array([0.34730539, 0.34730539, 0.34730539, 0.3493014 , 0.3493014 ,
       0.3493014 , 0.3493014 , 0.3493014 , 0.3493014 , 0.3493014 ,
       0.3493014 , 0.3493014 , 0.3493014 , 0.34730539, 0.34730539,
       0.34730539, 0.34730539, 0.34730539, 0.34730539, 0.34730539,
       0.34730539, 0.3493014 , 0.35329341, 0.35329341, 0.35329341,
       0.35329341, 0.35329341, 0.35728543, 0.35728543, 0.35728543,
       0.35528942, 0.35528942])

In [172]:
dists*len(p53_domain)

array([207.94382022, 207.94382022, 206.8258427 , 206.8258427 ,
       206.8258427 , 206.8258427 , 206.8258427 , 206.8258427 ,
       206.8258427 , 206.8258427 , 207.94382022, 210.17977528,
       210.17977528, 210.17977528, 210.17977528, 210.17977528,
       212.41573034, 212.41573034, 212.41573034, 211.29775281,
       211.29775281])

In [184]:
np.argmax(dists)

27

In [186]:
translate(seqs[27])

'CVFQTDKMGNHQLFTKPHPHYWRLNYSADPSMSTENMYIRMVPVFGDPEKAQCILERCAKHKEVTTDENHWKYRSMLIVEKTCAHYFQDSATKRVCILLPFEKHAEGEIYSSVNCQFACYNSCFNQDSGGRKTLYLIITLEFLDKKTNKFDVWGRQCLQFRSCACPS'

In [190]:
seqs[27]

'TGCGTTTTCCAGACAGACAAGATGGGTAACCATCAGCTTTTCACAAAGCCACACCCACACTACTGGAGACTTAACTACTCTGCAGACCCATCAATGTCTACAGAGAATATGTACATAAGAATGGTACCTGTATTCGGAGACCCTGAGAAGGCACAGTGTATATTAGAGCGATGCGCTAAGCATAAGGAGGTTACTACAGACGAGAACCATTGGAAGTACAGATCAATGTTAATAGTTGAGAAGACATGCGCTCACTATTTCCAAGACTCAGCTACAAAGCGAGTATGTATACTTCTTCCATTCGAGAAACACGCAGAGGGTGAAATATACTCATCAGTAAATTGCCAGTTCGCTTGTTATAATTCATGTTTCAACCAGGACTCTGGAGGAAGAAAGACTTTATACCTTATAATAACATTAGAGTTCTTAGACAAGAAGACTAACAAGTTTGACGTTTGGGGAAGACAATGCTTACAGTTCAGATCATGCGCATGCCCTTCA'

In [176]:
dists[1]

0.34831460674157305

In [177]:
dists[16]

0.35580524344569286

In [178]:
seqs[1]

'TCATACCCTGGACCATACAACTTCCAGATATTCATACCAAATGGTGAGTTCGACGAGTCAAAGCGAAAGGGTCAAACTTGCGTTTTCCAGACAGACAAGATGGGTAACCATCAGCTTTTCACAAAGCCACACCCACACTACTGGAGACTTAACTACTCTGCAGACCCATCAATGTCTACAGAGAATATGTACATAAGAATGGTACCTGTATTCGGAGACCCTGAGAAGGCACAGTGTATATTAGAGCGATGCGCTAAGCATAAGGAGGTTACTACAGACGAGAACCATTGGAAGTACAGATCAATGTTAATAGTTGAGAAGACATGCGCTCACTATTTCCAAGACTCAGCTACAAAGCGAGTATGTATACTTCTTCCATTCGAGAAACACGCAGAGGGTGAAATATACTCATCAGTAAATTGCCAGTTCGCTTGTTATAATTCATGTTTCAACCAGGACTCTGGAGGAAGAAAGACTTTATACCTTATAATAACATTAGAGTTCTTAGACAAGAAGACTAACAAGTTTGACGTT'

In [179]:
dists[16]

0.35580524344569286

In [181]:
translate(seqs[16])

'EFDESKRKGQTCVFQTDKMGNHQLFTKPHPHYWRLNYSADPSMSTENMYIRMVPVFGDPEKAQCILERCAKHKEVTTDENHWKYRSMLIVEKTCAHYFQDSATKRVCILLPFEKHAEGEIYSSVNCQFACYNSCFNQDSGGRKTLYLIITLEFLDKKTNKFDVWGRQCLQFRSCACPS'