In [1]:
import os
import glob
import copy
import numpy as np
import Bio
import scipy.spatial
import pickle
import matplotlib.pyplot as plt
import pandas as pd
from rnai_scripts import *
import bokeh.io
import bokeh.plotting

# Enable viewing Bokeh plots in the notebook
bokeh.io.output_notebook()

In [2]:
def ecdf_vals(data):
    """Return x and y values for an ECDF."""
    return np.sort(data), np.arange(1, len(data)+1) / len(data)


# RNAi recoding 

## Reading in the Smed transcriptome
We read in the Smed_v6 transcriptome orfs that were extracted using orfipy. We then join them all into one string and obtain the codon frequencies. 

In [6]:
fname = 'data/dd_Smed_v6_transcripts_orfs_large3.fa' # makes smallest proteins be around 30 amino acids
descriptors, seqs = read_many_fasta(fname)
# join all ORFS into one large transcriptome 
transcriptome = ''.join(seqs)
# get aminoacidweights and codon weights 

codon_frequencies_dic = get_codon_frequencies(transcriptome) 

Now we get frequencies of doublets

In [7]:
doubletscode = get_codon_frequencies_doublets(transcriptome)

I also found a published version of amino acid frequencies:

In [8]:
df = pd.read_csv('data/codon_usage_smed.csv')


AAs = df['codon'].values
freqs = df['frequency'].values/1000.

codon_frequencies_dic_published = {}
for i in range(len(AAs)):
    codon_frequencies_dic_published[AAs[i]] = freqs[i]
print(sum(freqs))

1.00000000001


Here we find the discrepencies between the frequencies of each doublet vs. the product frequency of the separate codons. 

In [9]:
diff_dic = {}
for pair in doubletscode.keys():
    if 'TAA' == pair[:3]:
        continue
    if 'TAG' == pair[:3]:
        continue
    if 'TGA' == pair[:3]:
        continue
    
    diff_dic[pair] = (doubletscode[pair] - codon_frequencies_dic[pair[:3]]*codon_frequencies_dic[pair[3:]])

In [10]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='diff',
    y_axis_label='Dist',
    x_axis_type = 'log'
    
)
diffs, ecdf_diffs = ecdf_vals(np.array(list(doubletscode.values())))
print(np.sum(np.array(list(doubletscode.values()))))
p.circle(diffs, ecdf_diffs)

#diffs, ecdf_diffs = ecdf_vals(np.array(list(doublets.values())))
#p.circle(diffs, ecdf_diffs, color = 'orange')
bokeh.io.show(p)

1.0


We use our codon frequencies dictionary to compute CAI weights (based on the weight definition for the CAI) for all codons 

$$w_i = \frac{f_i}{\max (f_j)} i,j \in [ \text{synonymouse codons for amino acid} ]$$

Where $f_i$ is the frequency of codon $i$. 

We obtain two dictionaries: 


aminoacidweights: keys are amino acids, values are arrays of $w_i$ for all synonymous codons. The order of the codons is the as those used in aminoacidcode. 
    
gencodeweights: keys are codons, values are $w_i$ for each codon

In [12]:
aminoacidweights, gencodeweights = get_codon_weights(codon_frequencies_dic)

We pickle dump everything so we do not have to repeat the above line later. 

In [13]:
pickle.dump( aminoacidweights,
            open( "data/Smed_transcriptome_aminoacidweights.p", "wb" ) )
pickle.dump( gencodeweights, 
            open( "data/Smed_transcriptome_gencodeweights.p", "wb" ) )
pickle.dump( aminoacidcode,
            open( "data/aminoacidcode.p", "wb" ))
pickle.dump( doubletscode,
            open( "data/doubletscode.p", "wb" ))

We reload everything with pickle because why not. 

In [14]:
aminoacidweights = pickle.load( open( "data/Smed_transcriptome_aminoacidweights.p",
                                     "rb" ) )
gencodeweights = pickle.load( open( "data/Smed_transcriptome_gencodeweights.p", 
                                   "rb" ) )
aminoacidcode = pickle.load(open("data/aminoacidcode.p", 'rb'))
doubletscode = pickle.load(
            open( "data/doubletscode.p", "rb" ))

## Recoding the H2B ORF for RNAi

Here we define the ORF for the histone 2B gene.

In [15]:
H2B_ORF = 'atggcaattaaaggtaagatcgctgctaagtctgttaagaagatttcaaaggaagttgctcctaaaacagacaaaaagaaaaggatacataaacgcaaagaaagttatggtatttacatctacaaagtgttgagacaagttcatccagatactggaatttctggcaaagcaatgtctattatgaatagctttgtcaacgatgttttcgaaagaatcgcttcggaggctagcaagttggcaacttataataagaaatcaaccataaccagcagagagattcaaactgcagtcaggttaattttaccaggagaattggctaaacacgcagttagtgaaggaaccaaagctgttacaaaatacacaggatccaaa'

We add on the stop codon. 

In [16]:
H2B_ORF = H2B_ORF.upper() + 'TAA'

Here we define the protein for the histone 2B gene.

In [17]:
H2B_protein = 'MAIKGKIAAKSVKKISKEVAPKTDKKKRIHKRKESYGIYIYKVLRQVHPDTGISGKAMSIMNSFVNDVFERIASEASKLATYNKKSTITSREIQTAVRLILPGELAKHAVSEGTKAVTKYTGSK*'


Now we can use the function get_RNAi_seq to randomly sample different recoded H2B proteins. 

The function get_RNAi_seq requires the ORF, protein sequence, an aminoacidweights and gencodeweights dictionary. We run 1000 random samples and do not enforce that every codon be different. It returns the list of tested sequences (seqs), scores ($CAI + D$/2) for each sequence, codon adaptation indices (CAIs), and Hamming distances (dists = $D$). 

In [18]:
seqs, scores, cais, dists = get_RNAi_seq(H2B_ORF, H2B_protein, aminoacidweights, 
                            gencodeweights, trials = 1000,  enforce_different_codons = False, random = True)

best_seq, best_score, best_cai, best_dist = get_RNAi_seq(H2B_ORF, H2B_protein, aminoacidweights, 
                            gencodeweights, trials = 1,  enforce_different_codons = False, random = False)
print(best_cai, best_dist)

[1.0] [0.184]


We redo the process but enforce that every codon must be different. 

In [19]:
seqs_diff, scores_diff, cais_diff, dists_diff = get_RNAi_seq(H2B_ORF, H2B_protein, aminoacidweights, 
                            gencodeweights, trials = 1000,  enforce_different_codons = True, random = True)

best_seq_diff, best_score_diff, best_cai_diff, best_dist_diff = get_RNAi_seq(H2B_ORF, H2B_protein, aminoacidweights, 
                            gencodeweights, trials = 1,  enforce_different_codons = True, random = False)
print(best_cai_diff, best_dist_diff)

[0.715260684985255] [0.3546666666666667]


We find the best sequences of our random simulation

In [20]:
print(np.max(cais_diff), np.max(dists_diff))

0.6295778040767388 0.39466666666666667


We repeat with wiggle. 

In [21]:
seqs_diff, scores_diff, cais_wiggle, dists_wiggle = get_RNAi_seq(H2B_ORF, H2B_protein, aminoacidweights,
                            gencodeweights, trials = 1000,  enforce_different_codons = True, random = True, wiggle = True,)

best_seq_diff, best_score_diff, best_cai_diff_wiggle, best_dist_diff_wiggle = get_RNAi_seq(H2B_ORF, H2B_protein, aminoacidweights, 
                            gencodeweights, trials = 1,  enforce_different_codons = True, random = False,  wiggle = True
                                                                            )
print(best_cai_diff_wiggle, best_dist_diff_wiggle)

[0.715260684985255] [0.3546666666666667]


In [23]:
print(np.max(cais_wiggle), np.max(dists_wiggle))

0.6948839698560052 0.376


Doublets baby

In [24]:
seqs_doub, scores_doub, cais_doub, dists_doub = get_RNAi_seq(H2B_ORF, H2B_protein, aminoacidweights, 
                            gencodeweights, trials = 1000,  enforce_different_codons =True, random = True,
                                                            pairs = True, doubletscode = doubletscode)

best_seq_doub, best_score_doub, best_cai_doub, best_dist_doub = get_RNAi_seq(H2B_ORF, H2B_protein, aminoacidweights, 
                            gencodeweights, trials = 1,  enforce_different_codons = True, random = False,
                                                                            pairs = True, doubletscode = doubletscode,)
print(best_cai_doub, best_dist_doub)

[0.7035383596762999] [0.352]


In [27]:
seqs_doub, scores_doub, cais_doub_wigg, dists_doub_wigg = get_RNAi_seq(H2B_ORF, H2B_protein, aminoacidweights, 
                            gencodeweights, trials = 1000,  enforce_different_codons =True, random = True, wiggle = True,
                                                            pairs = True, doubletscode = doubletscode)

best_seq_doub, best_score_doub, best_cai_doub_wiggle, best_dist_doub_wiggle = get_RNAi_seq(H2B_ORF, H2B_protein, aminoacidweights, 
                            gencodeweights, trials = 1,  enforce_different_codons = True, random = False, wiggle = True,
                                                                            pairs = True, doubletscode = doubletscode,)
print(best_cai_doub_wiggle, best_dist_doub_wiggle)

[0.7370931356988379] [0.33866666666666667]


We define a function to compute ECDFs

We plot ECDFs of the CAIs.  

In [28]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='CAI',
    y_axis_label='ECDF',
    
)
cais, ecdf_cais = ecdf_vals(cais)
p.circle(cais, ecdf_cais, legend_label = 'Not all different ')

cais_diff, ecdf_cais_diff = ecdf_vals(cais_diff)
p.circle(cais_diff, ecdf_cais_diff, legend_label = 'all different', color = 'orange')

cais_wiggle, ecdf_cais_wiggle = ecdf_vals(cais_wiggle)
p.circle(cais_wiggle, ecdf_cais_wiggle, legend_label = 'all different wiggle', color = 'green')

cais_doub, ecdf_cais_doub = ecdf_vals(cais_doub)
p.circle(cais_doub, ecdf_cais_doub, legend_label = 'doublets', color = 'red')

cais_doub_wiggle, ecdf_cais_doub_wiggle = ecdf_vals(cais_doub_wigg)
p.circle(cais_doub_wiggle, ecdf_cais_doub_wiggle,
         legend_label = 'doublets wig', color = 'pink')


p.legend.location = 'bottom_right'
bokeh.io.show(p)

We plot ECDFs of the hamming distances 

In [29]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='Hamming Distance',
    y_axis_label='ECDF',
    
)
dists, ecdf_dists = ecdf_vals(dists)
p.circle(dists, ecdf_dists, legend_label = 'Not all different ')

dists_diff, ecdf_dists_diff = ecdf_vals(dists_diff)
p.circle(dists_diff, ecdf_dists_diff, legend_label = 'all different', color = 'orange')


dists_diff_wiggle, ecdf_dists_diff_wiggle = ecdf_vals(dists_wiggle)
p.circle(dists_diff_wiggle, ecdf_dists_diff_wiggle, legend_label = 'wiggle', color = 'green')

dists_doub, ecdf_dists_doub = ecdf_vals(dists_doub)
p.circle(dists_doub, ecdf_dists_doub, legend_label = 'doublets', color = 'red')

dists_doub_wiggle, ecdf_dists_doub_wiggle = ecdf_vals(dists_doub_wigg)
p.circle(dists_doub_wiggle, ecdf_dists_doub_wiggle,
         legend_label = 'doublets wig', color = 'pink')


p.legend.location = 'bottom_right'
p.x_range = bokeh.models.Range1d(.1, .6)
bokeh.io.show(p)

## Now we recode the luc ORFS!!!! 

Since SmedNluc2 is so short we must RNAi the whole thing. 

In [32]:
SmedNluc2 = 'ATGGTGTTTACTTTGGAAGATTTTGTTGGAGATTGGAGACAAACTGCTGGTTACAATCTGGATCAGGTACTGGAACAAGGCGGTGTTAGTTCATTATTCCAAAACCTGGGTGTGAGTGTAACTCCGATTCAGCGAATAGTGTTGTCTGGAGAAAATGGGCTGAAGATTGATATACACGTCATAATTCCATACGAAGGCTTAAGCGGTGATCAAATGGGACAAATTGAAAAAATTTTTAAAGTAGTTTACCCAGTTGACGACCATCATTTTAAAGTTATCCTTCATTACGGTACACTGGTTATAGATGGTGTAACTCCAAATATGATCGATTATTTCGGAAGACCTTACGAAGGCATAGCCGTTTTTGATGGAAAAAAGATTACAGTAACAGGTACATTGTGGAACGGAAATAAGATTATTGACGAACGTTTAATTAACCCAGATGGAAGTTTGCTCTTTAGAGTTACAATTAATGGTGTGACAGGATGGAGATTATGCGAACGGATACTCGCGTAA'

In [33]:
smednluc2_protein = 'MVFTLEDFVGDWRQTAGYNLDQVLEQGGVSSLFQNLGVSVTPIQRIVLSGENGLKIDIHVIIPYEGLSGDQMGQIEKIFKVVYPVDDHHFKVILHYGTLVIDGVTPNMIDYFGRPYEGIAVFDGKKITVTGTLWNGNKIIDERLINPDGSLLFRVTINGVTGWRLCERILA*'

In [34]:
hluc = 'ATGGTCTTCACACTCGAAGATTTCGTTGGGGACTGGCGACAGACAGCCGGCTACAACCTGGACCAAGTCCTTGAACAGGGAGGTGTGTCCAGTTTGTTTCAGAATCTCGGGGTGTCCGTAACTCCGATCCAAAGGATTGTCCTGAGCGGTGAAAATGGGCTGAAGATCGACATCCATGTCATCATCCCGTATGAAGGTCTGAGCGGCGACCAAATGGGCCAGATCGAAAAAATTTTTAAGGTGGTGTACCCTGTGGATGATCATCACTTTAAGGTGATCCTGCACTATGGCACACTGGTAATCGACGGGGTTACGCCGAACATGATCGACTATTTCGGACGGCCGTATGAAGGCATCGCCGTGTTCGACGGCAAAAAGATCACTGTAACAGGGACCCTGTGGAACGGCAACAAAATTATCGACGAGCGCCTGATCAACCCCGACGGCTCCCTGCTGTTCCGAGTAACCATCAACGGAGTGACCGGCTGGCGGCTGTGCGAACGCATTCTGGCGTAA'

I wonder what the CAI for each ORF is?

In [35]:
print('CAI for SMed Nuc:', get_CAI(SmedNluc2, gencodeweights))
print('CAI for Human Nuc:', get_CAI(hluc, gencodeweights))
print('Hamming Distance vs Smed vs Human Nuc', get_hamming_dist(SmedNluc2, hluc))

CAI for SMed Nuc: 0.737701636271008
CAI for Human Nuc: 0.5470635190087074
Hamming Distance vs Smed vs Human Nuc 0.25775193798449614


Seems ok, now lets recoede the Smed ORF with doublest code but do not enforce all codons to be different 

In [36]:
seqs_doub, scores_doub, cais_doub, dists_doub = get_RNAi_seq(SmedNluc2, smednluc2_protein, aminoacidweights, 
                            gencodeweights, trials = 1000,  enforce_different_codons =False, random = True,
                                                            pairs = True, doubletscode = doubletscode)

best_seq_doub, best_score_doub, best_cai_doub, best_dist_doub = get_RNAi_seq(SmedNluc2, smednluc2_protein, aminoacidweights, 
                            gencodeweights, trials = 1,  enforce_different_codons = False, random = False,
                                                                            pairs = True, doubletscode = doubletscode,)
print(best_cai_doub, best_dist_doub)

[0.9940680734100419] [0.19186046511627908]


We redo the process but enforce that every codon must be different. 

In [37]:
seqs_diff, scores_diff, cais_diff, dists_diff = get_RNAi_seq(SmedNluc2, smednluc2_protein, aminoacidweights, 
                            gencodeweights, trials = 1000,  enforce_different_codons = True, random = True,
                                                            pairs = True, doubletscode = doubletscode)

best_seq_diff, best_score_diff, best_cai_diff, best_dist_diff = get_RNAi_seq(SmedNluc2, smednluc2_protein, aminoacidweights, 
                            gencodeweights, trials = 1,  enforce_different_codons = True, random = False,
                                                                            pairs = True, doubletscode = doubletscode,)
print(best_cai_diff, best_dist_diff)

[0.7160654283084763] [0.3546511627906977]


In [38]:
print('Max CAI for any sequence: ', max(cais_doub))
print('Max CAI for all codons different sequence: ', max(cais_diff))

Max CAI for any sequence:  0.7960430697689278
Max CAI for all codons different sequence:  0.6329315643003521


In [39]:
print('Max Hamming Distance for any sequence: ', max(dists_doub))
print('Max Hamming Distance for all codons different sequence: ', max(dists_diff))

Max Hamming Distance for any sequence:  0.26356589147286824
Max Hamming Distance for all codons different sequence:  0.37209302325581395


We plot ECDFs of the CAIs.  

In [40]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='CAI',
    y_axis_label='ECDF',
)
cais_sort, ecdf_cais = ecdf_vals(cais_doub)
p.circle(cais_sort, ecdf_cais, legend_label = 'Not all different ')

cais_diff_sort, ecdf_cais_diff = ecdf_vals(cais_diff)
p.circle(cais_diff_sort, ecdf_cais_diff, legend_label = 'all different', color = 'orange')

cais_diff_sort, ecdf_cais_diff = ecdf_vals(cais_diff_doub)
p.circle(cais_diff_sort, ecdf_cais_diff, legend_label = 'doub', color = 'red')

cais_diff_sort, ecdf_cais_diff = ecdf_vals(cais_diff_doub)
p.circle(cais_diff_sort, ecdf_cais_diff, legend_label = 'doub', color = 'red')

p.legend.location = 'bottom_right'
bokeh.io.show(p)

NameError: name 'cais_diff_doub' is not defined

We plot ECDFs of the hamming distances 

In [None]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='Hamming Distance',
    y_axis_label='ECDF',
    
)
dists_sort, ecdf_dists = ecdf_vals(dists_doub)
p.circle(dists_sort, ecdf_dists, legend_label = 'Not all different ')

dists_diff_sort, ecdf_dists_diff = ecdf_vals(dists_diff)
p.circle(dists_diff_sort, ecdf_dists_diff, legend_label = 'all different', color = 'orange')
p.legend.location = 'bottom_right'
p.x_range = bokeh.models.Range1d(.1, .6)
bokeh.io.show(p)

It looks like if they are all different the hamming distance is pretty similar between all sequences, which suggests we should mainly choose by CAI for thehse cases. 

In [None]:
sorted_inds = np.argsort(cais_diff)
seqs_sort = np.array(seqs)[sorted_inds]

For the sequences in which not every codon is enforced to be the same distance matters. 