In [10]:
import os
import glob
import copy
import numpy as np
import Bio
import scipy.spatial
import pickle
import matplotlib.pyplot as plt
import pandas as pd
from rnai_scripts import *
import bokeh.io
import bokeh.plotting

# Enable viewing Bokeh plots in the notebook
bokeh.io.output_notebook()

# RNAi recoding 

## Reading in the Smed transcriptome
We read in the Smed_v6 transcriptome orfs that were extracted using orfipy. We then join them all into one string and obtain the weights (based on the weight definition for the CAI) for all codons 

$$w_i = \frac{f_i}{\max (f_j)} i,j \in [ \text{synonymouse codons for amino acid} ]$$

Where $f_i$ is the frequency of codon $i$. 

We obtain two dictionaries: 


aminoacidweights: keys are amino acids, values are arrays of $w_i$ for all synonymous codons. The order of the codons is the as those used in aminoacidcode. 
    
gencodeweights: keys are codons, values are $w_i$ for each codon

In [2]:
fname = 'data/dd_Smed_v6_transcripts_orfs.fa'
descriptors, seqs = read_many_fasta(fname)
# join all ORFS into one large transcriptome 
transcriptome = ''.join(seqs)
# get aminoacidweights and codon weights 
aminoacidweights, gencodeweights = get_codon_weights_dic(transcriptome)

We pickle dump everything so we do not have to repeat the above line later. 

In [3]:
pickle.dump( aminoacidweights,
            open( "data/Smed_transcriptome_aminoacidweights.p", "wb" ) )
pickle.dump( gencodeweights, 
            open( "data/Smed_transcriptome_gencodeweights.p", "wb" ) )
pickle.dump( aminoacidcode,
            open( "data/aminoacidcode.p", "wb" ))

We reload everything with pickle because why not. 

In [4]:
aminoacidweights = pickle.load( open( "data/Smed_transcriptome_aminoacidweights.p",
                                     "rb" ) )
gencodeweights = pickle.load( open( "data/Smed_transcriptome_gencodeweights.p", 
                                   "rb" ) )
aminoacidcode = pickle.load(open("data/aminoacidcode.p", 'rb'))

## Recoding the H2B ORF for RNAi

Here we define the ORF for the histone 2B gene.

In [5]:
H2B_ORF = 'atggcaattaaaggtaagatcgctgctaagtctgttaagaagatttcaaaggaagttgctcctaaaacagacaaaaagaaaaggatacataaacgcaaagaaagttatggtatttacatctacaaagtgttgagacaagttcatccagatactggaatttctggcaaagcaatgtctattatgaatagctttgtcaacgatgttttcgaaagaatcgcttcggaggctagcaagttggcaacttataataagaaatcaaccataaccagcagagagattcaaactgcagtcaggttaattttaccaggagaattggctaaacacgcagttagtgaaggaaccaaagctgttacaaaatacacaggatccaaa'

We add on the stop codon. 

In [6]:
H2B_ORF = H2B_ORF.upper() + 'TAA'

Here we define the protein for the histone 2B gene.

In [7]:
H2B_protein = 'MAIKGKIAAKSVKKISKEVAPKTDKKKRIHKRKESYGIYIYKVLRQVHPDTGISGKAMSIMNSFVNDVFERIASEASKLATYNKKSTITSREIQTAVRLILPGELAKHAVSEGTKAVTKYTGSK*'


Now we can use the function get_RNAi_seq to randomly sample different recoded H2B proteins. 

The function get_RNAi_seq requires the ORF, protein sequence, an aminoacidweights and gencodeweights dictionary. We run 1000 random samples and do not enforce that every codon be different. It returns the list of tested sequences (seqs), scores ($CAI + D$/2) for each sequence, codon adaptation indices (CAIs), and Hamming distances (dists = $D$). 

In [8]:
seqs, scores, cais, dists = get_RNAi_seq(H2B_ORF, H2B_protein, aminoacidweights,
                            gencodeweights, trials = 1000,  enforce_different_codons = False)


We redo the process but enforce that every codon must be different. 

In [35]:
seqs_diff, scores_diff, cais_diff, dists_diff = get_RNAi_seq(H2B_ORF, H2B_protein, aminoacidweights,
                            gencodeweights, trials = 5000,  enforce_different_codons = True)


We define a function to compute ECDFs

In [11]:
def ecdf_vals(data):
    """Return x and y values for an ECDF."""
    return np.sort(data), np.arange(1, len(data)+1) / len(data)


We plot ECDFs of the CAIs.  

In [33]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='CAI',
    y_axis_label='ECDF',
)
cais, ecdf_cais = ecdf_vals(cais)
p.circle(cais, ecdf_cais, legend_label = 'Not all different ')

cais_diff, ecdf_cais_diff = ecdf_vals(cais_diff)
p.circle(cais_diff, ecdf_cais_diff, legend_label = 'all different', color = 'orange')
p.legend.location = 'bottom_right'
bokeh.io.show(p)

We plot ECDFs of the hamming distances 

In [34]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='Hamming Distance',
    y_axis_label='ECDF',
    
)
dists, ecdf_dists = ecdf_vals(dists)
p.circle(dists, ecdf_dists, legend_label = 'Not all different ')

dists_diff, ecdf_dists_diff = ecdf_vals(dists_diff)
p.circle(dists_diff, ecdf_dists_diff, legend_label = 'all different', color = 'orange')
p.legend.location = 'bottom_right'
p.x_range = bokeh.models.Range1d(.1, .6)
bokeh.io.show(p)