In [1]:
import os
import glob
import copy
import numpy as np
import Bio
import scipy.spatial
import pickle
import matplotlib.pyplot as plt
import pandas as pd
from rnai_scripts import *
import bokeh.io
import bokeh.plotting

# Enable viewing Bokeh plots in the notebook
bokeh.io.output_notebook()

# RNAi recoding 

## Reading in the Smed transcriptome
We read in the Smed_v6 transcriptome orfs that were extracted using orfipy. We then join them all into one string and obtain the codon frequencies. 

In [29]:
fname = 'data/dd_Smed_v6_transcripts_orfs_large3.fa'
descriptors, seqs = read_many_fasta(fname)
# join all ORFS into one large transcriptome 
transcriptome = ''.join(seqs)
# get aminoacidweights and codon weights 

codon_frequencies_dic_sw_large = get_codon_frequencies(transcriptome) 
di_codon_frequencies_dic_sw_large = get_codon_frequencies_doublets(transcriptome) 

In [16]:
fname = 'data/dd_Smed_v6_transcripts_orfs.fa'
descriptors, seqs = read_many_fasta(fname)
# join all ORFS into one large transcriptome 
transcriptome = ''.join(seqs)
# get aminoacidweights and codon weights 

codon_frequencies_dic_sw = get_codon_frequencies(transcriptome) 

In [81]:
doublets = get_codon_frequencies_doublets(transcriptome)

{}
4096


In [84]:
#doublets

In [66]:
64*63 + 64

4096

I also found a published version of amino acid frequencies:

In [30]:
df = pd.read_csv('data/codon_usage_smed.csv')


AAs = df['codon'].values
freqs = df['frequency'].values/1000.

codon_frequencies_dic = {}
for i in range(len(AAs)):
    codon_frequencies_dic[AAs[i]] = freqs[i]
print(sum(freqs))

1.00000000001


In [31]:
codon_frequencies_compare = {}
diffs_large = 0
diffs_small = 0
for AA in AAs:
    codon_frequencies_compare[AA] = [codon_frequencies_dic[AA], codon_frequencies_dic_sw[AA],
                                     codon_frequencies_dic_sw_large[AA]]
    diffs_large += np.abs(codon_frequencies_dic[AA]-codon_frequencies_dic_sw_large[AA])
    diffs_small +=np.abs(codon_frequencies_dic[AA]-codon_frequencies_dic_sw[AA])

In [88]:
def random_reverse_translate(protein_seq, weights_dic = None, change_seq = '', 
                                       doubletscode = doubletscode, pairs = True):
    """
    Return a random cDNA sequence for a given protein sequence (protein_seq). 
    """
    cDNA_seq = ''
    weights = np.zeros(1)
    prev_codon = ''
    for i, aminoacid in enumerate(protein_seq):
        if not pairs:
            prev_codon = '' # do not incorporate information about previous codon
        if weights_dic:
            weights = weights_dic[aminoacid]
        if len(change_seq) > 0:
            random_codon = get_random_codon(aminoacid, doubletscode = doubletscode, 
                                               weights = weights, prev_codon = prev_codon,
                                                   dont_use = change_seq[3*i:3*(i+1)])
            cDNA_seq = cDNA_seq + random_codon
            prev_codon = random_codon
            
        else:
            random_codon = get_random_codon(aminoacid, doubletscode = doubletscode,
                                               weights = weights, prev_codon = prev_codon,
                                                   )
            cDNA_seq = cDNA_seq + random_codon
            prev_codon = random_codon
            
        
    assert translate(cDNA_seq) == protein_seq
    return cDNA_seq

In [86]:
doubletscode = doublets

In [186]:
def get_random_codon(aminoacid, aminoacidcode = aminoacidcode, doubletscode = doubletscode,
                     dont_use = '', weights = np.zeros(1),
                    prev_codon = ''):
    '''Gets a random codon for aminoacid, aminoacid must be single letter amino acid code.
    '''
    
    codon_list = aminoacidcode[aminoacid]

    if sum(weights == 0):
        weights = np.ones(len(codon_list)) # uniformly distributed weights list 
    if len(codon_list) < 2:
        dont_use = '' # only one option 
        
        
    if dont_use != '':
        ind = codon_list.index(dont_use.upper())
        if ind + 1 == len(codon_list):
            codon_list = codon_list[:ind]
            weights = np.array(list(weights[:ind]))
        else:
            codon_list = codon_list[:ind] + codon_list[ind + 1:]
            weights = np.array(list(weights[:ind])  + list(weights[ind + 1:]))  
    if prev_codon != '':
        doublets = []
        weights = []
        for codon in codon_list:
            doublets.append(prev_codon + codon)
            #print(doublets)
            weights.append(doubletscode[prev_codon + codon]) 
        print(codon_list, doublets, weights)
    return np.random.choice(codon_list, p = np.array(weights)/np.sum(weights))


In [188]:
codon = get_random_codon('I', dont_use = 'ATA')
codon

'ATC'

In [183]:
diff_dic = {}
for pair in doublets.keys():
    if 'TAA' == pair[:3]:
        continue
    if 'TAG' == pair[:3]:
        continue
    if 'TGA' == pair[:3]:
        continue
    
    diff_dic[pair] = (doublets[pair] - codon_frequencies_dic_sw_large[pair[:3]]*codon_frequencies_dic_sw_large[pair[3:]])

In [179]:
doublets['AAAAAA']

0.001962840807268411

In [169]:
np.array(list(diff_dic.values()))

array([ 9.04514320e-05,  2.20562450e-04, -6.43583361e-05, ...,
        5.09989337e-05,  8.71327688e-05,  3.12155335e-05])

In [170]:
ind = np.argmax(np.array(list(diff_dic.values())))

In [171]:
inds = np.argsort(list(diff_dic.values()))
inds.astype(int)

array([3853,  128,  570, ..., 3475,  703, 2655])

In [174]:
inds = np.argsort(list(diff_dic.values())).astype(int)
keys_srted = np.array(list(diff_dic.keys()))[inds]
vals_srt = np.array(list(diff_dic.values()))[inds]

In [176]:
vals_srt[:90]

array([-0.00079606, -0.00069424, -0.00069096, -0.00067781, -0.00066201,
       -0.00064606, -0.00061002, -0.00053877, -0.00044862, -0.00039546,
       -0.00038905, -0.00037555, -0.00037398, -0.00036192, -0.00035976,
       -0.00035276, -0.00035109, -0.00033813, -0.00033587, -0.0003204 ,
       -0.00031854, -0.00030877, -0.00030842, -0.00030563, -0.00030317,
       -0.00030276, -0.00029689, -0.00029688, -0.00028994, -0.00028891,
       -0.00028317, -0.00027878, -0.00027858, -0.00026655, -0.0002626 ,
       -0.0002607 , -0.00026034, -0.00025912, -0.00025802, -0.00025731,
       -0.0002546 , -0.00025358, -0.00025346, -0.00025205, -0.00024849,
       -0.00024721, -0.00024543, -0.00024528, -0.00024471, -0.00023941,
       -0.00023689, -0.00023675, -0.00023461, -0.00023284, -0.00023249,
       -0.00023041, -0.0002244 , -0.00022408, -0.00022332, -0.00021427,
       -0.00021411, -0.00021376, -0.00021166, -0.00021106, -0.00020927,
       -0.00020646, -0.00020455, -0.00020227, -0.00020172, -0.00

In [175]:
keys_srted[:90]

array(['AAAAAA', 'ATTATG', 'AATATG', 'TTTAAA', 'ATTAAA', 'ATTAAT',
       'TTTAAT', 'TTTATG', 'GATATG', 'GTTAAT', 'CTTAAA', 'ATTAAG',
       'TATAAT', 'TATATG', 'GTTAAA', 'TTTAAG', 'TTTTTT', 'GTTATG',
       'CTTAAT', 'ATTAAC', 'TCTATG', 'AATAAG', 'ATTATA', 'TATAAA',
       'ATCTTA', 'TTCGAA', 'CTTATG', 'CAAAAA', 'ACTATG', 'CATAAT',
       'AGTATG', 'TTCTTA', 'TTTAAC', 'TCTAAT', 'TCTAAA', 'ACTAAA',
       'GATAAT', 'CATATG', 'GAACAA', 'ACTAAT', 'AATATA', 'ATTAGA',
       'TGTAAT', 'TGTATG', 'AATACA', 'GAACCA', 'ATGTTA', 'TTTAGA',
       'CATAAA', 'ATCATG', 'GCTAAT', 'AATAAA', 'ATTACA', 'AAAATG',
       'ATCTTG', 'ATTAGT', 'AATAGA', 'ATTATT', 'TTTATA', 'AATAAT',
       'GGTAAT', 'TATATT', 'AGTAAT', 'GAACAT', 'GGTATG', 'GATAAG',
       'GATACA', 'GCTATG', 'TGCGAA', 'TTTAGT', 'TTCATG', 'ATCTTT',
       'AACTTA', 'ATACAA', 'TTGTTC', 'TATAAG', 'AATAGT', 'CAAGAT',
       'GTTAAG', 'AAGATG', 'GTTAAC', 'TTAGAT', 'CTTAAG', 'CCTATG',
       'ATAGAA', 'CTTATT', 'TGTAAA', 'GAATCA', 'AATACT', 'GAAC

In [None]:
diff_dic

In [182]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='diff',
    y_axis_label='Dist',
 #   x_axis_type = 'log'
    
)
diffs, ecdf_diffs = ecdf_vals(np.array(list(diff_dic.values())))
p.circle(diffs, ecdf_diffs)

#diffs, ecdf_diffs = ecdf_vals(np.array(list(doublets.values())))
#p.circle(diffs, ecdf_diffs, color = 'orange')
bokeh.io.show(p)

We use our codon frequencies dictionary to compute CAI weights (based on the weight definition for the CAI) for all codons 

$$w_i = \frac{f_i}{\max (f_j)} i,j \in [ \text{synonymouse codons for amino acid} ]$$

Where $f_i$ is the frequency of codon $i$. 

We obtain two dictionaries: 


aminoacidweights: keys are amino acids, values are arrays of $w_i$ for all synonymous codons. The order of the codons is the as those used in aminoacidcode. 
    
gencodeweights: keys are codons, values are $w_i$ for each codon

In [190]:
aminoacidweights, gencodeweights = get_codon_weights(codon_frequencies_dic_sw_large)

We pickle dump everything so we do not have to repeat the above line later. 

In [30]:
pickle.dump( aminoacidweights,
            open( "data/Smed_transcriptome_aminoacidweights.p", "wb" ) )
pickle.dump( gencodeweights, 
            open( "data/Smed_transcriptome_gencodeweights.p", "wb" ) )
pickle.dump( aminoacidcode,
            open( "data/aminoacidcode.p", "wb" ))

We reload everything with pickle because why not. 

In [31]:
aminoacidweights = pickle.load( open( "data/Smed_transcriptome_aminoacidweights.p",
                                     "rb" ) )
gencodeweights = pickle.load( open( "data/Smed_transcriptome_gencodeweights.p", 
                                   "rb" ) )
aminoacidcode = pickle.load(open("data/aminoacidcode.p", 'rb'))

## Recoding the H2B ORF for RNAi

Here we define the ORF for the histone 2B gene.

In [153]:
H2B_ORF = 'atggcaattaaaggtaagatcgctgctaagtctgttaagaagatttcaaaggaagttgctcctaaaacagacaaaaagaaaaggatacataaacgcaaagaaagttatggtatttacatctacaaagtgttgagacaagttcatccagatactggaatttctggcaaagcaatgtctattatgaatagctttgtcaacgatgttttcgaaagaatcgcttcggaggctagcaagttggcaacttataataagaaatcaaccataaccagcagagagattcaaactgcagtcaggttaattttaccaggagaattggctaaacacgcagttagtgaaggaaccaaagctgttacaaaatacacaggatccaaa'

We add on the stop codon. 

In [154]:
H2B_ORF = H2B_ORF.upper() + 'TAA'

Here we define the protein for the histone 2B gene.

In [155]:
H2B_protein = 'MAIKGKIAAKSVKKISKEVAPKTDKKKRIHKRKESYGIYIYKVLRQVHPDTGISGKAMSIMNSFVNDVFERIASEASKLATYNKKSTITSREIQTAVRLILPGELAKHAVSEGTKAVTKYTGSK*'


Now we can use the function get_RNAi_seq to randomly sample different recoded H2B proteins. 

The function get_RNAi_seq requires the ORF, protein sequence, an aminoacidweights and gencodeweights dictionary. We run 1000 random samples and do not enforce that every codon be different. It returns the list of tested sequences (seqs), scores ($CAI + D$/2) for each sequence, codon adaptation indices (CAIs), and Hamming distances (dists = $D$). 

In [35]:
seqs, scores, cais, dists = get_RNAi_seq(H2B_ORF, H2B_protein, aminoacidweights,
                            gencodeweights, trials = 5000,  enforce_different_codons = False)


In [157]:
s

'ATGGCTATCAAGGGGAAAATAGCGGCGAAATCAGTCAAAAAAATATCTAAAGAGGTAGCGCCGAAGACTGATAAGAAAAAGCGAATCCACAAGAGAAAGGAGTCTTACGGCATCTATATATATAAGGTTTTACGTCAGGTGCACCCGGACACAGGCATCAGTGGGAAGGCGATGAGCATAATGAACTCCTTCGTAAATGACGTGTTTGAGCGCATTGCGTCTGAAGCAAGTAAACTAGCGACATACAACAAAAAGTCTACAATTACTAGTCGTGAAATCCAGACAGCTGTTAGATTGATACTTCCTGGGGAGTTAGCAAAGCATGCCGTGTCTGAGGGGACTAAGGCAGTAACCAAGTATACTGGTTCTAAGTAG'

In [195]:
s1 = random_reverse_translate(H2B_protein, weights_dic = aminoacidweights, change_seq = H2B_ORF)
s1

'ATGGCTATCAAGGGGAAAATAGCCGCAAAATCAGTGAAAAAAATATCCAAAGAGGTAGCCCCAAAGACTGATAAGAAAAAGCGAATTCACAAGCGAAAGGAGTCATACGGAATATATATTTATAAGGTACTCCGACAGGTACACCCTGACACAGGCATCTCAGGAAAGGCCATGTCAATAATGAACTCTTTCGTTAATGACGTCTTTGAGCGGATTGCATCTGAAGCCTCGAAACTGGCCACATACAACAAAAAGAGTACAATTACTTCGCGCGAAATCCAGACGGCCGTTCGATTGATCCTCCCTGGTGAGTTAGCAAAGCATGCTGTATCTGAGGGTACGAAGGCAGTAACCAAGTATACTGGGTCAAAGTAG'

In [193]:
s

'ATGGCCATCAAGGGCAAAATTGCCGCCAAATCAGTAAAAAAAATATCGAAAGAGGTGGCGCCCAAGACCGATAAGAAAAAGCGAATTCACAAGAGAAAGGAGAGCTACGGAATATATATATATAAGGTCTTACGACAGGTGCACCCTGACACGGGCATCTCCGGAAAGGCTATGAGTATAATGAACTCATTCGTGAATGACGTGTTTGAGCGCATTGCCTCCGAAGCATCTAAATTAGCCACGTACAACAAAAAGAGTACTATCACATCACGCGAAATCCAGACGGCGGTGAGATTGATACTCCCTGGCGAGTTAGCAAAGCATGCGGTGTCAGAGGGTACTAAGGCAGTAACTAAGTATACCGGCAGTAAGTAG'

In [197]:
get_hamming_dist(s1, s, normalize = False)

55.0

In [192]:
s = random_reverse_translate_with_pairs(H2B_protein, weights_dic = None, change_seq = H2B_ORF)

['GCC', 'GCG', 'GCT'] ['ATGGCC', 'ATGGCG', 'ATGGCT'] [0.00023448756225906103, 0.00019900787538488677, 0.0004234445348467313]
['ATA', 'ATC'] ['GCCATA', 'GCCATC'] [0.00016915669407715375, 0.00014493401833586198]
['AAG'] ['ATCAAG'] [0.0004575170953292952]
['GGA', 'GGC', 'GGG'] ['AAGGGA', 'AAGGGC', 'AAGGGG'] [0.000144330964168029, 7.146191888820934e-05, 5.95013445595217e-05]
['AAA'] ['GGCAAA'] [0.000368466096545957]
['ATA', 'ATT'] ['AAAATA', 'AAAATT'] [0.0016208085850791332, 0.002100638684618249]
['GCA', 'GCC', 'GCG'] ['ATTGCA', 'ATTGCC', 'ATTGCG'] [0.0005943098823994118, 0.0003409266228815838, 0.0001957915864897775]
['GCA', 'GCC', 'GCG'] ['GCCGCA', 'GCCGCC', 'GCCGCG'] [9.447848629383513e-05, 7.618584320290109e-05, 3.055474450353817e-05]
['AAA'] ['GCCAAA'] [0.00042585675151806323]
['AGC', 'AGT', 'TCA', 'TCC', 'TCG'] ['AAAAGC', 'AAAAGT', 'AAATCA', 'AAATCC', 'AAATCG'] [0.00042585675151806323, 0.0008323152606374986, 0.0013977790520088992, 0.0007647731938402037, 0.0006533086818190727]
['GTA', 

We redo the process but enforce that every codon must be different. 

In [36]:
seqs_diff, scores_diff, cais_diff, dists_diff = get_RNAi_seq(H2B_ORF, H2B_protein, aminoacidweights,
                            gencodeweights, trials = 5000,  enforce_different_codons = True)


We define a function to compute ECDFs

In [106]:
def ecdf_vals(data):
    """Return x and y values for an ECDF."""
    return np.sort(data), np.arange(1, len(data)+1) / len(data)


We plot ECDFs of the CAIs.  

In [38]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='CAI',
    y_axis_label='ECDF',
)
cais, ecdf_cais = ecdf_vals(cais)
p.circle(cais, ecdf_cais, legend_label = 'Not all different ')

cais_diff, ecdf_cais_diff = ecdf_vals(cais_diff)
p.circle(cais_diff, ecdf_cais_diff, legend_label = 'all different', color = 'orange')
p.legend.location = 'bottom_right'
bokeh.io.show(p)

We plot ECDFs of the hamming distances 

In [40]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='Hamming Distance',
    y_axis_label='ECDF',
    
)
dists, ecdf_dists = ecdf_vals(dists)
p.circle(dists, ecdf_dists, legend_label = 'Not all different ')

dists_diff, ecdf_dists_diff = ecdf_vals(dists_diff)
p.circle(dists_diff, ecdf_dists_diff, legend_label = 'all different', color = 'orange')
p.legend.location = 'bottom_right'
p.x_range = bokeh.models.Range1d(.1, .6)
bokeh.io.show(p)

## Now we recode the luc ORFS!!!! 

In [41]:
SmedNluc2 = 'ATGGTGTTTACTTTGGAAGATTTTGTTGGAGATTGGAGACAAACTGCTGGTTACAATCTGGATCAGGTACTGGAACAAGGCGGTGTTAGTTCATTATTCCAAAACCTGGGTGTGAGTGTAACTCCGATTCAGCGAATAGTGTTGTCTGGAGAAAATGGGCTGAAGATTGATATACACGTCATAATTCCATACGAAGGCTTAAGCGGTGATCAAATGGGACAAATTGAAAAAATTTTTAAAGTAGTTTACCCAGTTGACGACCATCATTTTAAAGTTATCCTTCATTACGGTACACTGGTTATAGATGGTGTAACTCCAAATATGATCGATTATTTCGGAAGACCTTACGAAGGCATAGCCGTTTTTGATGGAAAAAAGATTACAGTAACAGGTACATTGTGGAACGGAAATAAGATTATTGACGAACGTTTAATTAACCCAGATGGAAGTTTGCTCTTTAGAGTTACAATTAATGGTGTGACAGGATGGAGATTATGCGAACGGATACTCGCGTAA'

In [54]:
smednluc2_protein = 'MVFTLEDFVGDWRQTAGYNLDQVLEQGGVSSLFQNLGVSVTPIQRIVLSGENGLKIDIHVIIPYEGLSGDQMGQIEKIFKVVYPVDDHHFKVILHYGTLVIDGVTPNMIDYFGRPYEGIAVFDGKKITVTGTLWNGNKIIDERLINPDGSLLFRVTINGVTGWRLCERILA*'

In [43]:
hluc = 'ATGGTCTTCACACTCGAAGATTTCGTTGGGGACTGGCGACAGACAGCCGGCTACAACCTGGACCAAGTCCTTGAACAGGGAGGTGTGTCCAGTTTGTTTCAGAATCTCGGGGTGTCCGTAACTCCGATCCAAAGGATTGTCCTGAGCGGTGAAAATGGGCTGAAGATCGACATCCATGTCATCATCCCGTATGAAGGTCTGAGCGGCGACCAAATGGGCCAGATCGAAAAAATTTTTAAGGTGGTGTACCCTGTGGATGATCATCACTTTAAGGTGATCCTGCACTATGGCACACTGGTAATCGACGGGGTTACGCCGAACATGATCGACTATTTCGGACGGCCGTATGAAGGCATCGCCGTGTTCGACGGCAAAAAGATCACTGTAACAGGGACCCTGTGGAACGGCAACAAAATTATCGACGAGCGCCTGATCAACCCCGACGGCTCCCTGCTGTTCCGAGTAACCATCAACGGAGTGACCGGCTGGCGGCTGTGCGAACGCATTCTGGCGTAA'

I wonder what the CAI for each ORF is?

In [47]:
print('CAI for SMed Nuc:', get_CAI(SmedNluc2, gencodeweights))
print('CAI for Human Nuc:', get_CAI(hluc, gencodeweights))
print('Hamming Distance vs Smed vs Human Nuc', get_hamming_dist(SmedNluc2, hluc))

CAI for SMed Nuc: 0.708949323659455
CAI for Human Nuc: 0.49758650479795064
Hamming Distance vs Smed vs Human Nuc 0.25775193798449614


Seems ok, now lets recoede the Smed ORF 

In [78]:
seqs, scores, cais, dists = get_RNAi_seq(SmedNluc2, smednluc2_protein, aminoacidweights,
                            gencodeweights, trials = 5000,  enforce_different_codons = False)


We redo the process but enforce that every codon must be different. 

In [79]:
seqs_diff, scores_diff, cais_diff, dists_diff = get_RNAi_seq(SmedNluc2, smednluc2_protein, aminoacidweights,
                            gencodeweights, trials = 5000,  enforce_different_codons = True)


In [87]:
print('Max CAI for any sequence: ', max(cais))
print('Max CAI for all codons different sequence: ', max(cais_diff))

Max CAI for any sequence:  0.7961929459204498
Max CAI for all codons different sequence:  0.6131242531108633


In [88]:
print('Max Hamming Distance for any sequence: ', max(dists))
print('Max Hamming Distance for all codons different sequence: ', max(dists_diff))

Max Hamming Distance for any sequence:  0.2596899224806202
Max Hamming Distance for all codons different sequence:  0.374031007751938


We plot ECDFs of the CAIs.  

In [80]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='CAI',
    y_axis_label='ECDF',
)
cais_sort, ecdf_cais = ecdf_vals(cais)
p.circle(cais_sort, ecdf_cais, legend_label = 'Not all different ')

cais_diff_sort, ecdf_cais_diff = ecdf_vals(cais_diff)
p.circle(cais_diff_sort, ecdf_cais_diff, legend_label = 'all different', color = 'orange')
p.legend.location = 'bottom_right'
bokeh.io.show(p)

We plot ECDFs of the hamming distances 

In [81]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='Hamming Distance',
    y_axis_label='ECDF',
    
)
dists_sort, ecdf_dists = ecdf_vals(dists)
p.circle(dists_sort, ecdf_dists, legend_label = 'Not all different ')

dists_diff_sort, ecdf_dists_diff = ecdf_vals(dists_diff)
p.circle(dists_diff_sort, ecdf_dists_diff, legend_label = 'all different', color = 'orange')
p.legend.location = 'bottom_right'
p.x_range = bokeh.models.Range1d(.1, .6)
bokeh.io.show(p)

It looks like if they are all different the hamming distance is pretty similar between all sequences, which suggests we should mainly choose by CAI for thehse cases. 

In [82]:
sorted_inds = np.argsort(cais_diff)
seqs_sort = np.array(seqs)[sorted_inds]

For the sequences in which not every codon is enforced to be the same distance matters. 

In [84]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='CAIs',
    y_axis_label='Dist',
    
)
p.circle(cais_diff[::10], dists_diff[::10])

p.circle(cais[::10], dists[::10], color = 'orange')
bokeh.io.show(p)

In [None]:
ge