In [1]:
import os
import glob
import copy
import numpy as np
import Bio
import scipy.spatial
import pickle
import matplotlib.pyplot as plt
import pandas as pd
from rnai_scripts import *
import bokeh.io
import bokeh.plotting

# Enable viewing Bokeh plots in the notebook
bokeh.io.output_notebook()

In [41]:
def ecdf_vals(data):
    """Return x and y values for an ECDF."""
    return np.sort(data), np.arange(1, len(data)+1) / len(data)


# RNAi recoding 

## Reading in the Smed transcriptome
We read in the Smed_v6 transcriptome orfs that were extracted using orfipy. We then join them all into one string and obtain the codon frequencies. 

In [49]:
fname = 'data/dd_Smed_v6_transcripts_orfs_large3.fa' # makes smallest proteins be around 30 amino acids
descriptors, seqs = read_many_fasta(fname)
# join all ORFS into one large transcriptome 
transcriptome = ''.join(seqs)
# get aminoacidweights and codon weights 

codon_frequencies_dic = get_codon_frequencies(transcriptome) 



In [50]:
print((transcriptome.count('C') + transcriptome.count('G'))/len(transcriptome))

0.3607041500977551


In [39]:
len(transcriptome)

29848068

Now we get frequencies of doublets

In [4]:
doubletscode = get_codon_frequencies_doublets(transcriptome)

I also found a published version of amino acid frequencies:

In [5]:
df = pd.read_csv('data/codon_usage_smed.csv')


AAs = df['codon'].values
freqs = df['frequency'].values/1000.

codon_frequencies_dic_published = {}
for i in range(len(AAs)):
    codon_frequencies_dic_published[AAs[i]] = freqs[i]
print(sum(freqs))

1.00000000001


Let's calculate the average discrepency between the doublets vs. codon frequencies. 

In [6]:
diff_published_vs_me = {}
for a in AAs:
    
    diff_published_vs_me[a] = codon_frequencies_dic_published[a] - codon_frequencies_dic[a]
values = np.array(list(diff_published_vs_me.values()))
print(np.mean(values))
print(np.mean(np.abs(values))) # values usually on order 
print(np.sum(np.abs(values)))

1.5624986385038425e-13
0.0016427577738011736
0.10513649752327511


Here we find the discrepencies between the frequencies of each doublet vs. the product frequency of the separate codons. 

In [7]:

diff_dic = {}
diff_dic_norm = {}
for pair in doubletscode.keys():
    if 'TAA' == pair[:3]:
        continue
    if 'TAG' == pair[:3]:
        continue
    if 'TGA' == pair[:3]:
        continue
    
    freq1 = codon_frequencies_dic[pair[:3]]
    freq2 = codon_frequencies_dic[pair[3:]]
    
    diff_dic_norm[pair] = (doubletscode[pair] - freq1*freq2)/np.max(np.array([freq1, freq2]))
    diff_dic[pair] = (doubletscode[pair] - freq1*freq2)

In [8]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='Doublets - Codon1xCodon2',
    y_axis_label='ECDF',
  #  x_axis_type = 'log'
    
)
diffs, ecdf_diffs = ecdf_vals(np.array(list(diff_dic.values())))
print(np.sum(np.array(list(doubletscode.values()))))
p.circle(diffs*1e4, ecdf_diffs)

#diffs, ecdf_diffs = ecdf_vals(np.array(list(doublets.values())))
#p.circle(diffs, ecdf_diffs, color = 'orange')
bokeh.io.show(p)

1.0


In [9]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='(Doublets - Codon1xCodon2)/(max(Codon1, Codon2))',
    y_axis_label='ECDF',
  #  x_axis_type = 'log'
    
)
diffs, ecdf_diffs = ecdf_vals(np.array(list(diff_dic_norm.values())))
print(np.sum(np.array(list(doubletscode.values()))))
p.circle(diffs, ecdf_diffs)

#diffs, ecdf_diffs = ecdf_vals(np.array(list(doublets.values())))
#p.circle(diffs, ecdf_diffs, color = 'orange')
bokeh.io.show(p)

1.0


In [10]:
values = np.array(list(diff_dic_norm.values()))
inds_sort = np.argsort(values)
keys = np.array(list(diff_dic_norm.keys()))
keys[inds_sort][:100]

array(['ATTATG', 'AAAAAA', 'AATATG', 'TTTATG', 'GATATG', 'ATTAAT',
       'TTTAAA', 'TTTAAT', 'ATTAAA', 'TATATG', 'GTTATG', 'ATCTTA',
       'TCTATG', 'CTTAAG', 'TTCTTA', 'CTTATG', 'ACTATG', 'TTTAAG',
       'TTTTTT', 'AGTATG', 'ATTAAG', 'ATCTTG', 'GTTAAG', 'GTTAAC',
       'CATATG', 'GTTAAT', 'CATAAG', 'TGTATG', 'TCTAAG', 'ACTAAG',
       'ATGTTA', 'TATAAT', 'TATAAG', 'CTTAAC', 'ATCATG', 'ATTAAC',
       'TTCGAA', 'AAGGGA', 'CTTAGA', 'AGTAAG', 'ATTATA', 'CTTAAA',
       'ATACAA', 'TTTAAC', 'TTGTTC', 'AACTTA', 'CTTAAT', 'TCTAAC',
       'ACTAAC', 'ACTATA', 'GTTAAA', 'GGTATG', 'TTCGCA', 'GTTATA',
       'TATATA', 'GCTATG', 'GATAAG', 'GAACAA', 'TATAAC', 'AATAAG',
       'GATACA', 'TTTAGA', 'GTTAGT', 'TTCATG', 'TGTAAG', 'CTTATA',
       'TATAGT', 'GAACCA', 'CATAGT', 'GTTAGA', 'TCTATA', 'GCTAAG',
       'AGTATA', 'TTCTTG', 'GCTAAC', 'TTGTAT', 'CATAAT', 'ATTAGA',
       'CATACT', 'CTTAGT', 'AAGATG', 'ACACAA', 'CAAGAT', 'TTTATA',
       'GTCTTA', 'AACTTG', 'TATACT', 'GAGGGA', 'TATAGA', 'ATCT

In [11]:
values = np.array(list(diff_dic.values()))*1e4
inds_sort = np.argsort(values)
keys = np.array(list(diff_dic.keys()))
keys[inds_sort][:100]

array(['AAAAAA', 'ATTATG', 'AATATG', 'TTTAAA', 'ATTAAA', 'ATTAAT',
       'TTTAAT', 'TTTATG', 'GATATG', 'GTTAAT', 'CTTAAA', 'ATTAAG',
       'TATAAT', 'TATATG', 'GTTAAA', 'TTTAAG', 'TTTTTT', 'GTTATG',
       'CTTAAT', 'ATTAAC', 'TCTATG', 'AATAAG', 'ATTATA', 'TATAAA',
       'ATCTTA', 'TTCGAA', 'CTTATG', 'CAAAAA', 'ACTATG', 'CATAAT',
       'AGTATG', 'TTCTTA', 'TTTAAC', 'TCTAAT', 'TCTAAA', 'ACTAAA',
       'GATAAT', 'CATATG', 'GAACAA', 'ACTAAT', 'AATATA', 'ATTAGA',
       'TGTAAT', 'TGTATG', 'AATACA', 'GAACCA', 'ATGTTA', 'TTTAGA',
       'CATAAA', 'ATCATG', 'GCTAAT', 'AATAAA', 'ATTACA', 'AAAATG',
       'ATCTTG', 'ATTAGT', 'AATAGA', 'ATTATT', 'TTTATA', 'AATAAT',
       'GGTAAT', 'TATATT', 'AGTAAT', 'GAACAT', 'GGTATG', 'GATAAG',
       'GATACA', 'GCTATG', 'TGCGAA', 'TTTAGT', 'TTCATG', 'ATCTTT',
       'AACTTA', 'ATACAA', 'TTGTTC', 'TATAAG', 'AATAGT', 'CAAGAT',
       'GTTAAG', 'AAGATG', 'GTTAAC', 'TTAGAT', 'CTTAAG', 'CCTATG',
       'ATAGAA', 'CTTATT', 'TGTAAA', 'GAATCA', 'AATACT', 'GAAC

In [12]:
diff_dic['AAAAAA']*1e4

-7.960567382195296

In [13]:
doubletscode['AAAAAA']

0.001962840807268411

In [14]:
codon_frequencies_dic['AAA']*codon_frequencies_dic['AAA']

0.0027588975454879406

We use our codon frequencies dictionary to compute CAI weights (based on the weight definition for the CAI) for all codons 

$$w_i = \frac{f_i}{\max (f_j)} i,j \in [ \text{synonymouse codons for amino acid} ]$$

Where $f_i$ is the frequency of codon $i$. 

We obtain two dictionaries: 


aminoacidweights: keys are amino acids, values are arrays of $w_i$ for all synonymous codons. The order of the codons is the as those used in aminoacidcode. 
    
gencodeweights: keys are codons, values are $w_i$ for each codon

In [15]:
aminoacidweights, gencodeweights = get_codon_weights(codon_frequencies_dic)

We pickle dump everything so we do not have to repeat the above line later. 

In [16]:
pickle.dump( aminoacidweights,
            open( "data/Smed_transcriptome_aminoacidweights.p", "wb" ) )
pickle.dump( gencodeweights, 
            open( "data/Smed_transcriptome_gencodeweights.p", "wb" ) )
pickle.dump( aminoacidcode,
            open( "data/aminoacidcode.p", "wb" ))
pickle.dump( doubletscode,
            open( "data/doubletscode.p", "wb" ))

We reload everything with pickle because why not. 

In [17]:
aminoacidweights = pickle.load( open( "data/Smed_transcriptome_aminoacidweights.p",
                                     "rb" ) )
gencodeweights = pickle.load( open( "data/Smed_transcriptome_gencodeweights.p", 
                                   "rb" ) )
aminoacidcode = pickle.load(open("data/aminoacidcode.p", 'rb'))
doubletscode = pickle.load(
            open( "data/doubletscode.p", "rb" ))

## We recode the luc ORFS!!!! 

Since SmedNluc2 is so short we must RNAi the whole thing. 

In [18]:
SmedNluc2_ORF = 'ATGGTGTTTACTTTGGAAGATTTTGTTGGAGATTGGAGACAAACTGCTGGTTACAATCTGGATCAGGTACTGGAACAAGGCGGTGTTAGTTCATTATTCCAAAACCTGGGTGTGAGTGTAACTCCGATTCAGCGAATAGTGTTGTCTGGAGAAAATGGGCTGAAGATTGATATACACGTCATAATTCCATACGAAGGCTTAAGCGGTGATCAAATGGGACAAATTGAAAAAATTTTTAAAGTAGTTTACCCAGTTGACGACCATCATTTTAAAGTTATCCTTCATTACGGTACACTGGTTATAGATGGTGTAACTCCAAATATGATCGATTATTTCGGAAGACCTTACGAAGGCATAGCCGTTTTTGATGGAAAAAAGATTACAGTAACAGGTACATTGTGGAACGGAAATAAGATTATTGACGAACGTTTAATTAACCCAGATGGAAGTTTGCTCTTTAGAGTTACAATTAATGGTGTGACAGGATGGAGATTATGCGAACGGATACTCGCGTAA'

In [19]:
SmedNluc2_protein = 'MVFTLEDFVGDWRQTAGYNLDQVLEQGGVSSLFQNLGVSVTPIQRIVLSGENGLKIDIHVIIPYEGLSGDQMGQIEKIFKVVYPVDDHHFKVILHYGTLVIDGVTPNMIDYFGRPYEGIAVFDGKKITVTGTLWNGNKIIDERLINPDGSLLFRVTINGVTGWRLCERILA*'

In [20]:
Hluc_ORF = 'ATGGTCTTCACACTCGAAGATTTCGTTGGGGACTGGCGACAGACAGCCGGCTACAACCTGGACCAAGTCCTTGAACAGGGAGGTGTGTCCAGTTTGTTTCAGAATCTCGGGGTGTCCGTAACTCCGATCCAAAGGATTGTCCTGAGCGGTGAAAATGGGCTGAAGATCGACATCCATGTCATCATCCCGTATGAAGGTCTGAGCGGCGACCAAATGGGCCAGATCGAAAAAATTTTTAAGGTGGTGTACCCTGTGGATGATCATCACTTTAAGGTGATCCTGCACTATGGCACACTGGTAATCGACGGGGTTACGCCGAACATGATCGACTATTTCGGACGGCCGTATGAAGGCATCGCCGTGTTCGACGGCAAAAAGATCACTGTAACAGGGACCCTGTGGAACGGCAACAAAATTATCGACGAGCGCCTGATCAACCCCGACGGCTCCCTGCTGTTCCGAGTAACCATCAACGGAGTGACCGGCTGGCGGCTGTGCGAACGCATTCTGGCGTAA'

I wonder what the CAI for each ORF is?

In [21]:
print('CAI for SMed Nuc:', get_CAI(SmedNluc2_ORF, gencodeweights))
print('CAI for Human Nuc:', get_CAI(Hluc_ORF, gencodeweights))
print('Hamming Distance vs Smed vs Human Nuc', get_hamming_dist(SmedNluc2_ORF, Hluc_ORF))

CAI for SMed Nuc: 0.737701636271008
CAI for Human Nuc: 0.5470635190087074
Hamming Distance vs Smed vs Human Nuc 0.25775193798449614


Now we can use the function get_RNAi_seq to randomly sample different recoded Luc proteins. 

The function get_RNAi_seq requires the ORF, protein sequence, an aminoacidweights and gencodeweights dictionary. We run 1000 random samples and do not enforce that every codon be different. It returns the list of tested sequences (seqs), scores ($CAI + D$/2) for each sequence, codon adaptation indices (CAIs), and Hamming distances (dists = $D$). 

In [22]:
def get_doublest_likelihood(dna_seq, weights_dic):
    '''
    Obtains Codon Adaptation Index (CAI) for a given DNA_seq calculated using weights_dic
    CAI = (w_1*.w_i*..w_N)^(1/N) where w_i is the weight of codon i. 
    
    Inputs:
        dna_seq: ORF in form of string to evaluate CAI
        weights_dic: dictionary of CAI weights for each codon. Values are weights and keys are codons. 
    '''
    if len(dna_seq) % 3 > 0.:
        raise ValueError("Length of DNA sequence must be divisble by 3")
    ncodons = int(len(dna_seq)//3)
    score = 0. 
    for i in range(ncodons-1):
        start = i*3
        end = start + 6
        codonpair = dna_seq[start:end].upper()
        score = score+ np.log(weights_dic[codonpair])
    return score

In [23]:
seqs, scores, cais, dists = get_RNAi_seq(SmedNluc2_ORF, SmedNluc2_protein, aminoacidweights, 
                            gencodeweights, trials = 1000,  enforce_different_codons = False, random = True)


best_seq, best_score, best_cai, best_dist = get_RNAi_seq(SmedNluc2_ORF, SmedNluc2_protein, aminoacidweights, 
                            gencodeweights, trials = 1,  enforce_different_codons = False, random = False)

best_doublet = get_doublest_likelihood(best_seq[0], doubletscode)
doublets_scores = np.array([get_doublest_likelihood(seq, doubletscode) for seq in seqs])
print(best_cai, best_dist, best_doublet)

[1.0] [0.187984496124031] -1253.807096386507


We redo the process but enforce that every codon must be different. 

In [51]:
seqs_diff, scores_diff, cais_diff, dists_diff = get_RNAi_seq(SmedNluc2_ORF, SmedNluc2_protein, aminoacidweights, 
                            gencodeweights, trials = 10000,  enforce_different_codons = True, random = True)

best_seq_diff, best_score_diff, best_cai_diff, best_dist_diff = get_RNAi_seq(SmedNluc2_ORF, SmedNluc2_protein, aminoacidweights, 
                            gencodeweights, trials = 1,  enforce_different_codons = True, random = False)
best_doublet_diff = get_doublest_likelihood(best_seq_diff[0], doubletscode)
doublets_scores_diff = np.array([get_doublest_likelihood(seq, doubletscode) for seq in seqs_diff])
print(best_cai_diff, best_dist_diff, best_doublet_diff)

[0.7226465538224905] [0.35658914728682173] -1367.970980554914


In [57]:
#dists_diff

We find the best sequences of our random simulation

In [55]:
print(np.max(cais_diff), np.max(dists_diff))

0.6438353380088827 0.375968992248062


We repeat with wiggle. 

In [26]:
seqs_diff, scores_diff, cais_wiggle, dists_wiggle = get_RNAi_seq(SmedNluc2_ORF, SmedNluc2_protein, aminoacidweights,
                            gencodeweights, trials = 1000,  enforce_different_codons = True, random = True, wiggle = True,)

best_seq_diff, best_score_diff, best_cai_diff_wiggle, best_dist_diff_wiggle = get_RNAi_seq(SmedNluc2_ORF, SmedNluc2_protein, aminoacidweights, 
                            gencodeweights, trials = 1,  enforce_different_codons = True, random = False,  wiggle = True
                                                                            )
best_doublet_diff_wiggle = get_doublest_likelihood(best_seq_diff[0], doubletscode)
doublets_scores_wiggle = np.array([get_doublest_likelihood(seq, doubletscode) for seq in seqs_diff])
print(best_cai_diff_wiggle, best_dist_diff_wiggle, best_doublet_diff_wiggle)

[0.7226465538224905] [0.35658914728682173] -1367.970980554914


In [27]:
print(np.max(cais_wiggle), np.max(dists_wiggle))

0.7097021379560103 0.3488372093023256


Doublets baby

In [28]:
seqs_doub, scores_doub, cais_doub_nd, dists_doub_nd = get_RNAi_seq(SmedNluc2_ORF, SmedNluc2_protein, aminoacidweights, 
                            gencodeweights, trials = 1000,  random = True,
                                                            pairs = True, doubletscode = doubletscode)

best_seq_doub, best_score_doub, best_cai_doub, best_dist_doub = get_RNAi_seq(SmedNluc2_ORF, SmedNluc2_protein, aminoacidweights, 
                            gencodeweights, trials = 1,   random = False,
                                                                            pairs = True, doubletscode = doubletscode,)
best_doublet_doub = get_doublest_likelihood(best_seq_doub[0], doubletscode)
doublets_scores_doub_nd= np.array([get_doublest_likelihood(seq, doubletscode) for seq in seqs_doub])
print(best_cai_doub, best_dist_doub, best_doublet_doub)

[0.9940680734100419] [0.19186046511627908] -1254.6892040809194


In [29]:
seqs_doub, scores_doub, cais_doub, dists_doub = get_RNAi_seq(SmedNluc2_ORF, SmedNluc2_protein, aminoacidweights, 
                            gencodeweights, trials = 1000,  enforce_different_codons =True, random = True,
                                                            pairs = True, doubletscode = doubletscode)

best_seq_doub, best_score_doub, best_cai_doub, best_dist_doub = get_RNAi_seq(SmedNluc2_ORF, SmedNluc2_protein, aminoacidweights, 
                            gencodeweights, trials = 1,  enforce_different_codons = True, random = False,
                                                                            pairs = True, doubletscode = doubletscode,)
best_doublet_doub = get_doublest_likelihood(best_seq_doub[0], doubletscode)
doublets_scores_doub= np.array([get_doublest_likelihood(seq, doubletscode) for seq in seqs_doub])
print(best_cai_doub, best_dist_doub, best_doublet_doub)

[0.7160654283084763] [0.3546511627906977] -1368.6292373559716


In [30]:
seqs_doub, scores_doub, cais_doub_wigg, dists_doub_wigg = get_RNAi_seq(SmedNluc2_ORF, SmedNluc2_protein, aminoacidweights, 
                            gencodeweights, trials = 1000,  enforce_different_codons =True, random = True, wiggle = True,
                                                            pairs = True, doubletscode = doubletscode)

best_seq_doub, best_score_doub, best_cai_doub_wiggle, best_dist_doub_wiggle = get_RNAi_seq(SmedNluc2_ORF, SmedNluc2_protein, aminoacidweights, 
                            gencodeweights, trials = 1,  enforce_different_codons = True, random = False, wiggle = True,
                                                                            pairs = True, doubletscode = doubletscode,)
best_doublet_doub = get_doublest_likelihood(best_seq_doub[0], doubletscode)
doublets_scores_doub_wigg = np.array([get_doublest_likelihood(seq, doubletscode) for seq in seqs_doub])
print(best_cai_doub_wiggle, best_dist_doub_wiggle, best_doublet_doub)

[0.7256511060897017] [0.3507751937984496] -1363.8648200075113


We define a function to compute ECDFs

We plot ECDFs of the CAIs.  

In [31]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='CAI',
    y_axis_label='ECDF',
   # x_range = (-1,1)
    
)
cais, ecdf_cais = ecdf_vals(cais)
p.circle(cais, ecdf_cais, legend_label = 'CAI')

cais_diff, ecdf_cais_diff = ecdf_vals(cais_diff)
p.circle(cais_diff, ecdf_cais_diff, legend_label = 'CAI, All Different', color = 'orange')

cais_wiggle, ecdf_cais_wiggle = ecdf_vals(cais_wiggle)
p.circle(cais_wiggle, ecdf_cais_wiggle, legend_label = 'CAI, All Different, Wiggle', color = 'green')

cais_doub_nd, ecdf_cais_doub = ecdf_vals(cais_doub_nd)
p.circle(cais_doub_nd, ecdf_cais_doub, legend_label = 'Doublets', color = 'purple')

cais_doub, ecdf_cais_doub = ecdf_vals(cais_doub)
p.circle(cais_doub, ecdf_cais_doub, legend_label = 'Doublets, All Different', color = 'red')

cais_doub_wiggle, ecdf_cais_doub_wiggle = ecdf_vals(cais_doub_wigg)
p.circle(cais_doub_wiggle, ecdf_cais_doub_wiggle,
         legend_label = 'Doublets, All Different, Wiggle', color = 'pink')

#p.legend.location = 'bottom_left'
p.legend.visible = False
#p.add_layout(legend, 'right')
bokeh.io.show(p)

We plot ECDFs of the hamming distances 

In [32]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='Hamming Distance',
    y_axis_label='ECDF',
    
)
dists, ecdf_dists = ecdf_vals(dists)
p.circle(dists, ecdf_dists, legend_label = 'Not all different ')

dists_diff, ecdf_dists_diff = ecdf_vals(dists_diff)
p.circle(dists_diff, ecdf_dists_diff, legend_label = 'all different', color = 'orange')


dists_diff_wiggle, ecdf_dists_diff_wiggle = ecdf_vals(dists_wiggle)
p.circle(dists_diff_wiggle, ecdf_dists_diff_wiggle, legend_label = 'wiggle', color = 'green')

dists_doub_nd, ecdf_dists_doub = ecdf_vals(dists_doub_nd)
p.circle(dists_doub_nd, ecdf_dists_doub, legend_label = 'doublets nd', color = 'purple')

dists_doub, ecdf_dists_doub = ecdf_vals(dists_doub)
p.circle(dists_doub, ecdf_dists_doub, legend_label = 'doublets', color = 'red')

dists_doub_wiggle, ecdf_dists_doub_wiggle = ecdf_vals(dists_doub_wigg)
p.circle(dists_doub_wiggle, ecdf_dists_doub_wiggle,
         legend_label = 'doublets wig', color = 'pink')


p.legend.visible = False
#p.x_range = bokeh.models.Range1d(.1, .6)
bokeh.io.show(p)

In [33]:
# Make figure
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='Log Doublets Likelihood',
    y_axis_label='ECDF',
    
)
dists, ecdf_dists = ecdf_vals(doublets_scores)
p.circle(dists, ecdf_dists, legend_label = 'Not all different ')

dists_diff, ecdf_dists_diff = ecdf_vals(doublets_scores_diff)
p.circle(dists_diff, ecdf_dists_diff, legend_label = 'all different', color = 'orange')


dists_diff_wiggle, ecdf_dists_diff_wiggle = ecdf_vals(doublets_scores_wiggle)
p.circle(dists_diff_wiggle, ecdf_dists_diff_wiggle, legend_label = 'wiggle', color = 'green')

dists_doub_nd, ecdf_dists_doub = ecdf_vals(doublets_scores_doub_nd)
p.circle(dists_doub_nd, ecdf_dists_doub, legend_label = 'doublets nd', color = 'purple')

dists_doub, ecdf_dists_doub = ecdf_vals(doublets_scores_doub)
p.circle(dists_doub, ecdf_dists_doub, legend_label = 'doublets', color = 'red')

dists_doub_wiggle, ecdf_dists_doub_wiggle = ecdf_vals(doublets_scores_doub_wigg)
p.circle(dists_doub_wiggle, ecdf_dists_doub_wiggle,
         legend_label = 'doublets wig', color = 'pink')


#p.legend.location = 'bottom_right'
p.legend.visible = False
bokeh.io.show(p)