In [1]:
import numpy as np
import pandas as pd
import logging as log

from IPython.display import display

import plotly
from plotly.offline import init_notebook_mode, iplot
plotly.offline.init_notebook_mode() # run at the start of every ipython notebook
plotly.plotly.sign_in('spersad', 'MwbO3xbqh2Mv6sfhCma7')
import plotly.graph_objs as go

log.getLogger().setLevel(log.INFO)

import tensorflow as tf

### Simulate binary vectors

In [2]:
def simulate_clusters(nclusters, readLength, nreads=10000, probs = None, pi=None, independent = True):
    ''' Simulate a mixture of multivariable Bernoulli random variables as binary vectors
        @param: nclusters - the number of clusters to generate
        @param: readLength - the length of a multivaribable R.V.
        @param: nreads - the number of reads to sample
        @param: probs - an (optional) list of probability distributions. Must be of length nclusters, and probs[0]
                        must have length readLength
        @param: pi - the mixing proportions of each cluster. Must be of length nclusters
        @param: independent - if this is true, positions are independent of each other. 
                              Experimentally we find that if there is a mutation at a position, there can be no mutations 
                              within three bases.
        @return: bitvector -  an array of binary vectors
    '''
    
    if probs != None:
        if len(pi) != nclusters:
            log.error('There is an incorrect number of mixing proportions')
        if len(probs) != nclusters:
            log.error('There is an incorrect number of probability distributions')
            return
        for dist in probs:
            if len(dist) != readLength:
                log.error('The distribution length {0} does not match the read length {1}'.format(len(dist),readLength))
                return 
    else:
        # Randomly generate probability distributions 
        pass 
    
    # Randomly generate binary vectors based on distributions specified

    
    bitvectors = []
    labels = []
    for k in range(nclusters):
        dist = probs[k]
        for j in range(int(nreads*pi[k])):
            x = np.array([np.random.binomial(1,p) for p in dist])
            if not independent:
                for i in range(len(x)-3):
                    if x[i]==1:
                        x[i+1:i+3]=0 # zero out everyone within a distance of three
            if x.sum()>2:
                bitvectors.append(x)
                labels.append(k)

        
    # Do some final processing to reshape as needed
    bitvectors = np.array(bitvectors)
    labels = np.array(labels)
    
    p = np.random.permutation(len(labels))
    
    return bitvectors[p], labels[p], probs, pi
        
    


In [3]:
def load_bitvectors(path, sep='\t'):
    '''
    Given a pandas dataframe with headings Read_name Binary_vector N_mutations Reference_name Start_position
    load only the bitvectors as a numpy array of integers
    @param: path - path to bitvector file
    @return: bitreads - an array of bitvectors
    '''
    bitvectors=pd.read_csv(path,sep=sep)
    #Grab the bit vectors only 
    bitreads = bitvectors.as_matrix(columns=['Binary_vector'])
    log.debug("bitreads[0][0]=%s" % ( bitreads[0][0]))
    len_bits = str(len(list(bitreads[0][0])))
    log.debug("Len bits is %s" % len_bits)
    size = bitreads.size
    bitreads = np.array(bitreads,dtype='|S'+len_bits)
    bitreads = bitreads.view('S1')
    bitreads = bitreads.reshape((size, -1))
    bitreads = bitreads.astype('|S4')
    bitreads[bitreads == '?'] = 0    
    bitreads = bitreads.astype(np.float)

    
    return bitreads


def denoise_bitvectors(bitvectors, threshold=0.001):
    '''
    Given an array of bitvectors, zero out all columns where the population average is below the given threshold.
    @param: bitvectors - numpy array of bitvectors
    @param: decimal threshold below which entries are considered noise. Default to 0.001=0.1%
    
    @return: denoised_bitvectors - numpy array of bitvectors where low signal positions are zero-ed out.'''
    population_average = bitvectors.sum(axis=0)/bitvectors.shape[1]
    denoise = np.where(population_average<threshold)
    bitvectors[:,denoise] = 0
    
    return bitvectors


In [5]:
l_12=np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.7, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.4, 0, 0.2, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0, 0, 0.005, 0, 0.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0.15, 0, 0, 0, 0, 0.0, 0, 0, 0, 0, 0.00, 0, 0, 0, 0, 0.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0.6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0.15, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0.15, 0, 0.5, 0.3, 0, 0, 0, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0.15, 0, 0, 0.6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0.5, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0.3, 0, 0, 0, 0.7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0.7, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0.6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0])
l_11=np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0.15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.6, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0, 0, 0, 0, 0, 0, 0.15, 0, 0, 0, 0, 0.4, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0.0, 0, 0, 0.00, 0.45, 0, 0, 0, 0.5, 0, 0, 0, 0, 0.15, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0.15, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.6, 0, 0, 0, 0, 0.4, 0, 0, 0.2, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.7, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0.4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.1, 0, 0, 0.3, 0, 0, 0, 0, 0.5, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0.7, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0.6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0])

l_22=np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.7, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.15, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0.15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.4, 0, 0.2, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0, 0, 0.15, 0, 0.3, 0, 0, 0.4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0.15, 0, 0, 0, 0, 0.5, 0, 0, 0, 0, 0.15, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0.6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0.15, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0.15, 0, 0.5, 0.3, 0, 0, 0, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0.15, 0, 0, 0.6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0.5, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0.3, 0, 0, 0, 0.7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0.7, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0.6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0])
l_21=np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.7, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.15, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0.15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.6, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0, 0, 0.15, 0, 0, 0, 0, 0.4, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0.15, 0, 0, 0, 0, 0.5, 0, 0, 0, 0, 0.15, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0.6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0.15, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.6, 0, 0, 0, 0, 0.4, 0, 0, 0.2, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.7, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0.4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.1, 0, 0, 0.3, 0, 0, 0, 0, 0.5, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0.7, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0.6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0])

probs_1=[l_11,l_12]
pi_1 = [0.3,0.7]

probs_2 = [l_21,l_22]
pi_2 = [0.3,0.7]

probs_3 = [l_11/3, l_12/4]
pi_3 = pi_1

probs_4 = [l_21/3, l_22/4]
pi_4=pi_2

simX1, simY1, _,_, = simulate_clusters(nclusters=2, readLength=400, nreads=10000, probs=probs_1, pi=pi_1, independent=False)
simX1 = denoise_bitvectors(simX1)
log.info('Generated clusters on sample 1.')

simX2, simY2, _,_, = simulate_clusters(nclusters=2, readLength=400, nreads=10000, probs=probs_2, pi=pi_2, independent=False)
simX2 = denoise_bitvectors(simX2)
log.info('Generated clusters on sample 2.')

simX3, simY3, _,_, = simulate_clusters(nclusters=2, readLength=400, nreads=10000, probs=probs_3, pi=pi_3, independent=False)
simX3 = denoise_bitvectors(simX3)
log.info('Generated clusters on sample 3.')

simX4, simY4, _,_, = simulate_clusters(nclusters=2, readLength=400, nreads=10000, probs=probs_4, pi=pi_4, independent=False)
simX4 = denoise_bitvectors(simX4)
log.info('Generated clusters on sample 4.')

INFO:root:Generated clusters on sample 1.
INFO:root:Generated clusters on sample 2.
INFO:root:Generated clusters on sample 3.
INFO:root:Generated clusters on sample 4.


In [6]:
def plot_clusters(X, Y, sample='1'):
    ''' Plot the population average as well as individual clusters.'''
    x=[i for i in range(X.shape[1])]
    y=100*X.sum(axis=0)/X.shape[0]

    trace = go.Bar(
        x=x,
        y=y,
        name='Population Average',
        marker = dict(
        color='rgb(255, 201, 43)'),
    ) 
    
    ## Now plot clusters separately
    from plotly import tools
    clusters = (np.unique(Y))
    fig2 = tools.make_subplots(rows=len(clusters)+1, cols=1)  
    
    for cluster in clusters:
        cluster_X = X[np.where(Y == cluster)]
        
        trace1 = go.Bar(
        x=x,
        y=100*cluster_X.sum(axis=0)/cluster_X.shape[0],
        name='Cluster '+str(cluster+1),
        marker = dict(
        color='rgb(216, 67, 77)'),
        )

        fig2.append_trace(trace1, cluster+1, 1)
        
    fig2.append_trace(trace, cluster+2, 1)
    fig2['layout'].update(title='Clusters within Sample '+str(sample))
    iplot(fig2, filename="clusters")


In [8]:
plot_clusters(simX1, simY1, sample=1)

This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x2,y2 ]
[ (3,1) x3,y3 ]



In [9]:
plot_clusters(simX2, simY2, sample=2)

This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x2,y2 ]
[ (3,1) x3,y3 ]



In [10]:
plot_clusters(simX3, simY3, sample=3)

This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x2,y2 ]
[ (3,1) x3,y3 ]



In [11]:
plot_clusters(simX4, simY4, sample=4)

This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x2,y2 ]
[ (3,1) x3,y3 ]



<hr>
#### t-SNE Visualization

Now, we wish to visualize our cluster data when embedded in a lower dimensional space.



In [12]:
def performTSNE(X, Y, num_examples=2000, perp=30, metric='euclidean'):
    from sklearn.manifold import TSNE
    
    log.info('Shape of input: {0}'.format(X.shape))
    model = TSNE(n_components=2, perplexity = perp, random_state=0, metric=metric) # fit into 2D space
    log.info('Defined model')
    if metric !='precomputed':
        X = X[:num_examples]
        Y = Y[:num_examples]
        
    embeddedX = model.fit_transform(X)
    log.info('Created embedding')

    # Scatter plot to visualize embedded data
    # Create a trace
    trace = go.Scatter(
        x = embeddedX[:,0],
        y = embeddedX[:,1],
        mode = 'markers',
        marker=dict(
            size='2',
            color = Y, # color points by label they belong to
            colorscale= [[0, '#dd2c4f'], [1, '#3d6fcc']],
        )
    )

    data = [trace]
    
    layout = go.Layout(
        title='Embedding of Clusters in 2D Space',
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig, filename='t-SNE-embedding')
    
    log.info('Plotted data')


In [13]:
performTSNE(simX1, simY1)

INFO:root:Defined model


(10000, 400)


INFO:root:Created embedding


INFO:root:Plotted data


In [12]:
def mask_dependent_bases(X):
    print(X.dtype)
    X_pad = np.lib.pad(X.astype(np.float64), (3,3), 'constant', constant_values=(0.0, 0.0))[3:-3]
    X_pad = X_pad.reshape(1,-1)[0]
    ones = np.where(X_pad==1)[0]
    X_pad[ones-1] = np.nan
    X_pad[ones-2] = np.nan
    X_pad[ones-3] = np.nan
    
    X_pad[ones+1] = np.nan
    X_pad[ones+2] = np.nan
    X_pad[ones+3] = np.nan
    X_masked = X_pad.reshape(-1,X.shape[1]+6)
    
    return X_masked[:,3:-3]
log.info('Masking X1 bases')
masked_X1 = mask_dependent_bases(simX1)

INFO:root:Masking X1 bases


int32
1
2
6


In [39]:
def dist_matrix(X):
    dist = np.zeros((X.shape[0],X.shape[0]))
    print(dist.shape)
    for i in range(len(X)):
        for j in range(i, len(X)):
            row1 = X[i]
            row2 = X[j]
            d = np.nansum(np.abs(row1-row2))
            dist[i,j] = d
            dist[j,i] = d
    return dist
        
pdist_x1 = dist_matrix(masked_X1)

(10000, 10000)


In [14]:
performTSNE(simX2, simY2)

INFO:root:Defined model


(10000, 400)


INFO:root:Created embedding


INFO:root:Plotted data


In [15]:
performTSNE(simX3, simY3)

INFO:root:Defined model


(7392, 400)


INFO:root:Created embedding


INFO:root:Plotted data


In [51]:
performTSNE(simX4, simY4, num_examples=10000)

INFO:root:Defined model


(8185, 400)


INFO:root:Created embedding


INFO:root:Plotted data


In [48]:
masked_X4 = mask_dependent_bases(simX4)

int32
1
2
6


In [49]:
pdist_X4 = dist_matrix(masked_X4)

(8185, 8185)


In [50]:
performTSNE(pdist_X4, simY4, metric='precomputed')

INFO:root:Defined model


(8185, 8185)


INFO:root:Created embedding


INFO:root:Plotted data


In [34]:
X = np.array([[1,0,1],[0,0,0]])
def remove_low_mutations(X, Y):
    T = 0
    sum_muts = np.sum(X, axis=1)
    low_muts = np.where(sum_muts > T)[0]
    return X[low_muts], Y[low_muts]
    

In [35]:
simX4_nozeros, simY4_nozeros = remove_low_mutations(simX4, simY4)

In [36]:
performTSNE(simX4_nozeros, simY4_nozeros)

INFO:root:Defined model


(8150, 400)


INFO:root:Created embedding


INFO:root:Plotted data
