In [None]:
import numpy as np
import pandas as pd
import logging as log
import plotly
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
plotly.plotly.sign_in('spersad', 'oNkuP1yzbpN734Ag8M9P')
import plotly.graph_objs as go

log.getLogger().setLevel(log.INFO)

In [None]:
def performTSNE(X, Y, num_examples=2000, perp=30, metric='euclidean'):
    from sklearn.manifold import TSNE
    model = TSNE(n_components=2, perplexity = perp, random_state=0, metric=metric) # fit into 2D space
    log.info('Defined model')
    if metric !='precomputed':
        X = X[:num_examples]
        Y = Y[:num_examples]
    log.info('Performing TSNE on data with shape {0}'.format(X.shape))        
    embeddedX = model.fit_transform(X)
    log.info('Created embedding')

    # Scatter plot to visualize embedded data
    # Create a trace
    trace = go.Scatter(
        x = embeddedX[:,0],
        y = embeddedX[:,1],
        mode = 'markers',
        marker=dict(
            size='2',
            color = Y, # color points by label they belong to
            colorscale= [[0, '#dd2c4f'], [1, '#3d6fcc']],
        ),
        text = Y#[str(x) for x in X]
    )

    data = [trace]
    
    layout = go.Layout(
        title='Embedding of Clusters in 2D Space',
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig, filename='t-SNE-embedding')
    
    log.info('Plotted data')
    


In [1]:
def performMDS(X, Y, num_examples=2000,metric='euclidean'):
    from sklearn.manifold import MDS
    model = MDS(n_components=2, max_iter=3000, dissimilarity=metric)
    log.info('Defined model')
    if metric !='precomputed':
        X = X[:num_examples]
        Y = Y[:num_examples]
    log.info('Performing MDS on data with shape {0}'.format(X.shape))         
    embeddedX = model.fit_transform(X)
    log.info('Created embedding')

    # Scatter plot to visualize embedded data
    # Create a trace
    trace = go.Scatter(
        x = embeddedX[:,0],
        y = embeddedX[:,1],
        mode = 'markers',
        marker=dict(
            size='5',
            color = Y, # color points by label they belong to
            colorscale= [[0, '#dd2c4f'], [1, '#3d6fcc']],
        ),
        text = [str(x) for x in X]
    )

    data = [trace]
    
    layout = go.Layout(
        title='MDS Embedding of Clusters in 2D Space',
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig, filename='MDS-embedding')
    
    log.info('Plotted data')

In [None]:
# Compute distance matrix for X_real data
def first_order_dist(X):
    dist = np.zeros((X.shape[0],X.shape[0]))
    log.info('Computing first order distance matrix with shape {0}'.format(dist.shape))
    for i in range(len(X)):
        for j in range(i, len(X)):
            row1 = X[i]
            row2 = X[j]
            d = np.nansum(np.abs(row1-row2))
            dist[i,j] = d
            dist[j,i] = d
    return dist

In [None]:
def second_order_distance(X, k=5):
    ''' Given a set of reads, compute the second order distance matrix,
        where the distance between reads i, j is the d(i,j) + alpha Sum(d(i,k_j)+d(k_i,j))
        and k_j is the k-NN neighborhood of j and k_i is the k-NN neighborhood of i'''
    
    ALPHA=1
    def mydist(x,y):
        return np.nansum(np.abs(x-y))
    
    from sklearn.neighbors import NearestNeighbors as NN
    nbrs  = NN().fit(X) #(algorithm='ball_tree', metric=mydist
    dists, indices = nbrs.kneighbors(n_neighbors=k)
    print(dists.shape)
    indices = indices[:,1:k] # Each read is not in its own nearest neighborhood
    
    second_dist = np.zeros((X.shape[0],X.shape[0]))
    print(second_dist.shape)
    for i in range(len(X)):
        for j in range(i, len(X)):
            row1 = X[i]
            row2 = X[j]
            d = np.nansum(np.abs(X[i]-X[j])) + ALPHA*(np.nansum(np.abs(X[indices[i]] - X[j])) + np.nansum(np.abs(X[indices[j]] - X[i])))
            second_dist[i,j] = d
            second_dist[j,i] = d
    return second_dist

dists = second_order_distance(X_real, k=5)