In [1]:
import numpy as np
import pandas as pd
import logging as log
import plotly
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
plotly.plotly.sign_in('spersad', 'oNkuP1yzbpN734Ag8M9P')
import plotly.graph_objs as go

log.getLogger().setLevel(log.INFO)

In [2]:
def simulate_modification(structure, sequence, DMS_prob, n_molecules):
    '''
    Simulate the DMS modification to a given RNA structure, producing the resulting DMS modified bases
    @param: structure- dot/bracket notation indication an open/closed base
    @param: sequence - nucleotide sequence corresponding to the given RNA structure
    @param: DMS_prob - probability of mutating an open (A/C) nucleotide
    @param: n_molecules - number of molecules to simulate
    
    @return: molecules - an array of DMS and RT'ed molecules
    @return: reads - an array of molecules where 0 if no mutation, 1 if mutated
    '''
    
    structure=list(structure)
    sequence=list(sequence)
    assert len(structure)==len(sequence), 'Sequence and structure must have the same length'
    
    molecules = []
    reads = []
    for mol_num in range(n_molecules):
        molecule = []
        read = []
        for (i,pos) in enumerate(structure):
            base = sequence[i]
            if base in ['A','C'] and pos == '.':
                # For a candidate base, it is DMS modified with a given probability
                if np.random.random()<DMS_prob:
                    mutation = mutate_base(base)
                    molecule.append(mutation)
                    if mutation == base:  
                        # If mutated base is the same as the original base, we cannot detect it.
                        read.append(0)
                    else: read.append(1)
                else:
                    molecule.append(base)
                    read.append(0)
            else:
                molecule.append(base)
                read.append(0)
        molecules.append(molecule)
        reads.append(read)
    
    return np.array(molecules), np.array(reads)
                

def mutate_base(original):
    ''' 
    Mutate a DMS modified base according to RT enzyme
    @param: original - original base
    @return: choice - modified base
    '''
    
    # Naive mutation distribution, estimate better from data 
                # A      U     C    G
    mut_dist = [[0.25, 0.25, 0.25, 0.25], #A
                [0.25, 0.25, 0.25, 0.25]] #C
    
    bases = {'A':0, 'C':1}
    prob_dist = mut_dist[bases[original]]
    choice = np.random.choice(['A','U','C','G'], p=prob_dist)
    return choice

def illegal_reads(reads):
    '''
    Compute the proportion of reads which are illegal (have two mutations within distance 3 of each other)
    @param: reads - array of n bitvectors (1 is mut, 0 is WT)
    @return: illegal - array of length n, where illegal[i] is 1 if read i is illegal, otherwise 0
    '''
    def check_read(read):
        dist_3 = read+np.concatenate(([0],read[:-1]))+np.concatenate(([0,0],read[:-2]))
        if 2 in dist_3:
            return 1
        return 0
    legal = np.apply_along_axis(check_read,1,reads)
    
    return legal

In [63]:
struc = '((((....(((((.......)))))...))).).'
seq= 'ATCTCTTTTCTTCTCTATGCGAGGATTTGGACTG'
mols, reads = simulate_modification(struc, seq, 0.1, 200000)
illegal = illegal_reads(reads)
percent = 100*np.sum(illegal)/len(illegal)
print('Illegal reads: {0}%'.format(percent))
good_reads = reads[illegal==0]
reads_df = pd.DataFrame(good_reads).astype(int)
reads_df.to_csv('riboA_sim.txt', index=False, sep='\t')

Illegal reads: 0.5695%


In [64]:
struc = '.((((((...(((((.((((.((........)).'
seq= 'ATCTCTTTTCTTCTCTATGCGAGGATTTGGACTG'
mols, reads = simulate_modification(struc, seq ,0.1,100000)
illegal = illegal_reads(reads)
percent = 100*np.sum(illegal)/len(illegal)
print('Illegal reads: {0}%'.format(percent))
good_reads = reads[illegal==0]
reads_df = pd.DataFrame(good_reads).astype(int)
reads_df.to_csv('riboC_sim.txt', index=False, sep='\t')

Illegal reads: 0.0%


In [115]:
def mask_dependent_bases(X):
    '''
    Given a bitvector, replace all bases around '1' with np.nan 
    '''
    print(X.dtype)
    X_pad = np.lib.pad(X.astype(np.float64), (3,3), 'constant', constant_values=(0.0, 0.0))[3:-3]
    X_pad = X_pad.reshape(1,-1)[0]
    ones = np.where(X_pad==1)[0]
    X_pad[ones-1] = np.nan
    X_pad[ones-2] = np.nan
    X_pad[ones-3] = np.nan
    
    X_pad[ones+1] = np.nan
    X_pad[ones+2] = np.nan
    X_pad[ones+3] = np.nan
    X_masked = X_pad.reshape(-1,X.shape[1]+6)
    
    return X_masked

In [99]:
def load_bitvectors(paths, unique=False):
    '''
    Given a pandas dataframe with headings Read_name Binary_vector N_mutations Reference_name Start_position
    load only the bitvectors as a numpy array of integers
    @param: paths - a list of paths to bitvector file for each cluster
    @return: bitreads - an array of bitvectors, not corrected for correlation between bases
    '''
    dfs = []
    for (i,path) in enumerate(paths):
        df = pd.read_csv(path, sep='\t', na_values='?')
        df['Label'] = i
        dfs.append(df)
    df = pd.concat(dfs)
    print('Loaded clusters with shape {0} before removing excess NaN values.'.format(df.shape))
    df = df[df.isnull().sum(axis=1)<0.3*df.shape[1]]
    # Cast NaN to zero for analysis
    df.fillna(0, inplace=True)
    print('Loaded clusters with shape {0} after removing excess NaN values.'.format(df.shape))
    if unique:
        df.drop_duplicates(inplace=True)
        print('Loaded clusters with shape {0} after removing duplicates.'.format(df.shape))
    df = df.sample(frac=1)
    Y = df['Label']
    df.drop('Label', inplace=True)
    return df.as_matrix(), Y

In [80]:
X, Y = load_bitvectors(['riboA_sim.txt','riboC_sim.txt'], unique=True)

(298861, 35)
Loaded clusters with shape (298861, 35) before removing excess NaN values.
Loaded clusters with shape (298861, 35) after removing excess NaN values.
Loaded clusters with shape (40, 35) after removing duplicates.


In [111]:
def performTSNE(X, Y, num_examples=2000, perp=30, metric='euclidean'):
    from sklearn.manifold import TSNE
    model = TSNE(n_components=2, perplexity = perp, random_state=0, metric=metric) # fit into 2D space
    log.info('Defined model')
    if metric !='precomputed':
        X = X[:num_examples]
        Y = Y[:num_examples]
    log.info('Performing TSNE on data with shape {0}'.format(X.shape))        
    embeddedX = model.fit_transform(X)
    log.info('Created embedding')

    # Scatter plot to visualize embedded data
    # Create a trace
    trace = go.Scatter(
        x = embeddedX[:,0],
        y = embeddedX[:,1],
        mode = 'markers',
        marker=dict(
            size='2',
            color = Y, # color points by label they belong to
            colorscale= [[0, '#dd2c4f'], [1, '#3d6fcc']],
        ),
        text = Y#[str(x) for x in X]
    )

    data = [trace]
    
    layout = go.Layout(
        title='Embedding of Clusters in 2D Space',
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig, filename='t-SNE-embedding')
    
    log.info('Plotted data')

In [104]:
performTSNE(X,Y)

INFO:root:Defined model
INFO:root:Performing TSNE on data with shape (40, 35)
INFO:root:Created embedding


INFO:root:Plotted data


In [132]:
def performMDS(X, Y, num_examples=2000,metric='euclidean'):
    from sklearn.manifold import MDS
    model = MDS(n_components=2, max_iter=3000, dissimilarity=metric)
    log.info('Defined model')
    if metric !='precomputed':
        X = X[:num_examples]
        Y = Y[:num_examples]
    log.info('Performing MDS on data with shape {0}'.format(X.shape))         
    embeddedX = model.fit_transform(X)
    log.info('Created embedding')

    # Scatter plot to visualize embedded data
    # Create a trace
    trace = go.Scatter(
        x = embeddedX[:,0],
        y = embeddedX[:,1],
        mode = 'markers',
        marker=dict(
            size='5',
            color = Y, # color points by label they belong to
            colorscale= [[0, '#dd2c4f'], [1, '#3d6fcc']],
        ),
        text = [str(x) for x in X]
    )

    data = [trace]
    
    layout = go.Layout(
        title='MDS Embedding of Clusters in 2D Space',
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig, filename='MDS-embedding')
    
    log.info('Plotted data')

In [38]:
performMDS(X,Y)

INFO:root:Defined model
INFO:root:Performing MDS on data with shape (40, 35)
INFO:root:Created embedding


INFO:root:Plotted data


In [100]:
X_real, Y_real = load_bitvectors(['riboA_real.txt','riboC_real.txt'], unique=True)

Loaded clusters with shape (939354, 41) before removing excess NaN values.
Loaded clusters with shape (804220, 41) after removing excess NaN values.
Loaded clusters with shape (2102, 41) after removing duplicates.


In [105]:
performTSNE(X_real,Y_real)

INFO:root:Defined model
INFO:root:Performing TSNE on data with shape (2000, 41)
INFO:root:Created embedding


INFO:root:Plotted data


In [106]:
# Compute distance matrix for X_real data
def first_order_dist(X):
    dist = np.zeros((X.shape[0],X.shape[0]))
    log.info('Computing first order distance matrix with shape {0}'.format(dist.shape))
    for i in range(len(X)):
        for j in range(i, len(X)):
            row1 = X[i]
            row2 = X[j]
            d = np.nansum(np.abs(row1-row2))
            dist[i,j] = d
            dist[j,i] = d
    return dist

In [116]:
masked_X = mask_dependent_bases(X_real) 
X_dist = first_order_dist(masked_X)

float64
(2102, 2102)


In [126]:
performTSNE(X_dist, Y_real, metric='precomputed')

INFO:root:Defined model
INFO:root:Performing TSNE on data with shape (2102, 2102)
INFO:root:Created embedding


INFO:root:Plotted data


In [125]:
def second_order_distance(X, k=5):
    ''' Given a set of reads, compute the second order distance matrix,
        where the distance between reads i, j is the d(i,j) + alpha Sum(d(i,k_j)+d(k_i,j))
        and k_j is the k-NN neighborhood of j and k_i is the k-NN neighborhood of i'''
    
    ALPHA=0.5
    def mydist(x,y):
        return np.nansum(np.abs(x-y))
    
    from sklearn.neighbors import NearestNeighbors as NN
    nbrs  = NN().fit(X) #(algorithm='ball_tree', metric=mydist
    dists, indices = nbrs.kneighbors(n_neighbors=k)
    print(dists.shape)
    indices = indices[:,1:k] # Each read is not in its own nearest neighborhood
    
    second_dist = np.zeros((X.shape[0],X.shape[0]))
    print(second_dist.shape)
    for i in range(len(X)):
        for j in range(i, len(X)):
            row1 = X[i]
            row2 = X[j]
            d = np.nansum(np.abs(X[i]-X[j])) + ALPHA*(np.nansum(np.abs(X[indices[i]] - X[j])) + np.nansum(np.abs(X[indices[j]] - X[i])))
            second_dist[i,j] = d
            second_dist[j,i] = d
    return second_dist

dists = second_order_distance(X_real, k=5)

(2102, 5)
(2102, 2102)


In [127]:
performTSNE(dists, Y_real, metric='precomputed')

INFO:root:Defined model
INFO:root:Performing TSNE on data with shape (2102, 2102)
INFO:root:Created embedding


INFO:root:Plotted data


In [129]:
performMDS(dists, Y_real, metric='precomputed')

INFO:root:Defined model
INFO:root:Performing MDS on data with shape (2102, 2102)
INFO:root:Created embedding


INFO:root:Plotted data


In [130]:
dists_10 = second_order_distance(X_real, k=10)

(2102, 10)
(2102, 2102)


In [131]:
performMDS(dists_10, Y_real, metric='precomputed')

INFO:root:Defined model
INFO:root:Performing MDS on data with shape (2102, 2102)
INFO:root:Created embedding


INFO:root:Plotted data


In [133]:
performMDS(X_dist, Y_real, metric='precomputed')

INFO:root:Defined model
INFO:root:Performing MDS on data with shape (2102, 2102)
INFO:root:Created embedding


INFO:root:Plotted data


In [134]:
# TRY EM GAUSSIAN CLUSTERING 

In [146]:
df = pd.read_csv('embedded_coords.csv', sep=',', usecols=[0,1,2])
df.head()
df.columns
Y_tr = df['Y']
X_tr =df[['X1','X2']].as_matrix()

In [None]:
from matplotlib.colors import LogNorm
from sklearn import mixture
import matplotlib.pyplot as plt

# fit a Gaussian Mixture Model with two components
clf = mixture.GaussianMixture(n_components=2, covariance_type='full')
clf.fit(X_tr)

# display predicted scores by the model as a contour plot
x = np.linspace(-20., 30.)
y = np.linspace(-20., 40.)
X, Y = np.meshgrid(x, y)
XX = np.array([X.ravel(), Y.ravel()]).T
Z = -clf.score_samples(XX)
Z = Z.reshape(X.shape)

CS = plt.contour(X, Y, Z, norm=LogNorm(vmin=1.0, vmax=1000.0),
                 levels=np.logspace(0, 3, 10))
CB = plt.colorbar(CS, shrink=0.8, extend='both')
plt.scatter(X_tr[:, 0], X_tr[:, 1], c=Y_tr, s=.8)

plt.title('Negative log-likelihood predicted by a GMM')
plt.axis('tight')
plt.show()