In [1]:
import numpy as np
import pandas as pd
import logging as log
import plotly
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
plotly.plotly.sign_in('spersad', 'oNkuP1yzbpN734Ag8M9P')
import plotly.graph_objs as go

log.getLogger().setLevel(log.INFO)

## Simulate DMS modification on a given molecule

In [14]:
def simulate_modification(structure, sequence, DMS_prob, n_molecules):
    '''
    Simulate the DMS modification to a given RNA structure, producing the resulting DMS modified bases
    @param: structure- dot/bracket notation indication an open/closed base
    @param: sequence - nucleotide sequence corresponding to the given RNA structure
    @param: DMS_prob - probability of mutating an open (A/C) nucleotide
    @param: n_molecules - number of molecules to simulate
    
    @return: molecules - an array of DMS and RT'ed molecules
    @return: reads - an array of molecules where 0 if no mutation, 1 if mutated
    '''
    
    structure=list(structure)
    sequence=list(sequence)
    assert len(structure)==len(sequence), 'Sequence and structure must have the same length'
    
    molecules = []
    reads = []
    basevectors = []
    for mol_num in range(n_molecules):
        molecule = []
        read = []
        basevector = []
        for (i,pos) in enumerate(structure):
            base = sequence[i]
            if base in ['A','C'] and pos == '.':
                # For a candidate base, it is DMS modified with a given probability
                if np.random.random()<DMS_prob:
                    mutation = mutate_base(base)
                    molecule.append(mutation)
                    if mutation == base:  
                        # If mutated base is the same as the original base, we cannot detect it.
                        read.append(0)
                        basevector.append('0')
                    else: 
                        read.append(1)
                        basevector.append(base)
                else:
                    molecule.append(base)
                    read.append(0)
                    basevector.append('0')
            else:
                molecule.append(base)
                read.append(0)
                basevector.append('0')
        molecules.append(molecule)
        reads.append(read)
        basevectors.append(basevector)
    
    return np.array(molecules), np.array(reads), _ #, np.array(basevectors)
                

def mutate_base(original):
    ''' 
    Mutate a DMS modified base according to RT enzyme
    @param: original - original base
    @return: choice - modified base
    '''
    
    # Naive mutation distribution, estimate better from data 
                # A      U     C    G
    mut_dist = [[0.25, 0.25, 0.25, 0.25], #A
                [0.25, 0.25, 0.25, 0.25]] #C
    
    bases = {'A':0, 'C':1}
    prob_dist = mut_dist[bases[original]]
    choice = np.random.choice(['A','U','C','G'], p=prob_dist)
    return choice

def illegal_reads(reads):
    '''
    Compute the proportion of reads which are illegal (have two mutations within distance 3 of each other)
    @param: reads - array of n bitvectors (1 is mut, 0 is WT)
    @return: illegal - array of length n, where illegal[i] is 1 if read i is illegal, otherwise 0
    '''
    def check_read(read):
        dist_3 = read+np.concatenate(([0],read[:-1]))+np.concatenate(([0,0],read[:-2]))
        if 2 in dist_3:
            return 1
        return 0
    illegal = np.apply_along_axis(check_read,1,reads)
    
    return illegal

In [16]:
struc = '((((....(((((.......)))))...))).).'
seq= 'ATCTCTTTTCTTCTCTATGCGAGGATTTGGACTG'
mols, reads, basevectors = simulate_modification(struc, seq, 0.1, 200000)
illegal = illegal_reads(reads)
percent = 100*np.sum(illegal)/len(illegal)
print('{0} illegal reads: {1}%'.format(np.sum(illegal),percent))
good_reads = reads[illegal==0]
reads_df = pd.DataFrame(good_reads).astype(int)
reads_df.to_csv('riboA_sim.txt', index=False, sep='\t')

1099 illegal reads: 0.5495%


In [None]:
basevectors

In [4]:
struc = '.((((((...(((((.((((.((........)).'
seq= 'ATCTCTTTTCTTCTCTATGCGAGGATTTGGACTG'
mols, reads = simulate_modification(struc, seq ,0.1,100000)
illegal = illegal_reads(reads)
percent = 100*np.sum(illegal)/len(illegal)
print('Illegal reads: {0}%'.format(percent))
good_reads = reads[illegal==0]
reads_df = pd.DataFrame(good_reads).astype(int)
reads_df.to_csv('riboC_sim.txt', index=False, sep='\t')

Illegal reads: 0.0%


In [8]:
def mask_dependent_bases(X):
    '''
    Given a bitvector, replace all bases around '1' with np.nan 
    '''
    print(X.dtype)
    X_pad = np.lib.pad(X.astype(np.float64), (3,3), 'constant', constant_values=(0.0, 0.0))[3:-3]
    X_pad = X_pad.reshape(1,-1)[0]
    ones = np.where(X_pad==1)[0]
    X_pad[ones-1] = np.nan
    X_pad[ones-2] = np.nan
    X_pad[ones-3] = np.nan
    
    X_pad[ones+1] = np.nan
    X_pad[ones+2] = np.nan
    X_pad[ones+3] = np.nan
    X_masked = X_pad.reshape(-1,X.shape[1]+6)
    
    return X_masked

In [9]:
def load_bitvectors(paths, unique=False):
    '''
    Given a pandas dataframe with headings Read_name Binary_vector N_mutations Reference_name Start_position
    load only the bitvectors as a numpy array of integers
    @param: paths - a list of paths to bitvector file for each cluster
    @return: bitreads - an array of bitvectors, not corrected for correlation between bases
    '''
    dfs = []
    for (i,path) in enumerate(paths):
        df = pd.read_csv(path, sep='\t', na_values='?')
        df['Label'] = i
        dfs.append(df)
    df = pd.concat(dfs)
    print('Loaded clusters with shape {0} before removing excess NaN values.'.format(df.shape))
    df = df[df.isnull().sum(axis=1)<0.3*df.shape[1]]
    # Cast NaN to zero for analysis
    df.fillna(0, inplace=True)
    print('Loaded clusters with shape {0} after removing excess NaN values.'.format(df.shape))
    if unique:
        df.drop_duplicates(inplace=True)
        print('Loaded clusters with shape {0} after removing duplicates.'.format(df.shape))
    df = df.sample(frac=1)
    Y = df['Label']
    df.drop('Label', inplace=True)
    return df.as_matrix(), Y

In [10]:
X, Y = load_bitvectors(['riboA_sim.txt','riboC_sim.txt'], unique=True)

Loaded clusters with shape (298888, 35) before removing excess NaN values.
Loaded clusters with shape (298888, 35) after removing excess NaN values.
Loaded clusters with shape (40, 35) after removing duplicates.


In [19]:
print(X.shape)
A_avg = X[Y==0].mean(axis=0)
print(A_avg.shape)

data = [go.Bar(
            x=list(range(len(A_avg))),
            y=A_avg
    )]

iplot(data, filename='A_struc')


C_avg = X[Y==1].mean(axis=0)
print(C_avg)

data = [go.Bar(
            x=list(range(len(C_avg))),
            y=C_avg
    )]

iplot(data, filename='C_struc')


(40, 35)
(35,)


[ 0.5  0.   0.   0.   0.   0.   0.   0.   0.   0.5  0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.5  0.   0.   0.   0.   0.
  0.5  0.   0.   0.   1. ]


In [26]:
masked_X = mask_dependent_bases(X_real) 
X_dist = first_order_dist(masked_X)

INFO:root:Computing first order distance matrix with shape (2102, 2102)


float64


In [50]:
from matplotlib.colors import LogNorm
from sklearn.mixture import GaussianMixture as GM
import matplotlib.pyplot as plt

from sklearn.cluster import AgglomerativeClustering as AC
from sklearn.cluster import KMeans as KM
from sklearn.cluster import SpectralClustering as sc


print('Cluster')
# fit a Gaussian Mixture Model with two components
clf = AC(n_clusters=2)#, covariance_type='full')

#clf = GM(n_components=2, covariance_type='full')
clf.fit(X_tr)
pred = clf.fit_predict(X_tr)
# Create a trace
trace = go.Scatter(
    x = X_tr[:,0],
    y = X_tr[:,1],
    mode = 'markers',
    marker=dict(
        size='5',
        color = pred, # color points by label they belong to
        colorscale= [[0, '#dd2c4f'], [1, '#3d6fcc']],
    ),
    text = [str(x) for x in X]
)

data = [trace]

layout = go.Layout(
    title='<b>Hierarchical Clustering of MDS 5NN Embedded Data</b>',
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='MDS-embedding')


Cluster


In [46]:
np.sum(pred==Y_tr)

1979

In [47]:
Y_tr.shape

(2000,)

In [48]:
1979/2000

0.9895

In [51]:
X_real[pred!=Y_tr]


boolean index did not match indexed array along dimension 0; dimension is 2102 but corresponding boolean dimension is 2000



array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
         0.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., 

In [52]:
from matplotlib.colors import LogNorm
from sklearn.mixture import GaussianMixture as GM
import matplotlib.pyplot as plt

from sklearn.cluster import AgglomerativeClustering as AC
from sklearn.cluster import KMeans as KM
from sklearn.cluster import SpectralClustering as sc


print('Cluster')
# fit a Gaussian Mixture Model with two components
clf = AC(n_clusters=2)#, covariance_type='full')

#clf = GM(n_components=2, covariance_type='full')
clf.fit(X_real)
pred = clf.fit_predict(X_real)
# # Create a trace
# trace = go.Scatter(
#     x = X_tr[:,0],
#     y = X_tr[:,1],
#     mode = 'markers',
#     marker=dict(
#         size='5',
#         color = pred, # color points by label they belong to
#         colorscale= [[0, '#dd2c4f'], [1, '#3d6fcc']],
#     ),
#     text = [str(x) for x in X]
# )

# data = [trace]

# layout = go.Layout(
#     title='<b>Hierarchical Clustering of MDS 5NN Embedded Data</b>',
# )

# fig = go.Figure(data=data, layout=layout)
# iplot(fig, filename='MDS-embedding')


Cluster


In [54]:
np.sum(pred==Y_real)

1539

In [56]:
1529/2102

0.7274024738344433

In [57]:
from matplotlib.colors import LogNorm
from sklearn.mixture import GaussianMixture as GM
import matplotlib.pyplot as plt

from sklearn.cluster import AgglomerativeClustering as AC
from sklearn.cluster import KMeans as KM
from sklearn.cluster import SpectralClustering as sc


print('Cluster')
# fit a Gaussian Mixture Model with two components
clf = KM(n_clusters=2)#, covariance_type='full')

#clf = GM(n_components=2, covariance_type='full')
clf.fit(X_real)
pred = clf.fit_predict(X_real)
# # Create a trace
# trace = go.Scatter(
#     x = X_tr[:,0],
#     y = X_tr[:,1],
#     mode = 'markers',
#     marker=dict(
#         size='5',
#         color = pred, # color points by label they belong to
#         colorscale= [[0, '#dd2c4f'], [1, '#3d6fcc']],
#     ),
#     text = [str(x) for x in X]
# )

# data = [trace]

# layout = go.Layout(
#     title='<b>Hierarchical Clustering of MDS 5NN Embedded Data</b>',
# )

# fig = go.Figure(data=data, layout=layout)
# iplot(fig, filename='MDS-embedding')


Cluster


In [59]:
np.sum(pred==Y_real)

2102