Beer3 Positive Results
============

1. LDA
-------

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
basedir = '../'
sys.path.append(basedir)

from lda_for_fragments import Ms2Lda
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import sys

from sklearn import preprocessing
from sklearn.decomposition import PCA
import networkx as nx
from networkx.readwrite import json_graph

In [2]:
n_topics = 100
n_samples = 100

fragment_filename = 'input/Beer_3_T10_POS_fragments.csv'
neutral_loss_filename = 'input/Beer_3_T10_POS_losses.csv'
mzdiff_filename = None

ms1_filename = 'input/Beer_3_T10_POS_ms1.csv'
ms2_filename = 'input/Beer_3_T10_POS_ms2.csv'

In [3]:
ms2lda = Ms2Lda(fragment_filename, neutral_loss_filename, mzdiff_filename, 
                ms1_filename, ms2_filename)
df = ms2lda.preprocess()
ms2lda.run_lda(df, n_topics, n_samples, n_burn, n_thin, 
               alpha, beta, use_own_model=True, use_inline=False)


TypeError: __init__() takes at most 7 arguments (8 given)

In [None]:
ms2lda.write_results('beer3_pos')

In [None]:
topic_fragments = ms2lda.model.topic_word_
n_top_frags = 20
for i,topic_dist in enumerate(topic_fragments):
    topic_f = np.array(ms2lda.data.columns.values)[np.argsort(topic_dist)][:-n_top_frags:-1]
    out_string = 'Topic {}: {}'.format(i, ', '.join(topic_f.astype('str')))
    print(out_string)

In [None]:
print len(ms2lda.model.loglikelihoods_)
plt.plot(ms2lda.model.loglikelihoods_)

2. PCA
-------

Here we use PCA to project the vector of topics for each parent peak to lower-dimensional space for visualisation purposes. First ensure the variables are scaled.

In [None]:
df = ms2lda.docdf.transpose() # topics x documents matrix
# df = fragments_topicdf.transpose() # topics x words matrix
print df.shape

# normalise and scale the variables
scaled_mat = preprocessing.scale(df, axis=0)
# print scaled_mat.mean(axis=0)
# print scaled_mat.std(axis=0)

Then run PCA

In [None]:
pca = PCA()
pca.fit(scaled_mat)
X_r = pca.transform(scaled_mat)

plt.figure()
plt.hist(X_r[:, 0], bins=30)
plt.title('First transformed variable')
plt.show()

plt.figure()
plt.hist(X_r[:, 1], bins=30)
plt.title('Second transformed variable')
plt.show()

print np.argmax(np.abs(pca.components_[0, :]))

But the explained variances by the first few principal components are too low ..??

In [None]:
print X_r.shape
print np.sum(pca.explained_variance_ratio_)
print('explained variance by the principal components: %s' % str(pca.explained_variance_ratio_))

In [None]:
plt.figure()
plt.scatter(X_r[:, 0], X_r[:, 1])
plt.xlabel('1st princomp')
plt.ylabel('2nd princomp')
plt.title('Projected parent peaks')

3. Network
------------

We can try to put the parent peaks on a network too.

In [None]:
# df = topicdf.transpose() # topic x terms matrix
df = ms2lda.docdf.transpose() # documents x topic matrix
print df.shape 

In [None]:
# create adjacency matrix A

# first compute euclidean distance between the topics
from scipy.spatial.distance import cdist
A = cdist(df, df, 'euclidean')
print A.shape

# crudely convert to similarities
maxval = A.max()
A = 1-(A/maxval)
plt.figure()
plt.hist(A)
plt.title('Histogram of values in the adjacency matrix')
plt.show()

# set a threshold for the similarity values for the network graph
for i in xrange(A.shape[0]):
    for j in xrange(A.shape[1]):
        if A[i, j] < 0.75:
            A[i, j] = 0

plt.figure()
plt.matshow(A)
plt.colorbar()
plt.title('Adjacency matrix after thresholding', y=1.2)
plt.show()

In [None]:
dt = [('len', float)]
A = A.view(dt)
G = nx.from_numpy_matrix(A)
pos = nx.spring_layout(G, k=0.01, iterations=20)
nx.draw(G, pos, node_size=10, with_labels=False)

Now we see some connected components in the network graph. Below we print the largest top-20 components. 

Parent peaks in the same component are connected in the graph above, i.e. they form some sort of clusters, suggesting they share topics in common?

In [None]:
components = sorted(nx.connected_components(G), key = len, reverse=True)
counter = 1
for comp in components:
    if counter > 20:
        break
    print "Component " + str(counter)
    print "=============="
    idx = np.array(comp)-1 # nodes are indexed from 1 .. N
    ms1_rows = ms2lda.ms1.iloc[idx]
    print ms1_rows[['peakID', 'mz', 'rt', 'intensity']].to_string(index=False, justify='left')
    counter += 1
    print

4. Document-Topics Distribution
----------------------------------

Visualise the document-topic distributions

In [None]:
df = ms2lda.docdf.transpose()
print df.shape
plt.pcolor(df, norm=None, cmap='Blues')
plt.tight_layout()
plt.xlabel('Topics')
plt.ylabel('Parent peaks')
plt.title('Documents-topics distributions')
plt.show()

Nothing useful here ...