Example Notebook
============

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
basedir = '../'
sys.path.append(basedir)

from lda_for_fragments import Ms2Lda
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import sys
import visualisation.pyLDAvis as pyLDAvis

In [2]:
n_topics = 400 # number of topics, 400 seems to be good from cross-validation
n_samples = 200 # how many samples to get during Gibbs sampling

n_burn = 0 # no. of burn-in samples to discard, unused
n_thin = 1 # thinning parameter, unused

# follow the recommendation from Griffith & Styver
alpha = 50.0/n_topics # hyper-parameter for document-topic distributions
beta = 0.1 # hyper-parameter for topic-word distributions

fragment_filename = basedir + 'input/relative_intensities/Beer_3_T10_POS_fragments_rel.csv'
neutral_loss_filename = basedir + 'input/relative_intensities/Beer_3_T10_POS_losses_rel.csv'
mzdiff_filename = None

ms1_filename = basedir + 'input/relative_intensities/Beer_3_T10_POS_ms1_rel.csv'
ms2_filename = basedir + 'input/relative_intensities/Beer_3_T10_POS_ms2_rel.csv'

In [None]:
ms2lda = Ms2Lda(fragment_filename, neutral_loss_filename, mzdiff_filename, 
                ms1_filename, ms2_filename, relative_intensity=True)
df, vocab = ms2lda.preprocess()

Data shape (856, 1664)


In [None]:
ms2lda.run_lda(df, vocab, n_topics, n_samples, n_burn, n_thin, 
               alpha, beta, use_own_model=True, use_native=True)

Fitting model...
CGS LDA initialising
......................................................................................
Using Numba for LDA sampling
Preparing words
Preparing Z matrix
DONE
Sample 1   Log joint likelihood = -3290884.011 
Sample 2   Log joint likelihood = -2101706.039 
Sample 3   Log joint likelihood = -1558905.402 
Sample 4   Log joint likelihood = -1388784.302 
Sample 5   Log joint likelihood = -1316030.825 
Sample 6   Log joint likelihood = -1276246.474 
Sample 7   Log joint likelihood = -1253508.982 
Sample 8   Log joint likelihood = -1236643.844 
Sample 9   Log joint likelihood = -1221421.259 
Sample 10   Log joint likelihood = -1209065.510 
Sample 11   Log joint likelihood = -1200262.033 
Sample 12   Log joint likelihood = -1191446.933 
Sample 13   Log joint likelihood = -1183412.672 
Sample 14   Log joint likelihood = -1178194.408 
Sample 15   Log joint likelihood = -1172773.213 
Sample 16   Log joint likelihood = -1168721.917 
Sample 17   Log joint likelihoo

In [None]:
ms2lda.write_results('beer3_pos_rel')

2. LDAVis
----------

Visualise topics using LDAVis. First prepare the data to display.

In [None]:
data = {}
data['topic_term_dists'] = ms2lda.model.topic_word_
data['doc_topic_dists'] = ms2lda.model.doc_topic_
data['doc_lengths'] = ms2lda.model.cd
data['vocab'] = ms2lda.model.vocab
data['term_frequency'] = np.sum(ms2lda.model.ckn, axis=0)

In [None]:
print np.sum(ms2lda.model.ckn, axis=0).shape

Check to make sure everything's correct ..

In [None]:
print('Topic-Term shape: %s' % str(data['topic_term_dists'].shape))
print('Doc-Topic shape: %s' % str(data['doc_topic_dists'].shape))
print len(data['doc_lengths'])
plt.hist(data['doc_lengths'])
print type(data['vocab'])
print data['vocab'][0]
print data['vocab'][1]
print data['topic_term_dists'].shape
print data['doc_topic_dists'].shape
print data['doc_lengths'].shape
print data['term_frequency'].shape

In [None]:
vis_data = pyLDAvis.prepare(**data)

In [None]:
pyLDAvis.show(vis_data)

In [None]:
ms2lda.model.print_topic_words()

3. Shared Fragments within a Topic
--------------------------------------

For every topic, we visualise the interesting 'words' shared in common across the documents. Topics are ordered in the list below by their 'h-indices'. The consistency score (0.50 in below case) is then used to highlight interesting peaks in the plot. 0.50 means the word (fragment/loss) is present in at least 50% of the selected parent peaks for this topic. 

In [None]:
# ms2lda.plot_lda_fragments(consistency=0.50, sort_by="h_index")
# ms2lda.plot_lda_fragments(consistency=0.50, sort_by="in_degree")