In [4]:
import numpy as np
import lda
from scipy.io import loadmat
import scipy
from time import time

In [5]:
# Read NIPS Dataset
with open('../data/docword.nips.txt', 'r') as df:
    num_docs = int(df.readline())
    num_words = int(df.readline())
    nnz = int(df.readline())

    X = scipy.sparse.lil_matrix((num_docs, num_words))

    for l in df:
        d, w, v = [int(x) for x in l.split()]
        X[d-1, w-1] = v

# read NIPS vocabulary
with open('../data/vocab.nips.txt', 'r') as vf:
    vocab = tuple(vf.read().split())

print("Vocabulary: {} words".format(len(vocab)))
print('Done reading NIPS dataset.')

Vocabulary: 12419 words
Done reading NIPS dataset.


In [6]:
# LDA to find topics
model = lda.LDA(n_topics=10, n_iter=1500, random_state=1)

print('Start fitting.')
d = time()
model.fit(X.astype(int))  # model.fit_transform(X) is also available
d = time() - d
print("Done fitting in {:.1f} minutes.".format(d/60))

topic_word = model.topic_word_  # model.components_ also works
n_top_words = 8

for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i+1, ' '.join(topic_words)))

Start fitting.
Done fitting in 5.9 minutes.
Topic 1: function algorithm vector problem learning point method space
Topic 2: network input unit weight output training learning layer
Topic 3: model data distribution gaussian probability parameter likelihood density
Topic 4: circuit signal analog chip neural output system input
Topic 5: image object images model visual motion representation field
Topic 6: learning action algorithm policy reinforcement control system task
Topic 7: network model system neural dynamic control recurrent learning
Topic 8: function bound set algorithm theorem number examples learning
Topic 9: neuron cell model input activity pattern synaptic response
Topic 10: training recognition set speech data system word classifier
