# Data

In [1]:
# Downloads a corpus of 1740 NIPS papers used in the tutorial.
# Each document is a Unicode string; when using own corpus for LDA,
# need to make sure that the format is a list of Unicode strings
import io
import os.path
import re
import tarfile

import smart_open

def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'):
    with smart_open.open(url, "rb") as file:
        with tarfile.open(fileobj=file) as tar:
            for member in tar.getmembers():
                if member.isfile() and re.search(r'nipstxt/nips\d+/\d+\.txt', member.name):
                    member_bytes = tar.extractfile(member).read()
                    yield member_bytes.decode('utf-8', errors='replace')

docs = list(extract_documents())

In [2]:
print(len(docs))
print(docs[0][:500])

1740
387 
Neural Net and Traditional Classifiers  
William Y. Huang and Richard P. Lippmann 
MIT Lincoln Laboratory 
Lexington, MA 02173, USA 
Abstract
Previous work on nets with continuous-valued inputs led to generative 
procedures to construct convex decision regions with two-layer percepttons (one hidden 
layer) and arbitrary decision regions with three-layer percepttons (two hidden layers). 
Here we demonstrate that two-layer perceptton classifiers trained with back propagation 
can form both c


# Pre-process and vectorizing documents

## Tokenization

In [3]:
# We are using a regular expression tokenizer from NLTK, which removes
# numeric tokens and single-character tokens

# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

## Lemmatizing tokens

In [4]:
# Lemmatization is the process of grouping together different inflected forms of the
# same word. Used instead of stemming, which removes characters (suffixes/inflections)
# from the end of a word, because lemmatizing produces more readable output (which
# is important for topic modeling).

# Lemmatize documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

## Compute bigrams

In [5]:
# Bigrams are sets of two adjacent words (ex. machine_learning). When we find
# bigrams, we add them to the original data because we want to keep the individual
# words (machine and learning) in the dataset.

# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [6]:
# We also remove rare (words that appear in less than 20 documents and common words
# (words that appear in 50%+ documents)

# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

## Bag-of-words representation

In [7]:
# We transform the documents to a vectorized form; compute the frequency of each
# word, including bigrams

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [8]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 8644
Number of documents: 1740


# Training

In [10]:
# Enable logging to see the progress of training
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [11]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000 # Number of docs processed at a time
passes = 20 # How often we train the model on the entire corpus
iterations = 400 # How often we repeat a particular loop over each doc
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

2023-06-15 14:20:57,682 : INFO : using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
2023-06-15 14:20:57,694 : INFO : using serial LDA version on this node
2023-06-15 14:20:57,701 : INFO : running online (multi-pass) LDA training, 10 topics, 20 passes over the supplied corpus of 1740 documents, updating model once every 1740 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2023-06-15 14:20:57,715 : INFO : PROGRESS: pass 0, at document #1740/1740
2023-06-15 14:21:07,741 : INFO : optimized alpha [0.0625251, 0.08549464, 0.09212624, 0.07895554, 0.075591445, 0.054502595, 0.08670515, 0.07642625, 0.07267764, 0.0831401]
2023-06-15 14:21:07,747 : INFO : topic #5 (0.055): 0.013*"image" + 0.004*"layer" + 0.004*"hidden" + 0.003*"noise" + 0.003*"recognition" + 0.003*"net" + 0.003*"map" + 0.003*"class" + 0.003*"cell" + 0.003*"loss"
2023-06-15 14:21:07,748 : INFO : topic #0 (0.063): 0.012*"neuron" + 0.003*"

2023-06-15 14:21:28,054 : INFO : topic #1 (0.050): 0.008*"hidden" + 0.005*"gradient" + 0.005*"generalization" + 0.005*"hidden_unit" + 0.005*"noise" + 0.004*"dynamic" + 0.004*"solution" + 0.004*"field" + 0.003*"recurrent" + 0.003*"layer"
2023-06-15 14:21:28,055 : INFO : topic #6 (0.053): 0.008*"rule" + 0.004*"class" + 0.003*"bound" + 0.003*"optimal" + 0.003*"let" + 0.003*"layer" + 0.003*"decision" + 0.003*"control" + 0.003*"theorem" + 0.003*"sample"
2023-06-15 14:21:28,055 : INFO : topic #3 (0.056): 0.006*"gaussian" + 0.006*"mixture" + 0.005*"density" + 0.005*"likelihood" + 0.004*"matrix" + 0.004*"prior" + 0.004*"estimate" + 0.004*"approximation" + 0.004*"component" + 0.004*"bayesian"
2023-06-15 14:21:28,055 : INFO : topic diff=0.195084, rho=0.377964
2023-06-15 14:21:28,060 : INFO : PROGRESS: pass 6, at document #1740/1740
2023-06-15 14:21:31,064 : INFO : optimized alpha [0.039382346, 0.048831232, 0.045547783, 0.055046152, 0.04074983, 0.038806535, 0.051917735, 0.036081027, 0.048680328, 

2023-06-15 14:21:44,834 : INFO : topic #7 (0.033): 0.017*"speech" + 0.013*"word" + 0.012*"recognition" + 0.008*"signal" + 0.006*"context" + 0.006*"speaker" + 0.005*"frame" + 0.005*"eye" + 0.005*"cluster" + 0.005*"sequence"
2023-06-15 14:21:44,835 : INFO : topic #4 (0.036): 0.013*"circuit" + 0.011*"chip" + 0.009*"analog" + 0.008*"neuron" + 0.007*"node" + 0.007*"voltage" + 0.006*"signal" + 0.006*"bit" + 0.005*"layer" + 0.005*"vlsi"
2023-06-15 14:21:44,835 : INFO : topic #1 (0.049): 0.010*"hidden" + 0.007*"gradient" + 0.006*"hidden_unit" + 0.006*"generalization" + 0.006*"noise" + 0.004*"solution" + 0.004*"recurrent" + 0.004*"layer" + 0.004*"minimum" + 0.004*"dynamic"
2023-06-15 14:21:44,836 : INFO : topic #6 (0.050): 0.008*"rule" + 0.005*"class" + 0.005*"bound" + 0.005*"decision" + 0.004*"let" + 0.004*"theorem" + 0.004*"node" + 0.004*"tree" + 0.004*"optimal" + 0.004*"sample"
2023-06-15 14:21:44,836 : INFO : topic #3 (0.056): 0.007*"gaussian" + 0.007*"mixture" + 0.006*"density" + 0.005*"li

2023-06-15 14:21:58,628 : INFO : topic diff=0.115364, rho=0.235702
2023-06-15 14:21:58,632 : INFO : PROGRESS: pass 17, at document #1740/1740
2023-06-15 14:22:01,681 : INFO : optimized alpha [0.039316103, 0.052661013, 0.047532573, 0.059529666, 0.035384603, 0.039379288, 0.050718367, 0.032386933, 0.04950802, 0.042188756]
2023-06-15 14:22:01,686 : INFO : topic #7 (0.032): 0.019*"speech" + 0.016*"word" + 0.014*"recognition" + 0.009*"signal" + 0.007*"context" + 0.007*"speaker" + 0.006*"frame" + 0.006*"sequence" + 0.005*"acoustic" + 0.005*"hmm"
2023-06-15 14:22:01,686 : INFO : topic #4 (0.035): 0.015*"circuit" + 0.012*"chip" + 0.010*"analog" + 0.009*"neuron" + 0.007*"voltage" + 0.007*"node" + 0.006*"bit" + 0.006*"signal" + 0.006*"vlsi" + 0.005*"implementation"
2023-06-15 14:22:01,687 : INFO : topic #6 (0.051): 0.006*"rule" + 0.006*"class" + 0.006*"bound" + 0.005*"decision" + 0.005*"let" + 0.004*"theorem" + 0.004*"tree" + 0.004*"node" + 0.004*"policy" + 0.004*"sample"
2023-06-15 14:22:01,687 

In [12]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

2023-06-15 14:23:34,721 : INFO : CorpusAccumulator accumulated stats from 1000 documents


Average topic coherence: -1.1684.
[([(0.016905755, 'cell'),
   (0.014687324, 'neuron'),
   (0.009116583, 'response'),
   (0.008101081, 'stimulus'),
   (0.0069811973, 'visual'),
   (0.0067367177, 'spike'),
   (0.0066651385, 'activity'),
   (0.005324701, 'signal'),
   (0.0050893207, 'frequency'),
   (0.004930853, 'synaptic'),
   (0.0049284077, 'firing'),
   (0.004896282, 'field'),
   (0.004882054, 'cortex'),
   (0.0044669765, 'motion'),
   (0.004450724, 'direction'),
   (0.0041271923, 'orientation'),
   (0.0038171958, 'connection'),
   (0.0038136553, 'spatial'),
   (0.0038042122, 'fig'),
   (0.00377497, 'cortical')],
  -0.968588005382116),
 ([(0.0077499007, 'gaussian'),
   (0.0069311652, 'mixture'),
   (0.0061188852, 'density'),
   (0.005852694, 'matrix'),
   (0.0056930874, 'likelihood'),
   (0.0051899254, 'estimate'),
   (0.0051225196, 'component'),
   (0.0051025506, 'prior'),
   (0.004516479, 'bayesian'),
   (0.004426193, 'log'),
   (0.004379986, 'approximation'),
   (0.004174605, 'sam