# Topic Modeling
We'll use Gensim to model the topics in a corpus of forum posts from a range of different forums.

In [22]:
from gensim import models, similarities
from gensim.corpora.dictionary import Dictionary
# Plus a few other assorted inputs.
import numpy as np
from sklearn.datasets import fetch_20newsgroups

## Preprocessing Textual Corpora

In [23]:
# We'll use a dataset of posts from 20 distinct Usenet forums.
texts = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))   # Strip metadata that makes the problem too easy.


In [24]:
# We'd typically start by tokenizing the data 
from gensim.utils import tokenize

tokenized_texts = [list(tokenize(text)) for text in texts.data]
tokenized_texts[3][:10]  # Print a random example.

['Do', 'you', 'have', 'Weitek', 's', 'address', 'phone', 'number', 'I', 'd']

In [25]:
# And then stem all words.
from gensim.parsing.porter import PorterStemmer

stemmer = PorterStemmer()
stemmed_texts = [[stemmer.stem(word) for word in text] for text in tokenized_texts]
stemmed_texts[3][:10]

['do', 'you', 'have', 'weitek', 's', 'address', 'phone', 'number', 'i', 'd']

In [26]:
# While this gets us useful data by itself, we'll instead lemmatize the documents
# using another library. We'll need to do a bit of setup to make this work.
!pip install pattern
import nltk
nltk.download('omw-1.4')
# from gensim.utils import lemmatize
from pattern.en import lemma
# from nltk.stem.wordnet import WordNetLemmatizer

# lemmatizer = WordNetLemmatizer()
# docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]



[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Erik\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [45]:
# Here we start from full sentences again to help the NLP library understand the types of words. 
# Note: this can take quite a while!
# print(texts.data[0])
# print(lemma(texts.data[0]))  # For some reason I occasionally have to run this several times before things start working.
lemmatized_texts = [[list(lemma(word)) for word in text.split()] for text in texts.data]

print(lemmatized_texts[1])
# lemmatized_texts[3][:10]  # Print a random example.

# from nltk.stem.wordnet import WordNetLemmatizer

# nltk.download('wordnet')

# lemmatizer = WordNetLemmatizer()
# lemmatized_texts = [list(lemmatizer.lemmatize(text)) for text in texts.data]
# lemmatized_texts[3]  # Print a random example.


[['a'], ['f', 'a', 'i', 'r'], ['n', 'u', 'm', 'b', 'e', 'r'], ['o', 'f'], ['b', 'r', 'a', 'v', 'e'], ['s', 'o', 'u', 'l'], ['w', 'h', 'o'], ['u', 'p', 'g', 'r', 'a', 'd', 'e'], ['t', 'h', 'e', 'i', 'r'], ['s', 'i'], ['c', 'l', 'o', 'c', 'k'], ['o', 's', 'c', 'i', 'l', 'l', 'a', 't', 'o', 'r'], ['h', 'a', 'v', 'e'], ['s', 'h', 'a', 'r', 'e'], ['t', 'h', 'e', 'i', 'r'], ['e', 'x', 'p', 'e', 'r', 'i', 'e', 'n', 'c', 'e'], ['f', 'o', 'r'], ['t', 'h', 'i'], ['p', 'o', 'l', 'l', '.'], ['p', 'l', 'e', 'a', 's', 'e'], ['s', 'e', 'n', 'd'], ['a'], ['b', 'r', 'i', 'e', 'f'], ['m', 'e', 's', 's', 'a', 'g', 'e'], ['d', 'e', 't', 'a', 'i', 'l'], ['y', 'o', 'u', 'r'], ['e', 'x', 'p', 'e', 'r', 'i', 'e', 'n', 'c', 'e'], ['w', 'i', 't', 'h'], ['t', 'h', 'e'], ['p', 'r', 'o', 'c', 'e', 'd', 'u', 'r', 'e', '.'], ['t', 'o', 'p'], ['s', 'p', 'e', 'e', 'd'], ['a', 't', 't', 'a', 'i', 'n', 'e', 'd', ','], ['c', 'p', 'u'], ['r', 'a', 't', 'e'], ['s', 'p', 'e', 'e', 'd', ','], ['a', 'd', 'd'], ['o', 'n'], ['c

In [40]:
# Create a corpus from a list of texts.
# The dictionary just extracts and numbers each distinct word.
dictionary = Dictionary(lemmatized_texts)
# A corpus is a sparse datastore containing the number of times each word appears in each document.
corpus = [dictionary.doc2bow(text) for text in lemmatized_texts]

In [41]:
# Print a sample of dictionary items.
top_words_in_doc_0 = sorted(corpus[0], key=lambda e: e[1], reverse=True)[:10]
for word_index, count in top_words_in_doc_0:
  print(f'{dictionary[word_index]}   \tindex: {word_index}\tcount: {count:,}')

i be wonder if anyone out there can enlighten me on thi car i see the other day. it be a 2-door sport car, look to be from the late 60s/ early 70s. it be call a bricklin. the door be really small. in addition, the front bumper be separate from the rest of the body. thi be all i know. if anyone can tellme a model name, engine specs, year of production, where thi car be made, history, or whatever info you have on thi funky look car, please e-mail.   	index: 0	count: 1


## Models Trained on Text

In [13]:
# Let's start with a simple TF-IDF model. Worth noting: SKLearn also has a Tfidf vectorizer.
model = models.TfidfModel(corpus)

In [14]:
# What are the most _salient_ words in document 0?
top_tfidf_words = sorted(model[corpus[0]], key=lambda e: e[1], reverse=True)[:10]
for word_index, score in top_tfidf_words:
  print(f'{dictionary[word_index]}   \tindex: {word_index}\ttf-idf: {score:.2f}')

# What's different from the previous ranking? What do you think the topic label might be?

    	index: 1	tf-idf: 0.39
0   	index: 6	tf-idf: 0.31
7   	index: 9	tf-idf: 0.26
/   	index: 5	tf-idf: 0.23
e   	index: 17	tf-idf: 0.23
,   	index: 2	tf-idf: 0.23
6   	index: 8	tf-idf: 0.22
I   	index: 11	tf-idf: 0.22
o   	index: 26	tf-idf: 0.19
a   	index: 13	tf-idf: 0.18


In [None]:
print(texts.target_names[texts.target[0]])  # 7 = 'rec.autos'

In [None]:
# We could conver this to a giant vector spanning all documents and words.
# It'll be mostly zeros, of course.
tfidf_vector = np.zeros([len(texts.target), len(dictionary)])
for doc_idx, doc in enumerate(corpus):
  for word_ix, word_tfidf in model[doc]:
    tfidf_vector[doc_idx, word_ix] = word_tfidf
tfidf_vector.shape  # How many entries are there in this vector? What % do you guess is zero?

In [None]:
print(f'Percentage of 0 entries: {(tfidf_vector == 0).sum() / tfidf_vector.size:.3%}')

In [None]:
# Then we can train a model that predicts the topic from these values.
# A quick tangent: we'd normally use sklearn.feature_extraction.text.TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha=.01)
clf.fit(tfidf_vector, texts.target)

In [None]:
# Let's index our test data in the same way and test the model.
test_texts, test_targets = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), return_X_y=True)
# We'll save ourselves a bit of time by focusing on 1K documents.
test_texts = test_texts[:1000]
test_targets = test_targets[:1000]

# Lemmatize text, index with the same dictionary, and create a TF-IDF vector.
test_lemmatized = [list(lemmatize(text)) for text in test_texts]
test_corpus = [dictionary.doc2bow(text) for text in test_lemmatized]
test_tfidf_vector = np.zeros([len(test_targets), len(dictionary)])
for doc_idx, doc in enumerate(test_corpus):
  for word_ix, word_tfidf in model[doc]:
    test_tfidf_vector[doc_idx, word_ix] = word_tfidf

print(f'Accuracy: {clf.score(test_tfidf_vector, test_targets):.2%}')

In [None]:
# What are the most important features for each category?
for ix, category in enumerate(texts.target_names):
    top10 = np.argsort(clf.coef_[ix])[-10:]
    print(f'{category}:   \t{" ".join([dictionary[index] for index in top10])}')
# Pretty reasonable for the most part, but note the first few in almost every category.

In [None]:
# Okay, let's do something more exciting: build an LDA model.
# Note: we could also do this with SKLearn using LinearDiscriminantAnalysis.

# Let's start with 20 topics.
num_topics = 20
model = models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=10, workers=4, dtype=np.float64)

In [None]:
# Let's again look at the most salient words per topic, but note that we no longer
# have "labels" now.
for ix in range(num_topics):
  top10 = np.argsort(model.get_topics()[ix])[-10:]
  print(f'{ix}:  {" ".join([dictionary[index] for index in top10])}')  # See any patterns?

In [None]:
# Is this useful? One thing you can now do is find relevant neighbors for a given document
def topics_to_vector(topics):
  """Little util method for creating vectors."""
  vector = np.zeros(num_topics)
  for index, score in topics:
    vector[index] = score
  return vector

# Print the first document with its category.
print('Label:', texts.target_names[texts.target[0]])
print(texts.data[0])
print('\n\n')

# Conver to a vector and find nearest neighbors.
min_dist = 1e9
nearest_neighbor = -1
topic_vector = topics_to_vector(model.get_document_topics(corpus[0]))
print(topic_vector)
for ix, doc in enumerate(corpus[1:], start=1):
  doc_vector = topics_to_vector(model.get_document_topics(doc))
  dist = np.linalg.norm(topic_vector - doc_vector)
  if dist < min_dist:
    min_dist = dist
    nearest_neighbor = ix
    print(f'New nearest neighbor: {ix} with distance {dist:.4f} and label {texts.target_names[texts.target[ix]]}')

print('\n\nNearest neighbor: ')
print('Label:', texts.target_names[texts.target[nearest_neighbor]])
print(texts.data[nearest_neighbor])
print(topics_to_vector(model.get_document_topics(corpus[nearest_neighbor])))

In [None]:
# How coherent are these topics?
from gensim.models.coherencemodel import CoherenceModel

cm = CoherenceModel(model=model, texts=lemmatized_texts, dictionary=dictionary, coherence='u_mass')
coherence = cm.get_coherence()
coherence

In [None]:
# How about HAC?
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=20, affinity='euclidean', linkage='single', compute_distances=True)
cluster.fit(tfidf_vector[:2000])  # Note: this takes a very long time, so just using a small subset.

In [None]:
# Let's visualize the clusters.
# Using code from https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html#sphx-glr-auto-examples-cluster-plot-agglomerative-dendrogram-py
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram


def plot_dendrogram(model, **kwargs):
    """Create linkage matrix and then plot the dendrogram."""

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

# setting distance_threshold=0 ensures we compute the full tree.
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(cluster, truncate_mode="level", p=20)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()
# Not very informative; might just require more samples.