## Topic modelling with sci-kit learn and gensim

In [1]:
from nltk.corpus import brown

In [3]:
data = []

for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)
    
NO_DOCUMENTS = len(data)
print(NO_DOCUMENTS)
print(data[0])

500


### Using LDA and LSI to model topics - gensim

In [6]:
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords

In [7]:
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')

In [8]:
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text

In [9]:
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(text))

# tokenized data a list of lists

In [10]:
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)

# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
 
# Have a look at how the 20th document looks like: [(word_id, count), ...]
print(corpus[20])

[(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2), (44, 2), (45, 2), (46, 2), (47, 2), (49, 1), (50, 1), (53, 1), (56, 1), (59, 1), (60, 1), (66, 1), (75, 1), (80, 1), (98, 1), (101, 1), (106, 1), (117, 1), (129, 1), (130, 2), (132, 2), (135, 2), (140, 1), (141, 2), (143, 4), (144, 2), (145, 2), (166, 1), (195, 1), (198, 3), (219, 1), (220, 4), (221, 3), (223, 1), (229, 4), (230, 4), (231, 2), (235, 1), (236, 1), (242, 2), (246, 2), (255, 1), (263, 1), (269, 1), (270, 5), (271, 2), (275, 5), (276, 1), (278, 4), (280, 2), (281, 1), (307, 2), (310, 1), (311, 3), (313, 1), (314, 5), (318, 4), (322, 1), (336, 1), (338, 3), (339, 1), (340, 1), (341, 1), (345, 1), (346, 1), (351, 1), (354, 1), (355, 1), (366, 3), (368, 13), (370, 1), (372, 1), (374, 3), (377, 3), (381, 3), (386, 1), (392, 6), (396, 1), (401, 1), (412, 2), (426, 2), (428, 2), (431, 2), (434, 2), (439, 2), (444, 1), (450, 1), (452, 1), (462, 1), (465, 1), (467, 1), (470, 1), (478, 1), (483, 1), (

In [11]:
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
 
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [12]:
print("LDA Model:")

for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
    print("=" * 20)

print("LSI Model:")

for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
    print("=" * 20)

LDA Model:
Topic #0: 0.005*"one" + 0.005*"would" + 0.004*"could" + 0.004*"may" + 0.004*"said" + 0.003*"new" + 0.003*"two" + 0.003*"first" + 0.002*"even" + 0.002*"made"
Topic #1: 0.006*"one" + 0.005*"would" + 0.004*"said" + 0.004*"could" + 0.003*"time" + 0.003*"first" + 0.003*"may" + 0.003*"even" + 0.002*"also" + 0.002*"new"
Topic #2: 0.007*"one" + 0.004*"would" + 0.004*"new" + 0.003*"could" + 0.003*"man" + 0.003*"two" + 0.003*"first" + 0.003*"time" + 0.003*"state" + 0.003*"said"
Topic #3: 0.007*"would" + 0.005*"one" + 0.003*"new" + 0.003*"may" + 0.003*"could" + 0.002*"said" + 0.002*"also" + 0.002*"man" + 0.002*"made" + 0.002*"time"
Topic #4: 0.007*"one" + 0.005*"would" + 0.004*"said" + 0.003*"time" + 0.003*"could" + 0.003*"like" + 0.002*"made" + 0.002*"also" + 0.002*"even" + 0.002*"new"
Topic #5: 0.006*"one" + 0.005*"would" + 0.004*"said" + 0.004*"time" + 0.003*"could" + 0.003*"like" + 0.002*"first" + 0.002*"new" + 0.002*"work" + 0.002*"even"
Topic #6: 0.005*"would" + 0.004*"one" + 0.0

In [14]:
text = "The economy is working better than ever"
bow = dictionary.doc2bow(clean_text(text))

print(lsi_model[bow])

[(0, 0.09161249216492842), (1, -0.008641202453661087), (2, 0.01626542542902395), (3, -0.04114231734630266), (4, 0.016083720176433437), (5, 0.009292099074032242), (6, -0.030090038439968284), (7, -0.019053079559659625), (8, -0.057151468944598514), (9, 0.021854497450139867)]


In [15]:
print(lda_model[bow])

[(0, 0.020006863), (1, 0.020006163), (2, 0.020006515), (3, 0.020007012), (4, 0.81994313), (5, 0.020006055), (6, 0.020005455), (7, 0.020006636), (8, 0.02000676), (9, 0.020005416)]


## Using sci-kit learn to do topic modelling

In [16]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
NUM_TOPICS = 10
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)

In [18]:
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)



(500, 10)
(500, 10)
(500, 10)


In [19]:
# Let's see how the first document in the corpus looks like in different topic spaces
print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])

[1.05613919e-04 1.05612254e-04 9.88134076e-01 1.05599257e-04
 1.05597135e-04 1.05622652e-04 1.05614138e-04 1.10210368e-02
 1.05613248e-04 1.05614948e-04]
[0.         0.         2.11382321 0.07696888 0.         0.54368194
 1.06699232 0.         0.         0.24523941]
[ 23.30684292   1.59477481  21.79958931  -0.05622958   0.81017835
  11.53978492   4.11809294  -2.24638525   1.55094963 -13.81302971]


In [20]:
# inspect the inferred topics
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('surface', 138.2209233251389), ('used', 125.95158615785968), ('number', 122.85722509127008), ('temperature', 105.87281668551309), ('time', 103.40042826859889), ('order', 88.16091513169246), ('point', 85.85839660743127), ('line', 84.14916149532947), ('radiation', 82.00307021978278), ('use', 81.9080037816733)]
Topic 1:
[('used', 140.46254919201562), ('cells', 76.61325289635023), ('index', 71.12533422253848), ('feed', 67.00815837679164), ('electronic', 57.02428064943172), ('food', 52.854491044915854), ('reaction', 49.344410664824935), ('cell', 48.92531157439847), ('small', 48.67037526629477), ('form', 48.54800580587242)]
Topic 2:
[('new', 923.3618643484982), ('state', 706.2859967051934), ('time', 571.9616845405476), ('years', 559.9322953960152), ('year', 555.0995136362205), ('states', 540.842810180838), ('said', 498.6198070672877), ('american', 423.94987222368997), ('people', 422.9704393527312), ('united', 419.25172870739755)]
Topic 3:
[('site', 28.93443085401476), (

In [21]:
# testing on an unseen document
text = "The economy is working better than ever"
x = nmf_model.transform(vectorizer.transform([text]))[0]
print(x)

[0.00289954 0.         0.         0.         0.         0.00439422
 0.         0.         0.         0.00464866]


## Let's try plotting words and documents

In [23]:
import pandas as pd
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

In [26]:
# if we plot documents in 2D
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)
 
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(data))
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [27]:
svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(data_vectorized.T)
 
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

## Visualizing LDA

In [28]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))
