In [1]:
# Code borrowed from sklean
# https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
# Author: Luke Kumar

In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import pickle

## Params

In [3]:
n_samples = None # 2000
n_features = 10000
n_components = 25
n_top_words = 20

# Data Loading

In [4]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
if n_samples is not None:
    data_samples = dataset.data[:n_samples]
else:
    data_samples = dataset.data

In [5]:
data_samples[0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [6]:
len(data_samples)

11314

# Encode Text

In [7]:
# max_df : float in range [0.0, 1.0] or int, default=1.0
# When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold
# (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

# min_df : float in range [0.0, 1.0] or int, default=1
# When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. 
# This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, 
# integer absolute counts. This parameter is ignored if vocabulary is not None.

In [8]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')

In [9]:
tf = tf_vectorizer.fit_transform(data_samples)

# LDA Model

In [11]:
lda = LatentDirichletAllocation(n_components=n_components, max_iter=100,
                                learning_method = 'batch', #'online',
                                random_state=0, verbose=0, n_jobs=-1,
                                mean_change_tol=0.001)

In [12]:
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=100, mean_change_tol=0.001,
             n_components=25, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [13]:
# save model
pickle.dump(lda, open('lda.pkl', 'wb'))

In [14]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [15]:
print_top_words(lda, tf_vectorizer.get_feature_names(), n_top_words)

Topic #0: file israel entry israeli output program arab section rules entries printf int oname char build title info stream number return
Topic #1: good year think just like don game better time did got hit really know players make ll didn years team
Topic #2: team games game hockey season new league san nhl vs teams gm division year players city chicago st play york
Topic #3: ax max g9v b8f a86 pl 145 1d9 1t 0t giz bhj 3t 2di 75u 34u 2tm wm 7ey bxn
Topic #4: people think don does say god believe just know way like true question point life make things time good right
Topic #5: edu com information mail available ftp pub list send email university software graphics address computer data cs contact ca internet
Topic #6: like just don good car time ve know problem think work way does make use new used better really want
Topic #7: said people know didn time just went don did like told came say saw going started left home took years
Topic #8: db mov cs al bh si word pain com byte dave delete

In [16]:
pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
