In [1]:
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import pyLDAvis.sklearn
from nltk.stem.snowball import SnowballStemmer

In [2]:
data = pd.read_json('articles.json', orient='index')['text'].values

In [3]:
stemmer = SnowballStemmer("english")
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [4]:
# tokenize, remove stopwords, lowercase, and ignore words 
# that happen too much or too little
vectorizer = StemmedCountVectorizer(
    min_df=0.02, 
    max_df=0.80, 
    stop_words='english', 
    lowercase=True,
    analyzer="word",
    token_pattern='[a-zA-Z\-0-9][a-zA-Z\-0-9]{2,}'
)
data_vectorized = vectorizer.fit_transform(data)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(
    n_components=10,
    max_iter=50, 
    learning_method='online'
)
lda_Z = lda_model.fit_transform(data_vectorized)

In [5]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(
    lda_model, 
    data_vectorized,
    vectorizer, 
    mds='tsne'
)
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
