In [1]:
from collections import OrderedDict
from pathlib import Path
import warnings
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from pyLDAvis.sklearn import prepare
import pyLDAvis
from wordcloud import WordCloud
from termcolor import colored

ModuleNotFoundError: No module named 'pyLDAvis'

In [None]:
# jtplot.style(theme='onedork', context='talk', fscale=1.4, spines=False, gridlines='--', ticks=True, grid=False, figsize=(14, 8))
%matplotlib inline
plt.style.use('ggplot')
pyLDAvis.enable_notebook()
warnings.filterwarnings('ignore')

In [None]:
path = Path('bbc')
files = path.glob('**/*.txt')
doc_list = []
for i, file in enumerate(files):
    with open(str(file), encoding='latin1') as f:
        _, topic, file_name = file.parts

        lines = f.readlines()
        file_id = file_name.split('.')[0]
        heading = lines[0].strip()
        body = ' '.join([l.strip() for l in lines[1:]])
        doc_list.append([topic, heading, body])

In [None]:
docs = pd.DataFrame(doc_list, columns=['topic', 'heading', 'article'])
docs['word count'] = docs.article.str.split().str.len()
docs.info()

In [None]:
topic_labels = ['Topic {}'.format(i) for i in range(1, 6)]

In [None]:
vectorizer = CountVectorizer(max_df=.5, min_df=5, 
                             stop_words='english', 
                             max_features=2000)
dtm = vectorizer.fit_transform(docs.article)
lda = LatentDirichletAllocation(n_components=5, max_iter=500,
                                learning_method='batch', 
                                evaluate_every=10, 
                                random_state=42)
lda.fit(dtm)

In [None]:
prepare(lda, dtm, vectorizer)

### Topics as WordClouds

In [None]:
topics_prob = lda.components_ / lda.components_.sum(axis=1).reshape(-1, 1)
topics = pd.DataFrame(topics_prob.T,
                      index=vectorizer.get_feature_names(),
                      columns=topic_labels)

In [None]:
w = WordCloud()
fig, axes = plt.subplots(nrows=5, figsize=(15, 30))
axes = axes.flatten()
for t, (topic, freq) in enumerate(topics.items()):
    w.generate_from_frequencies(freq.to_dict())
    axes[t].imshow(w, interpolation='bilinear')
    axes[t].set_title(topic, fontsize=18)
    axes[t].axis('off')

### Visualize topic-word assocations per document

In [None]:
dtm_ = pd.DataFrame(data=lda.transform(dtm),
                    columns=topic_labels,
                    index=docs.topic)

In [None]:
color_dict = OrderedDict()
color_dict['Topic 1'] = {'color': 'white', 'on_color': 'on_blue'}
color_dict['Topic 2'] = {'color': 'white', 'on_color': 'on_green'}
color_dict['Topic 3'] = {'color': 'white', 'on_color': 'on_red'}
color_dict['Topic 4'] = {'color': 'white', 'on_color': 'on_magenta'}
color_dict['Topic 5'] = {'color': 'blue', 'on_color': 'on_yellow'}

In [None]:
dtm_['article'] = docs.article.values
dtm_['heading'] = docs.heading.values
sample = dtm_[dtm_[topic_labels].gt(.1).all(1)]
sample

In [None]:
colored_text = []
for word in sample.iloc[0, 5].split():
    try:
        topic = topics.loc[word.strip().lower()].idxmax()
        colored_text.append(colored(word, **color_dict[topic]))
    except:
        colored_text.append(word)
    

print(' '.join([colored(k, **v) for k, v in color_dict.items()]))
print('\n',sample.iloc[0, 6], '\n')
text = ' '.join(colored_text)
print(text)