# Citation

Much of the code and examples are copied/modified from 

> Blueprints for Text Analytics Using Python by Jens Albrecht, Sidharth Ramachandran, and Christian Winkler (O'Reilly, 2021), 978-1-492-07408-3.
>

- https://github.com/blueprints-for-text-analytics-python/blueprints-text
- https://github.com/blueprints-for-text-analytics-python/blueprints-text/blob/master/ch08/Topic_Modeling_Clustering.ipynb

---

# Configuration

In [1]:
# this variable controls the range of n-grams used by CountVectorizer/TfidfVectorizer
# and, therefore, the n-grams the topic modeling will use
n_gram_range = (1, 3)
# specify stop words specific to this dataset
custom_stop_words = {'united', 'nations', 'nation'}
# specify the number of topics the NMF/LDA will create
number_of_topics = 10

# Setup

In [2]:
cd ../..

/Users/shanekercheval/repos/nlp-template


In [3]:
%run "source/config/notebook_settings.py"

In [4]:
pd.set_option('display.max_colwidth', None)

In [6]:
from source.library.utilities import Timer, get_logger
from source.library.text_analysis import count_tokens, tf_idf, get_context_from_keyword, count_keywords, count_keywords_by, impurity
from source.library.sklearn_topic_modeling import *

In [7]:
with Timer("Loading Data"):
    path = 'artifacts/data/processed/un-general-debates-paragraphs.pkl'
    paragraphs = pd.read_pickle(path)

Started: Loading Data
Finished (0.22 seconds)


---

# Exploratory Data Analysis

This section provides a basic exploration of the text and dataset.

## Dataset Summary

In [8]:
hlp.pandas.numeric_summary(paragraphs)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
year,279045,0,0.0%,0,0.0%,1992.4,12.6,0.0,0.1,-1.1,1970,1975.0,1982.0,1993.0,2003.0,2010.0,2015


In [9]:
hlp.pandas.non_numeric_summary(paragraphs)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,Most Freq. Value,# of Unique,% Unique
country,279045,0,0.0%,Russian Federation,199,0.1%
text,279045,0,0.0%,The President returned to the [...],278820,99.9%


In [10]:
assert not (paragraphs['text'].str.strip() == '').any()

# Prep

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

In [12]:
stopwords |= custom_stop_words
stopwords |= {'ll', 've'}

## Sample

In [13]:
paragraphs = paragraphs.sample(2000)
#paragraphs.to_pickle('source/tests/test_files/datasets/un_debates_paragraphs_sample.pkl')

## TF / TF-IDF

NOTE: `TF` seems to be used with `LDA` rather than `TF-IDF`

In [14]:
with Timer("Calculating TF & TF-IDF (1-3 ngrams)"):
    count_vectorizer = CountVectorizer(stop_words=stopwords, ngram_range=(1, 3), min_df=5, max_df=0.7)
    count_vectors = count_vectorizer.fit_transform(paragraphs["text"])
    print(count_vectors.shape)

    tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1, 3), min_df=5, max_df=0.7)
    tfidf_vectors = tfidf_vectorizer.fit_transform(paragraphs["text"])
    tfidf_vectors.shape

Started: Calculating TF & TF-IDF (1-3 ngrams)
(2000, 3489)
Finished (0.36 seconds)


# Topic Modeling

In [15]:
import matplotlib.pyplot as plt
def plot_top_words(model, feature_names, n_top_words, title):
    """
    https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py
    """
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()



In [16]:
def display_topics(model, features, no_top_words=5):
    for topic, words in enumerate(model.components_):
        total = words.sum()
        largest = words.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print("  %s (%2.2f)" % (features[largest[i]], abs(words[largest[i]]*100.0/total)))

## NMF

In [20]:
from sklearn.decomposition import NMF

nmf_model = NMF(init='nndsvda', n_components=number_of_topics, random_state=42, max_iter=1000)
_ = nmf_model.fit_transform(tfidf_vectors)
feature_names = tfidf_vectorizer.get_feature_names_out()

In [None]:
topics_df = extract_topic_dataframe(
    model=nmf_model,
    features=feature_names,
    top_n_tokens=10,
    num_tokens_in_label=2
)

In [None]:
plot_topics(topics_df)

In [None]:
topic_sizes = calculate_topic_sizes(nmf_model, count_vectors)

In [None]:
topic_dict = extract_topic_dictionary(nmf_model, feature_names)
topic_labels = list(create_topic_labels(topic_dict).values())

In [None]:
df = pd.DataFrame({
    'Topics': topic_labels,
    'Topic Size as a Percent of the Dataset': topic_sizes,
})

fig = px.bar(
    df,
    x='Topic Size as a Percent of the Dataset',
    y='Topics',
    title='Size of Topics<br><sup>More than 1 topic can be assigned to a single document; therefore, relative (percentage) sizes are provided.</sup>'
)
fig.update_layout(xaxis_tickformat = 'p')
fig.show()

Get Topic Weightings for First Doc

In [None]:
# topics for first document 
w_first_doc = w_matrix_unigrams[0, ]
w_first_doc

In [None]:
# Should be the same values as before
predictions_first_doc = nmf_unigrams.transform(tfidf_vectors_unigrams[0,])
predictions_first_doc

In [None]:
[abs(round(x, 4)) for x in (w_first_doc - predictions_first_doc).tolist()[0]]

---

Get Top 10 Words for First Topic

In [None]:
first_topic = h_matrix_unigrams[0,]
first_topic.shape

In [None]:
largest_word_values = first_topic.argsort()[::-1]
largest_word_values[0:10]

In [None]:
word_names[largest_word_values[0:10]]

---

Size of Topics (Percent of all Documents)

In [None]:
w_matrix_unigrams.sum(axis=0)/w_matrix_unigrams.sum()*100.0

---

In [None]:
def topics_to_dictionary(model, features, num_top_words=10):
    topics = dict()
    for topic, words in enumerate(model.components_):
        total = words.sum()
        largest = words.argsort()[::-1] # invert sort order
        topics[topic + 1] = [(features[largest[i]], abs(words[largest[i]]*100.0/total)) for i in range(0, num_top_words)]
    return topics

In [None]:
topic_dictionary = topics_to_dictionary(nmf_unigrams, tfidf_vectorizer_unigrams.get_feature_names_out())
#topic_dictionary

In [None]:
name_lookup = {topic:' | '.join([y[0] for y in x[0:3]]) for topic, x in topic_dictionary.items()}

In [None]:
def topic_dictionary_to_names(topic_dictionary: dict, num_words_in_name: int=3):
    return {topic:' | '.join([y[0] for y in x[0:num_words_in_name]]) for topic, x in topic_dictionary.items()}

name_lookup = topic_dictionary_to_names(topic_dictionary, num_words_in_name=2)
name_lookup

In [None]:
def topics_to_dataframe(model, features: list, num_top_words: int = 10, num_words_in_name: int = 2) -> pd.DataFrame:
    topic_dictionary = topics_to_dictionary(model, features, num_top_words)
    name_lookup = topic_dictionary_to_names(topic_dictionary, num_words_in_name=num_words_in_name)
    
    topic_words = pd.DataFrame(topic_dictionary)
    topics = topic_words.columns
    topic_words = topic_words.reset_index().rename(columns={'index': 'word'})
    topic_words = pd.melt(topic_words, id_vars='word', value_vars=list(topics), var_name='topic')
    topic_words = topic_words.assign(**pd.DataFrame(topic_words['value'].tolist(), columns=['words', 'value']))
    topic_words['label'] = topic_words['topic'].apply(lambda x: name_lookup[x])
    return topic_words

topic_df = topics_to_dataframe(
    model=nmf_unigrams,
    features=tfidf_vectorizer_unigrams.get_feature_names_out(),
    num_top_words=10,
    num_words_in_name=2,
)
topic_df

In [None]:
# topic_words = pd.DataFrame(topic_dictionary)
# topics = topic_words.columns
# topic_words = topic_words.reset_index().rename(columns={'index': 'word'})
# topic_words = pd.melt(topic_words, id_vars='word', value_vars=list(topics), var_name='topic')
# topic_words = topic_words.assign(**pd.DataFrame(topic_words['value'].tolist(), columns=['words', 'value']))
# topic_words['label'] = topic_words['topic'].apply(lambda x: name_lookup[x])
# topic_words

In [None]:
import plotly_express as px

fig = px.bar(
    topic_df,
    x='value',
    y='words',
    facet_col='label',
    facet_col_wrap=3,
    facet_col_spacing=0.2,
    labels={
        'words': '',
        'label': '',
    },
    width=900,
    height=1000,
    title="Topics in NMF model (Unigrams)"
)
fig.update_yaxes(matches=None, showticklabels=True, autorange="reversed")
#fig.update_xaxes(matches=None)
fig.show()

In [None]:
plot_top_words(
    model=nmf_unigrams,
    feature_names=tfidf_vectorizer_unigrams.get_feature_names_out(),
    n_top_words=5,
    title="Topics in NMF model (Uni-grams)"
)

---

### Bigrams

In [None]:
nmf_bigrams = NMF(n_components=10, random_state=42)
# see Blueprints pg. 214 for explaination of W X H
w_matrix_bigrams = nmf_bigrams.fit_transform(tfidf_vectors_bigrams)
h_matrix_bigrams = nmf_bigrams.components_
word_names = tfidf_vectorizer_bigrams.get_feature_names_out()

In [None]:
#display_topics(nmf_para_model_bigrams, tfidf_para_vectorizer_bigrams.get_feature_names())

In [None]:

topic_df = topics_to_dataframe(
    model=nmf_bigrams,
    features=tfidf_vectorizer_bigrams.get_feature_names_out(),
    num_top_words=10,
    num_words_in_name=2,
)
topic_df

In [None]:
import plotly_express as px

fig = px.bar(
    topic_df,
    x='value',
    y='words',
    facet_col='label',
    facet_col_wrap=2,
    facet_col_spacing=0.2,
    labels={
        'words': '',
        'label': '',
    },
    width=900,
    height=1000,
    title="Topics in NMF model (Bigrams)"
)
fig.update_yaxes(matches=None, showticklabels=True, autorange="reversed")
#fig.update_xaxes(matches=None)
fig.show()

In [None]:
plot_top_words(
    model=nmf_bigrams,
    feature_names=tfidf_vectorizer_bigrams.get_feature_names_out(),
    n_top_words=5,
    title="Topics in NMF model (Bi-grams)"
)

https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py

---

Neither the book nor the example above uses TF-IDF with LDA, but do not specify why. Both use TF-IDF with NMF and then change to CountVectorizer with LDA


https://stackoverflow.com/questions/44781047/necessary-to-apply-tf-idf-to-new-documents-in-gensim-lda-model/44789327#44789327

> LDA only needs a bag-of-word vector.



In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_para_vectorizer_bigrams = CountVectorizer(stop_words=stopwords, min_df=5, max_df=0.7, ngram_range=(2,3))
count_para_vectors_bigrams = count_para_vectorizer_bigrams.fit_transform(paragraphs["text"])
count_para_vectors_bigrams.shape

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_para_vectorizer_bigrams = CountVectorizer(stop_words=stopwords, min_df=5, max_df=0.7, ngram_range=(2,3))
count_para_vectors_bigrams = count_para_vectorizer_bigrams.fit_transform(paragraphs["text"])
count_para_vectors_bigrams.shape


from sklearn.feature_extraction.text import CountVectorizer
count_para_vectorizer = CountVectorizer(stop_words=stopwords, min_df=5, max_df=0.7)
count_para_vectors = count_para_vectorizer.fit_transform(paragraphs["text"])
count_para_vectors.shape

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda_para_model = LatentDirichletAllocation(n_components = 10, random_state=42)
W_lda_para_matrix = lda_para_model.fit_transform(count_para_vectors)
H_lda_para_matrix = lda_para_model.components_

In [None]:
plot_top_words(
    model=lda_para_model,
    feature_names=count_para_vectorizer.get_feature_names_out(),
    n_top_words=5,
    title="Topics in LDA model (Uni-grams)"
)

In [None]:
W_lda_para_matrix.sum(axis=0)/W_lda_para_matrix.sum()*100.0

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda_para_model_bigrams = LatentDirichletAllocation(n_components = 10, random_state=42)
W_lda_para_matrix_bigrams = lda_para_model_bigrams.fit_transform(count_para_vectors_bigrams)
H_lda_para_matrix_bigrams = lda_para_model_bigrams.components_

In [None]:
import matplotlib.pyplot as plt
def plot_top_words(model, feature_names, n_top_words, title):
    """
    https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py
    """
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()




In [None]:
plot_top_words(
    model=lda_para_model_bigrams,
    feature_names=count_para_vectorizer_bigrams.get_feature_names_out(),
    n_top_words=5,
    title="Topics in LDA model (Bi-grams)"
)

In [None]:
import pyLDAvis.sklearn

lda_display = pyLDAvis.sklearn.prepare(lda_para_model, count_para_vectors, count_para_vectorizer, sort_topics=False)
#pyLDAvis.display(lda_display)
pyLDAvis.save_html(lda_display, 'docs/models/lda.html')

In [None]:
import pyLDAvis.sklearn

lda_display = pyLDAvis.sklearn.prepare(lda_para_model_bigrams, count_para_vectors_bigrams, count_para_vectorizer_bigrams, sort_topics=False)
#pyLDAvis.display(lda_display)
pyLDAvis.save_html(lda_display, 'docs/models/lda_bigrams.html')