# Citation

Much of the code and examples are copied/modified from 

> Blueprints for Text Analytics Using Python by Jens Albrecht, Sidharth Ramachandran, and Christian Winkler (O'Reilly, 2021), 978-1-492-07408-3.
>

- https://github.com/blueprints-for-text-analytics-python/blueprints-text
- https://github.com/blueprints-for-text-analytics-python/blueprints-text/blob/master/ch08/Topic_Modeling_Clustering.ipynb

---

# Setup

In [1]:
%run "/code/source/config/notebook_settings.py"

In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:
from source.library.text_analysis import count_tokens, tf_idf, get_context_from_keyword, count_keywords, count_keywords_by, impurity
from source.library.sklearn_topic_modeling import *
from helpsk.utility import read_pickle

In [4]:
ngrams_low = 1
ngrams_high = 3
num_clusters = 10

In [5]:
with Timer("Loading Data"):
    path = 'artifacts/data/processed/un-general-debates-paragraphs.pkl'
    paragraphs = pd.read_pickle(path)
    
with Timer("Loading NMF"):
    file = f'artifacts/models/topics/nmf-topics-{num_clusters}-ngrams-{ngrams_low}-{ngrams_high}__vectorizer.pkl'
    nmf_vectorizer = read_pickle(file)

    # file = f'artifacts/models/topics/nmf-topics-10-ngrams-{ngrams_low}-{ngrams_high}__vectors.pkl'
    # nmf_vectors = read_pickle(file)
    
    file = f'artifacts/models/topics/nmf-topics-{num_clusters}-ngrams-{ngrams_low}-{ngrams_high}__model.pkl'  # noqa
    nmf_model = read_pickle(file)

with Timer("Loading LDA"):
    file = f'artifacts/models/topics/lda-topics-{num_clusters}-ngrams-{ngrams_low}-{ngrams_high}__vectorizer.pkl'
    lda_vectorizer = read_pickle(file)

    file = f'artifacts/models/topics/lda-topics-{num_clusters}-ngrams-{ngrams_low}-{ngrams_high}__vectors.pkl'
    lda_vectors = read_pickle(file)
    
    file = f'artifacts/models/topics/lda-topics-{num_clusters}-ngrams-{ngrams_low}-{ngrams_high}__model.pkl'  # noqa
    lda_model = read_pickle(file)

with Timer("Loading K-Means"):
    file = f'artifacts/models/topics/k_means-topics-{num_clusters}-ngrams-{ngrams_low}-{ngrams_high}__vectorizer.pkl'
    k_means_vectorizer = read_pickle(file)

    file = f'artifacts/models/topics/k_means-topics-{num_clusters}-ngrams-{ngrams_low}-{ngrams_high}__vectors.pkl'
    k_means_vectors = read_pickle(file)
    
    file = f'artifacts/models/topics/k_means-topics-{num_clusters}-ngrams-{ngrams_low}-{ngrams_high}__model.pkl'  # noqa
    k_means_model = read_pickle(file)

2023-02-17 03:38:31 - INFO     | Timer Started: Loading Data
2023-02-17 03:38:33 - INFO     | Timer Finished: (2.13 seconds)
2023-02-17 03:38:33 - INFO     | Timer Started: Loading NMF
2023-02-17 03:38:34 - INFO     | Timer Finished: (0.37 seconds)
2023-02-17 03:38:34 - INFO     | Timer Started: Loading LDA
2023-02-17 03:38:34 - INFO     | Timer Finished: (0.24 seconds)
2023-02-17 03:38:34 - INFO     | Timer Started: Loading K-Means
2023-02-17 03:38:34 - INFO     | Timer Finished: (0.32 seconds)


---

# Exploratory Data Analysis

This section provides a basic exploration of the text and dataset.

## Dataset Summary

In [6]:
hlp.pandas.numeric_summary(paragraphs)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
year,279045,0,0.0%,0,0.0%,1992.4,12.6,0.0,0.1,-1.1,1970,1975.0,1982.0,1993.0,2003.0,2010.0,2015


In [7]:
hlp.pandas.non_numeric_summary(paragraphs)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,Most Freq. Value,# of Unique,% Unique
country,279045,0,0.0%,Russian Federation,199,0.1%
text,279045,0,0.0%,The President returned to the [...],278820,99.9%


In [8]:
assert not (paragraphs['text'].str.strip() == '').any()

---

# Topic Modeling

## NMF

In [10]:
nmf_explorer = TopicModelExplorer(model=nmf_model, vectorizer=nmf_vectorizer)

In [11]:
nmf_explorer.extract_topic_dataframe()

Unnamed: 0,topic,token_index,token,value,topic_label
0,0,0,world,0.95,world | peace
1,0,1,peace,0.52,world | peace
2,0,2,states,0.52,world | peace
3,0,3,people,0.52,world | peace
4,0,4,country,0.48,world | peace
...,...,...,...,...,...
95,9,5,goals,0.76,development | social
96,9,6,global,0.67,development | social
97,9,7,development goals,0.59,development | social
98,9,8,agenda,0.59,development | social


### Overview

In [9]:
nmf_explorer.plot_topics(
    top_n_tokens=8,
    num_tokens_in_label=2
)

NameError: name 'nmf_explorer' is not defined

In [None]:
nmf_explorer.plot_topic_sizes(text_series=paragraphs['text'])

In [None]:
predicted_topics = nmf_model.transform(X=nmf_vectorizer.transform(paragraphs['text']))
per_document_totals = predicted_topics.sum(axis=1)
ax = pd.Series(per_document_totals).plot(kind='box', vert=False, figsize=(10, 1))
ax.set_title("Distribution Sum of Predicted Values/Topics Per Document")
ax.set_xlabel("Sum of Predicted Values Per Document")
ax.set_yticklabels([])
ax;

---

### Trends

In [None]:
topic_sizes_per_year = nmf_explorer.get_topic_sizes_per_segment(
    df=paragraphs,
    text_column='text',
    segment_column='year',
)
topic_sizes_per_year.head()

In [None]:
fig = px.area(
    topic_sizes_per_year,
    x="year",
    y="relative_size",
    color="topic_labels",
    title="Topics Over Time",
)
fig.show()

In [None]:
fig = px.bar(
    topic_sizes_per_year,
    x="year",
    y="relative_size",
    color="topic_labels",
    title="Topics Over Time",
)
fig.show()

In [None]:
fig = px.line(
    topic_sizes_per_year,
    x="year",
    y="relative_size",
    color="topic_labels",
    title="Topics Over Time",
)
fig.show()

In [None]:
fig = px.scatter(
    topic_sizes_per_year,
    x="year",
    y="relative_size",
    color="topic_labels",
    trendline="lowess",
    opacity=0.0,
    title="Topics Over Time",
)
fig.show()

---

### Examples

In [None]:
top_examples = nmf_explorer.extract_top_examples(
    text_series=paragraphs['text'],
    top_n_examples=10
)
top_examples.style.hide(axis='index')

---

## LDA

Neither the book nor the example above uses TF-IDF with LDA, but do not specify why. Both use TF-IDF with NMF and then change to CountVectorizer with LDA


https://stackoverflow.com/questions/44781047/necessary-to-apply-tf-idf-to-new-documents-in-gensim-lda-model/44789327#44789327

> LDA only needs a bag-of-word vector.



In [None]:
lda_explorer = TopicModelExplorer(model=lda_model, vectorizer=lda_vectorizer)

### Overview

In [None]:
lda_explorer.plot_topics(
    top_n_tokens=8,
    num_tokens_in_label=2,
    token_separator=' | '
)

In [None]:
lda_explorer.plot_topic_sizes(
    text_series=paragraphs['text'],
    num_tokens_in_label=3,
    token_separator=' | '
)

In [None]:
predicted_topics = lda_model.transform(X=lda_vectorizer.transform(paragraphs['text']))
per_document_totals = predicted_topics.sum(axis=1)
ax = pd.Series(per_document_totals).plot(kind='box', vert=False, figsize=(10, 1))
ax.set_title("Distribution Sum of Predicted Values/Topics Per Document")
ax.set_xlabel("Sum of Predicted Values Per Document")
ax.set_yticklabels([])
ax;

---

### Trends

In [None]:
topic_sizes_per_year = lda_explorer.get_topic_sizes_per_segment(
    df=paragraphs,
    text_column='text',
    segment_column='year',
)
topic_sizes_per_year.head()

In [None]:
fig = px.area(
    topic_sizes_per_year,
    x="year",
    y="relative_size",
    color="topic_labels",
    title="Topics Over Time",
)
fig.show()

In [None]:
fig = px.bar(
    topic_sizes_per_year,
    x="year",
    y="relative_size",
    color="topic_labels",
    title="Topics Over Time",
)
fig.show()

In [None]:
fig = px.line(
    topic_sizes_per_year,
    x="year",
    y="relative_size",
    color="topic_labels",
    title="Topics Over Time",
)
fig.show()

In [None]:
fig = px.scatter(
    topic_sizes_per_year,
    x="year",
    y="relative_size",
    color="topic_labels",
    trendline="lowess",
    opacity=0.0,
    title="Topics Over Time",
)
fig.show()

---

### Examples

In [None]:
top_examples = lda_explorer.extract_top_examples(
    text_series=paragraphs['text'],
    top_n_examples=10
)
top_examples.style.hide(axis='index')

---

In [None]:
import pyLDAvis.sklearn

# monkey patch vectorizer since sklearn changed the name of the function which pyLDAvis isn't
# expecting
lda_vectorizer.get_feature_names = lda_vectorizer.get_feature_names_out
lda_display = pyLDAvis.sklearn.prepare(lda_model, lda_vectors, lda_vectorizer, sort_topics=False)
# pyLDAvis.display(lda_display)

file_name = f"output/models/topics/lda-{num_clusters}-n-grams-{ngrams_low}-{ngrams_high}.html"
pyLDAvis.save_html(lda_display, file_name)

---

## K-Means

In [None]:
k_means_explorer = KMeansTopicExplorer(
    model=k_means_model,
    vectorizer=k_means_vectorizer,
    vectors=k_means_vectors
)

### Overview

In [None]:
k_means_explorer.plot_topics(
    top_n_tokens=8,
    num_tokens_in_label=2
)

In [None]:
k_means_explorer.plot_topic_sizes(text_series=paragraphs['text'])

### Trends

In [None]:
topic_sizes_per_year = k_means_explorer.get_topic_sizes_per_segment(
    df=paragraphs,
    text_column='text',
    segment_column='year',
)
topic_sizes_per_year.head()

In [None]:
fig = px.area(
    topic_sizes_per_year,
    x="year",
    y="relative_size",
    color="topic_labels",
    title="Topics Over Time",
)
fig.show()

In [None]:
fig = px.bar(
    topic_sizes_per_year,
    x="year",
    y="relative_size",
    color="topic_labels",
    title="Topics Over Time",
)
fig.show()

In [None]:
fig = px.line(
    topic_sizes_per_year,
    x="year",
    y="relative_size",
    color="topic_labels",
    title="Topics Over Time",
)
fig.show()

In [None]:
fig = px.scatter(
    topic_sizes_per_year,
    x="year",
    y="relative_size",
    color="topic_labels",
    trendline="lowess",
    opacity=0.0,
    title="Topics Over Time",
)
fig.show()

---

### Examples

In [None]:
top_examples = k_means_explorer.extract_top_examples(
    text_series=paragraphs['text'],
    top_n_examples=10
)
top_examples.style.hide(axis='index')

---