# Topic Modeling

<a href="https://colab.research.google.com/github/chu-ise/411A-2022/blob/main/notebooks/09/09-02_topic_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Topic Modeling using Scikit Learn

In [None]:
from sklearn.datasets import fetch_20newsgroups

categories = [
    "alt.atheism",
    "talk.religion.misc",
    "comp.graphics",
    "sci.space",
    "comp.sys.ibm.pc.hardware",
    "sci.crypt",
]

newsgroups_train = fetch_20newsgroups(subset="train", categories=categories)

print("#Train set size:", len(newsgroups_train.data))
print("#Selected categories:", newsgroups_train.target_names)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(
    token_pattern="[\w']{3,}",
    stop_words="english",
    max_features=2000,
    min_df=5,
    max_df=0.5,
)
review_cv = cv.fit_transform(newsgroups_train.data)


### LDA Model

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

np.set_printoptions(precision=3)

lda = LatentDirichletAllocation(
    n_components=10,
    max_iter=5,
    topic_word_prior=0.1,
    doc_topic_prior=1.0,
    learning_method="online",
    n_jobs=-1,
    random_state=0,
)

review_topics = lda.fit_transform(review_cv)
print("#shape of review_topics:", review_topics.shape)
print("#Sample of review_topics:", review_topics[0])

gross_topic_weights = np.mean(review_topics, axis=0)
print("#Sum of topic weights of documents:", gross_topic_weights)

print("#shape of topic word distribution:", lda.components_.shape)


In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d: " % topic_idx, end="")
        print(
            ", ".join(
                [feature_names[i] for i in topic.argsort()[: -n_top_words - 1 : -1]]
            )
        )
    print()


print_top_words(lda, cv.get_feature_names_out(), 10)


### Optimal number of topics


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def show_perplexity(cv, start=10, end=30, max_iter=5, topic_word_prior= 0.1, 
                    doc_topic_prior=1.0):
    iter_num = []
    per_value = []

    for i in range(start, end + 1):
        lda = LatentDirichletAllocation(n_components = i, max_iter=max_iter, 
                                        topic_word_prior= topic_word_prior, 
                                        doc_topic_prior=doc_topic_prior,
                                        learning_method='batch', n_jobs= -1,
                                        random_state=7)    
        lda.fit(cv)
        iter_num.append(i)
        pv = lda.perplexity(cv)
        per_value.append(pv)
        print(f'n_components: {i}, perplexity: {pv:0.3f}')

    plt.plot(iter_num, per_value, 'g-')
    plt.show()
    return start + per_value.index(min(per_value))

print("n_components with minimum perplexity:",
      show_perplexity(review_cv, start=6, end=15))

In [None]:
lda = LatentDirichletAllocation(
    n_components=8,
    max_iter=20,
    topic_word_prior=0.1,
    doc_topic_prior=1.0,
    learning_method="batch",
    n_jobs=-1,
    random_state=7,
)

review_topics = lda.fit_transform(review_cv)

print_top_words(lda, cv.get_feature_names_out(), 10)

## Topic modeling using Gensim


In [None]:
# %pip install --upgrade gensim

In [None]:
import nltk
  

nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

RegTok = RegexpTokenizer("[\w']{3,}")
english_stops = set(stopwords.words("english"))


def tokenizer(text):
    tokens = RegTok.tokenize(text.lower())
    # stopwords 제외
    words = [word for word in tokens if (word not in english_stops) and len(word) > 2]
    return words


texts = [tokenizer(news) for news in newsgroups_train.data]

In [None]:
from gensim.corpora.dictionary import Dictionary

dictionary = Dictionary(texts)
print('#Number of initial unique words in documents:', len(dictionary))

dictionary.filter_extremes(keep_n=2000, no_below=5, no_above=0.5)
print('#Number of unique words after removing rare and common words:', len(dictionary))

corpus = [dictionary.doc2bow(text) for text in texts]
print('#Number of unique tokens: %d' % len(dictionary))
print('#Number of documents: %d' % len(corpus))

In [None]:
%%time
from gensim.models import LdaModel

num_topics = 10
passes = 5
model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    passes=passes,
    num_topics=num_topics,
    random_state=7,
)


In [None]:
model.print_topics(num_words=10)

In [None]:
print(
    "#topic distribution of the first document: ", model.get_document_topics(corpus)[0]
)


In [None]:
import pandas as pd


def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)

        for j, (topic_num, prop_topic) in enumerate(doc):
            if j == 0:
                topic_table = topic_table.append(
                    pd.Series([int(topic_num), round(prop_topic, 4), topic_list]),
                    ignore_index=True,
                )
            else:
                break
    return topic_table


In [None]:
topictable = make_topictable_per_doc(model, corpus)
topictable = topictable.reset_index()
topictable.columns = ['Doc No.', 'Top topic', 'Top topic weight', 'topic weights']
topictable.head()

In [None]:
%%capture
%pip install pyLDAvis

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
pyLDAvis.display(vis)


### Select number of topics by coherence and perplexity

In [None]:
from gensim.models import CoherenceModel

cm = CoherenceModel(model=model, corpus=corpus, coherence="u_mass")
coherence = cm.get_coherence()
print(coherence)

In [None]:
def show_coherence(corpus, dictionary, start=6, end=15):
    iter_num = []
    per_value = []
    coh_value = []

    for i in range(start, end + 1):
        model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            chunksize=1000,
            num_topics=i,
            random_state=7,
        )
        iter_num.append(i)
        pv = model.log_perplexity(corpus)
        per_value.append(pv)

        cm = CoherenceModel(model=model, corpus=corpus, coherence="u_mass")
        cv = cm.get_coherence()
        coh_value.append(cv)
        print(f"num_topics: {i}, perplexity: {pv:0.3f}, coherence: {cv:0.3f}")

    plt.plot(iter_num, per_value, "g-")
    plt.xlabel("num_topics")
    plt.ylabel("perplexity")
    plt.show()

    plt.plot(iter_num, coh_value, "r--")
    plt.xlabel("num_topics")
    plt.ylabel("coherence")
    plt.show()


show_coherence(corpus, dictionary, start=6, end=15)

## Topic trends

In [None]:
%%capture
%pip install ekorpkit[dataset]

In [None]:
from ekorpkit import eKonf

cfg = eKonf.compose(config_group='corpus')
cfg.name = 'fomc'
cfg.data_dir = "${cached_path:'https://github.com/entelecheia/ekorpkit-config/raw/main/data/fomc.zip',true,false}"
cfg.automerge = True
fomc = eKonf.instantiate(cfg)
print(fomc)

In [None]:
fomc_statements = fomc.data[fomc.data.content_type == 'fomc_statement'].reset_index(drop=True)
fomc_statements['year'] = fomc_statements.timestamp.dt.year
fomc_statements.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(
    token_pattern="[\w']{3,}",
    stop_words="english",
    max_features=1000,
    min_df=5,
    max_df=0.5,
)
vec = cv.fit_transform(fomc_statements.text)
print(vec.shape)

In [None]:
%%time
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=10, n_jobs=-1, random_state=0)

fomc_topics = lda.fit_transform(vec)


In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d: " % topic_idx, end="")
        print(
            ", ".join(
                [feature_names[i] for i in topic.argsort()[: -n_top_words - 1 : -1]]
            )
        )

print_top_words(lda, cv.get_feature_names_out(), 10)


In [None]:
import pandas as pd 

trend_data = pd.DataFrame(fomc_topics, columns=["Topic" + str(i) for i in range(1, 11)])
trend_data = pd.concat([trend_data, fomc_statements.year], axis=1)
trend_data.head()

In [None]:
trend = trend_data.groupby(['year']).mean()
trend.head()

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(5, 2, sharex="col", figsize=(12, 16))
for col, ax in zip(trend.columns.tolist(), axes.ravel()):
    ax.set_title(col)
    ax.axes.xaxis.set_visible(False)
    ax.plot(trend[col])
plt.show()