In [None]:
# ! pip install seaborn

In [12]:
import joblib
import os
import json

In [29]:
data_dir_name = "data"
lda_log_dir = "lda_topics"
models_dir_name = "models"
lda_dir_name = "lda"
lda_dir_path = os.path.join(data_dir_name, models_dir_name, lda_dir_name)

documents_dir_name = "corpus"
documents_file_name = "lemmatized_lifestyle_documents.json"
lifestye_stopwords_file_name = "lifestyle_stop_words_4_lda.json"
documents_file = os.path.join(data_dir_name, documents_dir_name, documents_file_name)

sk_coherence_name = "sk_coherence_scores_top25.json"
gensim_true_coherence_name = "gensim_true_coherence_scores_top25.json"
gensim_fake_coherence_name = "gensim_fake_coherence_scores_top25.json"

In [30]:
with open(documents_file, "r") as file:
    lifestyle_documents = json.load(file)

# Loading 'lifestyle_stop-words'
stopwords_path = os.path.join(data_dir_name, models_dir_name, lda_dir_name, lifestye_stopwords_file_name)
with open(stopwords_path, "r") as file:
    lifestye_stopwords = json.load(file)

# And removing the "lifestyle_stop_words" from the lemmatized texts for consitency:
lifestyle_tokens = [[item for item in document.split() if item not in lifestye_stopwords] for document in lifestyle_documents]


# Recreatring gensim's vocabulary
from gensim.corpora import Dictionary

id2word = Dictionary(lifestyle_tokens)
id2word.filter_extremes(no_below=2, no_above=0.95)

len(id2word)

11060

In [31]:
# Imnpoting coherence scores
with open(os.path.join(lda_dir_path, sk_coherence_name),"r") as file:
    sk_coherence = json.load(file)

with open(os.path.join(lda_dir_path, gensim_true_coherence_name),"r") as file:
    gensim_true_coherence = json.load(file)

with open(os.path.join(lda_dir_path, gensim_fake_coherence_name),"r") as file:
    gensim_fake_coherence = json.load(file)

In [None]:
import matplotlib.pyplot as plt

def sort_scores(score_dict):
    items = sorted(((int(k), v) for k, v in score_dict.items()))
    x, y = zip(*items)
    return x, y

x1, y1 = sort_scores(sk_coherence)
x2, y2 = sort_scores(gensim_true_coherence)
x3, y3 = sort_scores(gensim_fake_coherence)

plt.figure(figsize=(10, 6))
plt.plot(x1, y1, marker='o', label='sklearn LDA')
plt.plot(x2, y2, marker='s', label='gensim LDA (native Dictionary)')
plt.plot(x3, y3, marker='^', label='gensim LDA (CV Dictionary)')

plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.title("Coherence Scores by Number of Topics")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Importing the sklearn lda data for heatmaps

import re

count_vectorizer_file_name = "count_vectorizer.pkl"
doc_term_matrix_file_name = "doc_term_matrix.pkl"

# Load an LDA model (or LDA models)
lda_models = {}

for item in os.listdir(lda_dir_path):
    if item.startswith("lda_") and item.endswith("_topics.pkl"):
        n_topics_match = re.search(r"lda_(\d+)_topics\.pkl", item)
        if n_topics_match:
            n_topics = int(n_topics_match.group(1))
            lda_models[n_topics] = joblib.load(os.path.join(lda_dir_path, item))


# Load the vectorizer
vectorizer = joblib.load(os.path.join(lda_dir_path, count_vectorizer_file_name))

# Load the doc-term matrix
doc_term_matrix = joblib.load(os.path.join(lda_dir_path, doc_term_matrix_file_name))

In [33]:
# Improting "true" gensim LDA models data for heatmap

gensim_true_lda_models = {}

for item in os.listdir(lda_dir_path):
    if item.startswith("gensim_true_lda_") and item.endswith("_topics.pkl"):
        n_topics_match = re.search(r"gensim_true_lda_(\d+)_topics\.pkl", item)
        if n_topics_match:
            n_topics = int(n_topics_match.group(1))
            gensim_true_lda_models[n_topics] = joblib.load(os.path.join(lda_dir_path, item))

gensim_true_lda_models = dict(sorted(gensim_true_lda_models.items(), key=lambda item: item[0], reverse=False))

In [34]:
"""
Loading 'fake' gensim models
(models fitted using id2word based on vount vectorizer's vocab)
"""
gensim_fake_lda_models = {}

for item in os.listdir(lda_dir_path):
    if item.startswith("gensim_fake_lda_") and item.endswith("_topics.pkl"):
        n_topics_match = re.search(r"gensim_fake_lda_(\d+)_topics\.pkl", item)
        if n_topics_match:
            n_topics = int(n_topics_match.group(1))
            gensim_fake_lda_models[n_topics] = joblib.load(os.path.join(lda_dir_path, item))

gensim_fake_lda_models = dict(sorted(gensim_fake_lda_models.items(), key=lambda item: item[0], reverse=False))

In [35]:
def get_data_for_heatmap_sk(lda_model, vocabulary=vectorizer):
    words_sk = vocabulary.get_feature_names_out()
    word_weights_sk = lda_model.components_
    return words_sk, word_weights_sk

def get_data_for_heatmap_true_gensim(lda_model, vocabulary=id2word):
    word_weights_gensim = lda_model.get_topics()
    return vocabulary, word_weights_gensim

def get_data_for_heatmap_fake_gensim(lda_model, vocabulary=vectorizer):
    words_gensim = vocabulary.get_feature_names_out()
    word_weights_gensim = lda_model.get_topics()
    return words_gensim, word_weights_gensim

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [51]:
def plot_heatmap_for_topics(words, word_weights, top_n=20, width=10, height=12):

    data_for_heatmap = []

    for topic_idx, topic_weights in enumerate(word_weights):
        top_word_indices = topic_weights.argsort()[::-1][:top_n]
        for word_index in top_word_indices:
            word = words[word_index]
            weight = topic_weights[word_index]
            data_for_heatmap.append((f"Topic {topic_idx+1}", word, weight))

    # Create pivot table for heatmap
    df = pd.DataFrame(data_for_heatmap, columns=["Topic", "Word", "Weight"])
    heatmap_data = df.pivot(index="Word", columns="Topic", values="Weight")

    # Sort words so they appear grouped in the heatmap
    heatmap_data = heatmap_data.loc[heatmap_data.sum(axis=1).sort_values(ascending=False).index]

    # Plot heatmap
    plt.figure(figsize=(width, height))

    ax = sns.heatmap(heatmap_data, cmap="YlGnBu")
    ax.set_yticks([i + 0.5 for i in range(len(heatmap_data.index))])
    ax.set_yticklabels(heatmap_data.index, fontsize=7, rotation=0) 

    plt.title("Top Words per Topic")
    plt.tight_layout()
    plt.show()

In [None]:
plot_heatmap_for_topics(get_data_for_heatmap_sk(lda_models[6])[0], get_data_for_heatmap_sk(lda_models[6])[1])

In [None]:
plot_heatmap_for_topics(get_data_for_heatmap_sk(lda_models[13])[0], get_data_for_heatmap_sk(lda_models[13])[1], height=20)

In [None]:
plot_heatmap_for_topics(get_data_for_heatmap_sk(lda_models[4])[0], get_data_for_heatmap_sk(lda_models[4])[1])

In [None]:
plot_heatmap_for_topics(get_data_for_heatmap_fake_gensim(gensim_fake_lda_models[6])[0], get_data_for_heatmap_fake_gensim(gensim_fake_lda_models[6])[1])