In [None]:
# !pip install joblib
# !pip install scikit-learn

In [1]:
import joblib
import os
import re
import json

In [2]:
data_dir_name = "data"
models_dir_name = "models"
lda_dir_name = "lda"
lda_log_dir = "lda_topics"
documents_dir_name = "corpus"
documents_file_name = "lemmatized_lifestyle_documents.json"
lifestye_stopwords_file_name = "lifestyle_stop_words_4_lda.json"
documents_file = os.path.join(data_dir_name, documents_dir_name, documents_file_name)
lda_dir_path = os.path.join(data_dir_name, models_dir_name, lda_dir_name)

In [3]:
with open(documents_file, "r") as file:
    lifestyle_documents = json.load(file)

count_vectorizer_file_name = "count_vectorizer.pkl"
doc_term_matrix_file_name = "doc_term_matrix.pkl"

# Load an LDA model (or LDA models)
lda_models = {}

for item in os.listdir(lda_dir_path):
    if item.startswith("lda_") and item.endswith("_topics.pkl"):
        n_topics_match = re.search(r"lda_(\d+)_topics\.pkl", item)
        if n_topics_match:
            n_topics = int(n_topics_match.group(1))
            lda_models[n_topics] = joblib.load(os.path.join(lda_dir_path, item))

lda_models = dict(sorted(lda_models.items(), key=lambda item: item[0], reverse=False))

# Load the vectorizer
vectorizer = joblib.load(os.path.join(lda_dir_path, count_vectorizer_file_name))

# Load the doc-term matrix
doc_term_matrix = joblib.load(os.path.join(lda_dir_path, doc_term_matrix_file_name))

In [4]:
# Loading 'lifestyle_stop-words'
stopwords_path = os.path.join(data_dir_name, models_dir_name, lda_dir_name, lifestye_stopwords_file_name)
with open(stopwords_path, "r") as file:
    lifestye_stopwords = json.load(file)

# And removing the "lifestyle_stop_words" from the lemmatized texts for consitency:
lifestyle_tokens = [[item for item in document.split() if item not in lifestye_stopwords] for document in lifestyle_documents]

In [None]:
len([item for sublist in [sublist for sublist in lifestyle_tokens] for item in sublist])

In [6]:
"""
Mimicing CountVectorizer's vocabulary inside a Gensim Dictionary
to be able to use sklearn LDA models to get coherence scores
"""

from gensim.corpora import Dictionary

feature_names = vectorizer.get_feature_names_out()
id2word = Dictionary()
id2word.token2id = {word: i for i, word in enumerate(feature_names)}
id2word.id2token = {i: word for word, i in id2word.token2id.items()}

In [7]:
# Mimic a gensim lda model using the data from an lda model and count_vectorizer's vocab
import numpy as np

class SklearnLdaGensimWrapper:
    def __init__(self, sklearn_lda_model, feature_names, id2word):
        self.num_topics = sklearn_lda_model.n_components
        self.feature_names = feature_names
        self.id2word = id2word
        self.topic_term_matrix = sklearn_lda_model.components_

    def show_topic(self, topicid, topn=10):
        topic = self.topic_term_matrix[topicid]
        top_indices = topic.argsort()[::-1][:topn]
        return [(self.feature_names[i], topic[i]) for i in top_indices]

    def get_topics(self):
        return self.topic_term_matrix / self.topic_term_matrix.sum(axis=1)[:, np.newaxis]


In [None]:
from gensim.models import CoherenceModel

def batch_coherence_score(lda_model, feature_names, id2word, texts, topn=25):
    wrapper = SklearnLdaGensimWrapper(
        sklearn_lda_model=lda_model,
        feature_names=feature_names,
        id2word=id2word
    )

    topics = [[word for word, _ in wrapper.show_topic(i, topn=topn)]
              for i in range(wrapper.num_topics)]

    coherence_model = CoherenceModel(
        topics=topics,
        texts=texts,
        dictionary=id2word,
        coherence='c_v'
    )

    score = coherence_model.get_coherence()
    return score


In [None]:
sk_coherence_scores_top25 = {}
for n_topics, model in lda_models.items():
    score = batch_coherence_score(
        lda_model=model,
        feature_names=feature_names,
        id2word=id2word,
        texts=lifestyle_tokens,
        topn=25
    )
    sk_coherence_scores_top25[n_topics] = score
    sk_coherence_scores_top25 = dict(sorted(sk_coherence_scores_top25.items(), key=lambda item: item[1], reverse=True))

with open(os.path.join(lda_dir_path, "sk_coherence_scores_top25.json"), "w") as file:
    json.dump(sk_coherence_scores_top25, file, indent=4, ensure_ascii=False)

In [10]:
"""
Loading 'fake' gensim models
(models fitted using id2word based on vount vectorizer's vocab)
"""
gensim_fake_lda_models = {}

for item in os.listdir(lda_dir_path):
    if item.startswith("gensim_fake_lda_") and item.endswith("_topics.pkl"):
        n_topics_match = re.search(r"gensim_fake_lda_(\d+)_topics\.pkl", item)
        if n_topics_match:
            n_topics = int(n_topics_match.group(1))
            gensim_fake_lda_models[n_topics] = joblib.load(os.path.join(lda_dir_path, item))

gensim_fake_lda_models = dict(sorted(gensim_fake_lda_models.items(), key=lambda item: item[0], reverse=False))

In [11]:
"""
Let's see coherence scores using
(1) id2word built from count_vectorizer's vocab (inherited from doing coherence scores for sklearn models above) and
(2) gensim models fitted using that vocab
"""

from gensim.models import CoherenceModel

gensim_fake_coherence_scores_top25 = {}

for n_topics, model in gensim_fake_lda_models.items():
    coherence_model = CoherenceModel(
        model=model,
        texts=lifestyle_tokens,
        dictionary=id2word,
        coherence='c_v',
        topn=25
    )
    coherence_score = coherence_model.get_coherence()
    gensim_fake_coherence_scores_top25[n_topics] = coherence_score
    gensim_fake_coherence_scores_top25 = dict(sorted(gensim_fake_coherence_scores_top25.items(), key=lambda item: item[1], reverse=True))

with open(os.path.join(lda_dir_path, "gensim_fake_coherence_scores_top25.json"), "w") as file:
    json.dump(gensim_fake_coherence_scores_top25, file, indent=4, ensure_ascii=False)

In [None]:
"""
Let's see coherence scores using
(1) id2word built built by gensim natively and
(2) gensim models fitted using that vocab ("true" models)
"""

from gensim.corpora import Dictionary

id2word = Dictionary(lifestyle_tokens)
id2word.filter_extremes(no_below=2, no_above=0.95)

len(id2word)

In [13]:
# Load "true" gensim LDA models

gensim_true_lda_models = {}

for item in os.listdir(lda_dir_path):
    if item.startswith("gensim_true_lda_") and item.endswith("_topics.pkl"):
        n_topics_match = re.search(r"gensim_true_lda_(\d+)_topics\.pkl", item)
        if n_topics_match:
            n_topics = int(n_topics_match.group(1))
            gensim_true_lda_models[n_topics] = joblib.load(os.path.join(lda_dir_path, item))

gensim_true_lda_models = dict(sorted(gensim_true_lda_models.items(), key=lambda item: item[0], reverse=False))

In [14]:
from gensim.models import CoherenceModel

gensim_true_coherence_scores_top25 = {}

for n_topics, model in gensim_true_lda_models.items():
    coherence_model = CoherenceModel(
        model=model,
        texts=lifestyle_tokens,
        dictionary=id2word,
        coherence='c_v',
        topn=25
    )
    coherence_score = coherence_model.get_coherence()
    gensim_true_coherence_scores_top25[n_topics] = coherence_score
    gensim_true_coherence_scores_top25 = dict(sorted(gensim_true_coherence_scores_top25.items(), key=lambda item: item[1], reverse=True))

with open(os.path.join(lda_dir_path, "gensim_true_coherence_scores_top25.json"), "w") as file:
    json.dump(gensim_true_coherence_scores_top25, file, indent=4, ensure_ascii=False)