In [None]:
# ! pip install gensim

In [3]:
import os
import json

In [4]:
data_dir_name = "data"
documents_dir_name = "corpus"
working_dir = os.path.join(data_dir_name, documents_dir_name)
documents_file_name = "lemmatized_lifestyle_documents.json"
documents_file = os.path.join(working_dir, documents_file_name)
lda_log_dir = "lda_topics"
models_dir_name = "models"
lda_dir_name = "lda"
lifestye_stopwords_file_name = "lifestyle_stop_words_4_lda.json"

In [5]:
with open(documents_file, "r") as file:
    lifestyle_documents = json.load(file)

In [6]:
stopwords_path = os.path.join(data_dir_name, models_dir_name, lda_dir_name, lifestye_stopwords_file_name)
with open(stopwords_path, "r") as file:
    lifestye_stopwords = json.load(file)

# And removing the "lifestyle_stop_words" from the lemmatized texts for consitency:
lifestyle_tokens = [[item for item in document.split() if item not in lifestye_stopwords] for document in lifestyle_documents]

In [8]:
from gensim.corpora import Dictionary

id2word = Dictionary(lifestyle_tokens)
id2word.filter_extremes(no_below=2, no_above=0.95)
corpus = [id2word.doc2bow(doc) for doc in lifestyle_tokens]

In [10]:
from gensim.models import LdaModel

gensim_true_lda_models = {}

for n_topics in range(3, 16):
    lda_model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=n_topics,
        random_state=42,
        passes=10,
        iterations=100,
        eval_every=None
    )
    gensim_true_lda_models[n_topics] = lda_model

In [11]:
gensim_true_lda_topic_modeling_log = {}
count = 1

for key, value in gensim_true_lda_models.items():
    gensim_ttm = gensim_true_lda_models[key].get_topics()
    
    for topic_idx, topic_vector in enumerate(gensim_ttm):
        gensim_true_lda_topic_modeling_log.setdefault("Trial #", []).append(f"Trial #{count}") 
        gensim_true_lda_topic_modeling_log.setdefault("# of Topics", []).append(int(key))
        gensim_true_lda_topic_modeling_log.setdefault("Topic #", []).append(topic_idx + 1)
        gensim_true_lda_topic_modeling_log.setdefault("Top Words", []).append(", ".join([id2word[i] for i in topic_vector.argsort()[:-26:-1]]))
        gensim_true_lda_topic_modeling_log.setdefault("Label(s)", []).append("")
        gensim_true_lda_topic_modeling_log.setdefault("Interpretability", []).append("")
        gensim_true_lda_topic_modeling_log.setdefault("Related or Similar to", []).append("")
        gensim_true_lda_topic_modeling_log.setdefault("Notes", []).append("")
        
    count += 1

In [12]:
import pandas as pd
import joblib

gensim_true_lda_topic_modeling_log_pd = pd.DataFrame.from_dict(gensim_true_lda_topic_modeling_log)
gensim_true_lda_topic_modeling_log_pd.to_csv(os.path.join(data_dir_name, lda_log_dir, "gensim_true_lda_topic_modeling_log.csv"), index=True, encoding="utf-8")

save_path = os.path.join(data_dir_name, models_dir_name, lda_dir_name)
os.makedirs(save_path, exist_ok=True)

# Save each fitted LDA model
for n_topics, model in gensim_true_lda_models.items():
    joblib.dump(model, f"{save_path}/gensim_true_lda_{n_topics}_topics.pkl")

In [13]:
# Import the vocab from count_vectorizer to use it to mimic an id2word vocab

import joblib

retrieval_path = os.path.join(data_dir_name, models_dir_name, lda_dir_name)
count_vectorizer_file_name = "count_vectorizer.pkl"

vectorizer = joblib.load(os.path.join(retrieval_path, count_vectorizer_file_name))

In [14]:
from gensim.corpora import Dictionary

# Mimicing CountVectorizer's vocabulary inside a Gensim Dictionary

feature_names = vectorizer.get_feature_names_out()
id2word = Dictionary()
id2word.token2id = {word: i for i, word in enumerate(feature_names)}
id2word.id2token = {i: word for word, i in id2word.token2id.items()}

corpus = [id2word.doc2bow(item) for item in lifestyle_tokens]

In [16]:
from gensim.models import LdaModel

gensim_fake_lda_models = {}

for n_topics in range(3, 16):
    lda_model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=n_topics,
        random_state=42,
        passes=10,
        iterations=100,
        eval_every=None
    )
    gensim_fake_lda_models[n_topics] = lda_model

In [17]:
gensim_fake_lda_topic_modeling_log = {}
count = 1

for key, value in gensim_fake_lda_models.items():
    gensim_ttm = gensim_fake_lda_models[key].get_topics()
    
    for topic_idx, topic_vector in enumerate(gensim_ttm):
        gensim_fake_lda_topic_modeling_log.setdefault("Trial #", []).append(f"Trial #{count}") 
        gensim_fake_lda_topic_modeling_log.setdefault("# of Topics", []).append(int(key))
        gensim_fake_lda_topic_modeling_log.setdefault("Topic #", []).append(topic_idx + 1)
        gensim_fake_lda_topic_modeling_log.setdefault("Top Words", []).append(", ".join([id2word[i] for i in topic_vector.argsort()[:-26:-1]]))
        gensim_fake_lda_topic_modeling_log.setdefault("Label(s)", []).append("")
        gensim_fake_lda_topic_modeling_log.setdefault("Interpretability", []).append("")
        gensim_fake_lda_topic_modeling_log.setdefault("Related or Similar to", []).append("")
        gensim_fake_lda_topic_modeling_log.setdefault("Notes", []).append("")
        
    count += 1

In [18]:
import pandas as pd
import joblib

gensim_fake_lda_topic_modeling_log_pd = pd.DataFrame.from_dict(gensim_fake_lda_topic_modeling_log)
gensim_fake_lda_topic_modeling_log_pd.to_csv(os.path.join(data_dir_name, lda_log_dir, "gensim_fake_lda_topic_modeling_log.csv"), index=True, encoding="utf-8")

save_path = os.path.join(data_dir_name, models_dir_name, lda_dir_name)
os.makedirs(save_path, exist_ok=True)

# Save each fitted LDA model
for n_topics, model in gensim_fake_lda_models.items():
    joblib.dump(model, f"{save_path}/gensim_fake_lda_{n_topics}_topics.pkl")