In [None]:
#! pip install tqdm
#! pip install pandas
#! pip install -U scikit-learn
# ! pip install numpy -U
# ! pip install gensim -U

In [2]:
import os
import json

In [3]:
data_dir_name = "data"
documents_dir_name = "corpus"
working_dir = os.path.join(data_dir_name, documents_dir_name)
documents_file_name = "lemmatized_lifestyle_documents.json"
documents_file = os.path.join(working_dir, documents_file_name)
lda_log_dir = "lda_topics"
models_dir_name = "models"
lda_dir_name = "lda"

In [4]:
with open(documents_file, "r") as file:
    lifestyle_documents = json.load(file)

In [5]:
lifestyle_bag_of_words = [item for sublist in [item.split() for item in lifestyle_documents] for item in sublist]

In [6]:
from collections import Counter
lifestyle_words_freqs = Counter(lifestyle_bag_of_words)
lifestyle_most_common_words = lifestyle_words_freqs.most_common(60)

In [7]:
lifestyle_stop_words = [item[0] for item in lifestyle_most_common_words[:37]]

In [8]:
lifestyle_stop_words.extend(["kind", "yeah", "bro", "gent", "buy", "maybe", "probably", "oh"])
lifestyle_stop_words.remove("style")
lifestyle_stop_words.remove("shirt")
lifestyle_stop_words.remove("wear")

In [9]:
lifestye_stopwords_file_name = "lifestyle_stop_words_4_lda.json" 
with open(os.path.join(data_dir_name, models_dir_name, lda_dir_name, lifestye_stopwords_file_name), "w") as file:
    json.dump(lifestyle_stop_words, file, indent = 4, ensure_ascii=False)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words=lifestyle_stop_words, max_df=0.95, min_df=2)
doc_term_matrix = vectorizer.fit_transform(lifestyle_documents)

In [14]:
from sklearn.decomposition import LatentDirichletAllocation
from tqdm import tqdm

count = 1
lda_topic_modeling_log = {}
lda_models = {}

for n_topics in tqdm(range(3,16), desc="Fitting LDA Models"):
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(doc_term_matrix)
    lda_models[n_topics] = lda

    words = vectorizer.get_feature_names_out()
    
    for topic_idx, topic in enumerate(lda.components_):
        
        lda_topic_modeling_log.setdefault("Trial #", []).append(f"Trial #{count}") 
        lda_topic_modeling_log.setdefault("# of Topics", []).append(n_topics)
        lda_topic_modeling_log.setdefault("Topic #", []).append(topic_idx + 1)
        lda_topic_modeling_log.setdefault("Top Words", []).append(", ".join([words[i] for i in topic.argsort()[:-26:-1]]))
        lda_topic_modeling_log.setdefault("Label(s)", []).append("")
        lda_topic_modeling_log.setdefault("Interpretability", []).append("")
        lda_topic_modeling_log.setdefault("Related or Similar to", []).append("")
        lda_topic_modeling_log.setdefault("Notes", []).append("")
        
    count += 1

Fitting LDA Models: 100%|██████████| 13/13 [00:54<00:00,  4.20s/it]


In [15]:
import pandas as pd

lda_topic_modeling_log_pd = pd.DataFrame.from_dict(lda_topic_modeling_log)
lda_topic_modeling_log_pd
lda_topic_modeling_log_pd.to_csv(os.path.join(data_dir_name, lda_log_dir, "lda_topic_modeling_log.csv"), index=True, encoding="utf-8")

In [16]:
import joblib

save_path = os.path.join(data_dir_name, models_dir_name, lda_dir_name)
os.makedirs(save_path, exist_ok=True)

# Saving each fitted LDA model
for n_topics, model in lda_models.items():
    joblib.dump(model, f"{save_path}/lda_{n_topics}_topics.pkl")

# Saving the vectorizer
joblib.dump(vectorizer, f"{save_path}/count_vectorizer.pkl")

# Saving the doc-term matrix
joblib.dump(doc_term_matrix, f"{save_path}/doc_term_matrix.pkl")

['data/models/lda/doc_term_matrix.pkl']