In [None]:
# ! pip install bertopic
# ! pip install sentence-transformers
# ! pip install -U nbformat

In [1]:
import os
import json

data_dir_name = "data"
documents_dir_name = "corpus"
documents_dir_path = os.path.join(data_dir_name, documents_dir_name)

documents_file_name = "tokenized_lifestyle_documents.json"
documents_file = os.path.join(documents_dir_path, documents_file_name)

transcripts_no_stopwords_name = "transcripts_no_stopwords.json"
transcripts_no_stopwords_file = os.path.join(documents_dir_path, transcripts_no_stopwords_name)

transcripts_as_tokens_name = "transcripts_as_complete_tokens.json"
transcripts_as_tokens_file = os.path.join(documents_dir_path, transcripts_as_tokens_name)

database_file_name_1 = "lifestyle_channels_full_db.json"
database_file_name_2 = "lifestyle_channels_full_db_2.json"
database_file_1 = os.path.join(documents_dir_path, database_file_name_1)
database_file_2 = os.path.join(documents_dir_path, database_file_name_2)

models_dir_name = "models"
models_dir_path = os.path.join(data_dir_name, models_dir_name)
os.makedirs(models_dir_path, exist_ok=True)

bertopic_dir_name = "bert_topics"
bertopic_dir_path = os.path.join(data_dir_name, bertopic_dir_name)
os.makedirs(bertopic_dir_path, exist_ok=True)

default_model_dir_name = "default"
# os.makedirs(default_model_dir_name, exist_ok=True)

bert_topics_dir_name = "bert_topics"
# os.makedirs(bert_topics_dir_name, exist_ok=True)

default_model_path = os.path.join(data_dir_name, models_dir_name, bertopic_dir_name, default_model_dir_name)

In [3]:
with open(database_file_1, "r") as file:
    lifestyle_database_1 = json.load(file)

with open(database_file_2, "r") as file:
    lifestyle_database_2 = json.load(file)

with open(documents_file, "r") as file:
    tokenized_documents = json.load(file)

with open(transcripts_no_stopwords_file, "r") as file:
    transcrips_no_stopwords = json.load(file)

with open(transcripts_as_tokens_file, "r") as file:
    transcripts_as_tokens = json.load(file)

lifestyle_database = lifestyle_database_1 + lifestyle_database_2
full_transcripts = [item["transcript"] for item in lifestyle_database if "transcript" in item and  not "File missing — could not transcribe." in item["transcript"]]

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN

# Transcripts with stopwords pre-removed

In [None]:
embedding_model = SentenceTransformer("all-mpnet-base-v2")

hdbscan_model = HDBSCAN(
    min_cluster_size=5,
    min_samples=1,
    prediction_data=True)

topic_model = BERTopic(
    top_n_words=30,
    embedding_model=embedding_model,
    hdbscan_model=hdbscan_model,
    verbose=True,
    calculate_probabilities=True
)

topics, probs = topic_model.fit_transform(transcrips_no_stopwords)

topic_model.get_topic_info()

In [None]:
log_n_topics_pd = topic_model.get_topic_info()
log_n_topics_pd.to_csv(os.path.join(data_dir_name, bertopic_dir_name, "n_topics_log.csv"), index=True, encoding="utf-8")

In [None]:
topic_model.reduce_topics(transcrips_no_stopwords)

In [None]:
topic_model.get_topic_info()

In [None]:
from_n_topics_pd = topic_model.get_topic_info()
from_n_topics_pd.to_csv(os.path.join(data_dir_name, bertopic_dir_name, "from_n_topics_log.csv"), index=True, encoding="utf-8")