In [8]:
#pip install bertopic

In [1]:
# Import required libraries
from bertopic import BERTopic
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.utils import simple_preprocess
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Initialize stop words
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Read the CSV file into a DataFrame
#df = pd.read_csv('../Data/YT_title_test_data.csv')
df = pd.read_csv('../Data/Final_Synthetic_Patient_Feedback_Dataset.csv')
documents = df['Patient Feedback']


In [4]:
# Step 3: Preprocess the text data
def preprocess_text(texts):
    return [simple_preprocess(doc, deacc=True) for doc in texts]

print("Preprocessing text data...")
tokenized_docs = preprocess_text(documents)

# Step 4: Train the BERTopic model
print("Training BERTopic model...")
topic_model = BERTopic(language="english", verbose=True)
topics, probs = topic_model.fit_transform(documents)

2025-02-05 15:09:20,491 - BERTopic - Embedding - Transforming documents to embeddings.


Preprocessing text data...
Training BERTopic model...


Batches: 100%|██████████| 15/15 [00:03<00:00,  4.16it/s]
2025-02-05 15:09:25,727 - BERTopic - Embedding - Completed ✓
2025-02-05 15:09:25,728 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-02-05 15:09:33,667 - BERTopic - Dimensionality - Completed ✓
2025-02-05 15:09:33,667 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-02-05 15:09:33,704 - BERTopic - Cluster - Completed ✓
2025-02-05 15:09:33,713 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-02-05 15:09:33,743 - BERTopic - Representation - Completed ✓


In [5]:
# Define the range of topics and top words per topic
num_topics_list = [3, 5, 10, 15, 20]
top_n_list = [5, 10, 15, 20]

# Store coherence scores
coherence_scores = []

for desired_num_topics in num_topics_list:
    print(f"\nTraining BERTopic model for {desired_num_topics} topics...")
    topic_model = BERTopic(language="english", verbose=True)
    topics, probs = topic_model.fit_transform(documents)

    print(f"Reducing topics to {desired_num_topics}...")
    topic_model = topic_model.reduce_topics(documents, nr_topics=desired_num_topics)

    print("Extracting top words for each topic...")
    topic_words = topic_model.get_topics()

    print("Checking number of topics...")
    topic_freq = topic_model.get_topic_freq()
    num_topics = len(topic_freq[topic_freq["Topic"] != -1])  # Exclude outliers (-1)
    print(f"Number of topics generated: {num_topics}")

    # Process topics for coherence evaluation
    for top_n in top_n_list:
        topic_word_lists = []
        for topic, words in topic_words.items():
            if topic != -1:  # Exclude outlier topics
                topic_word_lists.append([word[0] for word in words[:top_n]])

        if not topic_word_lists:
            print("No valid topics were generated. Skipping coherence calculation.")
            continue

        print("Preparing dictionary and corpus...")
        dictionary = Dictionary(tokenized_docs)
        corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

        print(f"Calculating coherence score for {top_n} words per topic...")
        coherence_model = CoherenceModel(
            topics=topic_word_lists,
            texts=tokenized_docs,
            dictionary=dictionary,
            coherence="c_v"
        )
        coherence_score = coherence_model.get_coherence()

        coherence_scores.append({
            "num_topics": desired_num_topics,
            "top_n": top_n,
            "coherence": coherence_score
        })

# Convert results into a dataframe for better visualization
df_coherence = pd.DataFrame(coherence_scores)

2025-02-05 15:09:36,514 - BERTopic - Embedding - Transforming documents to embeddings.



Training BERTopic model for 3 topics...


Batches: 100%|██████████| 15/15 [00:00<00:00, 23.62it/s]
2025-02-05 15:09:38,910 - BERTopic - Embedding - Completed ✓
2025-02-05 15:09:38,911 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-02-05 15:09:39,484 - BERTopic - Dimensionality - Completed ✓
2025-02-05 15:09:39,485 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-02-05 15:09:39,501 - BERTopic - Cluster - Completed ✓
2025-02-05 15:09:39,504 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-02-05 15:09:39,527 - BERTopic - Representation - Completed ✓
2025-02-05 15:09:39,559 - BERTopic - Topic reduction - Reducing number of topics
2025-02-05 15:09:39,582 - BERTopic - Topic reduction - Reduced number of topics from 15 to 3


Reducing topics to 3...
Extracting top words for each topic...
Checking number of topics...
Number of topics generated: 2
Preparing dictionary and corpus...
Calculating coherence score for 5 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Preparing dictionary and corpus...
Calculating coherence score for 10 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Preparing dictionary and corpus...
Calculating coherence score for 15 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Preparing dictionary and corpus...
Calculating coherence score for 20 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


Training BERTopic model for 5 topics...


Batches: 100%|██████████| 15/15 [00:00<00:00, 23.90it/s]
2025-02-05 15:09:49,546 - BERTopic - Embedding - Completed ✓
2025-02-05 15:09:49,546 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-02-05 15:09:50,115 - BERTopic - Dimensionality - Completed ✓
2025-02-05 15:09:50,116 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-02-05 15:09:50,153 - BERTopic - Cluster - Completed ✓
2025-02-05 15:09:50,236 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-02-05 15:09:50,276 - BERTopic - Representation - Completed ✓
2025-02-05 15:09:50,311 - BERTopic - Topic reduction - Reducing number of topics
2025-02-05 15:09:50,327 - BERTopic - Topic reduction - Reduced number of topics from 18 to 5


Reducing topics to 5...
Extracting top words for each topic...
Checking number of topics...
Number of topics generated: 4
Preparing dictionary and corpus...
Calculating coherence score for 5 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Preparing dictionary and corpus...
Calculating coherence score for 10 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Preparing dictionary and corpus...
Calculating coherence score for 15 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Preparing dictionary and corpus...
Calculating coherence score for 20 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


Training BERTopic model for 10 topics...


Batches: 100%|██████████| 15/15 [00:00<00:00, 24.33it/s]
2025-02-05 15:09:59,898 - BERTopic - Embedding - Completed ✓
2025-02-05 15:09:59,899 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-02-05 15:10:00,475 - BERTopic - Dimensionality - Completed ✓
2025-02-05 15:10:00,476 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-02-05 15:10:00,493 - BERTopic - Cluster - Completed ✓
2025-02-05 15:10:00,498 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-02-05 15:10:00,526 - BERTopic - Representation - Completed ✓
2025-02-05 15:10:00,563 - BERTopic - Topic reduction - Reducing number of topics
2025-02-05 15:10:00,585 - BERTopic - Topic reduction - Reduced number of topics from 17 to 10


Reducing topics to 10...
Extracting top words for each topic...
Checking number of topics...
Number of topics generated: 9
Preparing dictionary and corpus...
Calculating coherence score for 5 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Preparing dictionary and corpus...
Calculating coherence score for 10 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Preparing dictionary and corpus...
Calculating coherence score for 15 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Preparing dictionary and corpus...
Calculating coherence score for 20 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


Training BERTopic model for 15 topics...


Batches: 100%|██████████| 15/15 [00:00<00:00, 24.09it/s]
2025-02-05 15:10:09,631 - BERTopic - Embedding - Completed ✓
2025-02-05 15:10:09,632 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-02-05 15:10:10,202 - BERTopic - Dimensionality - Completed ✓
2025-02-05 15:10:10,203 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-02-05 15:10:10,218 - BERTopic - Cluster - Completed ✓
2025-02-05 15:10:10,221 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-02-05 15:10:10,244 - BERTopic - Representation - Completed ✓
2025-02-05 15:10:10,278 - BERTopic - Topic reduction - Reducing number of topics
2025-02-05 15:10:10,300 - BERTopic - Topic reduction - Reduced number of topics from 17 to 15


Reducing topics to 15...
Extracting top words for each topic...
Checking number of topics...
Number of topics generated: 14
Preparing dictionary and corpus...
Calculating coherence score for 5 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Preparing dictionary and corpus...
Calculating coherence score for 10 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Preparing dictionary and corpus...
Calculating coherence score for 15 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Preparing dictionary and corpus...
Calculating coherence score for 20 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


Training BERTopic model for 20 topics...


Batches: 100%|██████████| 15/15 [00:00<00:00, 25.04it/s]
2025-02-05 15:10:19,304 - BERTopic - Embedding - Completed ✓
2025-02-05 15:10:19,304 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-02-05 15:10:19,896 - BERTopic - Dimensionality - Completed ✓
2025-02-05 15:10:19,897 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-02-05 15:10:19,911 - BERTopic - Cluster - Completed ✓
2025-02-05 15:10:19,914 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-02-05 15:10:19,936 - BERTopic - Representation - Completed ✓
2025-02-05 15:10:19,969 - BERTopic - Topic reduction - Reducing number of topics
2025-02-05 15:10:19,970 - BERTopic - Topic reduction - Reduced number of topics from 16 to 16


Reducing topics to 20...
Extracting top words for each topic...
Checking number of topics...
Number of topics generated: 15
Preparing dictionary and corpus...
Calculating coherence score for 5 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Preparing dictionary and corpus...
Calculating coherence score for 10 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Preparing dictionary and corpus...
Calculating coherence score for 15 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Preparing dictionary and corpus...
Calculating coherence score for 20 words per topic...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [6]:
# Display the dataframe
print("\nCoherence Scores:")
print(df_coherence)


Coherence Scores:
    num_topics  top_n  coherence
0            3      5   0.512585
1            3     10   0.468651
2            3     15   0.468651
3            3     20   0.468651
4            5      5   0.736372
5            5     10   0.573400
6            5     15   0.573400
7            5     20   0.573400
8           10      5   0.913477
9           10     10   0.616210
10          10     15   0.616210
11          10     20   0.616210
12          15      5   0.898848
13          15     10   0.675034
14          15     15   0.675034
15          15     20   0.675034
16          20      5   0.861202
17          20     10   0.670397
18          20     15   0.670397
19          20     20   0.670397
