In [8]:
#pip install bertopic

In [22]:
# Import required libraries
from bertopic import BERTopic
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.utils import simple_preprocess
import numpy as np
import pandas as pd

In [23]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Initialize stop words
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
# Read the CSV file into a DataFrame
df = pd.read_csv('../Data/YT_title_test_data.csv')
#df = pd.read_csv('../Data/YT_title_test_data_500.csv')
documents = df['title']


In [39]:
# Step 3: Preprocess the text data
def preprocess_text(texts):
    return [simple_preprocess(doc, deacc=True) for doc in texts]

print("Preprocessing text data...")
tokenized_docs = preprocess_text(documents)

# Step 4: Train the BERTopic model
print("Training BERTopic model...")
topic_model = BERTopic(language="english", verbose=True)
topics, probs = topic_model.fit_transform(documents)

# Step 5: Reduce the number of topics
desired_num_topics = 2500  # Change this value to the number of topics you want
print(f"Reducing topics to {desired_num_topics}...")
topic_model = topic_model.reduce_topics(documents, nr_topics=desired_num_topics)

# Step 5: Extract top 10 words per topic
print("Extracting top words for each topic...")
topic_words = topic_model.get_topics()

# Step 5: Check the number of topics
print("Checking number of topics...")
topic_freq = topic_model.get_topic_freq()  # Get the frequency of topics
num_topics = len(topic_freq[topic_freq['Topic'] != -1])  # Exclude outlier (-1)
print(f"Number of topics generated: {num_topics}")

# Display topic frequencies
print("Topic frequencies:")
print(topic_freq)

2025-01-22 20:56:59,342 - BERTopic - Embedding - Transforming documents to embeddings.


Preprocessing text data...
Training BERTopic model...


Batches: 100%|██████████| 313/313 [00:08<00:00, 36.73it/s]
2025-01-22 20:57:09,902 - BERTopic - Embedding - Completed ✓
2025-01-22 20:57:09,903 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-01-22 20:57:15,189 - BERTopic - Dimensionality - Completed ✓
2025-01-22 20:57:15,190 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-01-22 20:57:15,598 - BERTopic - Cluster - Completed ✓
2025-01-22 20:57:15,607 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-01-22 20:57:15,821 - BERTopic - Representation - Completed ✓
2025-01-22 20:57:16,083 - BERTopic - Topic reduction - Reducing number of topics
2025-01-22 20:57:16,084 - BERTopic - Topic reduction - Reduced number of topics from 166 to 166


Reducing topics to 2500...
Extracting top words for each topic...
Checking number of topics...
Number of topics generated: 165
Topic frequencies:
     Topic  Count
0       -1   3740
11       0    391
37       1    330
20       2    167
50       3    153
..     ...    ...
156    160     10
159    161     10
84     162     10
139    163     10
82     164     10

[166 rows x 2 columns]


In [37]:
# Prepare top words for coherence score calculation
top_n_words = 10
topic_word_lists = []
for topic, words in topic_words.items():
    if topic != -1:  # Exclude outlier topics
        topic_word_lists.append([word[0] for word in words[:top_n_words]])

# Check if there are topics generated
if not topic_word_lists:
    print("No valid topics were generated. Try increasing the sample size or adjusting model parameters.")
else:
    # Step 6: Prepare corpus and dictionary for coherence calculation
    print("Preparing dictionary and corpus...")
    dictionary = Dictionary(tokenized_docs)
    corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

    # Step 7: Compute Coherence Score (Cv)
    print("Calculating coherence score...")
    coherence_model = CoherenceModel(
        topics=topic_word_lists, 
        texts=tokenized_docs, 
        dictionary=dictionary, 
        coherence='c_v'
    )
    coherence_score = coherence_model.get_coherence()
    print(f"Coherence Score (Cv): {coherence_score}")

    # Display results
    print("\nTop 5 topics with their top words:")
    for i, words in enumerate(topic_word_lists[:30]):
        print(f"Topic {i}: {', '.join(words)}")

    # Optional: Visualize topics
    #print("Visualizing topics...")
    #topic_model.visualize_topics().show()

Preparing dictionary and corpus...
Calculating coherence score...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Coherence Score (Cv): 0.287265892633566

Top 5 topics with their top words:
Topic 0: the, you, to, of, your, that, are, and, this, from
Topic 1: in, of, president, after, to, us, on, over, and, for
Topic 2: to, in, iphone, million, of, for, is, apple, says, the
Topic 3: of, the, championship, victory, for, out, in, cup, woods, win
