In [15]:
#pip install bertopic

In [1]:
# Import required libraries
from sklearn.datasets import fetch_20newsgroups
from bertopic import BERTopic
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.utils import simple_preprocess
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [53]:
# Step 1: Load the 20 Newsgroups dataset and create a DataFrame
#print("Fetching 20 Newsgroups dataset...")
#newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
#df = pd.DataFrame({'text': newsgroups.data})
#print(f"Total documents: {len(df)}")

# Step 2: Take a random sample of 50 documents
documents = pd.read_csv('../Data/20NewsGroup.csv')
#documents = pd.read_csv('../Data/20NewsGroup_500.csv')

documents.head()

if 'Text' not in documents.columns:
    raise ValueError("The CSV file does not contain a 'Text' column. Please check the column name.")

documents['Text'] = documents['Text'].astype(str).fillna('')
# Convert to a list of strings for BERTopic
texts = documents['Text'].tolist()


In [54]:
# Step 3: Preprocess the text data
def preprocess_text(texts):
    return [simple_preprocess(doc, deacc=True) for doc in texts]

print("Preprocessing text data...")
tokenized_docs = preprocess_text(texts)

# Step 4: Train the BERTopic model
print("Training BERTopic model...")
topic_model = BERTopic(language="english", verbose=True)
topics, probs = topic_model.fit_transform(texts)

# Step 5: Reduce the number of topics
desired_num_topics = 1500  # Change this value to the number of topics you want
#print(f"Reducing topics to {desired_num_topics}...")
topic_model = topic_model.reduce_topics(texts, nr_topics=desired_num_topics)

# Step 5: Extract top 10 words per topic
print("Extracting top words for each topic...")
topic_words = topic_model.get_topics()

# Step 5: Check the number of topics
print("Checking number of topics...")
topic_freq = topic_model.get_topic_freq()  # Get the frequency of topics
num_topics = len(topic_freq[topic_freq['Topic'] != -1])  # Exclude outlier (-1)
print(f"Number of topics generated: {num_topics}")

# Display topic frequencies
print("Topic frequencies:")
print(topic_freq)

Preprocessing text data...


2025-01-22 20:58:07,693 - BERTopic - Embedding - Transforming documents to embeddings.


Training BERTopic model...


Batches: 100%|██████████| 589/589 [02:18<00:00,  4.26it/s]
2025-01-22 21:00:33,340 - BERTopic - Embedding - Completed ✓
2025-01-22 21:00:33,344 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-01-22 21:00:39,728 - BERTopic - Dimensionality - Completed ✓
2025-01-22 21:00:39,735 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Avoid using `tokenizers` befor

Extracting top words for each topic...
Checking number of topics...
Number of topics generated: 209
Topic frequencies:
     Topic  Count
2       -1   6424
0        0   1848
28       1    622
49       2    473
1        3    439
..     ...    ...
118    204     11
204    205     10
205    206     10
183    207     10
208    208     10

[210 rows x 2 columns]


In [52]:
# Prepare top words for coherence score calculation
top_n_words = 10
topic_word_lists = []
for topic, words in topic_words.items():
    if topic != -1:  # Exclude outlier topics
        topic_word_lists.append([word[0] for word in words[:top_n_words]])

# Check if there are topics generated
if not topic_word_lists:
    print("No valid topics were generated. Try increasing the sample size or adjusting model parameters.")
else:
    # Step 6: Prepare corpus and dictionary for coherence calculation
    print("Preparing dictionary and corpus...")
    dictionary = Dictionary(tokenized_docs)
    corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

    # Step 7: Compute Coherence Score (Cv)
    print("Calculating coherence score...")
    coherence_model = CoherenceModel(
        topics=topic_word_lists, 
        texts=tokenized_docs, 
        dictionary=dictionary, 
        coherence='c_v'
    )
    coherence_score = coherence_model.get_coherence()
    print(f"Coherence Score (Cv): {coherence_score}")

    # Display results
    print("\nTop 5 topics with their top words:")
    for i, words in enumerate(topic_word_lists[:50]):
        print(f"Topic {i}: {', '.join(words)}")

    # Optional: Visualize topics
    #print("Visualizing topics...")
    #topic_model.visualize_topics().show()

Preparing dictionary and corpus...
Calculating coherence score...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Coherence Score (Cv): 0.39039478011053835

Top 5 topics with their top words:
Topic 0: the, to, of, and, in, is, for, that, it, you
Topic 1: the, in, to, is, of, that, and, be, game, he
Topic 2: nan, consistently, wanted, know, just, to, , , , 
