# Using BERTopic for topic modelling 

In [None]:
import re
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

# Step 1: Extract and preprocess documents
documents = []
for i in range(475):  # Your 475 files
    file_path = f"../corpus/program{i}.txt"
    with open(file_path, 'r') as file:
        content = file.read()
        # Extract content between DOC tags
        docs = re.findall(r'<DOC>(.*?)</DOC>', content, re.DOTALL)
        for doc in docs:
            # Split by line breaks or into sentences
            segments = [segment.strip() for segment in doc.split('\n') if segment.strip()]
            documents.extend(segments)

# Step 2: Configure BERTopic
# Remove stopwords after embedding to preserve context
vectorizer_model = CountVectorizer(stop_words="english", min_df=10)

# Step 3: Create and train the model
topic_model = BERTopic(
    vectorizer_model=vectorizer_model,
    min_topic_size=50,  # Adjust based on your needs
    low_memory=True,    # For memory efficiency
    calculate_probabilities=False  # For speed and memory efficiency
)

# Step 4: Fit the model
topics, probs = topic_model.fit_transform(documents)


In [45]:
from gensim.models import CoherenceModel
from gensim import corpora

# Example: Assuming `topic_model` is the trained BERTopic model and `documents` is the list of your original documents

# Retrieve topics and their top words
topics = topic_model.get_topics()

# Extract the list of top words for each topic
topics_words = []
for topic_num, words in topics.items():
    if topic_num != -1:  # Exclude outlier topic
        topic_words = [word for word, _ in words]
        topics_words.append(topic_words)

# Preprocess documents (ensure they are tokenized and in the right format for CoherenceModel)
texts = [[word for word in doc.split()] for doc in documents]  # Assuming basic tokenization here

# Create a dictionary and corpus from the documents
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Compute coherence score
coherence_model = CoherenceModel(topics=topics_words, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()

print(f"Coherence Score: {coherence_score}")


Coherence Score: 0.5478099941520221


In [None]:
# Create an interactive visualization of topics
fig = topic_model.visualize_topics()

# To save the visualization as HTML
fig.write_html("topics_visualization.html")

In [37]:
topic_model.visualize_distribution(probs, min_probability=0.05)

In [38]:
topic_model.visualize_barchart()


In [54]:
topic_model.visualize_documents(documents)

In [50]:
topic_model.visualize_hierarchy()


In [51]:
topic_model.visualize_heatmap(n_clusters=20)