# Modern Topic Modeling: BERTopic
- Handles all text preprocessing: stop words, capitalization, etc..
- Transforms each document into its own embedding.

In [None]:
# import sys
# print(sys.executable)
# !pip install bertopic #install in terminal
from bertopic import BERTopic 
from sklearn.datasets import fetch_20newsgroups
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.metrics.pairwise import cosine_distances
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.utils import simple_preprocess

## Data Read In
fetch_20newsgroups is a large corpus of news articles

In [None]:
newsgroups = fetch_20newsgroups()
input_categories = [
    'sci.space',
    'comp.os.ms-windows.misc',
    'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware',  
    'rec.autos',
    'rec.motorcycles',
    'rec.sport.baseball',
    'rec.sport.hockey',
    'sci.med']
docs = fetch_20newsgroups(subset='all', categories=input_categories,remove=('headers','footers', 'quotes'))['data']  
len(docs)

## Model Application:
Embedding transformation and topic assignment by clustering

Topic -1 contains stopwords that would traditionally be filtered out in pre-processing, such as when using LDA topic modeling.

In [None]:
# takes a few minutes to run 
# (embedding transformation, dimensionality reduction, clustering, tf-idf representations in one line of code)
# Remobing stop-words for interpritability
vectorizer_model = CountVectorizer(
    stop_words="english",
    token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z]{2,}\b"  )
# Applying transformer topic model
# topic_model = BERTopic(nr_topics=50, top_n_words=30,vectorizer_model=vectorizer_model)
# topics, probs = topic_model.fit_transform(docs)
# topic_model.save("bert_model")
topic_model = BERTopic.load("bert_model")
topic_df = topic_model.get_topic_info()
topic_df.head(10)

In [None]:
print("topic 8:")
print(topic_model.get_topic(8))
print("")
print("topic 9:")
print(topic_model.get_topic(9))

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy()

## Improvements: Metric-Driven Model Evaluation, Hyperparameter Tuning, Pre-Processing

In [None]:
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.utils import simple_preprocess
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics.pairwise import cosine_distances
from scipy.cluster.hierarchy import linkage, leaves_list, optimal_leaf_ordering
from scipy.spatial.distance import squareform

In [None]:
topk = 10
topics_dict = topic_model.get_topics()
topic_words = [
    [word for word, _ in words[:topk]]
    for topic_id, words in topics_dict.items()
    if topic_id != -1]
topic_words[:3]

## Coherence Metric 

In [None]:
texts = [simple_preprocess(doc) for doc in docs]
dictionary = Dictionary(texts)
coherence_cv = CoherenceModel(
    topics=topic_words,
    texts=texts,
    dictionary=dictionary,
    coherence="c_v"
).get_coherence()
coherence_cv

In [None]:
coherence_npmi = CoherenceModel(
    topics=topic_words,
    texts=texts,
    dictionary=dictionary,
    coherence="c_npmi"
).get_coherence()
coherence_npmi

## Inter-topic Distance

In [None]:
topic_embeddings = topic_model.topic_embeddings_
distance_matrix = cosine_distances(topic_embeddings)
condensed_dist = squareform(distance_matrix)
mean_intertopic_distance = np.mean(
    distance_matrix[np.triu_indices_from(distance_matrix, k=1)])
print("mean_intertopic_distance:", mean_intertopic_distance)

Z = linkage(condensed_dist, method="average")
Z = optimal_leaf_ordering(Z, condensed_dist)

order = leaves_list(Z)
distance_matrix_ordered = distance_matrix[np.ix_(order, order)]
sns.heatmap(
    distance_matrix_ordered,
    cmap="viridis",
    vmin=0,
    vmax=np.percentile(distance_matrix, 90)  # preserves diagonal contrast
)
plt.title("Inter-Topic Cosine Distance (Clustered & Ordered)")
plt.tight_layout()
plt.figure(figsize=(6, 5))

plt.show()
mean_intertopic_distance = np.mean(
    distance_matrix[np.triu_indices_from(distance_matrix, k=1)]
)
mean_intertopic_distance

In [None]:
{"coherence_cv": coherence_cv,
 "coherence_npmi": coherence_npmi,
 "mean_intertopic_distance": mean_intertopic_distance,
 "num_topics": len(topic_words)}

## Optimal Number of Topics

In [None]:
texts = [simple_preprocess(doc) for doc in docs]
dictionary = Dictionary(texts)

topk = 10
coherence_scores = {}

for n in [5, 10, 20, 50]:
    print("nr_topics =", n)

    model_n = BERTopic(nr_topics=n, verbose=False)
    topics_n, _ = model_n.fit_transform(docs)

    topics_dict = model_n.get_topics()
    topic_words_n = [
        [word for word, _ in words[:topk]]
        for topic_id, words in topics_dict.items()
    ]

    print("actual_topics =", len(topic_words_n))

    coherence = CoherenceModel(
        topics=topic_words_n,
        texts=texts,
        dictionary=dictionary,
        coherence="c_v"
    ).get_coherence()

    coherence_scores[n] = coherence
coherence_scores

In [None]:
# Sort by number of topics
n_topics = sorted(coherence_scores.keys())
scores = [coherence_scores[n] for n in n_topics]

plt.figure(figsize=(6, 4))
plt.plot(n_topics, scores, marker="o")
plt.xlabel("Number of Topics (nr_topics)")
plt.ylabel("Coherence (C_v)")
plt.title("Topic Coherence vs Topic Granularity (BERTopic)")
plt.grid(True)
plt.show()

## Pre-Processing: Stop-words, Alphabetical Characters, Repetition Removal

In [None]:
def remove_repeated_patterns(text):
    # remove any character repeated 4+ times in a row
    text = re.sub(r'(.)\1{3,}', '', text)
    # remove alternating repetition like axaxaxaxax
    text = re.sub(r'(\b\w{1,3})(\1){3,}', '', text)
    return text
docs_clean = [remove_repeated_patterns(doc) for doc in docs]
vectorizer_model = CountVectorizer(
    stop_words="english",
    token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z]{2,}\b"  # only real words, min length 3)