In [1]:
import warnings
import pandas as pd
from bertopic import BERTopic
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load unstructured text 
with open("uzaki-chanS2_S1_S2_combined.txt", "r", encoding = "utf-8") as f:
          raw_text = f.read()

# Let's split the text by double newline to create a list of paragraphs \n\n.
# Filter out very short snippets.

docs = [p.strip() for p in raw_text.split("\n\n") if len(p.strip()) > 50]

print(f"Total documents created: {len(docs)}")



Total documents created: 2422


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

# 1. Initialize the Embedding Model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# 2. Pre-calculate embeddings (to save time later)
embeddings = embedding_model.encode(docs, show_progress_bar=True)

# 3. Define the Vectorizer. This removes English stop words from the topic results.
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))

# 4. Pass EVERYTHING into BERTopic.
topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model, 
    min_topic_size=10,
    verbose=True
)

# 5. Train the model.
topics, probs = topic_model.fit_transform(docs, embeddings)

Batches: 100%|██████████| 76/76 [00:12<00:00,  6.33it/s]
2026-02-15 15:36:56,308 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-15 15:37:13,404 - BERTopic - Dimensionality - Completed ✓
2026-02-15 15:37:13,406 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-02-15 15:37:13,532 - BERTopic - Cluster - Completed ✓
2026-02-15 15:37:13,542 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-02-15 15:37:13,716 - BERTopic - Representation - Completed ✓


In [4]:
print(topic_model.embedding_model)

<bertopic.backend._sentencetransformers.SentenceTransformerBackend object at 0x0000018EFDE5AE10>


In [5]:
# Topic Breakdown

# View the Topic Metadata Table
topic_info = topic_model.get_topic_info()
topic_info.head(10)

# Get the top words for a specific topic (e.g. Topic 0)
topic_model.get_topic(0)


[('know', np.float64(0.020747405714792343)),
 ('just', np.float64(0.019379304114687117)),
 ('say', np.float64(0.01728413845246891)),
 ('fine', np.float64(0.01584310871665051)),
 ('sorry', np.float64(0.01584310871665051)),
 ('didn', np.float64(0.015613336402836252)),
 ('don', np.float64(0.015105583074333188)),
 ('mean', np.float64(0.014619998581105527)),
 ('right', np.float64(0.014250050537365855)),
 ('okay', np.float64(0.013744350973091943))]

In [None]:
from bertopic.representation import KeyBERTInspired

# Update the model with a more sophisticated representation.
representation_model = KeyBERTInspired()
topic_model.update_topics(docs, representation_model=representation_model)

# Now, when you run visualize_barchart(), the words will be much more descriptive.

# 1. Topic Word Scores (bar charts)
topic_model.visualize_barchart()

In [None]:
# 2. Intertopic Distance Map (bubble chart)
topic_model.visualize_topics()






In [8]:
# 3. Hierarchical Clustering (how topics merge)
topic_model.visualize_hierarchy()

In [None]:
# 4. Shows a correlation matrix between topics
topic_model.visualize_heatmap()

In [None]:
# 5. Uses the "Elbow Method" for topoic keywords
topic_model.visualize_term_rank() 
 
# Per the term rank visualization: After the second word the scores fall off dramatically. Give the most weight to the first and second words in each topic. 


In [11]:
# 6. Data Exploration/Document Cloud 

# Recalculates embeddings for 2D space (UMAP) and colors them by topic. 

topic_model.visualize_documents(docs, embeddings = embeddings)



In [None]:
# Search for a specific word to find related topics

query = "support" # replace support/the word in parentheses with the word in your dataset you want to search for. 
similar_topics, similarity_scores = topic_model.find_topics(query, top_n=3)

print(f"The most relevant topics for '{query}' are: {similar_topics}")

# Print the top words for the best match

if similar_topics:
    best_topic = similar_topics[0]
    print(f"\nTop words in Topic {best_topic}:")
    print(topic_model.get_topic(best_topic))

The most relevant topics for 'support' are: [7, -1, 20]

Top words in Topic 7:
[('workout', np.float32(0.5832658)), ('fitness', np.float32(0.5569831)), ('exercise', np.float32(0.55507267)), ('gym', np.float32(0.52451384)), ('climbing', np.float32(0.4244014)), ('lifting', np.float32(0.4051817)), ('muscles', np.float32(0.39710164)), ('trainer', np.float32(0.36125675)), ('climb', np.float32(0.35859066)), ('legs', np.float32(0.30359948))]


In [13]:
import os

# Create a folder for your report
output_dir = "topic_analysis_report"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Define and save each figure
figs = {
    "intertopic_distance": topic_model.visualize_topics(),
    "topic_barchart": topic_model.visualize_barchart(top_n_topics=15),
    "topic_hierarchy": topic_model.visualize_hierarchy(),
    "topic_heatmap": topic_model.visualize_heatmap(),
    "document_projections": topic_model.visualize_documents(docs, embeddings=embeddings)
}

for name, fig in figs.items():
    path = os.path.join(output_dir, f"{name}.html")
    fig.write_html(path)
    print(f"✅ Saved: {path}")

✅ Saved: topic_analysis_report\intertopic_distance.html
✅ Saved: topic_analysis_report\topic_barchart.html
✅ Saved: topic_analysis_report\topic_hierarchy.html
✅ Saved: topic_analysis_report\topic_heatmap.html
✅ Saved: topic_analysis_report\document_projections.html
