## This notebook requires GPU

This lab must be run in Google Colab in order to use GPU acceleration for model training. Click the button below to open this notebook in Colab, then set your runtime to GPU:

**Runtime > Change Runtime Type > T4 GPU**

### Upload the data file first

Before opening this notebook in Colab, be sure to download the `final_preprocessed_reviews.json` file from the data folder and upload it to a folder called `coursera-msds` in your Google Drive.

### Open in Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/scott2b/coursera-msds-public/blob/main/notebooks/MSDSTopicModel_Lesson_BERTopic.ipynb)

In [None]:
# If running in Google Colab, install dependencies (safe to re-run)
!pip -q install -U bertopic[visualization] sentence-transformers umap-learn hdbscan

# 1) Imports
import json
from pathlib import Path
from typing import List

from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

# 2) Mount Google Drive (for Colab)
from google.colab import drive
drive.mount('/content/drive')

# 3) Project paths
WORKING_DIR = Path('/content/drive/MyDrive/coursera-msds')
DATA_FILE = WORKING_DIR / 'final_preprocessed_reviews.json'
OUTPUT_DIR = WORKING_DIR / 'processed' / 'bertopic'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f'Using data file: {DATA_FILE}')

# 4) Load documents (same source as NMF notebook)
with DATA_FILE.open('r', encoding='utf-8') as f:
    reviews = json.load(f)

docs: List[str] = []
for r in reviews.values():
    txt = (r.get('cleaned_text') or '').strip()
    if txt and len(txt.split()) >= 3:
        docs.append(txt)

print(f'Loaded {len(docs)} documents')

# 5) Domain stoplist (aligned to your NMF setup)
# custom_stop = {
#     'wa','nt',"n't","’s","'s",'im','ve','ll','amp',
#     'size','fit','great','good','like','love','just','get','got','really','very','little','bit',
#     'use','time','day','week','month','year','buy','bought','ordered','come','came','want','need',
#     'make','made','find','found','say','said','tell','told','think','know','feel','look','see','go',
#     'went','give','gave','take','took','wear','wore','try','tried','put','keep','start','started',
#     'end','ended','run','ran','walk','walked','work','worked','play','played','train','trained',
#     'exercise','workout','felt','able','lot','way','thing','much','many','sure','true','false',
#     'right','left','up','down','in','out','on','off','over','under','about','above','below',
#     'between','through','around','before','after','while','when','where','how','why','what','which',
#     'who','whom','whose','this','that','these','those','here','there','all','any','some','few',
#     'more','most','other','another','such','only','own','same','different','each','every','no',
#     'nor','not','so','than','too','s','t','can','will','don','should','now'
# }

vectorizer_model = CountVectorizer(
    ngram_range=(1, 2),
    min_df=2, # Changed from 5 to 2
    stop_words=None # Removed custom stop words
)

# 6) Configure SOTA components
def make_embedding_model():
    try:
        # High-quality, strong clustering performance; may use more VRAM
        m = SentenceTransformer('BAAI/bge-large-en-v1.5', trust_remote_code=True)
        m.max_seq_length = 512
        return m
    except Exception as e:
        print(f'Falling back to base model due to: {e}')
        m = SentenceTransformer('BAAI/bge-base-en-v1.5', trust_remote_code=True)
        m.max_seq_length = 512
        return m

embedding_model = make_embedding_model()

umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric='cosine',
    random_state=42
)

hdbscan_model = HDBSCAN(
    min_cluster_size=3, # Reduced further
    min_samples=1,     # Reduced further
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

representation_model = KeyBERTInspired()

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    top_n_words=10,
    # nr_topics='auto',
    calculate_probabilities=True,
    low_memory=True,
    verbose=True
)

# 7) Compute normalized embeddings explicitly (recommended)
embeddings = embedding_model.encode(
    docs,
    batch_size=64,
    show_progress_bar=True,
    normalize_embeddings=True
)

# 8) Fit-transform BERTopic
topics, probs = topic_model.fit_transform(docs, embeddings=embeddings)

# 9) Inspect and save results
info = topic_model.get_topic_info()
print(info.head(10))

info.to_csv(OUTPUT_DIR / 'topic_info.csv', index=False)

# Documents to topics
import pandas as pd
doc_topics = pd.DataFrame({
    'document': docs,
    'topic': topics
})
doc_topics.to_csv(OUTPUT_DIR / 'documents_topics.csv', index=False)

# 10) Visualizations (HTML saved to Drive)
try:
    # Only attempt visualizations if there are enough topics
    if len(topic_model.get_topics()) > 1:
        fig1 = topic_model.visualize_topics()
        fig1.write_html(str(OUTPUT_DIR / 'viz_topics.html'))

        fig2 = topic_model.visualize_hierarchy()
        fig2.write_html(str(OUTPUT_DIR / 'viz_hierarchy.html'))

        # Use precomputed embeddings to speed up
        fig3 = topic_model.visualize_documents(docs, embeddings=embeddings, hide_annotations=True)
        fig3.write_html(str(OUTPUT_DIR / 'viz_documents.html'))

        fig4 = topic_model.visualize_barchart(top_n_topics=12)
        fig4.write_html(str(OUTPUT_DIR / 'viz_barchart.html'))

        print(f'Visualizations saved to: {OUTPUT_DIR}')
    else:
        print("Skipping visualizations due to insufficient number of topics.")

except Exception as viz_e:
    print(f'Visualization step skipped due to: {viz_e}')

# 11) Optional: Outlier reduction and topic update
# If you see many -1 (outliers), uncomment the following:
# topics = topic_model.reduce_outliers(docs, topics)
# topic_model.update_topics(docs, vectorizer_model=vectorizer_model)
# info2 = topic_model.get_topic_info()
# info2.to_csv(OUTPUT_DIR / 'topic_info_after_outlier_reduction.csv', index=False)

### Visualize Topics

This visualization displays the topics in a 2D scatter plot based on their centroid embeddings, projected down using UMAP. It helps to understand the relationships between topics. Topics that are closer together are more semantically similar. The size of the circles represents the number of documents in each topic. Hovering over a circle shows the representative terms for that topic.

**Contextual Observations:**

*   **Clustering:** Observe how tightly clustered the topics are. Tightly clustered groups of topic circles suggest strong semantic similarity between those topics. Widely dispersed topics indicate distinct themes.
*   **Topic Size Distribution:** Notice the varying sizes of the topic circles. A few large circles and many small ones might indicate a few dominant themes and many niche discussions. A more even distribution of sizes could suggest a broader range of equally important topics.
*   **Outlier Topic (-1):** Pay attention to the position and size of the outlier topic (-1). A large outlier topic suggests a significant portion of documents do not fit well into any defined topic. Its position relative to other topics can sometimes indicate if the outliers are closer to certain themes.
*   **Potential Relationships:** Look for visual groupings of topics that might represent sub-themes or related concepts, even if they are not formally merged. Hovering to see representative terms helps validate these visual relationships.

In [None]:
from IPython.display import display

try:
    # Ensure topic_model is defined and has topics
    if 'topic_model' in locals() and len(topic_model.get_topics()) > 1:
        print("Generating BERTopic visualizations - Topics...")
        fig = topic_model.visualize_topics()
        display(fig)
        print("Visualizations generated.")
    elif 'topic_model' not in locals():
        print("Skipping visualization: topic_model is not defined. Please run the cell that creates and fits the BERTopic model first.")
    else:
         print("Skipping visualization due to insufficient number of topics.")
except Exception as viz_e:
    print(f'Visualization step skipped due to error: {viz_e}')

### Visualize Hierarchy

This visualization displays the hierarchical structure of the topics. It shows how topics are related to each other at different levels of granularity. This can help in understanding sub-topics and broader themes.

**Insights:**

*   **Topic Relationships:** See how smaller topics group together to form larger, more general topics.
*   **Topic Structure:** Understand the overall organization of themes in your data.
*   **Merging Decisions:** Can inform decisions about merging topics based on their hierarchical relationships.

### Visualize Hierarchy

This visualization displays the hierarchical structure of the topics. It shows how topics are related to each other at different levels of granularity. This can help in understanding sub-topics and broader themes.

**Insights:**

*   **Topic Relationships:** See how smaller topics group together to form larger, more general topics.
*   **Topic Structure:** Understand the overall organization of themes in your data.
*   **Merging Decisions:** Can inform decisions about merging topics based on their hierarchical relationships.

In [None]:
from IPython.display import display

try:
    # Ensure topic_model is defined and has topics
    if 'topic_model' in locals() and len(topic_model.get_topics()) > 1:
        print("Generating BERTopic visualizations - Hierarchy...")
        fig = topic_model.visualize_hierarchy()
        display(fig)
        print("Visualizations generated.")
    elif 'topic_model' not in locals():
        print("Skipping visualization: topic_model is not defined. Please run the cell that creates and fits the BERTopic model first.")
    else:
         print("Skipping visualization due to insufficient number of topics.")
except Exception as viz_e:
    print(f'Visualization step skipped due to error: {viz_e}')

### Visualize Documents

This visualization displays individual documents projected down to 2D space based on their embeddings, colored by their assigned topic. This helps to see how documents of the same topic cluster together and how topics are separated in the embedding space.

**Insights:**

*   **Topic Separation:** Assess how distinct the topics are in the document embedding space.
*   **Within-Topic Cohesion:** See how tightly clustered documents are within each topic.
*   **Misclassified Documents:** Identify documents that might be assigned to a topic but are located far from the main cluster of that topic.

In [None]:
from IPython.display import display

try:
    # Ensure topic_model is defined and has topics and embeddings
    if 'topic_model' in locals() and len(topic_model.get_topics()) > 1:
        if 'embeddings' in locals():
            print("Generating BERTopic visualizations - Documents...")
            fig = topic_model.visualize_documents(docs, embeddings=embeddings, hide_annotations=True)
            display(fig)
            print("Visualizations generated.")
        else:
            print("Skipping document visualization: embeddings not found.")
    elif 'topic_model' not in locals():
        print("Skipping visualization: topic_model is not defined. Please run the cell that creates and fits the BERTopic model first.")
    else:
         print("Skipping visualization due to insufficient number of topics.")
except Exception as viz_e:
    print(f'Visualization step skipped due to error: {viz_e}')

### Visualize Barchart

This visualization displays the top words for each topic in a barchart format. It provides a clear view of the most important terms associated with each topic.

**Insights:**

*   **Topic Interpretation:** Helps in understanding the meaning and theme of each topic by examining its key terms.
*   **Term Importance:** See the relative importance of words within a topic.
*   **Comparing Topics:** Easily compare the key terms across different topics.

In [None]:
from IPython.display import display

try:
    # Ensure topic_model is defined and has topics
    if 'topic_model' in locals() and len(topic_model.get_topics()) > 1:
        print("Generating BERTopic visualizations - Barchart...")
        fig = topic_model.visualize_barchart(top_n_topics=12)
        display(fig)
        print("Visualizations generated.")
    elif 'topic_model' not in locals():
        print("Skipping visualization: topic_model is not defined. Please run the cell that creates and fits the BERTopic model first.")
    else:
         print("Skipping visualization due to insufficient number of topics.")
except Exception as viz_e:
    print(f'Visualization step skipped due to: {viz_e}')

### Visualize Heatmap

This visualization displays the similarity between topics as a heatmap. Darker colors indicate higher similarity between topics. This can help identify topics that are closely related.

**Insights:**

*   **Topic Similarity:** Quickly identify pairs or groups of topics that are semantically similar.
*   **Distinct Topics:** Spot topics that are not highly similar to any other topics.
*   **Merging Candidates:** High similarity scores can suggest topics that might be good candidates for merging.

In [None]:
from IPython.display import display

try:
    # Ensure topic_model is defined and has topics
    if 'topic_model' in locals() and len(topic_model.get_topics()) > 1:
        print("Generating BERTopic visualizations - Heatmap...")
        fig = topic_model.visualize_heatmap()
        display(fig)
        print("Visualizations generated.")
    elif 'topic_model' not in locals():
        print("Skipping visualization: topic_model is not defined. Please run the cell that creates and fits the BERTopic model first.")
    else:
         print("Skipping visualization due to insufficient number of topics.")
except Exception as viz_e:
    print(f'Visualization step skipped due to: {viz_e}')

### Visualize Term Rank

This visualization displays the rank of terms across topics. It shows how important each term is for different topics. This can help identify terms that are highly specific to certain topics or terms that are important across multiple topics.

**Insights:**

*   **Topic Specificity of Terms:** Identify terms that are uniquely important to a single topic.
*   **General Terms:** Find terms that are important across many topics.
*   **Understanding Term Distribution:** See how the relevance of a term varies across the topic landscape.

In [None]:
from IPython.display import display

try:
    # Ensure topic_model is defined and has topics
    if 'topic_model' in locals() and len(topic_model.get_topics()) > 1:
        print("Generating BERTopic visualizations - Term Rank...")
        fig = topic_model.visualize_term_rank()
        display(fig)
        print("Visualizations generated.")
    elif 'topic_model' not in locals():
        print("Skipping visualization: topic_model is not defined. Please run the cell that creates and fits the BERTopic model first.")
    else:
         print("Skipping visualization due to insufficient number of topics.")
except Exception as viz_e:
    print(f'Visualization step skipped due to: {viz_e}')