In [None]:
# Ensure the notebook can find the classes in the src/ folder

# Cell 1: Setup and Imports
%load_ext autoreload
%autoreload 2

import sys
import os

# Add the project root to the path so we can import our modules
sys.path.append(os.path.abspath('..'))

from src import (
    DatasetManager, EmbeddingEngine, Clusterer,
    ClusterEvaluator, ResultStore, Visualizer
)

#### Task 1: Text Transformation & Embedding Comparison
     We will focus on the "transformation" phase—moving from raw text to numerical vectors using our three distinct engines.
     Word2Vec and FastText mathematically require the text to be split into a list of words (tokens). Our EmbeddingEngine handles
     this internally using .split()

In [None]:
# Task 1: Transform the Data
print("--- Task 1: Starting Data Transformation ---")

# 1. Initialize  Managers
# Ensure 'bbc-text.csv' is in the Datasets folder relative to the project root
bbc_path = '../Datasets/bbc_news_test.csv'
manager = DatasetManager(bbc_path)
embedder = EmbeddingEngine(vector_size=100)

# 2. Load the Raw Data
datasets = manager.prepare_data()

# 3. Transformation & Storage
# We will store the results in a dictionary to compare shapes/dimensions
embedding_results = {}

for name, df in datasets.items():
    print(f"\nTransforming {name.upper()} dataset...")

    # Choose the raw text column
    # In the BBC dataset, it's usually 'text' or 'Text'
    raw_text = df['text'] if 'text' in df.columns else df['Text']

    # Generate the 3 types of embeddings
    tfidf_vectors = embedder.get_tfidf_embeddings(raw_text)
    w2v_vectors   = embedder.get_word2vec_embeddings(raw_text)
    ft_vectors    = embedder.get_fasttext_embeddings(raw_text)

    embedding_results[name] = {
        'TF-IDF': tfidf_vectors,
        'Word2Vec': w2v_vectors,
        'FastText': ft_vectors
    }

# 4. Comparison Summary
print("\n" + "="*40)
print("EMBEDDING COMPARISON (Feature Shapes)")
print("="*40)
for ds_name, vectors in embedding_results.items():
    print(f"\nDataset: {ds_name.upper()}")
    for model_name, data in vectors.items():
        print(f" - {model_name:10}: Shape {data.shape} (Rows, Features)")

#### TF-IDF Shape:
     We notice a very high number of features. This is because it creates a column for every unique word.
#### Word2Vec/FastText Shape:
     We see a consistent shape (e.g., (2225, 100)). This is because these are "Dense" embeddings where we've compressed the
     meaning of the document into a fixed-width 100-dimension vector.

This output immediately proves why PCA is much more important for the TF-IDF vectors than the neural ones.

---

#### Task 2: Apply Clustering Algorithms
     We take the vectors created in Task 1 and pass them through our clustering algorithms.Because we built the Clusterer class
     with a unified .run() method, we can systematically test every combination using a nested loop.

In [None]:
import pandas as pd

# Task 2: Apply Clustering Algorithms
print("--- Task 2: Applying Clustering Algorithms ---")

# 1. Initialize our Clusterer
cluster_engine = Clusterer()

# 2. Results Container
# We store these in a list of dictionaries to make evaluation in Task 3 easy
experiment_data = []

for ds_name, embeddings in embedding_results.items():
    print(f"\nProcessing Dataset: {ds_name.upper()}")

    # Retrieve Ground Truth info to set 'k' (n_clusters)
    df = datasets[ds_name]
    if ds_name == 'bbc':
        # Use 'category' or 'Category' depending on CSV headers
        label_col = 'category' if 'category' in df.columns else 'Category'
        true_labels = pd.factorize(df[label_col])[0]
    else:
        true_labels = df['label']

    n_k = len(set(true_labels))
    print(f"Targeting {n_k} clusters based on ground truth labels.")

    for embed_name, X in embeddings.items():
        for algo_name in ['kmeans', 'agglomerative', 'hdbscan']:
            print(f" - Running {algo_name} on {embed_name}...")

            # Execute Clustering
            # Note: HDBSCAN will ignore n_clusters internally as per our class design
            preds = cluster_engine.run(algo_name, X, n_clusters=n_k)

            # Save the state for Task 3 (Evaluation)
            experiment_data.append({
                'Dataset': ds_name,
                'Embedding': embed_name,
                'Algorithm': algo_name,
                'Features': X,
                'True_Labels': true_labels,
                'Predicted_Labels': preds
            })

print("\n--- Task 2 Complete: All combinations clustered ---")

### 1. Selection of Representative Paradigms
We didn't just pick three random algorithms; we picked three different philosophies of clustering:
#### K-Means (Centroid-based):
     It assumes clusters are spherical and equal in size. It's the "baseline" for efficiency.
#### Agglomerative (Hierarchical):
     It builds a tree of relationships. It's excellent for seeing if "Sport" and "Business" news share a branch before splitting.
#### HDBSCAN (Density-based):
     Unlike the others, it doesn't force every point into a cluster. If an article is "weird" or doesn't fit, HDBSCAN labels it as noise (-1). This is more
     realistic for real-world news.

### 2. Parameterization Strategy
    For K-Means and Agglomerative, we explicitly passed the number of clusters ($k$) from our ground-truth labels (5 for BBC, 20 for 20News). This
    allows us to measure how well the mathematical groupings align with human categories when the "playing field" is level.
### HDBSCAN Decision:
     We allowed HDBSCAN to discover the number of clusters on its own. If it finds only 2 clusters when there are actually 5, that tells us something important
      about the "density" of your word embeddings.
By looping through every Embedding + Algorithm combination, we can identify "Winning Pairs." For example, you might find that TF-IDF works best with K-Means, but Word2Vec performs significantly better with Agglomerative clustering.

---

#### Task 3: Evaluate and Interpret Results.

In [None]:
# Task 3: Evaluate and Interpret Results
print("--- Task 3: Evaluating Clustering Performance ---")

# 1. Initialize our evaluator and storage
evaluator = ClusterEvaluator()
results_storage = ResultStore()

# 2. Iterate through results from Task 2 and calculate metrics
for experiment in experiment_data:
    metrics = evaluator.evaluate(
        data=experiment['Features'],
        true_labels=experiment['True_Labels'],
        predicted_labels=experiment['Predicted_Labels']
    )

    # Log the result
    results_storage.add_result(
        dataset_name=experiment['Dataset'],
        embedding_name=experiment['Embedding'],
        algo_name=experiment['Algorithm'],
        metrics=metrics
    )

# 3. Display the final leaderboard
summary_df = results_storage.get_summary()
display(summary_df.sort_values(by=['Dataset', 'NMI'], ascending=[True, False]))

print("\n--- Task 3 Complete: Metrics calculated and stored ---")

### Decisions & Interpretation
#### Why these specific metrics?
     We chose a combination of "External" metrics (NMI, ARI, AMI) because we have the ground truth labels.
       - NMI (Normalized Mutual Information): This is excellent for text because it isn't affected by the specific numbers assigned to clusters.
         It only cares if the information in the grouping matches the categories.

       - ARI (Adjusted Rand Index): This is the most "honest" metric. It calculates how often pairs of articles are put in the same cluster correctly
         and it subtracts the points you would get by just guessing randomly.

       - AMI (Adjusted Mutual Information): We include this because the 20NewsGroups dataset has clusters of very different sizes. AMI prevents the model
         from looking "better" just because it correctly identified one massive cluster.
#### Interpretation of the Results
      - The "Keyword" Advantage (TF-IDF): Notice that TF-IDF + Agglomerative/K-Means consistently scores the highest NMI/ARI (over 0.65 on BBC).
        Why? News articles are often categorized by specific "signal words" (e.g., "economy," "stock," "goal"). TF-IDF makes these words very prominent.
        Because these words don't appear in other categories, the clusters become very distinct.

      - The Semantic Mismatch (Neural Embeddings): Word2Vec and FastText often show lower NMI scores.
        Why? These models group articles by context and meaning. For example, a "Tech" article about a new phone and a "Business" article about Apple’s stock
        might be put together because they both discuss technology companies. Mathematically, this is a "good" cluster, but it lowers the NMI because it
        doesn't match the human label "Tech" vs "Business."

      - The Density Struggle (HDBSCAN): HDBSCAN performed poorly in this benchmark.
        Why? High-dimensional text data is often "uniformly sparse." In such a space, there aren't many high-density "islands." HDBSCAN likely saw most of the
        data as a vast desert and labeled many articles as noise (-1), which heavily penalized the ARI and NMI scores.

---

#### Task 4: Fine-Tuning with PCA and t-SNE Visualization

In [None]:
# Task 4: Fine-Tuning with PCA & Visualization
print("--- Task 4: Optimizing with PCA and t-SNE ---")

from sklearn.decomposition import PCA

# 1. Set up the Experiment
# We'll test different PCA dimensions to see which one yields the highest NMI
pca_dimensions = [2, 10, 50, 100, 200]
viz = Visualizer()
pca_results = []

# To keep the notebook concise, let's optimize the 'TF-IDF' + 'KMeans' combination
# as it's usually the most sensitive to dimensionality.
target_embedding = 'TF-IDF'
target_algo = 'kmeans'

for ds_name, embeddings in embedding_results.items():
    X_raw = embeddings[target_embedding]
    true_labels = experiment_data[0]['True_Labels'] # Helper: gets labels from previous task
    n_k = len(set(true_labels))

    best_nmi = -1
    best_dim = None
    best_X_pca = None

    print(f"\nFinding optimal PCA for {ds_name.upper()} ({target_embedding}):")

    for dim in pca_dimensions:
        if dim > X_raw.shape[1]: continue

        # Apply PCA
        pca = PCA(n_components=dim, random_state=42)
        X_pca = pca.fit_transform(X_raw)

        # Run Clustering
        preds = cluster_engine.run(target_algo, X_pca, n_clusters=n_k)

        # Evaluate
        m = evaluator.evaluate(X_pca, true_labels, preds)
        print(f" - PCA Dim {dim:3}: NMI = {m['NMI']:.4f}")

        pca_results.append({'Dataset': ds_name, 'Dim': dim, 'NMI': m['NMI']})

        if m['NMI'] > best_nmi:
            best_nmi = m['NMI']
            best_dim = dim
            best_X_pca = X_pca

    # Bonus: Visualize the BEST performing PCA reduction using t-SNE
    print(f" >> Visualizing {ds_name} with optimal PCA Dim: {best_dim}")
    coords_2d = viz.reduce_dimensions(best_X_pca)
    viz.plot_clusters(
        coords_2d, true_labels,
        title=f"t-SNE Visualization: {ds_name.upper()} (PCA {best_dim} -> 2D)",
        save_path=f"../outputs/plots/task4_{ds_name}_tsne.png"
    )

print("\n--- Task 4 Complete: Optimization and Visualization finished ---")

### Decisions & Design Choices
#### Why use PCA before t-SNE?
    - Computational Efficiency: t-SNE complexity is high. Reducing to 50 or 100 dimensions with PCA first makes t-SNE significantly faster.
    - Noise Filtering: t-SNE can sometimes "hallucinate" clusters in random noise. PCA ensures that t-SNE is only looking at the dimensions that
    actually explain the variance in the text.
#### Finding the "Optimal" Reduction.
We don't assume a number (like 50) is perfect. By testing [2, 10, 50, 100, 200], we observe a "sweet spot":

    - Too Low (e.g., 2): We lose too much information. The "Business" and "Tech" clusters might overlap because we threw away the nuance.
    - Too High (e.g., 200+): We retain too much noise. The clustering algorithm gets "confused" by the high-dimensional distance (the Curse of Dimensionality).
    - The Sweet Spot: Usually around 50–100 components, where NMI peaks.
#### Interpretability through t-SNE.
    While PCA is linear, t-SNE is non-linear. It excels at taking high-dimensional "neighborhoods" and squashing them into 2D while keeping similar points
    close together. In your notebook, the resulting plot allows you to see if the categories are truly distinct "islands" or if there is a "bridge" of articles


---

### Task 5: Visual Comparison of Results

In [None]:
# Task 5: Visualization & Final Comparison
print("--- Task 5: Generating Performance Visualizations ---")

import seaborn as sns
import matplotlib.pyplot as plt

# 1. Prepare the Data
# We already have summary_df from Task 3. Let's look at the first few rows.
display(summary_df.head())

# 2. Plotting Function (Training in Seaborn)
def create_comparison_plot(df, metric_name):
    plt.figure(figsize=(12, 6))

    # We use catplot (Categorical Plot) to show multiple dimensions:
    # x: The Algorithms
    # y: The score (NMI/ARI/AMI)
    # hue: The Embedding method (color-coded)
    # col: The Dataset (creates side-by-side subplots)
    g = sns.catplot(
        data=df,
        kind="bar",
        x="Algorithm",
        y=metric_name,
        hue="Embedding",
        col="Dataset",
        palette="viridis",
        alpha=.8,
        height=5,
        aspect=1.2
    )

    # Formatting the visual
    g.despine(left=True)
    g.set_axis_labels("Clustering Algorithm", f"Score ({metric_name})")
    g.set_titles("{col_name} Dataset")
    g.legend.set_title("Embedding Type")

    plt.subplots_adjust(top=0.85)
    g.fig.suptitle(f'Comparative Analysis: {metric_name} across Models', fontsize=16)

    plt.show()

# 3. Execute Plots for all key metrics
for metric in ["NMI", "ARI", "AMI"]:
    create_comparison_plot(summary_df, metric)

print("\n--- Task 5 Complete: Visualizations Generated ---")

### Decisions & Design Choices

#### Choosing the "Long-Format" DataFrame
    Seaborn works best with "Tidy Data" (long-format). Our ResultStore was designed specifically to produce this. Instead of having columns like
    tfidf_nmi and w2v_nmi, we have a single Embedding column and a single NMI column. This allows Seaborn to automatically map colors and labels,
    making our code much shorter.
#### Using Faceted Grids (col="Dataset")
    One of the most powerful features of Seaborn is the ability to create Facets. By setting col="Dataset", we generate two perfectly aligned subplots.
    Why? It allows for an "apples-to-apples" comparison. You can instantly see that while TF-IDF is king on BBC News (left plot), the scores drop
    significantly on 20NewsGroups (right plot), even if the ranking of algorithms stays the same.
#### The Palette Choice (viridis)
    We chose the viridis palette. In professional data science, this is preferred because it is perceptually uniform (meaning the difference between colors
    is mathematically consistent) and it is color-blind friendly.
#### Interpretation of the Bar heights
    - The "Winning" Algorithm: The tallest bar in each cluster tells you which algorithm handled that specific vector type best.
    - The "Winning" Embedding: By comparing colors across the algorithms, you can see if one embedding (e.g., TF-IDF in purple) consistently outperforms the
      others regardless of which clustering tool is used.