In [None]:
# Ensure the notebook can find the classes in the src/ folder

# Cell 1: Setup and Imports
%load_ext autoreload
%autoreload 2

import sys
import os

# Add the project root to the path so we can import our modules
sys.path.append(os.path.abspath('..'))

from src import (
    DatasetManager, EmbeddingEngine, Clusterer,
    ClusterEvaluator, ResultStore, Visualizer
)

#### Task 1: Text Transformation & Embedding Comparison
     We will focus on the "transformation" phaseâ€”moving from raw text to numerical vectors using our three distinct engines.
     Word2Vec and FastText mathematically require the text to be split into a list of words (tokens). Our EmbeddingEngine handles
     this internally using .split()

In [None]:
# Task 1: Transform the Data
print("--- Task 1: Starting Data Transformation ---")

# 1. Initialize  Managers
# Ensure 'bbc-text.csv' is in the Datasets folder relative to the project root
bbc_path = '../Datasets/bbc_news_test.csv'
manager = DatasetManager(bbc_path)
embedder = EmbeddingEngine(vector_size=100)

# 2. Load the Raw Data
datasets = manager.prepare_data()

# 3. Transformation & Storage
# We will store the results in a dictionary to compare shapes/dimensions
embedding_results = {}

for name, df in datasets.items():
    print(f"\nTransforming {name.upper()} dataset...")

    # Choose the raw text column
    # In the BBC dataset, it's usually 'text' or 'Text'
    raw_text = df['text'] if 'text' in df.columns else df['Text']

    # Generate the 3 types of embeddings
    tfidf_vectors = embedder.get_tfidf_embeddings(raw_text)
    w2v_vectors   = embedder.get_word2vec_embeddings(raw_text)
    ft_vectors    = embedder.get_fasttext_embeddings(raw_text)

    embedding_results[name] = {
        'TF-IDF': tfidf_vectors,
        'Word2Vec': w2v_vectors,
        'FastText': ft_vectors
    }

# 4. Comparison Summary
print("\n" + "="*40)
print("EMBEDDING COMPARISON (Feature Shapes)")
print("="*40)
for ds_name, vectors in embedding_results.items():
    print(f"\nDataset: {ds_name.upper()}")
    for model_name, data in vectors.items():
        print(f" - {model_name:10}: Shape {data.shape} (Rows, Features)")

#### TF-IDF Shape:
     We notice a very high number of features. This is because it creates a column for every unique word.
#### Word2Vec/FastText Shape:
     We see a consistent shape (e.g., (2225, 100)). This is because these are "Dense" embeddings where we've compressed the
     meaning of the document into a fixed-width 100-dimension vector.

This output immediately proves why PCA is much more important for the TF-IDF vectors than the neural ones.

---

#### Task 2: Apply Clustering Algorithms
     We take the vectors created in Task 1 and pass them through our clustering algorithms.Because we built the Clusterer class
     with a unified .run() method, we can systematically test every combination using a nested loop.

In [None]:
import pandas as pd

# Task 2: Apply Clustering Algorithms
print("--- Task 2: Applying Clustering Algorithms ---")

# 1. Initialize our Clusterer
cluster_engine = Clusterer()

# 2. Results Container
# We store these in a list of dictionaries to make evaluation in Task 3 easy
experiment_data = []

for ds_name, embeddings in embedding_results.items():
    print(f"\nProcessing Dataset: {ds_name.upper()}")

    # Retrieve Ground Truth info to set 'k' (n_clusters)
    df = datasets[ds_name]
    if ds_name == 'bbc':
        # Use 'category' or 'Category' depending on CSV headers
        label_col = 'category' if 'category' in df.columns else 'Category'
        true_labels = pd.factorize(df[label_col])[0]
    else:
        true_labels = df['label']

    n_k = len(set(true_labels))
    print(f"Targeting {n_k} clusters based on ground truth labels.")

    for embed_name, X in embeddings.items():
        for algo_name in ['kmeans', 'agglomerative', 'hdbscan']:
            print(f" - Running {algo_name} on {embed_name}...")

            # Execute Clustering
            # Note: HDBSCAN will ignore n_clusters internally as per our class design
            preds = cluster_engine.run(algo_name, X, n_clusters=n_k)

            # Save the state for Task 3 (Evaluation)
            experiment_data.append({
                'Dataset': ds_name,
                'Embedding': embed_name,
                'Algorithm': algo_name,
                'Features': X,
                'True_Labels': true_labels,
                'Predicted_Labels': preds
            })

print("\n--- Task 2 Complete: All combinations clustered ---")

### 1. Selection of Representative Paradigms
We didn't just pick three random algorithms; we picked three different philosophies of clustering:
#### K-Means (Centroid-based):
     It assumes clusters are spherical and equal in size. It's the "baseline" for efficiency.
#### Agglomerative (Hierarchical):
     It builds a tree of relationships. It's excellent for seeing if "Sport" and "Business" news share a branch before splitting.
#### HDBSCAN (Density-based):
     Unlike the others, it doesn't force every point into a cluster. If an article is "weird" or doesn't fit, HDBSCAN labels it as noise (-1). This is more
     realistic for real-world news.

### 2. Parameterization Strategy
    For K-Means and Agglomerative, we explicitly passed the number of clusters ($k$) from our ground-truth labels (5 for BBC, 20 for 20News). This
    allows us to measure how well the mathematical groupings align with human categories when the "playing field" is level.
### HDBSCAN Decision:
     We allowed HDBSCAN to discover the number of clusters on its own. If it finds only 2 clusters when there are actually 5, that tells us something important
      about the "density" of your word embeddings.
By looping through every Embedding + Algorithm combination, we can identify "Winning Pairs." For example, you might find that TF-IDF works best with K-Means, but Word2Vec performs significantly better with Agglomerative clustering.

---

#### Task 3: Evaluate and Interpret Results.