In [None]:
# Ensure the notebook can find the classes in the src/ folder

# Cell 1: Setup and Imports
%load_ext autoreload
%autoreload 2

import sys
import os

# Add the project root to the path so we can import our modules
sys.path.append(os.path.abspath('..'))

from src import (
    DatasetManager, EmbeddingEngine, Clusterer,
    ClusterEvaluator, ResultStore, Visualizer
)

#### Task 1: Text Transformation & Embedding Comparison
     We will focus on the "transformation" phaseâ€”moving from raw text to numerical vectors using our three distinct engines.
     Word2Vec and FastText mathematically require the text to be split into a list of words (tokens). Our EmbeddingEngine handles
     this internally using .split()

In [None]:
# Task 1: Transform the Data
print("--- Task 1: Starting Data Transformation ---")

# 1. Initialize  Managers
# Ensure 'bbc-text.csv' is in the Datasets folder relative to the project root
bbc_path = '../Datasets/bbc_news_test.csv'
manager = DatasetManager(bbc_path)
embedder = EmbeddingEngine(vector_size=100)

# 2. Load the Raw Data
# We access the raw 'text' column directly to respect the "no preprocessing" constraint
datasets = manager.prepare_data()

# 3. Transformation & Storage
# We will store the results in a dictionary to compare shapes/dimensions
embedding_results = {}

for name, df in datasets.items():
    print(f"\nTransforming {name.upper()} dataset...")

    # Choose the raw text column
    # In the BBC dataset, it's usually 'text' or 'Text'
    raw_text = df['text'] if 'text' in df.columns else df['Text']

    # Generate the 3 types of embeddings
    tfidf_vectors = embedder.get_tfidf_embeddings(raw_text)
    w2v_vectors   = embedder.get_word2vec_embeddings(raw_text)
    ft_vectors    = embedder.get_fasttext_embeddings(raw_text)

    embedding_results[name] = {
        'TF-IDF': tfidf_vectors,
        'Word2Vec': w2v_vectors,
        'FastText': ft_vectors
    }

# 4. Comparison Summary
print("\n" + "="*40)
print("EMBEDDING COMPARISON (Feature Shapes)")
print("="*40)
for ds_name, vectors in embedding_results.items():
    print(f"\nDataset: {ds_name.upper()}")
    for model_name, data in vectors.items():
        print(f" - {model_name:10}: Shape {data.shape} (Rows, Features)")