In [1]:
#This is the Colab Script for the production-level implementation of Taxonomy Case Study

# =============================================================================
# PART 1: SETUP & IMPORTS
# =============================================================================

print("="*80)
print("ICECAT TAXONOMY CLUSTERING - COMPLETE PIPELINE")
print("Target: 90%+ Accuracy")
print("="*80)

# Mount Google Drive
print("\n[1/10] Mounting Google Drive...")
from google.colab import drive
drive.mount('/content/drive')

# Install dependencies
print("\n[2/10] Installing dependencies (this may take a few minutes)...\n")
!pip install -q sentence-transformers umap-learn hdbscan matplotlib seaborn scikit-learn pandas numpy tqdm

# Imports
print("\n[3/10] Loading libraries...")
import json
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN, Birch, AgglomerativeClustering
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score, silhouette_score
import hdbscan
import umap

print("All libraries loaded!")

# =============================================================================
# PART 2: CONFIGURATION
# =============================================================================

print("\n[4/10] Setting up configuration...")


DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/Case_Studies_Taxonomy/data/icecat_data_train.json'
OUTPUT_DIR = '/content/drive/MyDrive/Colab Notebooks/Case_Studies_Taxonomy/outputs'

# Data preprocessing (from corrections_2.txt)
MIN_HIERARCHY_LEVELS = 3
MIN_PRODUCTS_PER_CATEGORY = 200
BALANCE_SAMPLE_SIZE = 300

# Embeddings
EMBEDDING_MODEL = 'sentence-transformers/all-mpnet-base-v2'
EMBEDDING_BATCH_SIZE = 32

# Targets (from corrections_3.txt)
TARGET_PURITY = 0.90
TARGET_ACCURACY = 0.90

# Visualization
VIZ_SAMPLE_SIZE = 10000
RANDOM_SEED = 42

print(f"‚úÖ Configuration set")
print(f"   Data path: {DATA_PATH}")
print(f"   Target accuracy: {TARGET_ACCURACY:.0%}")
print(f"   Target purity: {TARGET_PURITY:.0%}")

# Create output directory
import os
os.makedirs(OUTPUT_DIR, exist_ok=True)

# =============================================================================
# PART 3: DATA LOADING & PREPROCESSING
# =============================================================================

print("\n" + "="*80)
print("DATA LOADING & PREPROCESSING")
print("="*80)

print("\n[5/10] Loading dataset...")

# Load column-oriented JSON
with open(DATA_PATH, 'r', encoding='utf-8') as f:
    column_data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(column_data)
print(f"‚úÖ Loaded {len(df):,} products with {len(df.columns)} fields")

# Parse hierarchy
print("\n   Parsing hierarchy (pathlist_names)...")
df['hierarchy_levels'] = df['pathlist_names'].str.split('>')
df['num_levels'] = df['hierarchy_levels'].str.len()

print(f"   Original products: {len(df):,}")

# Remove products with < 3 levels (NO PADDING as per corrections_2.txt)
df = df[df['num_levels'] >= MIN_HIERARCHY_LEVELS].copy()
print(f"   After filtering ‚â•3 levels: {len(df):,}")

# Extract level1, level2, level3
df['level1'] = df['hierarchy_levels'].str[0].str.strip()
df['level2'] = df['hierarchy_levels'].str[1].str.strip()
df['level3'] = df['hierarchy_levels'].str[2].str.strip()

print(f"   Unique level3 categories: {df['level3'].nunique()}")

# Filter categories (keep only ‚â• MIN_PRODUCTS_PER_CATEGORY)
print(f"\n   Filtering categories (‚â•{MIN_PRODUCTS_PER_CATEGORY} products)...")
category_counts = df['level3'].value_counts()
valid_categories = category_counts[category_counts >= MIN_PRODUCTS_PER_CATEGORY].index
df = df[df['level3'].isin(valid_categories)].copy()

print(f"   After filtering: {len(df):,} products, {df['level3'].nunique()} categories")

# Balance dataset (sample N products per category)
print(f"\n   Balancing dataset ({BALANCE_SAMPLE_SIZE} products per category)...")

balanced_dfs = []
for category in df['level3'].unique():
    category_df = df[df['level3'] == category]
    n_samples = min(len(category_df), BALANCE_SAMPLE_SIZE)
    replace = len(category_df) < BALANCE_SAMPLE_SIZE
    sampled = category_df.sample(n=n_samples, replace=replace, random_state=RANDOM_SEED)
    balanced_dfs.append(sampled)

df = pd.concat(balanced_dfs, ignore_index=True)
df = df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

print(f"   Balanced dataset: {len(df):,} products, {df['level3'].nunique()} categories")
print(f"   Products per category: ~{len(df) // df['level3'].nunique()}")

# Drop temporary columns
df = df.drop(columns=['hierarchy_levels', 'num_levels'], errors='ignore')

print("\n‚úÖ Preprocessing complete!")

# =============================================================================
# PART 4: TEXT FEATURE ENGINEERING
# =============================================================================

print("\n" + "="*80)
print("TEXT FEATURE ENGINEERING")
print("="*80)

print("\n[6/10] Creating combined text features...")

# Concatenate LongProductName + LongDesc (from corrections_2.txt)
text_parts = []

if 'Description.LongProductName' in df.columns:
    text_parts.append(df['Description.LongProductName'].fillna('').astype(str))
if 'Description.LongDesc' in df.columns:
    text_parts.append(df['Description.LongDesc'].fillna('').astype(str))

if not text_parts:
    raise ValueError("No text fields found!")

df['combined_text'] = text_parts[0]
for part in text_parts[1:]:
    df['combined_text'] = df['combined_text'] + ' | ' + part

df['combined_text'] = df['combined_text'].str.strip()

avg_len = df['combined_text'].str.len().mean()
print(f"‚úÖ Combined text created")
print(f"   Average length: {avg_len:.0f} characters")

# =============================================================================
# PART 5: EMBEDDING GENERATION
# =============================================================================

print("\n" + "="*80)
print("EMBEDDING GENERATION")
print("="*80)

print(f"\n[7/10] Generating embeddings with {EMBEDDING_MODEL}...")
print("   (This may take 5-10 minutes depending on dataset size)")

# Load model
model = SentenceTransformer(EMBEDDING_MODEL)
print(f"   Model loaded: {model.get_sentence_embedding_dimension()}-dimensional embeddings")

# Generate embeddings
texts = df['combined_text'].tolist()
embeddings = model.encode(
    texts,
    batch_size=EMBEDDING_BATCH_SIZE,
    show_progress_bar=True,
    convert_to_numpy=True
)

print(f"‚úÖ Embeddings generated: {embeddings.shape}")



ICECAT TAXONOMY CLUSTERING - COMPLETE PIPELINE
Target: 90%+ Accuracy

[1/10] Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

[2/10] Installing dependencies (this may take a few minutes)...


[3/10] Loading libraries...
All libraries loaded!

[4/10] Setting up configuration...
‚úÖ Configuration set
   Data path: /content/drive/MyDrive/Colab Notebooks/Case_Studies_Taxonomy/data/icecat_data_train.json
   Target accuracy: 90%
   Target purity: 90%

DATA LOADING & PREPROCESSING

[5/10] Loading dataset...
‚úÖ Loaded 489,902 products with 45 fields

   Parsing hierarchy (pathlist_names)...
   Original products: 489,902
   After filtering ‚â•3 levels: 489,902
   Unique level3 categories: 231

   Filtering categories (‚â•200 products)...
   After filtering: 481,893 products, 114 categories

   Balancing dataset (300 products per category)...
   Balanced dataset: 32,885 products, 114 catego

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

MPNetModel LOAD REPORT from: sentence-transformers/all-mpnet-base-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


   Model loaded: 768-dimensional embeddings


Batches:   0%|          | 0/1028 [00:00<?, ?it/s]

‚úÖ Embeddings generated: (32885, 768)


In [None]:
# =============================================================================
# PART 6: CLUSTERING (OPTIMIZED FOR SPEED)
# =============================================================================
print("\n" + "="*80)
print("CLUSTERING ALGORITHMS (OPTIMIZED FOR ACCURACY)")
print("="*80)

true_labels = df['level3'].values
n_true_categories = len(set(true_labels))

print(f"   True categories: {n_true_categories}")
print(f"   Total products: {len(embeddings):,}")

clustering_results = {}

# --------------------------------------------------
# Algorithm 1: HDBSCAN (OPTIMIZED)
# --------------------------------------------------
print("\n   [Algorithm 1/4] HDBSCAN (optimized)...")
import time
start = time.time()

hdbscan_clusterer = hdbscan.HDBSCAN(
    min_cluster_size=max(30, len(embeddings) // (n_true_categories * 3)),  # Dynamic
    min_samples=3,
    metric='euclidean',
    cluster_selection_epsilon=0.0,  # No merging
    cluster_selection_method='leaf',  # More granular clusters
    core_dist_n_jobs=-1
)
labels_hdbscan = hdbscan_clusterer.fit_predict(embeddings)
n_clusters_hdbscan = len(set(labels_hdbscan)) - (1 if -1 in labels_hdbscan else 0)
noise_hdbscan = list(labels_hdbscan).count(-1)

elapsed = time.time() - start
print(f"      Found {n_clusters_hdbscan} clusters in {elapsed:.1f}s")
print(f"      Noise: {noise_hdbscan} ({noise_hdbscan/len(labels_hdbscan)*100:.1f}%)")
clustering_results['HDBSCAN'] = labels_hdbscan

# --------------------------------------------------
# Algorithm 2: BIRCH (OPTIMIZED)
# --------------------------------------------------
print("\n   [Algorithm 2/4] BIRCH (optimized)...")
start = time.time()

# Try multiple thresholds and pick best
best_threshold = None
best_silhouette = -1

for threshold in [0.3, 0.5, 0.7]:
    birch = Birch(threshold=threshold, n_clusters=n_true_categories)
    test_labels = birch.fit_predict(embeddings[:5000])  # Test on sample

    from sklearn.metrics import silhouette_score
    try:
        score = silhouette_score(embeddings[:5000], test_labels, sample_size=1000)
        if score > best_silhouette:
            best_silhouette = score
            best_threshold = threshold
    except:
        pass

print(f"      Best threshold: {best_threshold}")

birch_clusterer = Birch(threshold=best_threshold, n_clusters=n_true_categories)
labels_birch = birch_clusterer.fit_predict(embeddings)
n_clusters_birch = len(set(labels_birch))

elapsed = time.time() - start
print(f"      Found {n_clusters_birch} clusters in {elapsed:.1f}s")
clustering_results['BIRCH'] = labels_birch

# --------------------------------------------------
# Algorithm 3: DBSCAN (MUCH BETTER EPSILON)
# --------------------------------------------------
print("\n   [Algorithm 3/4] DBSCAN (improved epsilon)...")
start = time.time()

# Better epsilon estimation using elbow method
from sklearn.neighbors import NearestNeighbors

sample_size = min(10000, len(embeddings))
rng = np.random.RandomState(RANDOM_SEED)
sample_idx = rng.choice(len(embeddings), sample_size, replace=False)
embeddings_sample = embeddings[sample_idx]

k = 4
nbrs = NearestNeighbors(n_neighbors=k, n_jobs=-1).fit(embeddings_sample)
distances, indices = nbrs.kneighbors(embeddings_sample)

# Sort distances
distances = np.sort(distances[:, k-1], axis=0)

# Find elbow point (optimal epsilon)
# Use a much lower percentile for denser clusters
optimal_eps = np.percentile(distances, 70)  # Changed from 90 to 70

print(f"      Estimated optimal epsilon: {optimal_eps:.3f}")

# Try a few epsilon values around the estimate
best_eps = optimal_eps
best_n_clusters = 0

for eps_multiplier in [0.7, 0.85, 1.0, 1.15, 1.3]:
    test_eps = optimal_eps * eps_multiplier
    dbscan_test = DBSCAN(eps=test_eps, min_samples=4, n_jobs=-1)
    test_labels = dbscan_test.fit_predict(embeddings_sample)
    test_n_clusters = len(set(test_labels)) - (1 if -1 in test_labels else 0)

    # Prefer clusters closer to true count
    if abs(test_n_clusters - n_true_categories) < abs(best_n_clusters - n_true_categories):
        best_n_clusters = test_n_clusters
        best_eps = test_eps

print(f"      Using epsilon: {best_eps:.3f} (expecting ~{best_n_clusters} clusters)")

dbscan = DBSCAN(eps=best_eps, min_samples=4, n_jobs=-1)
labels_dbscan = dbscan.fit_predict(embeddings)
n_clusters_dbscan = len(set(labels_dbscan)) - (1 if -1 in labels_dbscan else 0)

elapsed = time.time() - start
print(f"      Found {n_clusters_dbscan} clusters in {elapsed:.1f}s")
clustering_results['DBSCAN'] = labels_dbscan

# --------------------------------------------------
# Algorithm 4: Agglomerative (OPTIMIZED)
# --------------------------------------------------
print("\n   [Algorithm 4/4] Agglomerative (optimized)...")
start = time.time()

# Use cosine affinity for text embeddings
agg_clusterer = AgglomerativeClustering(
    n_clusters=n_true_categories,
    metric='cosine',  # Better for embeddings than euclidean
    linkage='average'  # More robust than ward for cosine
)
labels_agg = agg_clusterer.fit_predict(embeddings)
n_clusters_agg = len(set(labels_agg))

elapsed = time.time() - start
print(f"      Created {n_clusters_agg} clusters in {elapsed:.1f}s")
clustering_results['Agglomerative'] = labels_agg

print("\n‚úÖ All clustering algorithms complete!")

# =============================================================================
# PART 7: EVALUATION (corrections_3.txt methodology)
# =============================================================================

print("\n" + "="*80)
print("EVALUATION - 2-STEP VALIDATION")
print("="*80)

print("\n[9/10] Evaluating clustering quality...")

def calculate_purity(predicted_labels, true_labels):
    """STEP 2 from corrections_3.txt: Calculate purity"""
    mask = predicted_labels != -1
    predicted_filtered = predicted_labels[mask]
    true_filtered = true_labels[mask]

    if len(predicted_filtered) == 0:
        return 0.0, 0

    total_correct = 0
    for cluster_id in set(predicted_filtered):
        cluster_mask = predicted_filtered == cluster_id
        cluster_true_labels = true_filtered[cluster_mask]
        most_common_count = Counter(cluster_true_labels).most_common(1)[0][1]
        total_correct += most_common_count

    purity = total_correct / len(predicted_filtered)
    noise = len(predicted_labels) - len(predicted_filtered)

    return purity, noise

def calculate_accuracy(predicted_labels, true_labels):
    """STEP 3 from corrections_3.txt: Accuracy metric"""
    mask = predicted_labels != -1
    predicted_filtered = predicted_labels[mask]
    true_filtered = true_labels[mask]

    if len(predicted_filtered) == 0:
        return 0.0

    cluster_dominant_category = {}
    for cluster_id in set(predicted_filtered):
        cluster_mask = predicted_filtered == cluster_id
        cluster_true_labels = true_filtered[cluster_mask]
        dominant_cat = Counter(cluster_true_labels).most_common(1)[0][0]
        cluster_dominant_category[cluster_id] = dominant_cat

    correctly_placed = sum(
        1 for pred_cluster, true_cat in zip(predicted_filtered, true_filtered)
        if cluster_dominant_category[pred_cluster] == true_cat
    )

    return correctly_placed / len(predicted_labels)

# Evaluate all algorithms
evaluation_results = {}

for algo_name, predicted_labels in clustering_results.items():
    print(f"\n{'='*80}")
    print(f"{algo_name}")
    print(f"{'='*80}")

    # STEP 1: Cluster count match
    n_predicted = len(set(predicted_labels)) - (1 if -1 in predicted_labels else 0)
    print(f"   Predicted clusters: {n_predicted}")
    print(f"   True categories: {n_true_categories}")
    print(f"   Difference: {abs(n_predicted - n_true_categories)}")

    # STEP 2: Purity
    purity, noise = calculate_purity(predicted_labels, true_labels)
    print(f"\n   Purity: {purity:.2%}")
    print(f"   Noise points: {noise} ({noise/len(predicted_labels)*100:.1f}%)")

    # STEP 3: Accuracy
    accuracy = calculate_accuracy(predicted_labels, true_labels)
    print(f"   Accuracy: {accuracy:.2%}")

    # Additional metrics
    nmi = normalized_mutual_info_score(true_labels, predicted_labels)
    ari = adjusted_rand_score(true_labels, predicted_labels)
    print(f"   NMI: {nmi:.2%}")
    print(f"   ARI: {ari:.2%}")

    # Pass/Fail
    passes_target = (accuracy >= TARGET_ACCURACY and purity >= TARGET_PURITY)
    print(f"\n   Target Met: {'‚úÖ YES' if passes_target else '‚ùå NO'}")

    evaluation_results[algo_name] = {
        'n_clusters': n_predicted,
        'purity': purity,
        'accuracy': accuracy,
        'nmi': nmi,
        'ari': ari,
        'noise': noise,
        'passes_target': passes_target
    }

print("\n‚úÖ Evaluation complete!")

# =============================================================================
# PART 8: RESULTS SUMMARY
# =============================================================================

print("\n" + "="*80)
print("RESULTS SUMMARY")
print("="*80)

summary_df = pd.DataFrame({
    algo: {
        'Accuracy': f"{metrics['accuracy']:.2%}",
        'Purity': f"{metrics['purity']:.2%}",
        'NMI': f"{metrics['nmi']:.2%}",
        'N_Clusters': metrics['n_clusters'],
        'Target_Met': '‚úÖ' if metrics['passes_target'] else '‚ùå'
    }
    for algo, metrics in evaluation_results.items()
}).T

print("\n", summary_df)

# Find best algorithm
best_algo = max(evaluation_results.items(), key=lambda x: x[1]['accuracy'])
best_name, best_metrics = best_algo

print(f"\n{'='*80}")
print(f"üèÜ BEST ALGORITHM: {best_name}")
print(f"{'='*80}")
print(f"   Accuracy: {best_metrics['accuracy']:.2%}")
print(f"   Purity: {best_metrics['purity']:.2%}")
print(f"   NMI: {best_metrics['nmi']:.2%}")
print(f"   Clusters: {best_metrics['n_clusters']} (true: {n_true_categories})")
print(f"\n   Target Achieved: {'‚úÖ YES!' if best_metrics['passes_target'] else '‚ùå Not yet'}")
print(f"{'='*80}")

# =============================================================================
# PART 9: VISUALIZATIONS
# =============================================================================

print("\n" + "="*80)
print("GENERATING VISUALIZATIONS")
print("="*80)

print("\n[10/10] Creating plots...")

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 100

# 1. Category Distribution
print("\n   [Plot 1/5] Category distribution...")
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

counts = df['level3'].value_counts()
counts.head(30).plot(kind='barh', ax=ax1)
ax1.set_title('Top 30 Categories', fontsize=14, fontweight='bold')
ax1.set_xlabel('Number of Products')
ax1.invert_yaxis()

ax2.hist(counts.values, bins=50, edgecolor='black')
ax2.set_title('Distribution of Products per Category', fontsize=14, fontweight='bold')
ax2.set_xlabel('Number of Products')
ax2.set_ylabel('Number of Categories')
ax2.axvline(counts.median(), color='red', linestyle='--', label=f'Median: {counts.median():.0f}')
ax2.legend()

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/01_category_distribution.png', bbox_inches='tight', dpi=300)
plt.show()
print(f"      Saved: {OUTPUT_DIR}/01_category_distribution.png")

# 2. UMAP 2D Projection (sample for speed)
print("\n   [Plot 2/5] UMAP 2D projection (this may take a few minutes)...")
sample_size = min(VIZ_SAMPLE_SIZE, len(embeddings))
np.random.seed(RANDOM_SEED)
idx = np.random.choice(len(embeddings), sample_size, replace=False)

reducer = umap.UMAP(n_components=2, random_state=RANDOM_SEED)
embeddings_2d = reducer.fit_transform(embeddings[idx])

fig, ax = plt.subplots(figsize=(12, 8))
scatter = ax.scatter(
    embeddings_2d[:, 0],
    embeddings_2d[:, 1],
    c=pd.factorize(df['level3'].iloc[idx])[0],
    cmap='tab20',
    alpha=0.6,
    s=10
)
ax.set_title('UMAP Projection (Colored by True Categories)', fontsize=14, fontweight='bold')
ax.set_xlabel('UMAP Dimension 1')
ax.set_ylabel('UMAP Dimension 2')
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/02_embeddings_umap.png', bbox_inches='tight', dpi=300)
plt.show()
print(f"      Saved: {OUTPUT_DIR}/02_embeddings_umap.png")

# 3. Best Algorithm Clusters
print(f"\n   [Plot 3/5] {best_name} cluster visualization...")
fig, ax = plt.subplots(figsize=(12, 8))
scatter = ax.scatter(
    embeddings_2d[:, 0],
    embeddings_2d[:, 1],
    c=clustering_results[best_name][idx],
    cmap='tab20',
    alpha=0.6,
    s=10
)
ax.set_title(f'{best_name} Clustering Results', fontsize=14, fontweight='bold')
ax.set_xlabel('UMAP Dimension 1')
ax.set_ylabel('UMAP Dimension 2')
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/03_{best_name}_clusters.png', bbox_inches='tight', dpi=300)
plt.show()
print(f"      Saved: {OUTPUT_DIR}/03_{best_name}_clusters.png")

# 4. Algorithm Comparison
print("\n   [Plot 4/5] Algorithm comparison...")
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

metrics_to_plot = ['accuracy', 'purity', 'nmi']
for i, metric in enumerate(metrics_to_plot):
    data = [evaluation_results[algo][metric] for algo in evaluation_results]
    axes[i].bar(evaluation_results.keys(), data, color='steelblue')
    axes[i].set_title(f'{metric.upper()}', fontsize=12, fontweight='bold')
    axes[i].set_ylabel('Score')
    axes[i].set_ylim(0, 1)
    axes[i].axhline(TARGET_ACCURACY if metric == 'accuracy' else TARGET_PURITY if metric == 'purity' else 0.8,
                   color='red', linestyle='--', label='Target')
    axes[i].legend()
    axes[i].tick_params(axis='x', rotation=45)

plt.suptitle('Clustering Algorithm Comparison', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/04_algorithm_comparison.png', bbox_inches='tight', dpi=300)
plt.show()
print(f"      Saved: {OUTPUT_DIR}/04_algorithm_comparison.png")

# 5. Results Table
print("\n   [Plot 5/5] Results table...")
fig, ax = plt.subplots(figsize=(12, 6))
ax.axis('tight')
ax.axis('off')

table_data = []
for algo, metrics in evaluation_results.items():
    table_data.append([
        algo,
        f"{metrics['accuracy']:.1%}",
        f"{metrics['purity']:.1%}",
        f"{metrics['nmi']:.1%}",
        f"{metrics['n_clusters']}",
        '‚úÖ' if metrics['passes_target'] else '‚ùå'
    ])

table = ax.table(
    cellText=table_data,
    colLabels=['Algorithm', 'Accuracy', 'Purity', 'NMI', 'Clusters', 'Target'],
    cellLoc='center',    loc='center',
    colWidths=[0.2, 0.15, 0.15, 0.15, 0.15, 0.1]
)

table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 2)

# Color header
for i in range(6):
    table[(0, i)].set_facecolor('#4472C4')
    table[(0, i)].set_text_props(weight='bold', color='white')

# Color target column
for i in range(1, len(table_data) + 1):
    if table_data[i-1][5] == '‚úÖ':
        table[(i, 5)].set_facecolor('#C6EFCE')
    else:
        table[(i, 5)].set_facecolor('#FFC7CE')

plt.title('Final Results Summary', fontsize=14, fontweight='bold', pad=20)
plt.savefig(f'{OUTPUT_DIR}/05_results_summary.png', bbox_inches='tight', dpi=300)
plt.show()
print(f"      Saved: {OUTPUT_DIR}/05_results_summary.png")

print("\n‚úÖ All visualizations created!")

# =============================================================================
# PART 10: SAVE RESULTS
# =============================================================================

print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

# Save summary CSV
summary_df.to_csv(f"{OUTPUT_DIR}/clustering_results_summary.csv")
print(f"‚úÖ Saved: {OUTPUT_DIR}/clustering_results_summary.csv")

# Save predictions
df['predicted_cluster'] = clustering_results[best_name]
df[['Brand', 'level3', 'predicted_cluster', 'combined_text']].to_csv(
    f"{OUTPUT_DIR}/predictions.csv",
    index=False
)
print(f"‚úÖ Saved: {OUTPUT_DIR}/predictions.csv")

# =============================================================================
# FINAL MESSAGE
# =============================================================================

print("\n" + "="*80)
print("‚úÖ PIPELINE COMPLETE!")
print("="*80)
print(f"\nüèÜ Best Algorithm: {best_name}")
print(f"   Accuracy: {best_metrics['accuracy']:.2%}")
print(f"   Purity: {best_metrics['purity']:.2%}")
print(f"   Target Achieved: {'YES! üéâ' if best_metrics['passes_target'] else 'Not yet - try tuning parameters'}")
print(f"\nüìÅ All results saved to: {OUTPUT_DIR}/")
print(f"\n{'='*80}\n")


CLUSTERING ALGORITHMS (OPTIMIZED FOR ACCURACY)
   True categories: 114
   Total products: 32,885

   [Algorithm 1/4] HDBSCAN (optimized)...
