# Clustering Analysis

This notebook performs clustering analysis using UMAP for dimensionality reduction and HDBSCAN/K-means for clustering.


In [None]:
import json
import numpy as np
import sys
from pathlib import Path

# Add src to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root / "src"))

from clustering import perform_clustering_analysis

# Set up paths
data_path = project_root / "data" / "processed" / "cleaned_data.json"
embeddings_path = project_root / "data" / "processed" / "embeddings.npy"
output_dir = project_root / "data" / "processed"

print(f"Loading data from: {data_path}")
print(f"Loading embeddings from: {embeddings_path}")


In [None]:
# Load data and embeddings
with open(data_path, 'r') as f:
    cleaned_data = json.load(f)

embeddings = np.load(embeddings_path)

print(f"Loaded {len(cleaned_data)} entries")
print(f"Embeddings shape: {embeddings.shape}")


In [None]:
# Perform clustering analysis (both methods)
results = perform_clustering_analysis(
    cleaned_data,
    embeddings,
    method="both",
    output_dir=output_dir
)


In [None]:
# Display HDBSCAN results
if 'hdbscan' in results:
    print("=== HDBSCAN CLUSTERING RESULTS ===")
    hdbscan_analysis = results['hdbscan']['analysis']
    
    print(f"\nNumber of clusters: {hdbscan_analysis['n_clusters']}")
    print(f"Noise points: {hdbscan_analysis['n_noise']}")
    
    print("\nCluster summaries:")
    for summary in hdbscan_analysis['cluster_summaries']:
        print(f"\nCluster {summary['cluster_id']} ({summary['size']} entries):")
        print(f"  Top domains: {list(summary['top_domains'].keys())}")
        print(f"  Top categories: {list(summary['top_categories'].keys())}")
        print(f"  Names: {', '.join(summary['names'][:5])}")
        if len(summary['names']) > 5:
            print(f"  ... and {len(summary['names']) - 5} more")


In [None]:
# Display K-means results
if 'kmeans' in results:
    print("=== K-MEANS CLUSTERING RESULTS ===")
    kmeans_analysis = results['kmeans']['analysis']
    optimal_k = results['kmeans']['optimal_k']
    
    print(f"\nOptimal k: {optimal_k}")
    print(f"Number of clusters: {kmeans_analysis['n_clusters']}")
    
    print("\nCluster summaries:")
    for summary in kmeans_analysis['cluster_summaries']:
        print(f"\nCluster {summary['cluster_id']} ({summary['size']} entries):")
        print(f"  Top domains: {list(summary['top_domains'].keys())}")
        print(f"  Top categories: {list(summary['top_categories'].keys())}")
        print(f"  Names: {', '.join(summary['names'][:5])}")
        if len(summary['names']) > 5:
            print(f"  ... and {len(summary['names']) - 5} more")


In [None]:
# Add cluster assignments to data and save
import json

# Use K-means results (or HDBSCAN if preferred)
if 'kmeans' in results:
    cluster_labels = results['kmeans']['labels']
    method = 'kmeans'
else:
    cluster_labels = results['hdbscan']['labels']
    method = 'hdbscan'

# Add cluster assignments
for i, entry in enumerate(cleaned_data):
    entry[f'cluster_{method}'] = int(cluster_labels[i])

# Save updated data
output_path = project_root / "data" / "processed" / "cleaned_data.json"
with open(output_path, 'w') as f:
    json.dump(cleaned_data, f, indent=2)

print(f"Saved data with cluster assignments to: {output_path}")
