# Clustering and Visualization Demo

This notebook demonstrates document clustering techniques, dimensionality reduction for visualization, and cluster analysis.

## Setup and Data Loading

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.analysis import (
    load_data, preprocess_df, compute_tfidf_matrix, lsa_embeddings,
    tsne_project, plot_scatter, cluster_and_characterize
)

# Generate data if needed
DATA = Path('data') / 'synthetic_texts.csv'
if not DATA.exists():
    from generate_data import generate
    generate(800)

# Load and preprocess data
df = load_data(DATA)
df = preprocess_df(df)
print(f"Dataset shape: {df.shape}")
print("Label distribution:")
print(df['label'].value_counts())

## TF-IDF Vectorization and Dimensionality Reduction

In [None]:
# Create TF-IDF matrix
vectorizer, X_tfidf = compute_tfidf_matrix(df['joined_tokens'], max_features=2000)
print(f"TF-IDF matrix shape: {X_tfidf.shape}")

# Reduce dimensionality with LSA
embeddings, svd = lsa_embeddings(X_tfidf, n_components=50)
print(f"LSA embeddings shape: {embeddings.shape}")

# Further reduce with t-SNE for visualization
tsne_projection = tsne_project(embeddings, n_components=2, perplexity=30)
print(f"t-SNE projection shape: {tsne_projection.shape}")

## Visualization by Original Labels

In [None]:
# Visualize by original labels
labels = df['label'].astype('category').cat.codes.values
plot_scatter(tsne_projection, labels, Path('results/tsne_by_label.png'), 
            title='t-SNE Projection by Original Label')
print("t-SNE visualization by label saved")

# Interactive visualization with Plotly
try:
    import plotly.express as px
    
    fig = px.scatter(
        x=tsne_projection[:, 0], 
        y=tsne_projection[:, 1],
        color=df['label'],
        title='t-SNE Projection by Label',
        labels={'color': 'Label'}
    )
    fig.show()
except ImportError:
    print("Plotly not available")

## Document Clustering with K-Means

In [None]:
# Perform clustering
cluster_labels, cluster_terms, cluster_samples = cluster_and_characterize(
    df, n_clusters=6, max_features=2000
)

print("Cluster analysis completed:")
print(f"Number of clusters: {len(set(cluster_labels))}")
print("Top terms per cluster:")
for cluster_id, terms in cluster_terms.items():
    print(f"Cluster {cluster_id}: {', '.join(terms[:5])}")

# Add cluster labels to dataframe
df['cluster'] = cluster_labels

## Visualize Clusters

In [None]:
# Visualize clusters
plot_scatter(tsne_projection, cluster_labels, Path('results/tsne_clusters.png'), 
            title='t-SNE Projection with Clusters')
print("Cluster visualization saved")

# Interactive cluster visualization
try:
    import plotly.express as px
    
    fig = px.scatter(
        x=tsne_projection[:, 0], 
        y=tsne_projection[:, 1],
        color=[f'Cluster {c}' for c in cluster_labels],
        title='t-SNE Projection with Document Clusters',
        labels={'color': 'Cluster'}
    )
    fig.show()
except ImportError:
    print("Plotly not available")

## Cluster Analysis and Characterization

In [None]:
# Analyze cluster composition
cluster_analysis = df.groupby('cluster').agg({
    'label': ['count', lambda x: x.value_counts().index[0]],
    'domain': lambda x: x.value_counts().index[0]
}).round(2)

cluster_analysis.columns = ['count', 'dominant_label', 'dominant_domain']
print("Cluster composition:")
print(cluster_analysis)

# Visualize cluster composition
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
label_dist = df.groupby(['cluster', 'label']).size().unstack(fill_value=0)
label_dist.plot(kind='bar', stacked=True, ax=plt.gca())
plt.title('Label Distribution by Cluster')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
domain_dist = df.groupby(['cluster', 'domain']).size().unstack(fill_value=0)
domain_dist.plot(kind='bar', stacked=True, ax=plt.gca())
plt.title('Domain Distribution by Cluster')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## Cluster Samples and Interpretation

In [None]:
# Display cluster samples
print("Cluster samples:")
for cluster_id, samples in cluster_samples.items():
    print(f"\nCluster {cluster_id}:")
    print(f"Top terms: {', '.join(cluster_terms[cluster_id][:5])}")
    print("Sample texts:")
    for i, sample in enumerate(samples[:2], 1):
        print(f"  {i}. {sample[:100]}...")

## Advanced Clustering: Optimal Cluster Number

In [None]:
# Find optimal number of clusters using elbow method
from sklearn.cluster import KMeans

inertia = []
k_range = range(2, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(embeddings)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(k_range, inertia, 'bo-')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal Cluster Number')
plt.xticks(k_range)
plt.grid(True)
plt.show()

print("Inertia values:")
for k, i in zip(k_range, inertia):
    print(f"k={k}: {i:.2f}")

## Interactive Cluster Exploration

In [None]:
# Interactive cluster exploration
try:
    import plotly.express as px
    
    # Create interactive scatter plot with hover information
    hover_data = {
        'label': True,
        'domain': True,
        'cluster': True,
        'text': df['text'].str.slice(0, 100) + '...'
    }
    
    fig = px.scatter(
        x=tsne_projection[:, 0], 
        y=tsne_projection[:, 1],
        color=[f'Cluster {c}' for c in cluster_labels],
        hover_data=hover_data,
        title='Interactive Cluster Exploration',
        labels={'color': 'Cluster'}
    )
    fig.show()
    
except ImportError:
    print("Plotly not available for interactive exploration")

## Clustering Summary

This notebook demonstrates:
1. TF-IDF vectorization and dimensionality reduction
2. Document clustering with K-Means
3. Cluster visualization with t-SNE
4. Cluster characterization and analysis
5. Optimal cluster number determination
6. Interactive cluster exploration

Key insights:
- Documents naturally cluster based on content similarity
- Clusters often correspond to domains or specific themes
- Visualization helps understand cluster structure and relationships
- Cluster analysis reveals patterns not visible in original labels