# Topographical Analysis of Semantic Spaces

This notebook provides an interactive exploration of topographical learning methods for semantic verification.

## Overview
- Project high-dimensional embeddings to 2D/3D spaces
- Compare different projection methods (UMAP, t-SNE, SOM, PCA)
- Analyze projection quality metrics
- Track semantic evolution over time

In [None]:
# Setup and imports
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import warnings
warnings.filterwarnings('ignore')

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

# Import PoT modules
from pot.semantic import (
    ConceptLibrary,
    SemanticMatcher,
    TopographicalProjector,
    TopographicalEvolutionTracker,
    create_topographical_semantic_system
)
from pot.semantic.topography_utils import (
    prepare_latents_for_projection,
    compute_trustworthiness,
    compute_continuity,
    select_optimal_parameters,
    identify_clusters_in_projection
)
from pot.semantic.topography_visualizer import (
    plot_projection,
    compare_projections,
    create_interactive_plot
)

print("Setup complete!")

## 1. Generate Sample Data

Let's create some synthetic embeddings with cluster structure to demonstrate the topographical methods.

In [None]:
def generate_clustered_embeddings(n_samples=500, dim=128, n_clusters=4):
    """Generate synthetic embeddings with cluster structure."""
    np.random.seed(42)
    torch.manual_seed(42)
    
    embeddings = []
    labels = []
    
    samples_per_cluster = n_samples // n_clusters
    
    for i in range(n_clusters):
        # Create cluster center
        center = torch.randn(dim) * 3
        
        # Generate samples around center
        cluster = torch.randn(samples_per_cluster, dim) * 0.8 + center
        embeddings.append(cluster)
        labels.extend([i] * samples_per_cluster)
    
    embeddings = torch.cat(embeddings)
    labels = np.array(labels)
    
    return embeddings, labels

# Generate data
embeddings, labels = generate_clustered_embeddings(n_samples=600, dim=128, n_clusters=4)
print(f"Generated embeddings: {embeddings.shape}")
print(f"Number of clusters: {len(np.unique(labels))}")

## 2. Basic Projection with Different Methods

Let's project the embeddings using different methods and compare them.

In [None]:
# Dictionary to store projections
projections = {}
metrics = {}

# Prepare data
embeddings_np = prepare_latents_for_projection(embeddings, normalize=True)

# Methods to test
methods = ['pca', 'umap', 'tsne']

for method in methods:
    print(f"\nProjecting with {method.upper()}...")
    
    # Create projector
    projector = TopographicalProjector(method)
    
    # Use subset for t-SNE (it's slow)
    if method == 'tsne':
        subset = embeddings[:300]
        subset_np = embeddings_np[:300]
        subset_labels = labels[:300]
    else:
        subset = embeddings
        subset_np = embeddings_np
        subset_labels = labels
    
    # Project
    projected = projector.project_latents(subset)
    projections[method] = projected
    
    # Compute metrics
    trust = compute_trustworthiness(subset_np, projected, n_neighbors=10)
    cont = compute_continuity(subset_np, projected, n_neighbors=10)
    
    metrics[method] = {
        'trustworthiness': trust,
        'continuity': cont,
        'labels': subset_labels
    }
    
    print(f"  Trustworthiness: {trust:.3f}")
    print(f"  Continuity: {cont:.3f}")

## 3. Visualize Projections

In [None]:
# Create visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for idx, (method, projection) in enumerate(projections.items()):
    ax = axes[idx]
    
    # Get labels for this projection
    method_labels = metrics[method]['labels']
    
    # Scatter plot
    scatter = ax.scatter(
        projection[:, 0],
        projection[:, 1],
        c=method_labels,
        cmap='tab10',
        alpha=0.7,
        s=20
    )
    
    ax.set_title(f"{method.upper()}\nTrust: {metrics[method]['trustworthiness']:.3f}")
    ax.set_xlabel("Component 1")
    ax.set_ylabel("Component 2")
    ax.grid(True, alpha=0.3)

plt.colorbar(scatter, ax=axes, label='Cluster')
plt.tight_layout()
plt.show()

## 4. Semantic System Integration

Now let's create a complete semantic system with topographical capabilities.

In [None]:
# Create topographical semantic system
library, matcher, projector = create_topographical_semantic_system(
    dim=128,
    projection_method='umap'
)

# Add concepts to library
concept_names = ['TypeA', 'TypeB', 'TypeC', 'TypeD']
for i, name in enumerate(concept_names):
    # Get embeddings for this cluster
    cluster_mask = labels == i
    cluster_embeddings = embeddings[cluster_mask][:30]  # Use subset
    
    # Add to library
    library.add_concept(name, cluster_embeddings)

print(f"Added {len(library.concepts)} concepts to library")

# Get concept positions
positions = library.get_concept_positions(method='pca')  # Use PCA for speed
print("\nConcept positions:")
for name, pos in positions.items():
    print(f"  {name}: {pos}")

## 5. Semantic Trajectory Tracking

Track how embeddings move through semantic space.

In [None]:
# Create a trajectory of embeddings
n_steps = 20
trajectory_embeddings = []

# Start from one cluster and move to another
start_embedding = embeddings[labels == 0][0]
end_embedding = embeddings[labels == 2][0]

for t in range(n_steps):
    alpha = t / (n_steps - 1)
    interpolated = (1 - alpha) * start_embedding + alpha * end_embedding
    interpolated += torch.randn_like(interpolated) * 0.1  # Add noise
    trajectory_embeddings.append(interpolated)

# Track trajectory
trajectory_result = matcher.track_semantic_trajectory(
    trajectory_embeddings,
    projection_method='pca',
    smooth=True
)

print(f"Trajectory length: {trajectory_result['trajectory_length']}")
print(f"Total distance: {trajectory_result['total_distance']:.3f}")
print(f"Mean velocity: {trajectory_result['mean_velocity']:.3f}")

# Visualize trajectory
fig, ax = plt.subplots(figsize=(8, 6))

# Plot trajectory
traj = trajectory_result['trajectory']
ax.plot(traj[:, 0], traj[:, 1], 'b-', alpha=0.5, linewidth=2, label='Trajectory')
ax.scatter(traj[:, 0], traj[:, 1], c=range(len(traj)), cmap='viridis', s=50, zorder=5)

# Plot concept positions
for name, pos in positions.items():
    ax.scatter(pos[0], pos[1], s=200, marker='s', label=name, alpha=0.7)
    ax.text(pos[0], pos[1], name, fontsize=10, ha='center')

ax.set_title("Semantic Trajectory")
ax.set_xlabel("Component 1")
ax.set_ylabel("Component 2")
ax.legend()
ax.grid(True, alpha=0.3)
plt.show()

## 6. Evolution Tracking

Track how the embedding space evolves over time.

In [None]:
# Simulate evolution
n_epochs = 5
evolution_snapshots = []
evolution_timestamps = []

base_embeddings = embeddings.clone()

for epoch in range(n_epochs):
    # Add progressive structure
    noise_scale = 1.0 / (epoch + 1)
    snapshot = base_embeddings + torch.randn_like(base_embeddings) * noise_scale
    
    evolution_snapshots.append(snapshot.numpy())
    evolution_timestamps.append(float(epoch))

# Track evolution
tracker = TopographicalEvolutionTracker()

for snapshot, timestamp in zip(evolution_snapshots, evolution_timestamps):
    tracker.add_snapshot(snapshot, timestamp=timestamp, compute_metrics=True)

# Compute drift metrics
drift_metrics = tracker.compute_drift_metrics()

print("Evolution Drift Metrics:")
print(f"  Cumulative drift: {drift_metrics['cumulative_drift']:.3f}")
print(f"  Centroid shifts: {len(drift_metrics['centroid_shift'])}")
print(f"  Mean centroid shift: {np.mean(drift_metrics['centroid_shift']):.3f}")

# Detect regime changes
regime_changes = tracker.detect_regime_changes(method='gradient')
print(f"\nRegime changes detected: {regime_changes}")

## 7. Interactive Visualization

Create an interactive plot using plotly (if available).

In [None]:
try:
    # Create interactive plot
    projected_umap = projections.get('umap', projections['pca'])
    
    # Create hover texts
    hover_texts = [f"Cluster: {label}<br>Index: {i}" 
                   for i, label in enumerate(metrics['umap']['labels'] 
                                            if 'umap' in metrics 
                                            else labels[:len(projected_umap)])]
    
    # Create interactive figure
    fig = create_interactive_plot(
        projected_umap,
        labels=metrics.get('umap', {}).get('labels', labels[:len(projected_umap)]),
        hover_texts=hover_texts,
        title="Interactive Embedding Visualization"
    )
    
    # Display in notebook
    fig.show()
    
except Exception as e:
    print(f"Could not create interactive plot: {e}")
    print("Install plotly for interactive visualizations: pip install plotly")

## 8. Parameter Optimization

Automatically select optimal parameters for projection methods.

In [None]:
# Select optimal parameters
data_sample = embeddings_np[:200]  # Use subset

print("Optimal Parameters:")
print("="*50)

for method in ['umap', 'tsne', 'som', 'pca']:
    params = select_optimal_parameters(data_sample, method)
    print(f"\n{method.upper()}:")
    for key, value in params.items():
        print(f"  {key}: {value}")

## 9. Cluster Analysis in Projected Space

In [None]:
# Identify clusters in projected space
projected_pca = projections['pca']

# Try different clustering methods
clustering_methods = ['kmeans', 'dbscan']

fig, axes = plt.subplots(1, len(clustering_methods), figsize=(12, 5))

for idx, method in enumerate(clustering_methods):
    ax = axes[idx]
    
    # Identify clusters
    if method == 'kmeans':
        cluster_labels = identify_clusters_in_projection(
            projected_pca, method=method, n_clusters=4
        )
    else:
        cluster_labels = identify_clusters_in_projection(
            projected_pca, method=method
        )
    
    # Plot
    scatter = ax.scatter(
        projected_pca[:, 0],
        projected_pca[:, 1],
        c=cluster_labels,
        cmap='tab10',
        alpha=0.7,
        s=20
    )
    
    ax.set_title(f"Clustering: {method.upper()}\n{len(np.unique(cluster_labels[cluster_labels >= 0]))} clusters found")
    ax.set_xlabel("Component 1")
    ax.set_ylabel("Component 2")
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Compare with true labels
from sklearn.metrics import adjusted_rand_score

for method in clustering_methods:
    if method == 'kmeans':
        cluster_labels = identify_clusters_in_projection(
            projected_pca, method=method, n_clusters=4
        )
    else:
        cluster_labels = identify_clusters_in_projection(
            projected_pca, method=method
        )
    
    # Filter out noise for DBSCAN
    valid_mask = cluster_labels >= 0
    if np.sum(valid_mask) > 0:
        ari = adjusted_rand_score(labels[valid_mask], cluster_labels[valid_mask])
        print(f"{method.upper()} ARI score: {ari:.3f}")

## Summary

This notebook demonstrated:
- **Projection Methods**: UMAP, t-SNE, and PCA for dimensionality reduction
- **Quality Metrics**: Trustworthiness and continuity for evaluating projections
- **Semantic Integration**: Building concept libraries with topographical views
- **Trajectory Tracking**: Following paths through semantic space
- **Evolution Analysis**: Monitoring changes in embedding spaces over time
- **Clustering**: Identifying structure in projected spaces

### Key Takeaways:
1. **PCA** is fast and preserves global structure but may miss nonlinear patterns
2. **UMAP** balances speed and quality, good for most applications
3. **t-SNE** excels at preserving local structure but is slower
4. Quality metrics help choose the right method for your use case
5. Topographical views enable intuitive exploration of semantic spaces