# Visualizing ML Embeddings in VR

Explore high-dimensional embeddings in 3D space using t-SNE, UMAP, or PCA.

In [None]:
# Alternative: Manual installation (uncomment if needed)
# !pip install numpy scikit-learn umap-learn matplotlib immersivepoints

In [None]:
# Auto-install immersivepoints in the current kernel
import sys
import subprocess
import importlib

try:
    import immersivepoints as ip
except ImportError:
    print("Installing immersivepoints...")
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'immersivepoints', '--quiet'])
    import site
    importlib.reload(site)
    import immersivepoints as ip
    print("✓ Installation complete!")

# Other imports
import numpy as np
from sklearn.datasets import load_digits
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

try:
    import umap
except ImportError:
    print("UMAP not installed. Installing...")
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'umap-learn', '--quiet'])
    import site
    importlib.reload(site)
    import umap

print("✓ ML Embedding VR Visualization Ready!")

## Load Dataset

We'll start with the scikit-learn digits dataset (1797 samples, 64 dimensions).

In [None]:
# Load digits dataset (small version of MNIST)
digits = load_digits()
X = digits.data  # 1797 samples × 64 features
y = digits.target  # Labels 0-9

print(f"Dataset shape: {X.shape}")
print(f"Number of classes: {len(np.unique(y))}")
print(f"Samples per class: {np.bincount(y)}")

## Option to Load MNIST (Full Dataset)

Uncomment to use full MNIST (70,000 samples). **Warning: t-SNE will be slow!**

In [None]:
# from sklearn.datasets import fetch_openml
# print("Downloading MNIST... this may take a minute")
# mnist = fetch_openml('mnist_784', parser='auto')
# X = mnist.data[:10000]  # Use subset for speed
# y = mnist.target[:10000].astype(int)
# print(f"Loaded {len(X)} MNIST samples")

## Reduce to 3D - Method 1: PCA (Fast)

In [None]:
print("Running PCA to reduce to 3D...")
pca = PCA(n_components=3)
X_3d_pca = pca.fit_transform(X)

explained_var = pca.explained_variance_ratio_.sum()
print(f"PCA complete! Explained variance: {explained_var*100:.1f}%")
print(f"Shape: {X_3d_pca.shape}")

## Reduce to 3D - Method 2: t-SNE (Better, Slower)

In [None]:
print("Running t-SNE to reduce to 3D...")
print("This may take 30-120 seconds depending on dataset size")

tsne = TSNE(n_components=3, random_state=42, perplexity=30, n_iter=1000)
X_3d_tsne = tsne.fit_transform(X)

print(f"t-SNE complete!")
print(f"Shape: {X_3d_tsne.shape}")

## Reduce to 3D - Method 3: UMAP (Best of Both Worlds)

In [None]:
if umap is not None:
    print("Running UMAP to reduce to 3D...")
    reducer = umap.UMAP(n_components=3, random_state=42, n_neighbors=15)
    X_3d_umap = reducer.fit_transform(X)
    print(f"UMAP complete!")
    print(f"Shape: {X_3d_umap.shape}")
else:
    print("UMAP not available. Using t-SNE instead.")
    X_3d_umap = X_3d_tsne

## Choose Your Embedding Method

In [None]:
# Choose which method to visualize:
X_3d = X_3d_tsne  # or X_3d_pca or X_3d_umap
method_name = "t-SNE"

print(f"Using {method_name} embeddings for visualization")

## Color Coding - Option 1: By True Labels

In [None]:
def labels_to_hue(labels, n_classes=10):
    """
    Map class labels to evenly spaced hues.
    """
    hue = (labels / n_classes) * 0.9  # 0 to 0.9 to avoid red wrapping
    return hue

hue_labels = labels_to_hue(y, n_classes=len(np.unique(y)))
print(f"Colored {len(hue_labels)} points by class label")

## Color Coding - Option 2: By Prediction Correctness

Train a simple classifier and visualize errors.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict

print("Training classifier to find misclassifications...")
clf = RandomForestClassifier(n_estimators=100, random_state=42)
y_pred = cross_val_predict(clf, X, y, cv=3)

# Color: green for correct, red for wrong
correct = (y == y_pred)
hue_correctness = np.where(correct, 0.33, 0.0)  # Green or Red

accuracy = correct.mean()
print(f"Classifier accuracy: {accuracy*100:.1f}%")
print(f"Misclassifications: {(~correct).sum()} / {len(y)}")

## Color Coding - Option 3: By Prediction Confidence

In [None]:
# Get prediction probabilities
clf_full = RandomForestClassifier(n_estimators=100, random_state=42)
clf_full.fit(X, y)
y_proba = clf_full.predict_proba(X)
confidence = y_proba.max(axis=1)

# Map confidence to hue (low=red, high=blue)
hue_confidence = confidence * 0.6  # 0 (red) to 0.6 (blue)

print(f"Confidence range: [{confidence.min():.2f}, {confidence.max():.2f}]")
print(f"Low confidence (<0.7) points: {(confidence < 0.7).sum()}")

## Prepare Points for VR

In [None]:
# Choose color scheme (uncomment one):
hue = hue_labels         # By class labels
# hue = hue_correctness  # By prediction correctness
# hue = hue_confidence   # By prediction confidence

# Normalize and center the 3D embedding
X_norm = X_3d - X_3d.mean(axis=0)
X_norm = X_norm / X_norm.std() * 5  # Scale to reasonable VR size

# Create point cloud
points = np.column_stack([X_norm[:, 0], X_norm[:, 1], X_norm[:, 2], hue])
points = points.astype(np.float32)

print(f"\nPoint cloud ready:")
print(f"  Points: {len(points):,}")
print(f"  Bounding box: [{X_norm.min():.1f}, {X_norm.max():.1f}]")

## Visualize in Jupyter

In [None]:
ip.renderPoints(points, point_size=0.08, background_color=0x1a1a1a)

## Generate VR Link

In [None]:
ip.showVR(points, point_size=0.08)

## Analysis: Find Interesting Patterns

Let's identify some interesting regions.

In [None]:
# Find most confused points
if 'confidence' in locals():
    uncertain_idx = confidence.argsort()[:20]  # 20 most uncertain
    print("Most uncertain predictions:")
    for i in uncertain_idx[:10]:
        print(f"  Sample {i}: True={y[i]}, Pred={y_pred[i]}, Confidence={confidence[i]:.2f}")

# Find largest clusters per class
print("\nClass distribution in embedding space:")
for class_id in range(len(np.unique(y))):
    class_mask = (y == class_id)
    class_center = X_norm[class_mask].mean(axis=0)
    class_spread = X_norm[class_mask].std(axis=0).mean()
    print(f"  Class {class_id}: center={class_center}, spread={class_spread:.2f}")

## Save for Upload

In [None]:
output_file = f"ml_embeddings_{method_name.lower()}.xyzi"
points.astype(np.float32).byteswap().tofile(output_file)

print(f"Saved to {output_file}")
print(f"File size: {len(points) * 16 / 1024:.1f} KB")

## What to Look For in VR

When exploring embeddings in VR:

1. **Cluster Separation**: Are different classes well-separated?
2. **Misclassification Patterns**: Do errors cluster together?
3. **Outliers**: Isolated points far from their class cluster
4. **Uncertain Regions**: Boundaries where classes overlap
5. **Sub-clusters**: Multiple groups within same class (e.g., different handwriting styles)

**Try This**:
- Walk to a red point (misclassification) and see what surrounds it
- Find the boundary between two classes
- Look for outliers (points far from their cluster)