# Reconstruction Error Analysis

Analyzing information loss in PCA:
1. Reconstruction error vs number of components
2. Visualizing the tradeoff
3. Choosing optimal k

---

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_olivetti_faces, load_digits

sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)

---
## Understanding Reconstruction Error

When we reduce dimensions, we lose information. How much?

In [None]:
def reconstruction_error(X_original, X_reconstructed):
    """Mean squared error between original and reconstructed data."""
    return np.mean((X_original - X_reconstructed)**2)

def compute_errors_for_components(X, max_components):
    """Compute reconstruction error for different numbers of components."""
    errors = []
    var_explained = []
    
    for n_comp in range(1, max_components + 1):
        pca = PCA(n_components=n_comp)
        X_transformed = pca.fit_transform(X)
        X_reconstructed = pca.inverse_transform(X_transformed)
        
        error = reconstruction_error(X, X_reconstructed)
        var_exp = np.sum(pca.explained_variance_ratio_)
        
        errors.append(error)
        var_explained.append(var_exp)
    
    return errors, var_explained

---
## Experiment 1: Digits Dataset (8x8 images)

In [None]:
# Load digits
digits = load_digits()
X_digits = digits.data

print(f'Digits dataset: {X_digits.shape}')
print(f'Each digit: 8x8 = 64 pixels')

# Compute errors
max_comp = min(64, X_digits.shape[0])  # Can't exceed min dimension
errors_digits, var_exp_digits = compute_errors_for_components(X_digits, max_comp)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(range(1, max_comp + 1), errors_digits, linewidth=2)
axes[0].set_xlabel('Number of Components', fontsize=12)
axes[0].set_ylabel('Reconstruction Error (MSE)', fontsize=12)
axes[0].set_title('Reconstruction Error vs Components (Digits)', fontsize=13, fontweight='bold')
axes[0].grid(True, alpha=0.3)

axes[1].plot(range(1, max_comp + 1), var_exp_digits, linewidth=2, color='orange')
axes[1].axhline(y=0.95, color='r', linestyle='--', alpha=0.7, label='95% variance')
axes[1].set_xlabel('Number of Components', fontsize=12)
axes[1].set_ylabel('Variance Explained', fontsize=12)
axes[1].set_title('Variance Explained vs Components', fontsize=13, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Find optimal
n_95 = np.argmax(np.array(var_exp_digits) >= 0.95) + 1
print(f'\nFor 95% variance: {n_95} components needed')
print(f'Reconstruction error at 95%: {errors_digits[n_95-1]:.6f}')

---
## Experiment 2: Visual Reconstruction

In [None]:
# Pick one digit
digit_idx = 10
original_digit = X_digits[digit_idx].reshape(8, 8)

# Reconstruct with different components
components_to_test = [2, 5, 10, 20, 30, 64]

fig, axes = plt.subplots(2, 3, figsize=(12, 8))
axes = axes.ravel()

for i, n_comp in enumerate(components_to_test):
    if n_comp > max_comp:
        n_comp = max_comp
    
    pca = PCA(n_components=n_comp)
    digit_encoded = pca.fit_transform(X_digits[digit_idx:digit_idx+1])
    digit_reconstructed = pca.inverse_transform(digit_encoded).reshape(8, 8)
    
    error = reconstruction_error( 
        original_digit.flatten().reshape(1, -1),
        digit_reconstructed.flatten().reshape(1, -1)
    )
    var_exp = np.sum(pca.explained_variance_ratio_) * 100
    
    axes[i].imshow(digit_reconstructed, cmap='gray')
    axes[i].set_title(f'{n_comp} comp | Error: {error:.4f}\n{var_exp:.1f}% variance', fontsize=10)
    axes[i].axis('off')

plt.suptitle(f'Digit Reconstruction (Original label: {digits.target[digit_idx]})', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

---
## Experiment 3: Faces (Higher Dimensional)

In [None]:
# Load faces
faces_data = fetch_olivetti_faces(shuffle=True, random_state=42)
X_faces = faces_data.data[:100]  # Use subset for speed

print(f'Faces dataset: {X_faces.shape}')
print(f'Each face: 64x64 = 4096 pixels')

# Test different numbers of components
components_range = list(range(5, 101, 5))
errors_faces = []
var_exp_faces = []

for n_comp in components_range:
    pca = PCA(n_components=n_comp)
    X_transformed = pca.fit_transform(X_faces)
    X_reconstructed = pca.inverse_transform(X_transformed)
    
    error = reconstruction_error(X_faces, X_reconstructed)
    var_exp = np.sum(pca.explained_variance_ratio_)
    
    errors_faces.append(error)
    var_exp_faces.append(var_exp)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(components_range, errors_faces, 'o-', linewidth=2, markersize=5)
axes[0].set_xlabel('Number of Components', fontsize=12)
axes[0].set_ylabel('Reconstruction Error', fontsize=12)
axes[0].set_title('Reconstruction Error vs Components (Faces)', fontsize=13, fontweight='bold')
axes[0].grid(True, alpha=0.3)

axes[1].plot(components_range, var_exp_faces, 'o-', linewidth=2, markersize=5, color='green')
axes[1].axhline(y=0.95, color='r', linestyle='--', alpha=0.7, label='95% variance')
axes[1].set_xlabel('Number of Components', fontsize=12)
axes[1].set_ylabel('Variance Explained', fontsize=12)
axes[1].set_title('Variance Explained vs Components', fontsize=13, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

---
## Choosing Optimal k

### Method 1: Variance Threshold
Keep components until reaching desired variance (e.g., 95%).

### Method 2: Elbow Method  
Look for "elbow" in reconstruction error curve.

### Method 3: Application-Specific
Based on downstream task performance.

In [None]:
# Find different thresholds
thresholds = [0.90, 0.95, 0.99]

for threshold in thresholds:
    n_comp = np.argmax(np.array(var_exp_faces) >= threshold)
    if n_comp == 0 and var_exp_faces[0] < threshold:
        print(f'{threshold*100:.0f}% variance: Need more than {max(components_range)} components')
    else:
        actual_comp = components_range[n_comp]
        error =errors_faces[n_comp]
        compression = (1 - actual_comp / 4096) * 100
        print(f'{threshold*100:.0f}% variance: {actual_comp} components | Error: {error:.6f} | Compression: {compression:.1f}%')

---
## Summary

### Reconstruction Error:
- Decreases as we add more components
- Zero error only with k = original dimensions
- Tradeoff between compression and accuracy

### Key Insights:
1. **First few PCs capture most information** (80-90% with <20% of dims)
2. **Diminishing returns** - later PCs add little value
3. **Elbow in error curve** - good indicator of optimal k
4. **95% variance** is common practical threshold

### Choosing k:
- **Variance threshold**: e.g., keep 95% of variance
- **Elbow method**: Look for bend in error curve
- **Cross-validation**: Test downstream task performance
- **Computational budget**: Balance accuracy vs speed

### Error Formula:
$$\text{Reconstruction Error} = ||X - \hat{X}||^2$$

Where $\hat{X} = X_{PCA} W^T$ (reconstructed data)

### In Practice:
- Start with high k (e.g., 95% variance)
- Evaluate on your task
- Reduce k if performance doesn't suffer
- Monitor train vs test performance

**Interview Tip**: "Reconstruction error measures information loss from dimensionality reduction. It decreases with more components, but we see diminishing returns. I'd use the 95% variance threshold as a starting point, then validate with cross-validation on the actual task. The elbow in the error curve also helps identify a good k."