# Eigenvalue Analysis: Olivetti Faces Dataset
## Discovering Eigenfaces: The Hidden Structure of Faces

**Goal**: Use eigenvalue analysis to decompose face images into "eigenfaces" (principal components of faces) and demonstrate that 50 components can reconstruct recognizable faces while discarding 95% of raw data.

**Key Questions**:
1. What do eigenfaces look like?
2. How many eigenfaces are needed for recognition?
3. What features do dominant eigenfaces capture?

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import warnings

# Set visualization parameters
plt.rcParams['figure.figsize'] = (16, 10)
plt.rcParams['font.size'] = 11
sns.set_style("white")
warnings.filterwarnings('ignore')

## 1. Load and Explore the Olivetti Faces Dataset

In [None]:
# Load Olivetti faces dataset
faces_data = fetch_olivetti_faces(shuffle=True, random_state=42)
X = faces_data.data
y = faces_data.target
images = faces_data.images

print("Dataset Information:")
print(f"Number of samples: {X.shape[0]}")
print(f"Number of features (pixels): {X.shape[1]}")
print(f"Image shape: {images.shape[1]}x{images.shape[2]}")
print(f"Number of people: {len(np.unique(y))}")
print(f"Images per person: {X.shape[0] // len(np.unique(y))}")

# Image dimensions
h, w = images.shape[1], images.shape[2]
n_samples, n_features = X.shape
n_people = len(np.unique(y))

print(f"\nData matrix shape: {n_samples} samples √ó {n_features} features")
print(f"Each row is a flattened {h}√ó{w} grayscale image")

In [None]:
# Display sample faces
def plot_gallery(images, titles=None, h=64, w=64, n_row=4, n_col=6):
    """Helper function to plot a gallery of portraits"""
    fig, axes = plt.subplots(n_row, n_col, figsize=(1.8 * n_col, 2.2 * n_row),
                            subplot_kw={'xticks': [], 'yticks': []})
    
    for i, ax in enumerate(axes.flat):
        if i < len(images):
            ax.imshow(images[i].reshape(h, w), cmap='gray', vmin=0, vmax=1)
            if titles is not None:
                ax.set_title(titles[i], size=10, fontweight='bold')
        else:
            ax.axis('off')
    
    plt.tight_layout()
    return fig

# Show random sample of faces
n_samples_show = 24
sample_indices = np.random.choice(n_samples, n_samples_show, replace=False)
sample_images = X[sample_indices]
sample_titles = [f"Person {y[i]}" for i in sample_indices]

plot_gallery(sample_images, sample_titles, h, w)
plt.suptitle('Sample Faces from Olivetti Dataset', 
            fontsize=16, fontweight='bold', y=1.0)
plt.show()

## 2. Center the Data (Mean Face Subtraction)

In [None]:
# Compute mean face
mean_face = np.mean(X, axis=0)

# Center the data by subtracting mean face
X_centered = X - mean_face

print(f"Original data range: [{X.min():.3f}, {X.max():.3f}]")
print(f"Centered data range: [{X_centered.min():.3f}, {X_centered.max():.3f}]")
print(f"\nMean of centered data: {np.abs(X_centered.mean()):.10f} (‚âà 0)")

In [None]:
# Visualize mean face and example of centering
fig, axes = plt.subplots(1, 4, figsize=(16, 4))

# Original face
example_idx = 0
axes[0].imshow(X[example_idx].reshape(h, w), cmap='gray', vmin=0, vmax=1)
axes[0].set_title('Original Face', fontsize=14, fontweight='bold')
axes[0].axis('off')

# Mean face
axes[1].imshow(mean_face.reshape(h, w), cmap='gray', vmin=0, vmax=1)
axes[1].set_title('Mean Face (Average)', fontsize=14, fontweight='bold')
axes[1].axis('off')

# Centered face (difference from mean)
axes[2].imshow(X_centered[example_idx].reshape(h, w), cmap='RdBu_r', 
              vmin=-0.5, vmax=0.5)
axes[2].set_title('Centered Face\n(Difference from Mean)', 
                 fontsize=14, fontweight='bold')
axes[2].axis('off')

# Reconstruction: mean + centered
reconstructed = mean_face + X_centered[example_idx]
axes[3].imshow(reconstructed.reshape(h, w), cmap='gray', vmin=0, vmax=1)
axes[3].set_title('Mean + Centered\n(= Original)', fontsize=14, fontweight='bold')
axes[3].axis('off')

plt.tight_layout()
plt.show()

print("\nüîç CENTERING CONCEPT:")
print("- The mean face captures common features across all faces")
print("- Centered data highlights unique deviations from the average")
print("- Eigenvalue analysis finds patterns in these deviations")

## 3. Compute Covariance Matrix and Eigenvalues
### Note: For high-dimensional data, we use SVD instead of direct covariance

In [None]:
# For faces: n_samples (400) << n_features (4096)
# Computing covariance directly would create a 4096√ó4096 matrix
# Instead, use SVD on the data matrix: X = U¬∑Œ£¬∑V^T

print("Computing eigendecomposition using SVD...")
print(f"Data matrix: {X_centered.shape[0]} samples √ó {X_centered.shape[1]} features")

# Perform SVD
U, s, Vt = np.linalg.svd(X_centered, full_matrices=False)

# Eigenvalues of covariance matrix = (singular values)^2 / (n-1)
eigenvalues = (s ** 2) / (n_samples - 1)

# Eigenvectors (principal components) = rows of V^T = columns of V
eigenvectors = Vt.T  # Each column is an eigenface

print(f"\nNumber of non-zero eigenvalues: {len(eigenvalues)}")
print(f"Shape of eigenvectors matrix: {eigenvectors.shape}")
print(f"\nTop 10 eigenvalues:")
for i in range(10):
    print(f"  Œª_{i+1} = {eigenvalues[i]:.6f}")

## 4. Variance Explained Analysis

In [None]:
# Calculate variance explained
total_variance = np.sum(eigenvalues)
variance_explained = eigenvalues / total_variance
cumulative_variance = np.cumsum(variance_explained)

# Find number of components for different thresholds
n_50_var = np.argmax(cumulative_variance >= 0.50) + 1
n_80_var = np.argmax(cumulative_variance >= 0.80) + 1
n_90_var = np.argmax(cumulative_variance >= 0.90) + 1
n_95_var = np.argmax(cumulative_variance >= 0.95) + 1
n_99_var = np.argmax(cumulative_variance >= 0.99) + 1

print("Eigenfaces needed to explain variance:")
print(f"  50% variance: {n_50_var} components ({n_50_var/n_features*100:.2f}% of features)")
print(f"  80% variance: {n_80_var} components ({n_80_var/n_features*100:.2f}% of features)")
print(f"  90% variance: {n_90_var} components ({n_90_var/n_features*100:.2f}% of features)")
print(f"  95% variance: {n_95_var} components ({n_95_var/n_features*100:.2f}% of features)")
print(f"  99% variance: {n_99_var} components ({n_99_var/n_features*100:.2f}% of features)")

print(f"\n‚ú® KEY INSIGHT:")
print(f"Using just {n_95_var} eigenfaces (vs {n_features} pixels) captures 95% of variation!")
print(f"Data reduction: {(1 - n_95_var/n_features)*100:.2f}% compression")

In [None]:
# Visualize variance explained
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. First 100 eigenvalues
n_show = 100
axes[0, 0].plot(range(1, n_show+1), eigenvalues[:n_show], 
               'o-', linewidth=2, markersize=4, color='steelblue')
axes[0, 0].set_xlabel('Eigenface Index', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('Eigenvalue Magnitude', fontsize=12, fontweight='bold')
axes[0, 0].set_title(f'Top {n_show} Eigenvalues (Scree Plot)', 
                    fontsize=14, fontweight='bold')
axes[0, 0].set_yscale('log')
axes[0, 0].grid(True, alpha=0.3)

# 2. Individual variance explained (first 100)
axes[0, 1].bar(range(1, n_show+1), variance_explained[:n_show] * 100,
              color='orange', edgecolor='black', linewidth=0.5, alpha=0.7)
axes[0, 1].set_xlabel('Eigenface Index', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('Variance Explained (%)', fontsize=12, fontweight='bold')
axes[0, 1].set_title(f'Individual Variance Contribution (Top {n_show})', 
                    fontsize=14, fontweight='bold')
axes[0, 1].grid(True, alpha=0.3, axis='y')

# 3. Cumulative variance (all components)
axes[1, 0].plot(range(1, len(cumulative_variance)+1), 
               cumulative_variance * 100,
               linewidth=2.5, color='green')
axes[1, 0].axhline(y=50, color='blue', linestyle='--', linewidth=2, alpha=0.6, label='50%')
axes[1, 0].axhline(y=80, color='purple', linestyle='--', linewidth=2, alpha=0.6, label='80%')
axes[1, 0].axhline(y=90, color='orange', linestyle='--', linewidth=2, alpha=0.6, label='90%')
axes[1, 0].axhline(y=95, color='red', linestyle='--', linewidth=2, alpha=0.6, label='95%')
axes[1, 0].set_xlabel('Number of Eigenfaces', fontsize=12, fontweight='bold')
axes[1, 0].set_ylabel('Cumulative Variance (%)', fontsize=12, fontweight='bold')
axes[1, 0].set_title('Cumulative Variance Explained', fontsize=14, fontweight='bold')
axes[1, 0].set_xlim([0, min(200, len(cumulative_variance))])
axes[1, 0].legend(loc='lower right', fontsize=10)
axes[1, 0].grid(True, alpha=0.3)

# 4. Cumulative variance (zoomed on first 150)
n_zoom = 150
axes[1, 1].plot(range(1, n_zoom+1), cumulative_variance[:n_zoom] * 100,
               linewidth=2.5, color='darkgreen')
axes[1, 1].axhline(y=95, color='red', linestyle='--', linewidth=2, alpha=0.7)
axes[1, 1].axvline(x=n_95_var, color='red', linestyle='--', linewidth=2, alpha=0.7)
axes[1, 1].plot(n_95_var, 95, 'ro', markersize=12, markeredgecolor='black', 
               markeredgewidth=2, label=f'{n_95_var} components for 95%')
axes[1, 1].set_xlabel('Number of Eigenfaces', fontsize=12, fontweight='bold')
axes[1, 1].set_ylabel('Cumulative Variance (%)', fontsize=12, fontweight='bold')
axes[1, 1].set_title(f'95% Variance Threshold (Zoomed)', fontsize=14, fontweight='bold')
axes[1, 1].legend(loc='lower right', fontsize=10)
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Visualizing Eigenfaces
### What Do Principal Components Look Like?

In [None]:
# Display top eigenfaces
n_eigenfaces_show = 24
eigenface_titles = [f'Eigenface {i+1}\n{variance_explained[i]*100:.1f}%' 
                   for i in range(n_eigenfaces_show)]

# Normalize eigenfaces for better visualization
eigenfaces_normalized = []
for i in range(n_eigenfaces_show):
    ef = eigenvectors[:, i].reshape(h, w)
    # Normalize to [0, 1] range
    ef_norm = (ef - ef.min()) / (ef.max() - ef.min())
    eigenfaces_normalized.append(ef_norm.flatten())

plot_gallery(eigenfaces_normalized, eigenface_titles, h, w, n_row=4, n_col=6)
plt.suptitle('Top 24 Eigenfaces (Principal Components)', 
            fontsize=16, fontweight='bold', y=1.0)
plt.show()

print("\nüîç EIGENFACE INTERPRETATION:")
print("- Eigenface 1: Captures lighting direction (left-right illumination)")
print("- Eigenface 2-3: Face shape variations and facial structure")
print("- Later eigenfaces: Capture finer details like hair, glasses, expressions")
print("- Each eigenface is orthogonal (independent) from others")

## 6. Face Reconstruction with Different Numbers of Eigenfaces

In [None]:
def reconstruct_face(face_centered, eigenvectors, n_components, mean_face):
    """Reconstruct a face using n_components eigenfaces"""
    # Project onto eigenfaces
    coefficients = face_centered @ eigenvectors[:, :n_components]
    # Reconstruct
    reconstructed = coefficients @ eigenvectors[:, :n_components].T
    # Add back mean
    return mean_face + reconstructed

# Choose a sample face
test_idx = 15
test_face = X[test_idx]
test_face_centered = X_centered[test_idx]

# Reconstruction with different numbers of components
n_components_list = [1, 5, 10, 25, 50, 100, 150, n_95_var]
reconstructions = []
mse_errors = []

for n_comp in n_components_list:
    recon = reconstruct_face(test_face_centered, eigenvectors, n_comp, mean_face)
    reconstructions.append(recon)
    mse = np.mean((test_face - recon) ** 2)
    mse_errors.append(mse)

In [None]:
# Visualize reconstructions
fig, axes = plt.subplots(3, 3, figsize=(12, 12))
axes = axes.ravel()

# Original
axes[0].imshow(test_face.reshape(h, w), cmap='gray', vmin=0, vmax=1)
axes[0].set_title('Original Face', fontsize=13, fontweight='bold')
axes[0].axis('off')

# Reconstructions
for idx, (n_comp, recon, mse) in enumerate(zip(n_components_list, 
                                                 reconstructions, 
                                                 mse_errors), 1):
    axes[idx].imshow(recon.reshape(h, w), cmap='gray', vmin=0, vmax=1)
    compression = (1 - n_comp/n_features) * 100
    axes[idx].set_title(f'{n_comp} Eigenfaces\n'
                       f'MSE: {mse:.6f}\n'
                       f'Compression: {compression:.1f}%',
                       fontsize=11, fontweight='bold')
    axes[idx].axis('off')

plt.tight_layout()
plt.show()

print(f"\n‚ú® RECONSTRUCTION QUALITY:")
print(f"- With 50 eigenfaces: MSE = {mse_errors[4]:.6f}, " 
      f"Compression = {(1 - 50/n_features)*100:.1f}%")
print(f"- With {n_95_var} eigenfaces (95% var): MSE = {mse_errors[-1]:.6f}")
print(f"\n‚Üí Face is recognizable with just 50 components (~1.2% of data)!")

## 7. Reconstruction Quality vs Number of Components

In [None]:
# Compute reconstruction error for many values of n_components
n_comp_range = list(range(1, 201, 5)) + [n_95_var]
n_comp_range = sorted(set(n_comp_range))

avg_mse_errors = []

for n_comp in n_comp_range:
    errors = []
    for i in range(n_samples):
        recon = reconstruct_face(X_centered[i], eigenvectors, n_comp, mean_face)
        mse = np.mean((X[i] - recon) ** 2)
        errors.append(mse)
    avg_mse_errors.append(np.mean(errors))

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# MSE vs components
axes[0].plot(n_comp_range, avg_mse_errors, 'o-', linewidth=2.5, 
            markersize=6, color='red', markeredgecolor='black', markeredgewidth=1)
axes[0].axvline(x=50, color='blue', linestyle='--', linewidth=2, 
               alpha=0.7, label='50 components')
axes[0].axvline(x=n_95_var, color='green', linestyle='--', linewidth=2, 
               alpha=0.7, label=f'{n_95_var} components (95% var)')
axes[0].set_xlabel('Number of Eigenfaces', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Average MSE', fontsize=12, fontweight='bold')
axes[0].set_title('Reconstruction Error vs Components', fontsize=14, fontweight='bold')
axes[0].legend(loc='upper right', fontsize=11)
axes[0].grid(True, alpha=0.3)
axes[0].set_yscale('log')

# Information retained
max_error = avg_mse_errors[0]
info_retained = (1 - np.array(avg_mse_errors) / max_error) * 100
axes[1].plot(n_comp_range, info_retained, 'o-', linewidth=2.5, 
            markersize=6, color='green', markeredgecolor='black', markeredgewidth=1)
axes[1].axvline(x=50, color='blue', linestyle='--', linewidth=2, 
               alpha=0.7, label='50 components')
axes[1].axhline(y=95, color='red', linestyle='--', linewidth=2, 
               alpha=0.7, label='95% quality')
axes[1].set_xlabel('Number of Eigenfaces', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Information Retained (%)', fontsize=12, fontweight='bold')
axes[1].set_title('Reconstruction Quality', fontsize=14, fontweight='bold')
axes[1].legend(loc='lower right', fontsize=11)
axes[1].grid(True, alpha=0.3)
axes[1].set_ylim([0, 105])

plt.tight_layout()
plt.show()

## 8. Gallery: Multiple Faces Reconstructed with 50 Eigenfaces

In [None]:
# Reconstruct multiple faces with 50 components
n_gallery = 12
n_comp_demo = 50

gallery_indices = np.random.choice(n_samples, n_gallery, replace=False)

fig, axes = plt.subplots(3, n_gallery, figsize=(20, 6))

for col, idx in enumerate(gallery_indices):
    # Original
    axes[0, col].imshow(X[idx].reshape(h, w), cmap='gray', vmin=0, vmax=1)
    if col == 0:
        axes[0, col].set_ylabel('Original', fontsize=12, fontweight='bold')
    axes[0, col].set_title(f'Person {y[idx]}', fontsize=10)
    axes[0, col].axis('off')
    
    # Reconstructed with 50 components
    recon_50 = reconstruct_face(X_centered[idx], eigenvectors, n_comp_demo, mean_face)
    axes[1, col].imshow(recon_50.reshape(h, w), cmap='gray', vmin=0, vmax=1)
    if col == 0:
        axes[1, col].set_ylabel(f'{n_comp_demo} Eigenfaces', fontsize=12, fontweight='bold')
    axes[1, col].axis('off')
    
    # Difference
    diff = np.abs(X[idx] - recon_50)
    axes[2, col].imshow(diff.reshape(h, w), cmap='Reds', vmin=0, vmax=0.5)
    if col == 0:
        axes[2, col].set_ylabel('Abs Difference', fontsize=12, fontweight='bold')
    axes[2, col].axis('off')

plt.suptitle(f'Face Reconstruction with {n_comp_demo} Eigenfaces '
            f'({(1-n_comp_demo/n_features)*100:.1f}% compression)',
            fontsize=16, fontweight='bold', y=0.98)
plt.tight_layout()
plt.show()

print(f"\n‚ú® Demonstration:")
print(f"All {n_gallery} faces are recognizable using only {n_comp_demo} eigenfaces!")
print(f"Storage reduction: {n_features} pixels ‚Üí {n_comp_demo} coefficients")
print(f"Compression ratio: {n_features/n_comp_demo:.1f}:1")

## 9. Eigenface Coefficients: Face Encoding

In [None]:
# Project all faces onto top 50 eigenfaces
n_encoding = 50
face_encodings = X_centered @ eigenvectors[:, :n_encoding]

print(f"Original data: {X.shape[0]} samples √ó {X.shape[1]} features")
print(f"Encoded data: {face_encodings.shape[0]} samples √ó {face_encodings.shape[1]} features")
print(f"\nCompression: {X.shape[1]} ‚Üí {face_encodings.shape[1]} features")
print(f"Data reduction: {(1 - face_encodings.shape[1]/X.shape[1])*100:.2f}%")

In [None]:
# Visualize encoding coefficients for sample faces
sample_people = [0, 10, 20, 30]
fig, axes = plt.subplots(len(sample_people), 2, figsize=(14, 12))

for row, person_id in enumerate(sample_people):
    # Find first image of this person
    person_idx = np.where(y == person_id)[0][0]
    
    # Show face
    axes[row, 0].imshow(X[person_idx].reshape(h, w), cmap='gray', vmin=0, vmax=1)
    axes[row, 0].set_title(f'Person {person_id}', fontsize=12, fontweight='bold')
    axes[row, 0].axis('off')
    
    # Show encoding (coefficients)
    coeffs = face_encodings[person_idx]
    colors = ['green' if c > 0 else 'red' for c in coeffs]
    axes[row, 1].bar(range(n_encoding), coeffs, color=colors, 
                    edgecolor='black', linewidth=0.5, alpha=0.7)
    axes[row, 1].axhline(y=0, color='black', linewidth=1.5)
    axes[row, 1].set_xlabel('Eigenface Index', fontsize=11, fontweight='bold')
    axes[row, 1].set_ylabel('Coefficient Value', fontsize=11, fontweight='bold')
    axes[row, 1].set_title(f'Eigenface Coefficients for Person {person_id}', 
                          fontsize=12, fontweight='bold')
    axes[row, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print("\nüîç COEFFICIENT INTERPRETATION:")
print("- Each face is represented by 50 numbers (eigenface coefficients)")
print("- Positive coefficient ‚Üí face has more of that eigenface feature")
print("- Negative coefficient ‚Üí face has less of that eigenface feature")
print("- Different people have distinct coefficient patterns")

## 10. Key Insights Summary

In [None]:
print("="*80)
print("KEY INSIGHTS FROM EIGENFACE ANALYSIS")
print("="*80)

print(f"\n1. DIMENSIONALITY REDUCTION:")
print(f"   - Original: {n_features} pixels per face")
print(f"   - Compressed: {n_encoding} eigenface coefficients")
print(f"   - Reduction: {(1 - n_encoding/n_features)*100:.2f}% (compression ratio {n_features/n_encoding:.1f}:1)")

print(f"\n2. VARIANCE EXPLANATION:")
print(f"   - Top eigenface explains {variance_explained[0]*100:.2f}% of variance")
print(f"   - Top 10 eigenfaces explain {cumulative_variance[9]*100:.2f}% of variance")
print(f"   - {n_50_var} eigenfaces needed for 50% variance")
print(f"   - {n_95_var} eigenfaces needed for 95% variance")

print(f"\n3. EIGENFACE INTERPRETATION:")
print(f"   - First eigenfaces capture major facial features:")
print(f"     * Lighting direction and shadows")
print(f"     * Overall face shape and structure")
print(f"   - Later eigenfaces capture finer details:")
print(f"     * Specific facial features (eyes, nose, mouth)")
print(f"     * Hair style and texture")
print(f"     * Accessories (glasses, etc.)")

print(f"\n4. RECONSTRUCTION QUALITY:")
avg_mse_50 = avg_mse_errors[n_comp_range.index(50)]
print(f"   - With 50 eigenfaces: Average MSE = {avg_mse_50:.6f}")
print(f"   - Faces are clearly recognizable")
print(f"   - Fine details may be smoothed but identity preserved")

print(f"\n5. PRACTICAL APPLICATIONS:")
print(f"   - Face recognition: Compare eigenface coefficients instead of raw pixels")
print(f"   - Storage: Store {n_encoding} numbers instead of {n_features} pixels")
print(f"   - Fast comparison: {n_encoding}-dimensional space vs {n_features}-dimensional")
print(f"   - Noise reduction: Reconstruction filters out high-frequency noise")

print(f"\n6. MATHEMATICAL INSIGHT:")
print(f"   - Eigenfaces form an orthonormal basis for the 'face space'")
print(f"   - Any face can be expressed as: face = mean + Œ£(coefficient_i √ó eigenface_i)")
print(f"   - Truncating the sum provides optimal low-rank approximation (SVD theorem)")

print("\n" + "="*80)
print("CONCLUSION: Eigenvalue analysis reveals that faces lie in a low-dimensional")
print("subspace. Using just ~50 'eigenfaces' (vs 4096 pixels) captures the essential")
print("structure needed for recognition, demonstrating the power of PCA for high-")
print("dimensional data compression and feature extraction.")
print("="*80)