# Eigenvalue Analysis: Iris Flower Dataset
## Discovering the Hidden 'Size' Factor in Flower Growth

**Goal**: Use eigenvalue analysis to uncover that flower "size" is the dominant hidden factor explaining variance in the Iris dataset.

**Key Questions**:
1. How much variance does the first eigenvalue explain?
2. What does the first principal component represent?
3. Can we visualize species separation using eigenvalue decomposition?

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from scipy import linalg

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

## 1. Load and Explore the Data

In [None]:
# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
target_names = iris.target_names

# Create DataFrame for easier manipulation
df = pd.DataFrame(X, columns=feature_names)
df['species'] = pd.Categorical.from_codes(y, target_names)

print("Dataset Shape:", X.shape)
print("\nFirst few rows:")
print(df.head(10))

print("\nBasic Statistics:")
print(df.describe())

In [None]:
# Visualize raw data distributions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.ravel()

for idx, feature in enumerate(feature_names):
    for species in target_names:
        data = df[df['species'] == species][feature]
        axes[idx].hist(data, alpha=0.6, label=species, bins=15)
    
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Frequency')
    axes[idx].set_title(f'Distribution of {feature}')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 2. Compute Covariance Matrix and Correlation

In [None]:
# Standardize the features (mean=0, std=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Compute covariance matrix (on standardized data, this equals correlation matrix)
cov_matrix = np.cov(X_scaled, rowvar=False)
corr_matrix = np.corrcoef(X_scaled, rowvar=False)

print("Covariance Matrix (Standardized Data):")
print(cov_matrix)
print("\nCorrelation Matrix:")
print(corr_matrix)

In [None]:
# Visualize correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', 
            xticklabels=feature_names, yticklabels=feature_names,
            center=0, vmin=-1, vmax=1, square=True, linewidths=1)
plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\nüîç KEY OBSERVATION:")
print("Petal length and petal width are highly correlated (r ‚âà 0.96)")
print("This suggests these features share a common underlying factor: FLOWER SIZE")

## 3. Eigenvalue Decomposition

In [None]:
# Compute eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Sort in descending order
idx = eigenvalues.argsort()[::-1]
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:, idx]

print("Eigenvalues (sorted):")
for i, ev in enumerate(eigenvalues, 1):
    print(f"  Œª_{i} = {ev:.6f}")

print("\nEigenvectors (Principal Components):")
eigenvector_df = pd.DataFrame(
    eigenvectors,
    index=feature_names,
    columns=[f'PC{i+1}' for i in range(len(eigenvalues))]
)
print(eigenvector_df)

## 4. Variance Explained Analysis
### The Critical Insight: First Eigenvalue Dominance

In [None]:
# Calculate variance explained
total_variance = np.sum(eigenvalues)
variance_explained = eigenvalues / total_variance
cumulative_variance = np.cumsum(variance_explained)

print("Variance Explained by Each Component:")
for i, (var, cum) in enumerate(zip(variance_explained, cumulative_variance), 1):
    print(f"  PC{i}: {var*100:.2f}% (Cumulative: {cum*100:.2f}%)")

print(f"\n‚ú® KEY INSIGHT:")
print(f"The FIRST eigenvalue explains {variance_explained[0]*100:.2f}% of total variance!")
print(f"This dominant factor represents the overall 'SIZE' of the flower.")

In [None]:
# Visualize variance explained
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# 1. Eigenvalue magnitudes
colors = plt.cm.viridis(np.linspace(0, 1, len(eigenvalues)))
bars = axes[0].bar(range(1, len(eigenvalues)+1), eigenvalues, 
                   color=colors, edgecolor='black', linewidth=1.5, alpha=0.8)
axes[0].set_xlabel('Principal Component', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Eigenvalue Magnitude', fontsize=12, fontweight='bold')
axes[0].set_title('Eigenvalue Spectrum', fontsize=14, fontweight='bold')
axes[0].set_xticks(range(1, len(eigenvalues)+1))
axes[0].grid(True, alpha=0.3, axis='y')

# Annotate first eigenvalue
axes[0].text(1, eigenvalues[0] + 0.05, f'{eigenvalues[0]:.3f}\n({variance_explained[0]*100:.1f}%)',
             ha='center', va='bottom', fontsize=11, fontweight='bold', color='red')

# 2. Individual variance explained
axes[1].bar(range(1, len(variance_explained)+1), variance_explained * 100,
           color=colors, edgecolor='black', linewidth=1.5, alpha=0.8)
axes[1].set_xlabel('Principal Component', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Variance Explained (%)', fontsize=12, fontweight='bold')
axes[1].set_title('Individual Variance Contribution', fontsize=14, fontweight='bold')
axes[1].set_xticks(range(1, len(eigenvalues)+1))
axes[1].grid(True, alpha=0.3, axis='y')

# 3. Cumulative variance
axes[2].plot(range(1, len(cumulative_variance)+1), cumulative_variance * 100,
            'o-', linewidth=2.5, markersize=10, color='darkorange',
            markeredgecolor='black', markeredgewidth=1.5)
axes[2].axhline(y=90, color='red', linestyle='--', linewidth=2, alpha=0.7, label='90% threshold')
axes[2].axhline(y=95, color='green', linestyle='--', linewidth=2, alpha=0.7, label='95% threshold')
axes[2].set_xlabel('Number of Components', fontsize=12, fontweight='bold')
axes[2].set_ylabel('Cumulative Variance (%)', fontsize=12, fontweight='bold')
axes[2].set_title('Cumulative Variance Explained', fontsize=14, fontweight='bold')
axes[2].set_xticks(range(1, len(eigenvalues)+1))
axes[2].set_ylim([0, 105])
axes[2].grid(True, alpha=0.3)
axes[2].legend(loc='lower right')

plt.tight_layout()
plt.show()

## 5. Interpreting the First Principal Component
### Understanding the 'Size' Factor

In [None]:
# Visualize first principal component loadings
pc1_loadings = eigenvectors[:, 0]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Loading values
colors_pc1 = ['green' if x > 0 else 'red' for x in pc1_loadings]
bars = axes[0].barh(feature_names, pc1_loadings, color=colors_pc1, 
                    edgecolor='black', linewidth=1.5, alpha=0.7)
axes[0].axvline(x=0, color='black', linewidth=1.5)
axes[0].set_xlabel('Loading Value', fontsize=12, fontweight='bold')
axes[0].set_title('First Principal Component Loadings\n(The "Size" Factor)', 
                  fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='x')

# Add value labels
for i, (feature, loading) in enumerate(zip(feature_names, pc1_loadings)):
    axes[0].text(loading + 0.01 if loading > 0 else loading - 0.01, i, 
                f'{loading:.3f}', va='center', 
                ha='left' if loading > 0 else 'right', fontweight='bold')

# Absolute contribution
abs_loadings = np.abs(pc1_loadings)
axes[1].barh(feature_names, abs_loadings, color='steelblue',
            edgecolor='black', linewidth=1.5, alpha=0.7)
axes[1].set_xlabel('Absolute Loading', fontsize=12, fontweight='bold')
axes[1].set_title('Feature Importance in PC1', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='x')

for i, (feature, loading) in enumerate(zip(feature_names, abs_loadings)):
    axes[1].text(loading + 0.01, i, f'{loading:.3f}', 
                va='center', ha='left', fontweight='bold')

plt.tight_layout()
plt.show()

print("\nüîç INTERPRETATION:")
print("All loadings are POSITIVE ‚Üí PC1 represents overall SIZE")
print(f"Petal length ({abs_loadings[2]:.3f}) and petal width ({abs_loadings[3]:.3f}) dominate")
print("Sepal dimensions contribute less but in the same direction")
print("\n‚Üí Larger PC1 score = Larger flower overall")

## 6. Project Data onto Principal Components

In [None]:
# Project data onto principal components
X_pca = X_scaled @ eigenvectors

# Create DataFrame with projected data
pca_df = pd.DataFrame(
    X_pca,
    columns=[f'PC{i+1}' for i in range(X_pca.shape[1])]
)
pca_df['species'] = pd.Categorical.from_codes(y, target_names)

print("Projected Data (First 5 rows):")
print(pca_df.head())

print("\nPC1 Statistics by Species:")
print(pca_df.groupby('species')['PC1'].describe())

In [None]:
# Visualize data in PC space
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# PC1 vs PC2
for species in target_names:
    mask = pca_df['species'] == species
    axes[0].scatter(pca_df.loc[mask, 'PC1'], pca_df.loc[mask, 'PC2'],
                   label=species, s=80, alpha=0.7, edgecolors='black', linewidth=1)
axes[0].set_xlabel(f'PC1 ({variance_explained[0]*100:.1f}% variance)', 
                   fontsize=12, fontweight='bold')
axes[0].set_ylabel(f'PC2 ({variance_explained[1]*100:.1f}% variance)', 
                   fontsize=12, fontweight='bold')
axes[0].set_title('Species Separation in PC Space', fontsize=14, fontweight='bold')
axes[0].legend(loc='best', fontsize=10)
axes[0].grid(True, alpha=0.3)
axes[0].axhline(y=0, color='k', linestyle='-', alpha=0.2, linewidth=1)
axes[0].axvline(x=0, color='k', linestyle='-', alpha=0.2, linewidth=1)

# PC1 distribution by species
for species in target_names:
    data = pca_df[pca_df['species'] == species]['PC1']
    axes[1].hist(data, alpha=0.6, label=species, bins=15, edgecolor='black')
axes[1].set_xlabel('PC1 (Size Factor)', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Frequency', fontsize=12, fontweight='bold')
axes[1].set_title('Distribution of "Size" by Species', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')

# PC1 vs PC3
for species in target_names:
    mask = pca_df['species'] == species
    axes[2].scatter(pca_df.loc[mask, 'PC1'], pca_df.loc[mask, 'PC3'],
                   label=species, s=80, alpha=0.7, edgecolors='black', linewidth=1)
axes[2].set_xlabel(f'PC1 ({variance_explained[0]*100:.1f}% variance)', 
                   fontsize=12, fontweight='bold')
axes[2].set_ylabel(f'PC3 ({variance_explained[2]*100:.1f}% variance)', 
                   fontsize=12, fontweight='bold')
axes[2].set_title('PC1 vs PC3', fontsize=14, fontweight='bold')
axes[2].legend(loc='best', fontsize=10)
axes[2].grid(True, alpha=0.3)
axes[2].axhline(y=0, color='k', linestyle='-', alpha=0.2, linewidth=1)
axes[2].axvline(x=0, color='k', linestyle='-', alpha=0.2, linewidth=1)

plt.tight_layout()
plt.show()

## 7. Reconstruction Analysis
### How Much Information is Lost?

In [None]:
def reconstruct_from_pcs(X_scaled, eigenvectors, n_components):
    """Reconstruct data using only n principal components"""
    # Project onto first n components
    X_proj = X_scaled @ eigenvectors[:, :n_components]
    # Reconstruct
    X_reconstructed = X_proj @ eigenvectors[:, :n_components].T
    return X_reconstructed

# Calculate reconstruction error for different numbers of components
reconstruction_errors = []
for n in range(1, len(eigenvalues) + 1):
    X_recon = reconstruct_from_pcs(X_scaled, eigenvectors, n)
    mse = np.mean((X_scaled - X_recon) ** 2)
    reconstruction_errors.append(mse)

# Visualize reconstruction quality
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Reconstruction error
axes[0].plot(range(1, len(reconstruction_errors) + 1), reconstruction_errors,
            'o-', linewidth=2.5, markersize=10, color='red',
            markeredgecolor='black', markeredgewidth=1.5)
axes[0].set_xlabel('Number of Components', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Mean Squared Error', fontsize=12, fontweight='bold')
axes[0].set_title('Reconstruction Error vs Components', fontsize=14, fontweight='bold')
axes[0].set_xticks(range(1, len(eigenvalues) + 1))
axes[0].grid(True, alpha=0.3)

# Information retained
info_retained = (1 - np.array(reconstruction_errors) / reconstruction_errors[0]) * 100
axes[1].plot(range(1, len(info_retained) + 1), info_retained,
            'o-', linewidth=2.5, markersize=10, color='green',
            markeredgecolor='black', markeredgewidth=1.5)
axes[1].axhline(y=95, color='red', linestyle='--', linewidth=2, alpha=0.7, label='95% threshold')
axes[1].set_xlabel('Number of Components', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Information Retained (%)', fontsize=12, fontweight='bold')
axes[1].set_title('Information Retention', fontsize=14, fontweight='bold')
axes[1].set_xticks(range(1, len(eigenvalues) + 1))
axes[1].set_ylim([0, 105])
axes[1].grid(True, alpha=0.3)
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"\nUsing only PC1 (the SIZE factor):")
print(f"  - Retains {info_retained[0]:.2f}% of information")
print(f"  - Reconstruction MSE: {reconstruction_errors[0]:.6f}")
print(f"\nUsing PC1 + PC2:")
print(f"  - Retains {info_retained[1]:.2f}% of information")
print(f"  - Reconstruction MSE: {reconstruction_errors[1]:.6f}")

## 8. Biplot: Feature-Sample Relationship

In [None]:
# Create biplot
fig, ax = plt.subplots(figsize=(12, 10))

# Plot samples
colors_map = {'setosa': 'red', 'versicolor': 'green', 'virginica': 'blue'}
for species in target_names:
    mask = pca_df['species'] == species
    ax.scatter(pca_df.loc[mask, 'PC1'], pca_df.loc[mask, 'PC2'],
              label=species, s=60, alpha=0.6, 
              c=colors_map[species], edgecolors='black', linewidth=0.5)

# Plot feature vectors (loadings scaled for visibility)
scale_factor = 3
for i, feature in enumerate(feature_names):
    ax.arrow(0, 0, 
            eigenvectors[i, 0] * scale_factor,
            eigenvectors[i, 1] * scale_factor,
            head_width=0.15, head_length=0.15, 
            fc='darkred', ec='darkred', linewidth=2.5, alpha=0.8)
    ax.text(eigenvectors[i, 0] * scale_factor * 1.15,
           eigenvectors[i, 1] * scale_factor * 1.15,
           feature.replace(' (cm)', ''),
           fontsize=11, fontweight='bold', 
           bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.7))

ax.set_xlabel(f'PC1 ({variance_explained[0]*100:.1f}% variance)', 
             fontsize=13, fontweight='bold')
ax.set_ylabel(f'PC2 ({variance_explained[1]*100:.1f}% variance)', 
             fontsize=13, fontweight='bold')
ax.set_title('Biplot: Samples and Feature Loadings', 
            fontsize=16, fontweight='bold', pad=20)
ax.legend(loc='best', fontsize=11, framealpha=0.9)
ax.grid(True, alpha=0.3)
ax.axhline(y=0, color='k', linestyle='-', alpha=0.3, linewidth=1)
ax.axvline(x=0, color='k', linestyle='-', alpha=0.3, linewidth=1)

plt.tight_layout()
plt.show()

print("\nüîç BIPLOT INTERPRETATION:")
print("- Arrow direction shows which features contribute to each PC")
print("- Arrow length represents the strength of contribution")
print("- All arrows point in similar direction ‚Üí All features measure SIZE")
print("- Petal measurements have longer arrows ‚Üí Stronger size indicators")

## 9. Key Insights Summary

In [None]:
print("="*70)
print("KEY INSIGHTS FROM EIGENVALUE ANALYSIS")
print("="*70)
print(f"\n1. DOMINANT FACTOR: The first eigenvalue (Œª‚ÇÅ = {eigenvalues[0]:.4f})")
print(f"   explains {variance_explained[0]*100:.2f}% of total variance.")
print(f"\n2. THE 'SIZE' INTERPRETATION: All features have positive loadings")
print(f"   on PC1, meaning it represents overall flower size.")
print(f"\n3. FEATURE IMPORTANCE:")
for i, feature in enumerate(feature_names):
    print(f"   - {feature}: {abs(pc1_loadings[i]):.4f}")
print(f"\n4. DIMENSIONALITY REDUCTION: Using just 2 components captures")
print(f"   {cumulative_variance[1]*100:.2f}% of variance, enabling effective")
print(f"   visualization and species separation.")
print(f"\n5. SPECIES SEPARATION: PC1 (size) is the primary discriminator:")
for species in target_names:
    mean_pc1 = pca_df[pca_df['species'] == species]['PC1'].mean()
    print(f"   - {species}: mean PC1 = {mean_pc1:.3f}")
print(f"\n6. DATA COMPRESSION: PC1 alone retains {info_retained[0]:.2f}%")
print(f"   of information, showing that 'size' is the dominant signal.")
print("\n" + "="*70)
print("CONCLUSION: Eigenvalue analysis reveals that a single hidden factor")
print("(overall SIZE) drives most of the variation in iris measurements.")
print("This validates the biological intuition that larger flowers have")
print("proportionally larger petals and sepals.")
print("="*70)