# Linear Discriminant Analysis on Iris Dataset
## The 'Hello World' of LDA

**Dataset Overview:**
- 150 samples, 4 features (sepal length/width, petal length/width)
- 3 classes (Setosa, Versicolor, Virginica)
- Features are normally distributed
- Classes are well-separated

**Focus:** Classification Basics & LDA Fundamentals

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score,
    roc_auc_score,
    roc_curve
)
from scipy import stats

# Plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## 1. Data Loading and Exploration

In [None]:
# Load dataset
iris = load_iris()
X = iris.data
y = iris.target

# Create DataFrame for easier manipulation
df = pd.DataFrame(X, columns=iris.feature_names)
df['species'] = pd.Categorical.from_codes(y, iris.target_names)

print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
display(df.head())

print("\nDataset Info:")
print(df.info())

print("\nClass Distribution:")
print(df['species'].value_counts())

In [None]:
# Statistical Summary
print("Statistical Summary:")
display(df.describe())

print("\nMissing Values:")
print(df.isnull().sum())

## 2. Exploratory Data Analysis

In [None]:
# Distribution of features by species
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.ravel()

for idx, col in enumerate(iris.feature_names):
    for species in iris.target_names:
        data = df[df['species'] == species][col]
        axes[idx].hist(data, alpha=0.5, label=species, bins=20)
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].legend()
    axes[idx].set_title(f'Distribution of {col}')

plt.tight_layout()
plt.show()

In [None]:
# Pairplot to visualize relationships
sns.pairplot(df, hue='species', diag_kind='kde', height=2.5)
plt.suptitle('Pairplot of Iris Features by Species', y=1.01)
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
correlation = df[iris.feature_names].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1)
plt.title('Feature Correlation Matrix')
plt.show()

In [None]:
# Box plots for each feature
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.ravel()

for idx, col in enumerate(iris.feature_names):
    df.boxplot(column=col, by='species', ax=axes[idx])
    axes[idx].set_title(f'Box Plot of {col}')
    axes[idx].set_xlabel('Species')
    axes[idx].set_ylabel(col)

plt.suptitle('')
plt.tight_layout()
plt.show()

## 3. Test Assumptions for LDA

In [None]:
# Test for normality (Shapiro-Wilk test)
print("Normality Tests (Shapiro-Wilk):")
print("=" * 60)

for species in iris.target_names:
    print(f"\n{species.upper()}:")
    for col in iris.feature_names:
        data = df[df['species'] == species][col]
        stat, p_value = stats.shapiro(data)
        result = "Normal" if p_value > 0.05 else "Non-normal"
        print(f"  {col}: p-value = {p_value:.4f} ({result})")

In [None]:
# Q-Q plots for normality visualization
fig, axes = plt.subplots(3, 4, figsize=(18, 12))

for i, species in enumerate(iris.target_names):
    for j, col in enumerate(iris.feature_names):
        data = df[df['species'] == species][col]
        stats.probplot(data, dist="norm", plot=axes[i, j])
        axes[i, j].set_title(f'{species} - {col}')

plt.suptitle('Q-Q Plots for Normality Check', y=1.0)
plt.tight_layout()
plt.show()

In [None]:
# Test for homogeneity of covariance (Box's M test approximation)
# Using Levene's test for equality of variances as a simpler alternative
print("\nHomogeneity of Variance Tests (Levene's test):")
print("=" * 60)

for col in iris.feature_names:
    groups = [df[df['species'] == species][col].values for species in iris.target_names]
    stat, p_value = stats.levene(*groups)
    result = "Equal variances" if p_value > 0.05 else "Unequal variances"
    print(f"{col}: p-value = {p_value:.4f} ({result})")

## 4. Data Preparation

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"\nTraining set class distribution:")
print(pd.Series(y_train).value_counts().sort_index())
print(f"\nTest set class distribution:")
print(pd.Series(y_test).value_counts().sort_index())

In [None]:
# Standardize features (optional for LDA, but good practice)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling complete!")
print("\nScaled feature means (should be ~0):")
print(X_train_scaled.mean(axis=0))
print("\nScaled feature std devs (should be ~1):")
print(X_train_scaled.std(axis=0))

## 5. Linear Discriminant Analysis

In [None]:
# Train LDA model
lda = LinearDiscriminantAnalysis()
lda.fit(X_train_scaled, y_train)

# Get predictions
y_train_pred = lda.predict(X_train_scaled)
y_test_pred = lda.predict(X_test_scaled)

# Get probability predictions
y_train_proba = lda.predict_proba(X_train_scaled)
y_test_proba = lda.predict_proba(X_test_scaled)

print("LDA Model trained successfully!")
print(f"\nNumber of discriminant components: {lda.n_components}")
print(f"Classes: {lda.classes_}")

In [None]:
# Model performance
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Model Performance:")
print("=" * 60)
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Overfitting Check: {train_accuracy - test_accuracy:.4f}")

In [None]:
# Cross-validation
cv_scores = cross_val_score(lda, X_train_scaled, y_train, cv=5)

print("\nCross-Validation Results:")
print("=" * 60)
print(f"CV Scores: {cv_scores}")
print(f"Mean CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

In [None]:
# Classification report
print("\nClassification Report (Test Set):")
print("=" * 60)
print(classification_report(y_test, y_test_pred, target_names=iris.target_names))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_test_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=iris.target_names, yticklabels=iris.target_names,
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix - LDA on Iris Dataset')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Normalized confusion matrix
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(10, 8))
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues',
            xticklabels=iris.target_names, yticklabels=iris.target_names,
            cbar_kws={'label': 'Proportion'})
plt.title('Normalized Confusion Matrix - LDA on Iris Dataset')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

## 6. Model Interpretation

In [None]:
# LDA coefficients (linear discriminants)
print("LDA Coefficients (Scalings):")
print("=" * 60)

scalings_df = pd.DataFrame(
    lda.scalings_,
    index=iris.feature_names,
    columns=[f'LD{i+1}' for i in range(lda.scalings_.shape[1])]
)
display(scalings_df)

# Visualize coefficients
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

for i in range(lda.scalings_.shape[1]):
    axes[i].barh(iris.feature_names, scalings_df[f'LD{i+1}'])
    axes[i].set_xlabel('Coefficient Value')
    axes[i].set_title(f'Linear Discriminant {i+1} Coefficients')
    axes[i].axvline(x=0, color='black', linestyle='--', linewidth=0.5)

plt.tight_layout()
plt.show()

In [None]:
# Explained variance ratio
print("\nExplained Variance Ratio:")
print("=" * 60)
for i, var in enumerate(lda.explained_variance_ratio_):
    print(f"LD{i+1}: {var:.4f} ({var*100:.2f}%)")

# Cumulative variance
cumulative_var = np.cumsum(lda.explained_variance_ratio_)
print(f"\nCumulative variance explained: {cumulative_var}")

# Plot explained variance
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar plot
axes[0].bar(range(1, len(lda.explained_variance_ratio_) + 1), 
            lda.explained_variance_ratio_)
axes[0].set_xlabel('Linear Discriminant')
axes[0].set_ylabel('Explained Variance Ratio')
axes[0].set_title('Explained Variance by Component')
axes[0].set_xticks(range(1, len(lda.explained_variance_ratio_) + 1))

# Cumulative plot
axes[1].plot(range(1, len(cumulative_var) + 1), cumulative_var, 'bo-')
axes[1].set_xlabel('Number of Components')
axes[1].set_ylabel('Cumulative Explained Variance')
axes[1].set_title('Cumulative Explained Variance')
axes[1].set_xticks(range(1, len(cumulative_var) + 1))
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Class means in the original space
print("\nClass Means (Original Feature Space):")
print("=" * 60)
means_df = pd.DataFrame(
    lda.means_,
    index=iris.target_names,
    columns=iris.feature_names
)
display(means_df)

## 7. Dimensionality Reduction Visualization

In [None]:
# Transform data to LDA space
X_train_lda = lda.transform(X_train_scaled)
X_test_lda = lda.transform(X_test_scaled)

print(f"Original feature space: {X_train_scaled.shape[1]} dimensions")
print(f"LDA feature space: {X_train_lda.shape[1]} dimensions")

In [None]:
# Visualize data in LDA space
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Training data
for i, species in enumerate(iris.target_names):
    mask = y_train == i
    axes[0].scatter(X_train_lda[mask, 0], X_train_lda[mask, 1], 
                   label=species, alpha=0.7, s=50, edgecolors='black')
axes[0].set_xlabel('LD1 (First Linear Discriminant)')
axes[0].set_ylabel('LD2 (Second Linear Discriminant)')
axes[0].set_title('Training Data in LDA Space')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Test data
for i, species in enumerate(iris.target_names):
    mask = y_test == i
    axes[1].scatter(X_test_lda[mask, 0], X_test_lda[mask, 1], 
                   label=species, alpha=0.7, s=50, edgecolors='black')
axes[1].set_xlabel('LD1 (First Linear Discriminant)')
axes[1].set_ylabel('LD2 (Second Linear Discriminant)')
axes[1].set_title('Test Data in LDA Space')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# 1D visualization (using only LD1)
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Histogram
for i, species in enumerate(iris.target_names):
    mask_train = y_train == i
    axes[0].hist(X_train_lda[mask_train, 0], alpha=0.5, label=species, bins=20)

axes[0].set_xlabel('LD1 (First Linear Discriminant)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Classes along LD1')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Violin plot
ld1_data = pd.DataFrame({
    'LD1': X_train_lda[:, 0],
    'Species': [iris.target_names[i] for i in y_train]
})
sns.violinplot(data=ld1_data, x='Species', y='LD1', ax=axes[1])
axes[1].set_title('Class Separation on LD1 (Violin Plot)')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Decision Boundaries (2D Projection)

In [None]:
# Create meshgrid for decision boundary
def plot_decision_boundary(X, y, model, title):
    h = 0.02
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.figure(figsize=(12, 8))
    plt.contourf(xx, yy, Z, alpha=0.3, cmap='viridis')
    
    for i, species in enumerate(iris.target_names):
        mask = y == i
        plt.scatter(X[mask, 0], X[mask, 1], label=species, 
                   edgecolors='black', s=100, alpha=0.7)
    
    plt.xlabel('LD1')
    plt.ylabel('LD2')
    plt.title(title)
    plt.legend()
    plt.colorbar(label='Predicted Class')
    plt.grid(True, alpha=0.3)
    plt.show()

# Train LDA on the LDA-transformed space for visualization
lda_viz = LinearDiscriminantAnalysis()
lda_viz.fit(X_train_lda, y_train)

plot_decision_boundary(X_train_lda, y_train, lda_viz, 
                       'Decision Boundaries in LDA Space (Training Data)')

## 9. Probability Analysis

In [None]:
# Analyze prediction probabilities
prob_df = pd.DataFrame(
    y_test_proba,
    columns=[f'P({species})' for species in iris.target_names]
)
prob_df['True_Class'] = [iris.target_names[i] for i in y_test]
prob_df['Predicted_Class'] = [iris.target_names[i] for i in y_test_pred]
prob_df['Correct'] = prob_df['True_Class'] == prob_df['Predicted_Class']

print("Sample Predictions with Probabilities:")
display(prob_df.head(15))

In [None]:
# Visualize prediction confidence
max_probs = y_test_proba.max(axis=1)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Histogram of prediction confidence
axes[0].hist(max_probs, bins=30, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Maximum Probability')
axes[0].set_ylabel('Count')
axes[0].set_title('Distribution of Prediction Confidence')
axes[0].axvline(x=max_probs.mean(), color='red', linestyle='--', 
                label=f'Mean: {max_probs.mean():.3f}')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Confidence by correctness
correct_probs = max_probs[y_test == y_test_pred]
incorrect_probs = max_probs[y_test != y_test_pred]

axes[1].hist([correct_probs, incorrect_probs], bins=20, 
             label=['Correct', 'Incorrect'], alpha=0.7, edgecolor='black')
axes[1].set_xlabel('Maximum Probability')
axes[1].set_ylabel('Count')
axes[1].set_title('Prediction Confidence: Correct vs Incorrect')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Average confidence for correct predictions: {correct_probs.mean():.4f}")
if len(incorrect_probs) > 0:
    print(f"Average confidence for incorrect predictions: {incorrect_probs.mean():.4f}")
else:
    print("No incorrect predictions!")

## 10. Key Insights and Conclusions

In [None]:
print("KEY INSIGHTS FROM LDA ON IRIS DATASET")
print("=" * 70)
print(f"\n1. MODEL PERFORMANCE")
print(f"   - Test Accuracy: {test_accuracy:.2%}")
print(f"   - Cross-validation Score: {cv_scores.mean():.2%} (+/- {cv_scores.std() * 2:.2%})")
print(f"   - The model shows {'excellent' if test_accuracy > 0.95 else 'good'} performance")

print(f"\n2. DIMENSIONALITY REDUCTION")
print(f"   - Reduced from {X.shape[1]} to {lda.n_components} dimensions")
print(f"   - LD1 explains {lda.explained_variance_ratio_[0]:.2%} of variance")
print(f"   - LD2 explains {lda.explained_variance_ratio_[1]:.2%} of variance")
print(f"   - Total variance explained: {lda.explained_variance_ratio_.sum():.2%}")

print(f"\n3. FEATURE IMPORTANCE")
most_important_ld1 = scalings_df['LD1'].abs().idxmax()
most_important_ld2 = scalings_df['LD2'].abs().idxmax()
print(f"   - Most important for LD1: {most_important_ld1}")
print(f"   - Most important for LD2: {most_important_ld2}")

print(f"\n4. CLASS SEPARATION")
print(f"   - Classes are well-separated in LDA space")
print(f"   - Setosa is the most easily distinguished class")
if len(incorrect_probs) > 0:
    print(f"   - Some overlap between Versicolor and Virginica")

print(f"\n5. ASSUMPTIONS")
print(f"   - Features are approximately normally distributed")
print(f"   - Covariance homogeneity is reasonably satisfied")
print(f"   - LDA assumptions are well met for this dataset")

print("\n" + "=" * 70)

## Summary

This notebook demonstrated:
1. **Data exploration** and understanding of the Iris dataset
2. **Assumption testing** for LDA (normality and homogeneity)
3. **Model training** and evaluation with LDA
4. **Dimensionality reduction** from 4D to 2D
5. **Visualization** of decision boundaries and class separation
6. **Interpretation** of LDA components and feature importance

The Iris dataset is ideal for learning LDA because:
- Features are approximately normally distributed
- Classes have similar covariances
- Clear linear separability between classes
- Small size allows for quick experimentation

**Next Steps:**
- Try QDA (Quadratic Discriminant Analysis) for comparison
- Experiment with feature selection
- Compare with other classifiers (Logistic Regression, SVM, etc.)