<a href="https://colab.research.google.com/github/rahitya-123/Classification-and-Regression-Trees---Statistics/blob/main/Linear_Discriminant_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Linear Discriminant Analysis for Auto-MPG Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For discriminant analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("Set2")

#----------------------------------------------------------------
# Data Loading and Preprocessing
#----------------------------------------------------------------
print("=== DATA LOADING AND PREPROCESSING ===")

# Define column names based on the dataset description
columns = [
    'mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
    'acceleration', 'model_year', 'origin', 'car_name'
]

# Read the data directly from the auto-mpg.data file
try:
    df = pd.read_csv(
        'auto-mpg.data',
        delim_whitespace=True,
        names=columns,
        na_values='?',
        quotechar='"',
        comment='\t',
        skipinitialspace=True
    )
    print("Loaded auto-mpg.data file successfully")
except:
    print("Error: Could not find auto-mpg.data file")
    print("Please make sure the dataset file is in the current directory.")
    exit()

# Convert horsepower to numeric if needed
if df['horsepower'].dtype == object:
    df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')

# Handle missing values
df['horsepower'].fillna(df['horsepower'].mean(), inplace=True)

# Add region names based on origin codes
origin_names = {1: 'American', 2: 'European', 3: 'Japanese'}
df['region'] = df['origin'].map(origin_names)

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

# Basic data exploration
print("\nSummary statistics:")
print(df.describe())

print("\nMissing values:")
print(df.isna().sum())

# Visualize class distribution (car origin)
plt.figure(figsize=(10, 6))
counts = df['origin'].value_counts().sort_index()
plt.bar(counts.index, counts.values, tick_label=['American', 'European', 'Japanese'])
plt.title('Distribution of Car Origins')
plt.xlabel('Region')
plt.ylabel('Count')
for i, v in enumerate(counts.values):
    plt.text(i+1, v+5, str(v), ha='center')
plt.savefig('origin_distribution.png')
plt.close()

# Correlation analysis
plt.figure(figsize=(10, 8))
numeric_cols = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year']
correlation = df[numeric_cols].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.tight_layout()
plt.savefig('correlation_matrix.png')
plt.close()

# Box plots of features by region
plt.figure(figsize=(15, 12))
features = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration']
for i, feature in enumerate(features):
    plt.subplot(3, 2, i+1)
    sns.boxplot(x='region', y=feature, data=df)
    plt.title(f'{feature} by Region')
    plt.xlabel('Region')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('features_by_region.png')
plt.close()

#----------------------------------------------------------------
# LINEAR DISCRIMINANT ANALYSIS
#----------------------------------------------------------------
print("\n=== LINEAR DISCRIMINANT ANALYSIS ===")

# Prepare the data for LDA
X = df[['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration']].copy()
y = df['origin']  # 1=American, 2=European, 3=Japanese

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the LDA model
print("Training LDA model...")
lda = LDA()
lda.fit(X_train_scaled, y_train)

# Analyze the explained variance ratio
print(f"Explained variance ratio for each discriminant function: {lda.explained_variance_ratio_}")

# Make predictions
y_pred = lda.predict(X_test_scaled)

# Evaluate the model
print("\nClassification Report for LDA:")
print(classification_report(y_test, y_pred, target_names=['American', 'European', 'Japanese']))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['American', 'European', 'Japanese'],
            yticklabels=['American', 'European', 'Japanese'])
plt.title('Confusion Matrix for LDA')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('lda_confusion_matrix.png')
plt.close()

# Transform the data to visualize the discriminant functions
X_train_lda = lda.transform(X_train_scaled)

# Plot the training data in the LDA space
plt.figure(figsize=(10, 6))
colors = {'1': 'red', '2': 'blue', '3': 'green'}
markers = {'1': 'o', '2': 's', '3': '^'}
labels = {'1': 'American', '2': 'European', '3': 'Japanese'}

# Since LDA for 3 classes gives 2 discriminant functions, we can visualize both
for origin, label in labels.items():
    mask = y_train.astype(str) == origin
    plt.scatter(
        X_train_lda[mask, 0],
        X_train_lda[mask, 1],
        c=colors[origin],
        marker=markers[origin],
        alpha=0.7,
        label=label
    )

plt.title('Linear Discriminant Analysis')
plt.xlabel('LD1')
plt.ylabel('LD2')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('lda_visualization.png')
plt.close()

# Analyze coefficients to understand feature importance
# The shape of lda.coef_ depends on the number of classes
n_classes = len(np.unique(y_train))
n_features = X.shape[1]

print(f"\nShape of LDA coefficients: {lda.coef_.shape}")
print(f"Number of classes: {n_classes}")
print(f"Number of features: {n_features}")

# For multi-class LDA, coefficients are shaped differently
# Each row represents coefficients for discriminating one class vs. others
coef = pd.DataFrame(
    lda.coef_,
    columns=X.columns,
    index=[f'Class {i+1}' for i in range(lda.coef_.shape[0])]
)
print("\nLDA Coefficients (feature importance):")
print(coef)

# Plot the coefficients
plt.figure(figsize=(12, 8))
for i in range(coef.shape[0]):
    plt.subplot(coef.shape[0], 1, i+1)
    plt.bar(X.columns, coef.iloc[i])
    plt.title(f'Coefficients for {coef.index[i]}')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('lda_coefficients.png')
plt.close()

# Plot the scaling factors for each class
scaling = lda.scalings_
scaling_df = pd.DataFrame(
    scaling,
    columns=['LD1', 'LD2'],
    index=X.columns
)
print("\nLDA Scaling factors:")
print(scaling_df)

plt.figure(figsize=(8, 6))
for i, feature in enumerate(X.columns):
    plt.arrow(0, 0, scaling_df.iloc[i, 0]*10, scaling_df.iloc[i, 1]*10,
              head_width=0.1, head_length=0.1, fc='k', ec='k')
    plt.text(scaling_df.iloc[i, 0]*10.1, scaling_df.iloc[i, 1]*10.1, feature)

plt.xlim(-5, 5)
plt.ylim(-5, 5)
plt.grid(True)
plt.axhline(y=0, color='k', linestyle='-', alpha=0.3)
plt.axvline(x=0, color='k', linestyle='-', alpha=0.3)
plt.title('LDA Feature Projections')
plt.xlabel('LD1')
plt.ylabel('LD2')
plt.tight_layout()
plt.savefig('lda_feature_projections.png')
plt.close()

# Analyze the class means in the original feature space
class_means = {}
for cls in np.unique(y_train):
    class_means[cls] = X_train[y_train == cls].mean()

class_means_df = pd.DataFrame(class_means).T
class_means_df.index = ['American', 'European', 'Japanese']
print("\nClass means in original feature space:")
print(class_means_df)

# Plot the class means as a heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(class_means_df, annot=True, cmap='YlGnBu', fmt='.2f')
plt.title('Class Means in Original Feature Space')
plt.tight_layout()
plt.savefig('lda_class_means.png')
plt.close()

# Analyze posterior probabilities for test samples
# This shows the probability of each class for each sample
probs = lda.predict_proba(X_test_scaled)
prob_df = pd.DataFrame(probs, columns=['American', 'European', 'Japanese'])
print("\nPosterior probabilities for first 5 test samples:")
print(prob_df.head())

# Visualize posterior probabilities
plt.figure(figsize=(10, 6))
plt.hist(prob_df['American'], bins=30, alpha=0.5, label='American')
plt.hist(prob_df['European'], bins=30, alpha=0.5, label='European')
plt.hist(prob_df['Japanese'], bins=30, alpha=0.5, label='Japanese')
plt.title('Distribution of Class Probabilities')
plt.xlabel('Probability')
plt.ylabel('Count')
plt.legend()
plt.tight_layout()
plt.savefig('lda_posterior_probabilities.png')
plt.close()

# Cross-validation for more robust evaluation
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(LDA(), X_train_scaled, y_train, cv=5)
print(f"\nCross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.4f}")
print(f"Standard deviation: {cv_scores.std():.4f}")

print("\nLinear Discriminant Analysis complete. Output files saved.")

=== DATA LOADING AND PREPROCESSING ===
Loaded auto-mpg.data file successfully
Dataset shape: (398, 10)

First few rows:
    mpg  cylinders  displacement  horsepower  weight  acceleration  \
0  18.0          8         307.0       130.0  3504.0          12.0   
1  15.0          8         350.0       165.0  3693.0          11.5   
2  18.0          8         318.0       150.0  3436.0          11.0   
3  16.0          8         304.0       150.0  3433.0          12.0   
4  17.0          8         302.0       140.0  3449.0          10.5   

   model_year  origin                   car_name    region  
0          70       1  chevrolet chevelle malibu  American  
1          70       1          buick skylark 320  American  
2          70       1         plymouth satellite  American  
3          70       1              amc rebel sst  American  
4          70       1                ford torino  American  

Summary statistics:
              mpg   cylinders  displacement  horsepower       weight  \


  df = pd.read_csv(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['horsepower'].fillna(df['horsepower'].mean(), inplace=True)



=== LINEAR DISCRIMINANT ANALYSIS ===
Training LDA model...
Explained variance ratio for each discriminant function: [0.96519937 0.03480063]

Classification Report for LDA:
              precision    recall  f1-score   support

    American       0.92      0.81      0.86        73
    European       0.42      0.35      0.38        23
    Japanese       0.49      0.75      0.59        24

    accuracy                           0.71       120
   macro avg       0.61      0.64      0.61       120
weighted avg       0.74      0.71      0.72       120


Shape of LDA coefficients: (3, 6)
Number of classes: 3
Number of features: 6

LDA Coefficients (feature importance):
              mpg  cylinders  displacement  horsepower    weight  acceleration
Class 1 -0.431211  -0.459398      1.935460   -0.710410 -0.221211      0.014882
Class 2  0.436080   0.492528     -3.466363    1.054004  0.902927      0.103786
Class 3  1.007226   1.049186     -3.231308    1.372619 -0.063717     -0.136314

LDA Scaling

  plt.tight_layout()



Class means in original feature space:
                mpg  cylinders  displacement  horsepower       weight  \
American  19.896591   6.238636    245.355114  116.786410  3370.840909   
European  28.359574   4.148936    109.148936   80.785931  2453.042553   
Japanese  30.609091   4.127273    104.345455   81.018182  2253.836364   

          acceleration  
American     15.170455  
European     16.921277  
Japanese     16.052727  

Posterior probabilities for first 5 test samples:
   American  European  Japanese
0  0.280274  0.246047  0.473679
1  0.334720  0.411315  0.253965
2  0.978382  0.008410  0.013208
3  0.983042  0.009405  0.007553
4  0.974913  0.016233  0.008853

Cross-validation scores: [0.75       0.69642857 0.80357143 0.69090909 0.65454545]
Mean CV score: 0.7191
Standard deviation: 0.0521

Linear Discriminant Analysis complete. Output files saved.
