<a href="https://colab.research.google.com/github/rahitya-123/Classification-and-Regression-Trees---Statistics/blob/main/Support_Vector_Machines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Support Vector Machines for Auto-MPG Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For SVM
from sklearn.svm import SVC, SVR
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("Set2")

#----------------------------------------------------------------
# Data Loading and Preprocessing
#----------------------------------------------------------------
print("=== DATA LOADING AND PREPROCESSING ===")

# Define column names based on the dataset description
columns = [
    'mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
    'acceleration', 'model_year', 'origin', 'car_name'
]

# Read the data directly from the auto-mpg.data file
try:
    df = pd.read_csv(
        'auto-mpg.data',
        delim_whitespace=True,
        names=columns,
        na_values='?',
        quotechar='"',
        comment='\t',
        skipinitialspace=True
    )
    print("Loaded auto-mpg.data file successfully")
except:
    print("Error: Could not find auto-mpg.data file")
    print("Please make sure the dataset file is in the current directory.")
    exit()

# Convert horsepower to numeric if needed
if df['horsepower'].dtype == object:
    df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')

# Handle missing values
df['horsepower'].fillna(df['horsepower'].mean(), inplace=True)

# Add region names based on origin codes
origin_names = {1: 'American', 2: 'European', 3: 'Japanese'}
df['region'] = df['origin'].map(origin_names)

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

# Basic data exploration
print("\nSummary statistics:")
print(df.describe())

print("\nMissing values:")
print(df.isna().sum())

# Visualize key relationships in the data
plt.figure(figsize=(15, 10))

# Plot 1: MPG vs Horsepower by Origin
plt.subplot(2, 2, 1)
sns.scatterplot(x='horsepower', y='mpg', hue='region', data=df, alpha=0.7)
plt.title('MPG vs Horsepower by Region')
plt.xlabel('Horsepower')
plt.ylabel('MPG')

# Plot 2: MPG vs Weight by Origin
plt.subplot(2, 2, 2)
sns.scatterplot(x='weight', y='mpg', hue='region', data=df, alpha=0.7)
plt.title('MPG vs Weight by Region')
plt.xlabel('Weight')
plt.ylabel('MPG')

# Plot 3: Horsepower vs Displacement by Origin
plt.subplot(2, 2, 3)
sns.scatterplot(x='displacement', y='horsepower', hue='region', data=df, alpha=0.7)
plt.title('Horsepower vs Displacement by Region')
plt.xlabel('Displacement')
plt.ylabel('Horsepower')

# Plot 4: MPG vs Cylinders by Origin
plt.subplot(2, 2, 4)
sns.boxplot(x='cylinders', y='mpg', hue='region', data=df)
plt.title('MPG by Cylinders and Region')
plt.xlabel('Cylinders')
plt.ylabel('MPG')

plt.tight_layout()
plt.savefig('data_relationships.png')
plt.close()

# Examine correlations
plt.figure(figsize=(10, 8))
numeric_features = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year']
corr = df[numeric_features].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.tight_layout()
plt.savefig('correlation_matrix.png')
plt.close()

#----------------------------------------------------------------
# SUPPORT VECTOR MACHINES FOR CLASSIFICATION
#----------------------------------------------------------------
print("\n=== SVM FOR CLASSIFICATION (PREDICTING CAR ORIGIN) ===")

# Prepare the data for SVM classification
X = df[['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration']].copy()
y = df['origin']  # 1=American, 2=European, 3=Japanese

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create SVM pipeline with preprocessing
svm_clf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(random_state=42))
])

# Define the parameter grid for grid search
param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__gamma': ['scale', 'auto', 0.1, 0.01],
    'svm__kernel': ['rbf', 'linear', 'poly']
}

print("Performing grid search for optimal hyperparameters...")
# Perform grid search for optimal hyperparameters
grid_search = GridSearchCV(
    svm_clf_pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1
)
grid_search.fit(X_train, y_train)

print(f"Best parameters for SVM classifier: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Test the best model
best_svm_clf = grid_search.best_estimator_
y_pred = best_svm_clf.predict(X_test)

# Evaluate the model
print("\nClassification Report for SVM:")
print(classification_report(y_test, y_pred, target_names=['American', 'European', 'Japanese']))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['American', 'European', 'Japanese'],
            yticklabels=['American', 'European', 'Japanese'])
plt.title('Confusion Matrix for SVM Classification')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('svm_confusion_matrix.png')
plt.close()

# For visualization, project the data to 2D using PCA
print("Projecting data to 2D using PCA for visualization...")
# Create a PCA model to project the data to 2D
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(best_svm_clf.named_steps['scaler'].transform(X_train))
X_test_pca = pca.transform(best_svm_clf.named_steps['scaler'].transform(X_test))

# For visualization, train an SVM model on the 2D PCA data
svm_2d = SVC(C=best_svm_clf.named_steps['svm'].C,
            gamma=best_svm_clf.named_steps['svm'].gamma,
            kernel=best_svm_clf.named_steps['svm'].kernel,
            random_state=42)
svm_2d.fit(X_train_pca, y_train)

# Create a mesh grid to plot the decision boundary
def plot_decision_boundary(model, X, y, title, filename):
    # Define a function to plot decision boundaries
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                         np.arange(y_min, y_max, 0.02))

    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.figure(figsize=(10, 8))
    plt.contourf(xx, yy, Z, alpha=0.3, cmap='viridis')

    colors = {1: 'red', 2: 'blue', 3: 'green'}
    markers = {1: 'o', 2: 's', 3: '^'}
    labels = {1: 'American', 2: 'European', 3: 'Japanese'}

    for origin in np.unique(y):
        plt.scatter(X[y == origin, 0], X[y == origin, 1],
                    c=colors[origin], marker=markers[origin],
                    label=labels[origin], alpha=0.7)

    plt.title(title)
    plt.xlabel('PCA Feature 1')
    plt.ylabel('PCA Feature 2')
    plt.legend()
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

# Plot the decision boundary
print("Plotting SVM decision boundaries...")
plot_decision_boundary(svm_2d, X_train_pca, y_train,
                      'SVM Decision Boundary (PCA-reduced data)',
                      'svm_decision_boundary.png')

# Plot PCA-transformed test data with predictions
plt.figure(figsize=(10, 8))
colors = {1: 'red', 2: 'blue', 3: 'green'}
markers = {1: 'o', 2: 's', 3: '^'}
labels = {1: 'American', 2: 'European', 3: 'Japanese'}

for origin in np.unique(y_test):
    mask = y_test == origin
    plt.scatter(X_test_pca[mask, 0], X_test_pca[mask, 1],
                c=colors[origin], marker=markers[origin],
                label=f'True {labels[origin]}', alpha=0.7, s=100)

# Plot incorrect predictions
for i, (true, pred) in enumerate(zip(y_test, y_pred)):
    if true != pred:
        plt.scatter(X_test_pca[i, 0], X_test_pca[i, 1],
                   color='black', marker='x', s=100, linewidth=2,
                   label='Misclassified' if i == 0 else "")

plt.title('PCA-transformed Test Data with SVM Predictions')
plt.xlabel('PCA Feature 1')
plt.ylabel('PCA Feature 2')
plt.legend()
plt.tight_layout()
plt.savefig('svm_test_predictions.png')
plt.close()

# Cross-validation for more robust evaluation
print("Performing cross-validation...")
cv_scores = cross_val_score(best_svm_clf, X, y, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.4f}")
print(f"Standard deviation: {cv_scores.std():.4f}")

#----------------------------------------------------------------
# SUPPORT VECTOR MACHINES FOR REGRESSION
#----------------------------------------------------------------
print("\n=== SVM FOR REGRESSION (PREDICTING MPG) ===")

# Prepare the data for SVM regression
X = df[['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year']].copy()
y = df['mpg']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create SVM regression pipeline with preprocessing
svm_regr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

# Define the parameter grid for grid search
param_grid_reg = {
    'svr__C': [0.1, 1, 10, 100],
    'svr__gamma': ['scale', 'auto', 0.1, 0.01],
    'svr__kernel': ['rbf', 'linear'],
    'svr__epsilon': [0.1, 0.2, 0.5]
}

print("Performing grid search for optimal hyperparameters...")
# Perform grid search for optimal hyperparameters
grid_search_reg = GridSearchCV(
    svm_regr_pipeline,
    param_grid_reg,
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=1
)
grid_search_reg.fit(X_train, y_train)

print(f"Best parameters for SVM regressor: {grid_search_reg.best_params_}")
print(f"Best cross-validation score (negative MSE): {grid_search_reg.best_score_:.4f}")

# Test the best model
best_svm_regr = grid_search_reg.best_estimator_
y_pred_regr = best_svm_regr.predict(X_test)

# Evaluate the regression model
mse = mean_squared_error(y_test, y_pred_regr)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_regr)

print(f"Test set Mean Squared Error: {mse:.4f}")
print(f"Test set Root Mean Squared Error: {rmse:.4f}")
print(f"Test set R² Score: {r2:.4f}")

# Plot actual vs predicted MPG
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_regr, alpha=0.7)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.title('Actual vs Predicted MPG (SVM Regression)')
plt.xlabel('Actual MPG')
plt.ylabel('Predicted MPG')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('svm_regression_actual_vs_predicted.png')
plt.close()

# Plot residuals
residuals = y_test - y_pred_regr
plt.figure(figsize=(10, 6))
plt.scatter(y_pred_regr, residuals, alpha=0.7)
plt.axhline(y=0, color='k', linestyle='--', lw=2)
plt.title('Residual Plot for SVM Regression')
plt.xlabel('Predicted MPG')
plt.ylabel('Residuals')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('svm_regression_residuals.png')
plt.close()

# Plot residuals distribution
plt.figure(figsize=(10, 6))
plt.hist(residuals, bins=30, alpha=0.7, edgecolor='black')
plt.axvline(x=0, color='r', linestyle='--', linewidth=2)
plt.title('Distribution of Residuals')
plt.xlabel('Residual Value')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.savefig('svm_residuals_distribution.png')
plt.close()

# Evaluate feature importance for regression
# For SVM, we can't directly get feature importance,
# but we can use permutation importance
from sklearn.inspection import permutation_importance

result = permutation_importance(best_svm_regr, X_test, y_test, n_repeats=10, random_state=42)
sorted_idx = result.importances_mean.argsort()
feature_names = X.columns

plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_idx)), result.importances_mean[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
plt.title('Permutation Feature Importance (SVR)')
plt.xlabel('Mean Decrease in Accuracy')
plt.tight_layout()
plt.savefig('svr_feature_importance.png')
plt.close()

print("\nFeature importance ranking for MPG prediction:")
for i in reversed(sorted_idx):
    print(f"{feature_names[i]}: {result.importances_mean[i]:.4f} ± {result.importances_std[i]:.4f}")

# Compare actual and predicted MPG values across different car origins
X_test_with_origin = X_test.copy()
X_test_with_origin['origin'] = y_test.index.map(lambda i: df.iloc[i]['origin'])
X_test_with_origin['actual_mpg'] = y_test
X_test_with_origin['predicted_mpg'] = y_pred_regr
X_test_with_origin['residual'] = residuals
X_test_with_origin['region'] = X_test_with_origin['origin'].map(origin_names)

plt.figure(figsize=(12, 6))
sns.boxplot(x='region', y='residual', data=X_test_with_origin)
plt.title('Residuals by Region')
plt.xlabel('Region')
plt.ylabel('Residual (Actual - Predicted)')
plt.axhline(y=0, color='r', linestyle='--')
plt.tight_layout()
plt.savefig('svm_residuals_by_region.png')
plt.close()

# Create a scatterplot of actual vs predicted MPG colored by region
plt.figure(figsize=(10, 6))
for region in origin_names.values():
    mask = X_test_with_origin['region'] == region
    plt.scatter(
        X_test_with_origin.loc[mask, 'actual_mpg'],
        X_test_with_origin.loc[mask, 'predicted_mpg'],
        alpha=0.7,
        label=region
    )
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.title('Actual vs Predicted MPG by Region')
plt.xlabel('Actual MPG')
plt.ylabel('Predicted MPG')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('svm_actual_vs_predicted_by_region.png')
plt.close()

print("\nSVM analysis complete. Output files saved.")

=== DATA LOADING AND PREPROCESSING ===
Loaded auto-mpg.data file successfully
Dataset shape: (398, 10)

First few rows:
    mpg  cylinders  displacement  horsepower  weight  acceleration  \
0  18.0          8         307.0       130.0  3504.0          12.0   
1  15.0          8         350.0       165.0  3693.0          11.5   
2  18.0          8         318.0       150.0  3436.0          11.0   
3  16.0          8         304.0       150.0  3433.0          12.0   
4  17.0          8         302.0       140.0  3449.0          10.5   

   model_year  origin                   car_name    region  
0          70       1  chevrolet chevelle malibu  American  
1          70       1          buick skylark 320  American  
2          70       1         plymouth satellite  American  
3          70       1              amc rebel sst  American  
4          70       1                ford torino  American  

Summary statistics:


  df = pd.read_csv(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['horsepower'].fillna(df['horsepower'].mean(), inplace=True)


              mpg   cylinders  displacement  horsepower       weight  \
count  398.000000  398.000000    398.000000  398.000000   398.000000   
mean    23.514573    5.454774    193.425879  104.469388  2970.424623   
std      7.815984    1.701004    104.269838   38.199187   846.841774   
min      9.000000    3.000000     68.000000   46.000000  1613.000000   
25%     17.500000    4.000000    104.250000   76.000000  2223.750000   
50%     23.000000    4.000000    148.500000   95.000000  2803.500000   
75%     29.000000    8.000000    262.000000  125.000000  3608.000000   
max     46.600000    8.000000    455.000000  230.000000  5140.000000   

       acceleration  model_year      origin  
count    398.000000  398.000000  398.000000  
mean      15.568090   76.010050    1.572864  
std        2.757689    3.697627    0.802055  
min        8.000000   70.000000    1.000000  
25%       13.825000   73.000000    1.000000  
50%       15.500000   76.000000    1.000000  
75%       17.175000   79.0000