In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_lfw_people
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
def load_lfw_dataset(min_faces_per_person=50, resize=0.5):
    lfw_people = fetch_lfw_people(min_faces_per_person=min_faces_per_person, resize=resize)
    X = lfw_people.data
    y = lfw_people.target
    target_names = lfw_people.target_names
    n_classes = target_names.shape[0]
    
    print(f"Dataset dimensions: {X.shape}")
    print(f"Number of classes: {n_classes}")
    
    return X, y, target_names, n_classes

X, y, target_names, n_classes = load_lfw_dataset()


In [None]:
def preprocess_and_split_data(X, y, n_components=200, test_size=0.3, random_state=42):
    # Standardize the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply PCA
    pca = PCA(n_components=n_components, whiten=True, random_state=random_state)
    X_pca = pca.fit_transform(X_scaled)
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=test_size, random_state=random_state, stratify=y)
    
    return X_train, X_test, y_train, y_test, pca

X_train, X_test, y_train, y_test, pca = preprocess_and_split_data(X, y)

In [None]:
def train_and_evaluate_svm(X_train, X_test, y_train, y_test, kernel, param_grid):
    svm = SVC(kernel=kernel)
    grid_search = GridSearchCV(svm, param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    print(f"\nBest parameters for {kernel} kernel:")
    print(grid_search.best_params_)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=target_names))
    
    return best_model

# Define parameter grids for each kernel
param_grids = {
    'linear': {'C': [0.1, 1, 10, 100]},
    'rbf': {'C': [0.1, 1, 10, 100], 'gamma': ['scale', 'auto', 0.1, 1]},
    'poly': {'C': [0.1, 1, 10], 'degree': [2, 3, 4], 'gamma': ['scale', 'auto']}
}

# Train and evaluate SVM with different kernels
best_models = {}
for kernel in ['linear', 'rbf', 'poly']:
    print(f"\nTraining SVM with {kernel} kernel")
    best_models[kernel] = train_and_evaluate_svm(X_train, X_test, y_train, y_test, kernel, param_grids[kernel])


In [None]:
def plot_decision_boundary(X, y, model, kernel):
    # We'll plot the decision boundary for the first two principal components
    X_plot = X[:, :2]
    
    x_min, x_max = X_plot[:, 0].min() - 1, X_plot[:, 0].max() + 1
    y_min, y_max = X_plot[:, 1].min() - 1, X_plot[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, (x_max - x_min) / 100),
                         np.arange(y_min, y_max, (y_max - y_min) / 100))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.figure(figsize=(10, 8))
    plt.contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdYlBu)
    plt.scatter(X_plot[:, 0], X_plot[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolor='black')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.title(f'Decision Boundary with {kernel.capitalize()} Kernel')
    
    # Plot support vectors
    if hasattr(model, 'support_vectors_'):
        plt.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], s=100, 
                    linewidth=1, facecolors='none', edgecolors='k')
    
    plt.show()
    
    # Print support vectors and margin size
    if hasattr(model, 'support_vectors_'):
        print(f"\nNumber of support vectors: {model.n_support_}")
        print(f"Support vectors:\n{model.support_vectors_}")
        
        # Calculate margin size for linear kernel
        if kernel == 'linear':
            w = model.coef_[0]
            margin = 2 / np.sqrt(np.sum(w ** 2))
            print(f"\nMargin size: {margin}")
    else:
        print("\nThis model does not have support vectors.")

# Plot decision boundaries and print support vectors for each best model
for kernel, model in best_models.items():
    plot_decision_boundary(X_train, y_train, model, kernel)

# Visualize some example predictions
def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
    plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
    plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
    for i in range(n_row * n_col):
        plt.subplot(n_row, n_col, i + 1)
        plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
        plt.title(titles[i], size=12)
        plt.xticks(())
        plt.yticks(())

def title(y_pred, y_test, target_names, i):
    pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
    true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
    return f'predicted: {pred_name}\ntrue: {true_name}'

# Choose the best model (you can change this to any of the trained models)
best_model = best_models['rbf']

prediction_titles = [title(best_model.predict(X_test), y_test, target_names, i)
                     for i in range(y_test.shape[0])]

plot_gallery(X_test, prediction_titles, h=50, w=37, n_row=3, n_col=4)
plt.show()