# SVM Classifier with PCA Dimensionality Reduction

## Importing Dependencies

In [None]:
import os
import pickle
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# SADR: PCA functionalities
from sklearn.decomposition import PCA, KernelPCA

# SADR: functionalities for model selection and cross-validation
from sklearn.model_selection import RandomizedSearchCV

# SADR: importing the confusion matrix functionality.
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

## Loading Data

In [None]:
# SADR: path to the dataset.
dataset_path = os.path.join("preprocessed_datasets", "dataset_one_hot.pkl")

# SADR: loading training data.
with open(dataset_path, "rb") as f:
    dataset_one_hot = pickle.load(f)

# SADR: getting the training, validation, and testing data.
X_train, y_train = dataset_one_hot["X_train"], dataset_one_hot["y_train"]
X_val, y_val = dataset_one_hot["X_val"], dataset_one_hot["y_val"]
X_test, y_test = dataset_one_hot["X_test"], dataset_one_hot["y_test"]

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"X_val: {X_val.shape}")

# SADR: rejoining the training and validation data.
# Important to do k-fold cross-validation.
X_train = np.concat((X_train, X_val))
y_train = np.concat((y_train, y_val))
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")

## Pipeline of kernel PCA (inhomogeneous polynomial kernel) + Linear SVM

First, we carry out a dimensionality reduction with a kernelized PCA through a inhomogeneous polynomial kernel, and then train a linear SVC on the data.

For reference, with `n_iter = 2`, the code below takes about 50s to run in my personal computer.

In [None]:
# SADR: specify the parameter values.
param_distr = {
    "pca__n_components": np.arange(10, 410, 10),
    "pca__degree" : np.arange(1, 40, 2),
    "svc__C": np.logspace(-1, 3, 100),
}

# SADR: classifier pipeline.
clf = Pipeline([
    ("pca", KernelPCA(kernel="poly", random_state=42)),
    ("scaler", StandardScaler()), 
    ("svc", SVC())
])

# SADR: model selection using randomized search.
# This treats the parameters as uniform probability distributions,
# takes random samples (n_iter) samples, and evaluates the results
# by default over k-fold (k=5) cross-validation.
# - n_iter: number of iterations (set it to a small number at the beginning
# to have an idea about the model selection plus training time).
rnd_search = RandomizedSearchCV(clf, param_distr, 
    n_iter=500, scoring='f1', random_state=42)

# SADR: fitting the model (and carrying out the parameter search)
# This might be time consuming.
rnd_search.fit(X_train, y_train)

# SADR: reporting the best parameters
rnd_search.best_params_

In [None]:
# SADR: prediction on the trained model.
y_pred = rnd_search.predict(X_test)

# SADR: testing scores.
print(f"accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"precision: {precision_score(y_test, y_pred):.4f}")
print(f"recall: {recall_score(y_test, y_pred):.4f}")
print(f"f1_score: {f1_score(y_test, y_pred):.4f}")

## Pipeline of kernel PCA (RBF kernel) + Linear SVM

In [None]:
# SADR: specify the parameter values.
param_distr = {
    "pca__n_components": np.arange(10, 410, 10),
    "pca__gamma" : np.logspace(-3, 3, 100),
    "svc__C": np.logspace(-1, 3, 100),
}

# SADR: classifier pipeline.
clf = Pipeline([
    ("pca", KernelPCA(kernel="rbf", random_state=42)), 
    ("svc", SVC())
])

# SADR: model selection using randomized search.
# This treats the parameters as uniform probability distributions,
# takes random samples (n_iter) samples, and evaluates the results
# by default over k-fold (k=5) cross-validation.
# - n_iter: number of iterations (set it to a small number at the beginning
# to have an idea about the model selection plus training time).
rnd_search = RandomizedSearchCV(clf, param_distr, 
    n_iter=500, scoring='f1', random_state=42)

# SADR: fitting the model (and carrying out the parameter search)
# This might be time consuming.
rnd_search.fit(X_train, y_train)

# SADR: reporting the best parameters
rnd_search.best_params_

### Testing Performance

In [None]:
# SADR: prediction on the trained model.
y_pred = rnd_search.predict(X_test)

# SADR: testing scores.
print(f"accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"precision: {precision_score(y_test, y_pred):.4f}")
print(f"recall: {recall_score(y_test, y_pred):.4f}")
print(f"f1_score: {f1_score(y_test, y_pred):.4f}")

## Pipeline of linear PCA + kernelized SVM (RBF)

In [None]:
# SADR: specify the parameter values.
param_distr = {
    "pca__n_components": np.arange(10, 410, 10),
    "svc__gamma" : np.logspace(-3, 3, 100),
    "svc__C": np.logspace(-1, 3, 100),
}

# SADR: classifier pipeline.
clf = Pipeline([
    ("pca", PCA(random_state=42)),
    ("scaler", StandardScaler()), 
    ("svc", SVC(kernel="rbf"))
])

# SADR: model selection using randomized search.
# This treats the parameters as uniform probability distributions,
# takes random samples (n_iter) samples, and evaluates the results
# by default over k-fold (k=5) cross-validation.
# - n_iter: number of iterations (set it to a small number at the beginning
# to have an idea about the model selection plus training time).
rnd_search = RandomizedSearchCV(clf, param_distr, 
    n_iter=500, scoring='f1', random_state=42)

# SADR: fitting the model (and carrying out the parameter search)
# This might be time consuming.
rnd_search.fit(X_train, y_train)

# SADR: reporting the best parameters
rnd_search.best_params_

### Testing Performance

In [None]:
# SADR: prediction on the trained model.
y_pred = rnd_search.predict(X_test)

# SADR: testing scores.
print(f"accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"precision: {precision_score(y_test, y_pred):.4f}")
print(f"recall: {recall_score(y_test, y_pred):.4f}")
print(f"f1_score: {f1_score(y_test, y_pred):.4f}")

## Pipeline of linear PCA + kernelized SVM (inhomogeneous polynomial kernel)

In [None]:
# SADR: specify the parameter values.
param_distr = {
    "pca__n_components": np.arange(10, 410, 10),
    "svc__degree" : np.arange(1, 21),
    "svc__C": np.logspace(-1, 3, 100),
}

# SADR: classifier pipeline.
clf = Pipeline([
    ("pca", PCA(random_state=42)),
    ("scaler", StandardScaler()), 
    ("svc", SVC(kernel="poly", coef0=1))
])

# SADR: model selection using randomized search.
# This treats the parameters as uniform probability distributions,
# takes random samples (n_iter) samples, and evaluates the results
# by default over k-fold (k=5) cross-validation.
# - n_iter: number of iterations (set it to a small number at the beginning
# to have an idea about the model selection plus training time).
rnd_search = RandomizedSearchCV(clf, param_distr, 
    n_iter=500, scoring='f1', random_state=42)

# SADR: fitting the model (and carrying out the parameter search)
# This might be time consuming.
rnd_search.fit(X_train, y_train)

# SADR: reporting the best parameters
rnd_search.best_params_

### Testing Performance

In [None]:
# SADR: prediction on the trained model.
y_pred = rnd_search.predict(X_test)

# SADR: testing scores.
print(f"accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"precision: {precision_score(y_test, y_pred):.4f}")
print(f"recall: {recall_score(y_test, y_pred):.4f}")
print(f"f1_score: {f1_score(y_test, y_pred):.4f}")