# SVM Classifier with $k$-mer Encoding

## Importing Dependencies

In [4]:
import os
import pickle
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC

# SADR: importing the confusion matrix functionality.
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

## Loading Dataset

In [5]:
# SADR: path to the dataset.
dataset_path = os.path.join("preprocessed_datasets", "dataset_k_mer.pkl")

# SADR: loading training data.
with open(dataset_path, "rb") as f:
    dataset_k_mer = pickle.load(f)

# SADR: getting the training, validation, and testing data.
X_train, y_train = dataset_k_mer["X_train"], dataset_k_mer["y_train"]
X_val, y_val = dataset_k_mer["X_val"], dataset_k_mer["y_val"]
X_test, y_test = dataset_k_mer["X_test"], dataset_k_mer["y_test"]

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"X_val: {X_val.shape}")

X_train: (2560, 459)
X_test: (800, 459)
X_val: (640, 459)


## Polynomial Kernel

### Model Selection

In [None]:
# SADR: inhomogenous polynomial kernel bias parameter.
coef0 = 1

# SADR: degree of the polynomial kernel.
degree = np.arange(2, 15)

# SADR: cost coefficient.
C_vals = np.logspace(-2, 1, 50)

# SADR: array to store the f1 scores.
f1_scores = np.zeros(shape=(degree.shape[0], C_vals.shape[0]))
precision_scores = np.zeros(shape=(degree.shape[0], C_vals.shape[0]))

# SADR: grid search for the best degree and C values.
for _n_d, _d in enumerate(degree):
    for _n_c, _c in enumerate(C_vals):
        poly_kernel_svm_clf = Pipeline([
            ("scaler", StandardScaler()),
            ("svm_clf", SVC(kernel="poly", degree=_d, C=_c, coef0=coef0))
        ])
        
        # SADR: training the model.
        poly_kernel_svm_clf.fit(X_train, y_train)

        # SADR: evaluating the performance on the validation set.
        f1_score_val = f1_score(y_val, poly_kernel_svm_clf.predict(X_val))
        f1_scores[_n_d, _n_c] = f1_score_val
        precision_score_val = precision_score(y_val, poly_kernel_svm_clf.predict(X_val))
        precision_scores[_n_d, _n_c] = precision_score_val
        print(f"degree: {_d}, C: {_c}, precision: {precision_score_val}, f1: {f1_score_val}")

# SADR: determining the best degree and C values.
# SADR: precision
(degree_opt_idx_pr, C_opt_idx_pr) = np.unravel_index(np.argmax(precision_scores), precision_scores.shape)
degree_opt_pr, C_opt_pr = degree[degree_opt_idx_pr], C_vals[C_opt_idx_pr]
print(f"degree_opt_pr: {degree_opt_pr}, C_opt_pr: {C_opt_pr}")

# SADR: f1
(degree_opt_idx_f1, C_opt_idx_f1) = np.unravel_index(np.argmax(f1_scores), f1_scores.shape)
degree_opt_f1, C_opt_f1 = degree[degree_opt_idx_f1], C_vals[C_opt_idx_f1]
print(f"degree_opt_pr: {degree_opt_f1}, C_opt_f1: {C_opt_f1}")

### Training

In [7]:
# SADR: optimal hyperparameter values.
# Precision and F1 scores yield the same optimal hyperparameters for this case.
coef0 = 1
degree_opt, C_opt = 3, 7.543120063354615

X_train_all = np.concatenate((X_train, X_val), axis=0)
y_train_all = np.concatenate((y_train, y_val), axis=0)

# SADR: training the model on the entire training set.
poly_kernel_svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="poly", degree=degree_opt, C=C_opt, coef0=coef0))
])
poly_kernel_svm_clf.fit(X_train_all, y_train_all)
y_pred = poly_kernel_svm_clf.predict(X_test)

# SADR: computing the performance metrics on the test set.
print(f"accuracy: {accuracy_score(y_test, y_pred)}")
print(f"precision: {precision_score(y_test, y_pred)}")
print(f"recall: {recall_score(y_test, y_pred)}")
print(f"f1_score: {f1_score(y_test, y_pred)}")

# SADR: printing the confusion matrix.
cm = confusion_matrix(y_test, y_pred)
print(f"confusion_matrix: {cm}")

# SADR: saving the confusion matrix.
cm_path = os.path.join("results", "k_mer", "svm_poly_kernel_cm.npy")
np.save(cm_path, cm)

accuracy: 0.9825
precision: 0.9761904761904762
recall: 0.9903381642512077
f1_score: 0.9832134292565947
confusion_matrix: [[376  10]
 [  4 410]]


## Gaussial RBF Kernel

### Model Selection

In [None]:
# SADR: cost parameter.
C_vals = np.logspace(-2, 1, 10)

# SADR: scaling parameter.
gamma_vals = np.logspace(-1, 1, 20)

# SADR: array to store the f1 scores.                                   
f1_scores = np.zeros(shape=(gamma_vals.shape[0], C_vals.shape[0]))
precision_scores = np.zeros(shape=(gamma_vals.shape[0], C_vals.shape[0]))

# SADR: grid search for the best degree and C values.
for _n_gamma, _gamma in enumerate(gamma_vals):
    for _n_c, _c in enumerate(C_vals):
        gaussian_rbf_svm_clf = Pipeline([
            ("scaler", StandardScaler()),
            ("svm_clf", SVC(kernel="rbf", gamma=_gamma, C=_c))
        ])
        
        # SADR: training the model.
        gaussian_rbf_svm_clf.fit(X_train, y_train)

        # SADR: evaluating the performance on the validation set.
        f1_score_val = f1_score(y_val, gaussian_rbf_svm_clf.predict(X_val))
        f1_scores[_n_gamma, _n_c] = f1_score_val

        precision_score_val = precision_score(y_val, gaussian_rbf_svm_clf.predict(X_val))
        precision_scores[_n_gamma, _n_c] = precision_score_val
        print(f"gamma: {_gamma}, C: {_c}, precision: {precision_score_val}, f1: {f1_score_val}")

# SADR: precision
(gamma_opt_idx_pr, C_opt_idx_pr) = np.unravel_index(np.argmax(precision_scores), precision_scores.shape)
gamma_opt_pr, C_opt_pr = gamma_vals[gamma_opt_idx_pr], C_vals[C_opt_idx_pr]
print(f"gamma_opt_pr: {gamma_opt_pr}, C_opt_pr: {C_opt_pr}")

# SADR: f1
(gamma_opt_idx_f1, C_opt_idx_f1) = np.unravel_index(np.argmax(f1_scores), f1_scores.shape)
gamma_opt_f1, C_opt_f1 = gamma_vals[gamma_opt_idx_f1], C_vals[C_opt_idx_f1]
print(f"gamma_opt_f1: {gamma_opt_f1}, C_opt_f1: {C_opt_f1}")

### Training

In [8]:
# SADR: optimal hyperparameter values.
# Precision
# gamma_opt, C_opt = 0.1, 0.01
# F1
gamma_opt, C_opt = 0.1, 2.1544

X_train_all = np.concatenate((X_train, X_val), axis=0)
y_train_all = np.concatenate((y_train, y_val), axis=0)

# SADR: training the model on the entire training set.
gaussian_rbf_svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="rbf", gamma=gamma_opt, C=C_opt))
])
gaussian_rbf_svm_clf.fit(X_train_all, y_train_all)
y_pred = gaussian_rbf_svm_clf.predict(X_test)

# SADR: computing the performance metrics on the test set.
print(f"accuracy: {accuracy_score(y_test, y_pred)}")
print(f"precision: {precision_score(y_test, y_pred)}")
print(f"recall: {recall_score(y_test, y_pred)}")
print(f"f1_score: {f1_score(y_test, y_pred)}")

# SADR: printing the confusion matrix.
cm = confusion_matrix(y_test, y_pred)
print(f"confusion_matrix: {cm}")

# SADR: saving the confusion matrix.
cm_path = os.path.join("results", "k_mer", "svm_rbf_kernel_cm.npy")
np.save(cm_path, cm)

accuracy: 0.925
precision: 0.988950276243094
recall: 0.8647342995169082
f1_score: 0.9226804123711341
confusion_matrix: [[382   4]
 [ 56 358]]
