Comparing model performance of the Radial Basis Function (RBF) and polynomial kernel in a Kernelized Support Vector Machine on the Breast Cancer dataset using Recall as the evaluation metric. Careful hyperparameter tuning was done to optimize model performance.

In [87]:
# Load the relevant libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.svm import SVC

In [88]:
# Load the breast cancer dataset
data = load_breast_cancer()

In [89]:
# Accessing the dataset
X = data.data
y = data.target

In [90]:
# A little EDA
X.shape

(569, 30)

In [91]:
# Checking the value counts for each class
values,counts = np.unique(y, return_counts = True)
value_counts = dict(zip(arr,counts))
print(value_counts)

{0: 212, 1: 357}


In [92]:
# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 0)

###### Cross Validation with recall as the evaluation metric

In [93]:
# Applying cross validation to check the performance of the model using the rbf kernel 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, recall_score

clf0 = SVC(kernel = 'rbf' , gamma = 0.01, C=5)
scoring = make_scorer(recall_score, pos_label=1)
cv_scores = cross_val_score(clf0, X_train, y_train, cv=5, scoring = scoring)
print("Cross validation scores: {}".format(cv_scores))
print("Mean score: {:.2f}".format(cv_scores.mean()))

Cross validation scores: [1.         1.         1.         1.         0.98113208]
Mean score: 1.00


In [94]:
# Applying cross validation to check the performance of the model using the polynomial kernel

clfp10 = SVC(kernel = 'poly' , degree = 3, C=10)
scoring = make_scorer(recall_score, pos_label=1)
cv_scores = cross_val_score(clfp10, X_train, y_train, cv=5, scoring=scoring)
print("Cross validation scores: {}".format(cv_scores))
print("Mean score: {:.2f}".format(cv_scores.mean()))

Cross validation scores: [1.         1.         0.98113208 0.96226415 0.98113208]
Mean score: 0.98


##### Model fitting and evaluation

In [95]:
# Model fitting using the rbf kernel
clf1_rbf = SVC(kernel = 'rbf' , gamma = 0.01, C=5).fit(X_train,y_train)

In [96]:
# Model evaluation
y_pred = clf1_rbf.predict(X_test)
y_pred2 = clf1_rbf.predict(X_train)
recall_rbf = recall_score(y_test,y_pred)
recall_rbf1 = recall_score(y_train,y_pred2)

print("Recall score:, Training set {:.2f}".format(recall_rbf1))
print("Recall score:, Test set {:.2f}".format(recall_rbf))

Recall score:, Training set 1.00
Recall score:, Test set 0.98


In [97]:
# Model fitting using the polynomial kernel
clf_poly1 = SVC(kernel = 'poly' , degree = 3, C=10).fit(X_train, y_train)

In [98]:
# Model evaluation
y_pred0 = clf_poly1.predict(X_train)
y_pred1 = clf_poly1.predict(X_test)

recall_poly0 = recall_score(y_train,y_pred0)
recall_poly = recall_score(y_test,y_pred1)

print("Recall score , Training set: {:.2f}".format(recall_poly0))
print("Recall score , Test set: {:.2f}".format(recall_poly))

Recall score , Training set: 0.98
Recall score , Test set: 0.98


Given this particular dataset and choosing the best hyperparameters for both kernels, they have similar model performances.