In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
# Breast cancer data, with diagnosis as target variable
print(df.loc[:, 1].value_counts())

y = df.loc[:, 1].values
X = df.loc[:, 2:].values

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

1
B    357
M    212
Name: count, dtype: int64


In [3]:
from sklearn.model_selection import GridSearchCV
# Grid searches literally just brute forces the search for values of hyperparameters over the specified grid

pipe_svc = make_pipeline(StandardScaler(),
                         SVC(random_state=42))

param_range = np.logspace(-4, 3, 8)
param_grid = [{'svc__C': param_range,
               'svc__kernel': ['linear']}, 
# For the linear kernel in SVC, scan over regularization strengths
              {'svc__C': param_range,
               'svc__gamma': param_range,
               'svc__kernel': ['rbf']}] 
# For the RBF (Gaussian) kernel, scan over different regularlization strengths as well as different Gaussian widths

gs = GridSearchCV(estimator=pipe_svc,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=10,
                  refit=True, # Refits to the whole training set automatically after finding the best one
                  n_jobs=-1).fit(X_train, y_train)

print(f"Grid Search best accuracy score: {gs.best_score_:.3f}.")
print(f"Grid Search best parameters: {gs.best_params_}")

Grid Search best accuracy score: 0.977.
Grid Search best parameters: {'svc__C': np.float64(100.0), 'svc__gamma': np.float64(0.01), 'svc__kernel': 'rbf'}


In [4]:
# We can now retreive the best estimator
classifier = gs.best_estimator_
# classifier.fit(X_train, y_train) # Unnecessary since we set refit to True
print(f'Test accuracy: {classifier.score(X_test, y_test):.3f}')

Test accuracy: 0.958


It is worth noting that the Grid Search is particularly sensitive to the choice of grid points, and a finer search will usually yield better results at the cost of search time. It is possible to switch over to a randomized search style, at the cost of possibly more random results. This is typically less time intensive.