In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

import scipy

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
# Breast cancer data, with diagnosis as target variable
print(df.loc[:, 1].value_counts())

y = df.loc[:, 1].values
X = df.loc[:, 2:].values

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

1
B    357
M    212
Name: count, dtype: int64


In [3]:
from sklearn.experimental import enable_halving_search_cv
# Experimental - might not be supported past 1.0
from sklearn.model_selection import HalvingRandomSearchCV
# Grid searches literall just brute force the seach for values of hyperparameters over the specified grid

pipe_svc = make_pipeline(StandardScaler(),
                         SVC(random_state=42))

param_range = scipy.stats.loguniform(10. ** -4, 10. **3)
param_grid = [{'svc__C': param_range,
               'svc__kernel': ['linear']}, 
# For the linear kernel in SVC, scan over regularization strengths
              {'svc__C': param_range,
               'svc__gamma': param_range,
               'svc__kernel': ['rbf']}] 
# For the RBF (Gaussian) kernel, scan over different regularlization strengths as well as different Gaussian widths

hs = HalvingRandomSearchCV(estimator=pipe_svc,
                  param_distributions=param_grid,
                  resource='n_samples', # Training set size is considered the resource to devote more of as fewer candidates remain
                  factor=1.5,# Keep best 1/1.5 ~ 66% of the candidates each round
                  n_candidates='exhaust', # Find the absolute best candidate
                  refit=True, # Refits to the whole training set automatically after finding the best one
                  n_jobs=-1).fit(X_train, y_train)

print(f"Random Search best accuracy score: {hs.best_score_:.3f}.")
print(f"Random Search best parameters: {hs.best_params_}")

Random Search best accuracy score: 0.968.
Random Search best parameters: {'svc__C': np.float64(0.4793116053425186), 'svc__kernel': 'linear'}


In [4]:
# We can now retreive the best estimator
classifier = hs.best_estimator_
# classifier.fit(X_train, y_train) # Unnecessary since we set refit to True
print(f'Test accuracy: {classifier.score(X_test, y_test):.3f}')

Test accuracy: 0.965


Other hyperparameter optimization/tuning packages exists as well, such as hyperopt (which has sklearn support https://github.com/hyperopt/hyperopt-sklearn). This has several methods included, such as TPE, which is a Bayesian optimizer than updates priors w.r.t past evaluations to make for more informed sampling of hyperparameter space.