In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
# Breast cancer data, with diagnosis as target variable
print(df.loc[:, 1].value_counts())

y = df.loc[:, 1].values
X = df.loc[:, 2:].values

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

1
B    357
M    212
Name: count, dtype: int64


In [3]:
# Randomized Search can sample over discrete lists of values as param inputs
# But it can also sample from distributions
import scipy
param_range = scipy.stats.loguniform(10. ** -4, 10. **3)
# This is a log-uniform distribution, and we can sample from it using the rvs call:
np.random.seed(42)
print(param_range.rvs(10))

[4.18582273e-02 4.51856095e+02 1.33032451e+01 1.55099140e+00
 1.23631883e-03 1.23583828e-03 2.55026485e-04 1.15673272e+02
 1.61363417e+00 9.04707196e+00]


In [4]:
from sklearn.model_selection import RandomizedSearchCV
# Grid searches literall just brute force the seach for values of hyperparameters over the specified grid

pipe_svc = make_pipeline(StandardScaler(),
                         SVC(random_state=42))

param_grid = [{'svc__C': param_range,
               'svc__kernel': ['linear']}, 
# For the linear kernel in SVC, scan over regularization strengths
              {'svc__C': param_range,
               'svc__gamma': param_range,
               'svc__kernel': ['rbf']}] 
# For the RBF (Gaussian) kernel, scan over different regularlization strengths as well as different Gaussian widths

rs = RandomizedSearchCV(estimator=pipe_svc,
                  param_distributions=param_grid, # Samples distributions now, instead of a grid
                  scoring='accuracy',
                  cv=10,
                  n_iter=20, # 20 samples
                  refit=True, # Refits to the whole training set automatically after finding the best one
                  n_jobs=-1).fit(X_train, y_train)

print(f"Random Search best accuracy score: {rs.best_score_:.3f}.")
print(f"Random Search best parameters: {rs.best_params_}")

Random Search best accuracy score: 0.969.
Random Search best parameters: {'svc__C': np.float64(655.0713895392056), 'svc__gamma': np.float64(0.004259899791418376), 'svc__kernel': 'rbf'}


In [5]:
# We can now retreive the best estimator
classifier = rs.best_estimator_
# classifier.fit(X_train, y_train) # Unnecessary since we set refit to True
print(f'Test accuracy: {classifier.score(X_test, y_test):.3f}')

Test accuracy: 0.951


Now, a completely random search might not yield the best possible results in a fixed time. A slightly more resource efficient method would be to throw out parts of configuration space that are simply not yielding good results. Throwing out the bottom 50% of hyperparameters from a large list trained on fewer datapoints to converge quicker is known as successive halving, which has been experimentally implemented in Scikit Learn. After throwing out the bottom half, more resources (training examples) can be expended on trying to search among the top 50%. This is done recursively until the best survives.