Working through chapter 5 exercise 8

Train a SVM on the MNIST dataset.
* Use one-versus-all
* Tune hyperparameters using small validation sets

Using smaller training sets throughout to speed training for learning purposes. See results in chapter notebook for improvement on including the whole set.

### Setup

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import reciprocal, uniform
import seaborn as sns
from sklearn import datasets
from sklearn.datasets import fetch_mldata
from sklearn.datasets import fetch_openml
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.utils import shuffle

In [2]:
pd.set_option('max_rows', 7)
pd.set_option('max_columns', 50)

%matplotlib inline
plt.style.use('fivethirtyeight')

### Data

In [3]:
%%capture --no-stdout

# Import MNIST data
mnist = fetch_mldata('MNIST original', )
X, y = mnist['data'].astype(float), mnist['target'].astype(float)

# test / train split specified by MNIST
split = 60000
X_train, X_test, y_train, y_test = X[:split], X[split:], y[:split], y[split:]

# Shuffle training order
X_train, y_train = shuffle(X_train, y_train)

# scale X
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### ML

#### Linear SVC

In [4]:
lin_clf = LinearSVC(penalty='l2', C=1, loss="hinge", max_iter=10000, 
                    random_state=42)

skip = 100   # subset for faster (though weaker) gridsearch
cross_val_score(lin_clf, X_train_scaled[::skip], y_train[::skip], cv=5,
                scoring="accuracy")

array([0.76      , 0.68595041, 0.7394958 , 0.78991597, 0.80172414])

In [6]:
skip=50
param_grid = {'C': [0.001, 0.01, 1, 10]}

grid_search = GridSearchCV(lin_clf, param_grid, cv=5, iid=False, 
                           return_train_score=True)

grid_search.fit(X_train_scaled[::skip], y_train[::skip])

print('best parameters {0}'.format(grid_search.best_params_))

cvres = pd.DataFrame(grid_search.cv_results_)
cvres.rename({'param_C':'C', 
              'mean_test_score':'score'},
             axis=1, inplace=True)
cvres[['C', 'score']].sort_values(by='score', ascending=False)

best parameters {'C': 0.01}


Unnamed: 0,C,score
1,0.01,0.825917
0,0.001,0.808609
2,1.0,0.784234
3,10.0,0.784234


In [7]:
skip=50
param_grid = {'C': [0.005, 0.01, 0.015]}

grid_search = GridSearchCV(lin_clf, param_grid, cv=5, iid=False, 
                           return_train_score=True)

grid_search.fit(X_train_scaled[::skip], y_train[::skip])

print('best parameters {0}'.format(grid_search.best_params_))

cvres = pd.DataFrame(grid_search.cv_results_)
cvres.rename({'param_C':'C', 
              'mean_test_score':'score'},
             axis=1, inplace=True)
cvres[['C', 'score']].sort_values(by='score', ascending=False)

best parameters {'C': 0.01}


Unnamed: 0,C,score
1,0.01,0.825917
2,0.015,0.824267
0,0.005,0.816039


In [8]:
lin_clf = LinearSVC(penalty='l2', C=0.01, loss="hinge", max_iter=100000, 
                    random_state=42)

skip = 10   # subset for faster (though weaker) gridsearch
cross_val_score(lin_clf, X_train_scaled[::skip], y_train[::skip], cv=5,
                scoring="accuracy")

array([0.8513289 , 0.86522463, 0.87      , 0.85475793, 0.88294314])

#### SVC - RBF kernel 

In [14]:
skip = 50   # subset for faster (though weaker) gridsearch

svm_clf = SVC(C=1, kernel='rbf', decision_function_shape='ovr', gamma='scale')
svm_clf.fit(X_train_scaled[::skip], y_train[::skip])

cross_val_score(svm_clf, X_train_scaled[::skip], y_train[::skip], cv=5,
                scoring="accuracy")

array([0.86938776, 0.86831276, 0.87866109, 0.86075949, 0.8940678 ])

In [15]:
skip=50
param_grid = {'C': [0.001, 0.01, 1, 10]}

grid_search = GridSearchCV(svm_clf, param_grid, cv=5, iid=False, 
                           return_train_score=True)

grid_search.fit(X_train_scaled[::skip], y_train[::skip])

print('best parameters {0}'.format(grid_search.best_params_))

cvres = pd.DataFrame(grid_search.cv_results_)
cvres.rename({'param_C':'C', 
              'mean_test_score':'score'},
             axis=1, inplace=True)
cvres[['C', 'score']].sort_values(by='score', ascending=False)

best parameters {'C': 10}


Unnamed: 0,C,score
3,10.0,0.885065
2,1.0,0.874238
0,0.001,0.13001
1,0.01,0.13001


In [17]:
skip=50
param_grid = {'C': [5, 20, 100]}

grid_search = GridSearchCV(svm_clf, param_grid, cv=5, iid=False, 
                           return_train_score=True)

grid_search.fit(X_train_scaled[::skip], y_train[::skip])

print('best parameters {0}'.format(grid_search.best_params_))

cvres = pd.DataFrame(grid_search.cv_results_)
cvres.rename({'param_C':'C', 
              'mean_test_score':'score'},
             axis=1, inplace=True)
cvres[['C', 'score']].sort_values(by='score', ascending=False)

best parameters {'C': 5}


Unnamed: 0,C,score
0,5,0.886725
1,20,0.885065
2,100,0.885065


In [21]:
skip=50

param_dists = {'gamma': reciprocal(0.001, 0.1), 'C': uniform(5, 20)}

rnd_search = RandomizedSearchCV(svm_clf, param_dists, n_iter=10, verbose=2, 
                                cv=5, return_train_score=True)

rnd_search.fit(X_train_scaled[::skip], y_train[::skip])


print('best parameters {0}'.format(rnd_search.best_params_))

cvres = pd.DataFrame(rnd_search.cv_results_)
cvres.rename({'param_C':'C', 
              'param_gamma':'gamma',
              'mean_test_score':'score'},
             axis=1, inplace=True)
cvres[['C', 'gamma', 'score']].sort_values(by='score', ascending=False)



Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] C=12.449176241426883, gamma=0.010767555829544997 ................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] . C=12.449176241426883, gamma=0.010767555829544997, total=   2.0s
[CV] C=12.449176241426883, gamma=0.010767555829544997 ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.9s remaining:    0.0s


[CV] . C=12.449176241426883, gamma=0.010767555829544997, total=   2.3s
[CV] C=12.449176241426883, gamma=0.010767555829544997 ................
[CV] . C=12.449176241426883, gamma=0.010767555829544997, total=   2.4s
[CV] C=12.449176241426883, gamma=0.010767555829544997 ................
[CV] . C=12.449176241426883, gamma=0.010767555829544997, total=   2.3s
[CV] C=12.449176241426883, gamma=0.010767555829544997 ................
[CV] . C=12.449176241426883, gamma=0.010767555829544997, total=   2.3s
[CV] C=17.46521074182516, gamma=0.004501323556083868 .................
[CV] .. C=17.46521074182516, gamma=0.004501323556083868, total=   2.1s
[CV] C=17.46521074182516, gamma=0.004501323556083868 .................
[CV] .. C=17.46521074182516, gamma=0.004501323556083868, total=   2.4s
[CV] C=17.46521074182516, gamma=0.004501323556083868 .................
[CV] .. C=17.46521074182516, gamma=0.004501323556083868, total=   2.1s
[CV] C=17.46521074182516, gamma=0.004501323556083868 .................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.8min finished


best parameters {'C': 24.738486327933032, 'gamma': 0.0013883773082811717}


Unnamed: 0,C,gamma,score
6,24.7385,0.00138838,0.885000
8,18.2631,0.00122208,0.884167
1,17.4652,0.00450132,0.803333
...,...,...,...
2,9.17644,0.0930879,0.130000
3,24.8852,0.0888569,0.130000
5,12.2074,0.0852465,0.130000


In [24]:
skip=20
rnd_search.best_estimator_.fit(X_train_scaled[::skip], y_train[::skip])

SVC(C=24.738486327933032, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0013883773082811717,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [26]:
y_pred = rnd_search.best_estimator_.predict(X_train_scaled)
accuracy_score(y_train, y_pred)

0.9277166666666666

In [27]:
y_pred = rnd_search.best_estimator_.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

0.9297