# Tuning models

- How do we choose the model hyperparameters? (K for k-Means, KNN, C and gamma for SV*, etc.)
- How can we actually measure variance (e.g. is our model over-fitting?)

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [3]:
iris_bag = datasets.load_iris()
iris = pd.DataFrame(np.c_[iris_bag.data, iris_bag.target], columns=iris_bag.feature_names + ['target'])
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [4]:
from sklearn.preprocessing import StandardScaler

In [5]:
X = iris_bag.data
y = iris_bag.target

In [6]:
X = StandardScaler().fit_transform(X)

In [7]:
# Naive train/test split
Xtrain, Xtest = X[:100], X[100:]
ytrain, ytest = y[:100], y[100:]

In [11]:
rfc = RandomForestClassifier(n_estimators=10)
rfc.fit(Xtrain, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [10]:
rfc.score(Xtest, ytest)

0.0

In [12]:
rfc.score(Xtrain, ytrain)

1.0

Um... that's **really** bad

In [13]:
np.unique(ytrain)

array([0, 1])

In [14]:
np.unique(ytest)

array([2])

There will be ways to account for this....

# Measuring variance

Try splitting test/train in different ways and measure the **average** performance

Variance problems will show up as some of the splits being *really bad*

## K-fold 

Split the dataset into K equal parts

In [15]:
from sklearn.model_selection import KFold

In [16]:
kf = KFold(n_splits=3)
scores = []
for train, test in kf.split(X, y):
    Xtrain, Xtest = X[train], X[test]
    ytrain, ytest = y[train], y[test]
    rfc.fit(Xtrain, ytrain)
    scores.append(rfc.score(Xtest, ytest))
scores

[0.0, 0.0, 0.0]

## Shuffle-split

Shuffle our dataset randomly before a K-fold split

In [17]:
kf = KFold(n_splits=3, shuffle=True)
scores = []
for train, test in kf.split(X, y):
    Xtrain, Xtest = X[train], X[test]
    ytrain, ytest = y[train], y[test]
    rfc.fit(Xtrain, ytrain)
    scores.append(rfc.score(Xtest, ytest))
scores

[0.94, 0.94, 0.92]

# Stratified

Ensure that our test and training splits always have the same proportion of outputs as one another

(split _within_ each set of samples with a particular output)

In [22]:
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=3)

In [23]:
scores = []
for train, test in kf.split(X, y):
    Xtrain, Xtest = X[train], X[test]
    ytrain, ytest = y[train], y[test]
    rfc.fit(Xtrain, ytrain)
    scores.append(rfc.score(Xtest, ytest))
scores

[0.98, 0.94, 0.94]

# Leave-one-out

Train on all but a single sample and use that sample to test. Repeat for all samples.

In [None]:
from sklearn.model_selection import LeaveOneOut
rfc = RandomForestClassifier(n_estimators=10)

loo = LeaveOneOut()
scores = []
for train, test in loo.split(X, y):
    Xtrain, Xtest = X[train], X[test]
    ytrain, ytest = y[train], y[test]
    rfc.fit(Xtrain, ytrain)
    scores.append(rfc.score(Xtest, ytest))

In [27]:
np.mean(scores), np.std(scores)

(0.9533333333333334, 0.21092389359408498)

# Cross-validation

The above pattern (splitting multiple times and comparing the scores) is known as **cross-validation**, and sklearn provides support for it in the model_selection module:

In [29]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [41]:
cross_val_score(rfc, X, y, cv=KFold(n_splits=10, shuffle=True))

array([0.73333333, 1.        , 1.        , 1.        , 0.93333333,
       1.        , 0.93333333, 0.93333333, 1.        , 0.86666667])

In [42]:
cross_val_score(rfc, X, y, cv=StratifiedKFold(n_splits=10, shuffle=True))

array([0.93333333, 0.93333333, 0.93333333, 1.        , 1.        ,
       0.86666667, 0.86666667, 1.        , 1.        , 1.        ])

# Hyperparameter selection

"How do I choose these parameters?"


In [43]:
bcan = datasets.load_breast_cancer()
X = bcan.data
y = bcan.target

In [44]:
grid = {
    'C': [10**x for x in range(-4, 5)],
    'gamma': [10**x for x in range(-4, 5)]}

In [45]:
grid

{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
 'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}

Remember we can't use our test data in training, so set it aside:

In [46]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)

Exhaustively search all C/gamma combinations in our grid and save the one that performs best on our test set

In [47]:
best_params = {'C': None, 'gamma': None}
best_score = None

for C in grid['C']:
    for gamma in grid['gamma']:
        model = SVC(C=C, gamma=gamma)
        model.fit(Xtrain, ytrain)
        score = model.score(Xtest, ytest)
        if best_score is None or best_score < score:
            best_score = score
            best_params = {'C': C, 'gamma': gamma}
        

In [48]:
best_params

{'C': 10, 'gamma': 0.0001}

In [49]:
best_score

0.958041958041958

# What's wrong here?

(we're double-dipping in our test data)

In [50]:
SVC()

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

# Train-validate-test split!

In [52]:
Xtrainval, Xtest, ytrainval, ytest = train_test_split(X, y)
Xtrain, Xval, ytrain, yval = train_test_split(Xtrainval, ytrainval)

In [53]:
best_score = None

for C in grid['C']:
    for gamma in grid['gamma']:
        model = SVC(C=C, gamma=gamma)
        model.fit(Xtrain, ytrain)
        score = model.score(Xval, yval)
        if best_score is None or best_score < score:
            best_score = score
            best_params = {'C': C, 'gamma': gamma}
        

In [54]:
best_params

{'C': 100, 'gamma': 0.0001}

In [55]:
best_score

0.9626168224299065

# Re-train with train + validate for final score

Once you have the best parameters, use the combined training/validation samples to fit and test against the test data

In [56]:
SVC(**best_params).fit(Xtrainval, ytrainval).score(Xtest, ytest)

0.9370629370629371

# Train-validate-test with cross-validation

We can use cross-validation for our **validation** phase:

In [57]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)

In [58]:
best_score = None
cv = cv=KFold(n_splits=10, shuffle=True)

for C in grid['C']:
    for gamma in grid['gamma']:
        model = SVC(C=C, gamma=gamma)
        scores = cross_val_score(model, Xtrain, ytrain, cv=cv)
        score = scores.mean()
        if best_score is None or best_score < score:
            best_score = score
            best_params = {'C': C, 'gamma': gamma}
        

In [59]:
best_params

{'C': 10, 'gamma': 0.0001}

In [60]:
best_score

0.9388704318936878

In [62]:
SVC(**best_params).fit(Xtrain, ytrain).score(Xtest, ytest)

0.972027972027972

# Grid search

Since this is a common thing to do, sklearn provides support for cross-validated grid search for hyperparameters via the GridSearchCV class:

In [63]:
from sklearn.model_selection import GridSearchCV

In [67]:
model = GridSearchCV(SVC(), grid, cv=cv, scoring='accuracy')
model.fit(Xtrainval, ytrainval)

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=True),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000,
                               10000],
                         'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000,
                                   10000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [68]:
model.best_estimator_

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [69]:
model.score(Xtest, ytest)

0.9790209790209791

In [70]:
model.best_params_

{'C': 10, 'gamma': 0.0001}

# Lab

Open [Model Evaluation Lab](model-evaluation-lab.ipynb)

