# Grid Search
Tuning the hyperparameters 
`GridSearchCV` performs an exhaustive search over the parameter space evaluating each combination using X-val. 

In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_digits
import pandas as pd

In [3]:
# Load and return the Digits dataset
data = load_digits()   
X = data.data
y = data.target

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=1/2,
                                                    random_state=42)
X_train.shape, X_test.shape

((898, 64), (899, 64))

## Basic *k*-NN Classifier
A basic *out-of-the-box* scalar and classifier combination. 

In [5]:
bScal = StandardScaler().fit(X_train)
X_trainS = bScal.transform(X_train)
X_testS = bScal.transform(X_test)

In [6]:
knn = KNeighborsClassifier()
knn.fit(X_trainS,y_train)
y_pred = knn.predict(X_testS)
print("Accuracy: {0:4.2f}".format(accuracy_score(y_test,y_pred)))
confusion_matrix(y_test, y_pred)

Accuracy: 0.96


array([[82,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 89,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  3, 80,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  1, 84,  0,  0,  0,  3,  5,  0],
       [ 0,  0,  0,  0, 91,  0,  0,  2,  0,  0],
       [ 0,  0,  0,  0,  0, 94,  1,  0,  0,  4],
       [ 0,  0,  0,  0,  0,  0, 98,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  1,  0, 85,  1,  0],
       [ 0,  3,  0,  2,  0,  0,  0,  0, 78,  0],
       [ 0,  0,  0,  1,  1,  2,  1,  2,  2, 83]])

In [7]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

## Grid Search  
First define the combinations of parameters to be considered. 

In [8]:
param_grid = {'n_neighbors':[1,3,5,10], 
              'metric':['manhattan','euclidean'],
             'weights':['uniform','distance']}

Run the grid search.

In [9]:
knn_gs = GridSearchCV(knn,param_grid,cv=10,  
                      verbose = 1, n_jobs = -1)
knn_gs = knn_gs.fit(X_trainS,y_train)

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 145 out of 160 | elapsed:    2.0s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    2.0s finished


The grid search object will now work as a classifier with the 'optimal' parameters. 

In [10]:
y_pred_gs = knn_gs.predict(X_testS)
print("Accuracy: {0:4.2f}".format(accuracy_score(y_test,y_pred_gs)))
confusion_matrix(y_test, y_pred_gs)

Accuracy: 0.97


array([[82,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 89,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 81,  0,  0,  0,  0,  0,  2,  0],
       [ 0,  0,  0, 87,  0,  0,  1,  2,  2,  1],
       [ 0,  2,  0,  0, 91,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  1,  0, 95,  1,  0,  0,  2],
       [ 1,  0,  0,  0,  0,  0, 97,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 86,  0,  1],
       [ 1,  3,  1,  1,  0,  0,  0,  0, 77,  0],
       [ 0,  0,  0,  2,  1,  1,  0,  1,  1, 86]])

In [20]:
# check what the classifier is
knn_gs

GridSearchCV(cv=10, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'metric': ['manhattan', 'euclidean'],
                         'n_neighbors': [1, 3, 5, 10],
                         'weights': ['uniform', 'distance']},
             verbose=1)

In [12]:
knn_gs.best_params_

{'metric': 'manhattan', 'n_neighbors': 1, 'weights': 'uniform'}

We can 'manually' provide the best parmameters to the *k*-NN object. 

In [18]:
knn2 = KNeighborsClassifier(metric= 'manhattan', 
                           n_neighbors = 1, weights = 'uniform')
knn2.fit(X_trainS,y_train)
y_pred_gs = knn2.predict(X_testS)
print("Accuracy: {0:4.2f}".format(accuracy_score(y_test,y_pred_gs)))
confusion_matrix(y_test, y_pred_gs)


Accuracy: 0.97


array([[82,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 89,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 81,  0,  0,  0,  0,  0,  2,  0],
       [ 0,  0,  0, 87,  0,  0,  1,  2,  2,  1],
       [ 0,  2,  0,  0, 91,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  1,  0, 95,  1,  0,  0,  2],
       [ 1,  0,  0,  0,  0,  0, 97,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 86,  0,  1],
       [ 1,  3,  1,  1,  0,  0,  0,  0, 77,  0],
       [ 0,  0,  0,  2,  1,  1,  0,  1,  1, 86]])

We can unpack the best parameters directly.

In [19]:
knn3 = KNeighborsClassifier(**knn_gs.best_params_)
knn3.fit(X_trainS,y_train)
y_pred_gs = knn3.predict(X_testS)
print("Accuracy: {0:4.2f}".format(accuracy_score(y_test,y_pred_gs)))
confusion_matrix(y_test, y_pred_gs)


Accuracy: 0.97


array([[82,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 89,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 81,  0,  0,  0,  0,  0,  2,  0],
       [ 0,  0,  0, 87,  0,  0,  1,  2,  2,  1],
       [ 0,  2,  0,  0, 91,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  1,  0, 95,  1,  0,  0,  2],
       [ 1,  0,  0,  0,  0,  0, 97,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 86,  0,  1],
       [ 1,  3,  1,  1,  0,  0,  0,  0, 77,  0],
       [ 0,  0,  0,  2,  1,  1,  0,  1,  1, 86]])

In [16]:
# presumably the reason weights = 'uniform' is not mentioned is because that is the default anyway.
knn3

KNeighborsClassifier(metric='manhattan', n_neighbors=1)