**Cross-validation** is a resampling procedure used to evaluate machine learning models on a limited data sample.

The procedure has a single parameter called k that refers to the number of groups that a given data sample is to be split into. As such, the procedure is often called k-fold cross-validation.

When a specific value for k is chosen, it may be used in place of k in the reference to the model, such as k=10 becoming 10-fold cross-validation.

<img src='images/K-fold.jpg' width='60%' height='60%'/>

Cross Validation is used to assess the predictive performance of the models and and to judge how they perform outside the sample to a new data set also known as test data

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

iris = datasets.load_iris()
print(iris.data.shape, iris.target.shape)


(150, 4) (150,)


In [3]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, 
                                                    test_size=0.4, 
                                                    random_state=0)

print(X_train.shape, y_train.shape)

print(X_test.shape, y_test.shape)


clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)  

(90, 4) (90,)
(60, 4) (60,)


0.9666666666666667

In [4]:
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, iris.data, iris.target, cv=5)
scores       

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [5]:
from sklearn.model_selection import KFold
kf=KFold(n_splits=5,shuffle=True,random_state=10)
clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, iris.data, iris.target, cv=kf,scoring='accuracy')
print(scores)
scores.mean()*100

[0.96666667 0.96666667 0.96666667 1.         1.        ]


98.00000000000001

A **Pipeline** makes it easier to compose estimators, providing this behavior under cross-validation:

In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import ShuffleSplit
from sklearn import preprocessing

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, 
                                                    test_size=0.2, random_state=0)

cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)


clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))
rs=cross_val_score(clf, X_train,y_train, cv=cv)
print(rs)
rs.mean()

[0.91666667 0.91666667 0.94444444 1.         0.94444444]


0.9444444444444444

In [7]:

std=preprocessing.StandardScaler()

In [8]:
train_std=std.fit_transform(X_train)
test_std=std.transform(X_test)

In [9]:
clf = svm.SVC(C=1)
clf.fit(train_std,y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [10]:
clf.score(test_std,y_test)

1.0

In [15]:
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(KNeighborsClassifier(),{'n_neighbors':[2,3,4,5,6,7,8,9,10]},
                    cv=5,
                    scoring='accuracy')

In [16]:
grid.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [17]:
grid.best_score_

0.9583333333333334

In [18]:
grid.best_params_

{'n_neighbors': 8}

In [25]:
import pandas as pd
pd.DataFrame(grid.cv_results_)



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_n_neighbors,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.001005,0.001402,0.925,0.972956,2,{'n_neighbors': 2},7,0.92,0.968421,0.84,...,1.0,0.96875,0.916667,0.96875,0.954545,0.969388,0.000633,0.000802,0.053003,0.008265
1,0.0,0.001384,0.925,0.954248,3,{'n_neighbors': 3},7,0.92,0.968421,0.84,...,1.0,0.927083,0.958333,0.958333,0.909091,0.94898,0.0,0.000774,0.053889,0.015389
2,0.00039,0.000399,0.925,0.96052,4,{'n_neighbors': 4},7,0.92,0.978947,0.84,...,1.0,0.9375,0.958333,0.96875,0.909091,0.94898,0.000478,0.000488,0.053889,0.015052
3,0.0002,0.000399,0.933333,0.960498,5,{'n_neighbors': 5},6,0.96,0.968421,0.88,...,0.958333,0.958333,0.958333,0.958333,0.909091,0.94898,0.0004,0.000489,0.033143,0.007316
4,0.000201,0.000399,0.941667,0.960433,6,{'n_neighbors': 6},5,0.96,0.978947,0.88,...,1.0,0.947917,0.916667,0.96875,0.954545,0.959184,0.000402,0.000488,0.04121,0.012173
5,0.000607,0.000607,0.95,0.962474,7,{'n_neighbors': 7},3,0.96,0.957895,0.96,...,1.0,0.947917,0.916667,0.96875,0.909091,0.969388,0.000496,0.000814,0.032722,0.008431
6,0.000399,0.000399,0.958333,0.966685,8,{'n_neighbors': 8},1,0.96,0.968421,0.96,...,1.0,0.947917,0.916667,0.96875,0.954545,0.969388,0.000488,0.000489,0.026424,0.010171
7,0.000198,0.000597,0.958333,0.964557,9,{'n_neighbors': 9},1,0.96,0.968421,0.96,...,1.0,0.958333,0.958333,0.96875,0.909091,0.969388,0.000396,0.000487,0.028159,0.005272
8,0.000399,0.000398,0.95,0.962474,10,{'n_neighbors': 10},3,0.96,0.968421,0.92,...,1.0,0.947917,0.916667,0.96875,0.954545,0.969388,0.000489,0.000488,0.030567,0.008431


In [22]:
grid.cv_results_



{'mean_fit_time': array([0.00100474, 0.        , 0.00039034, 0.00019999, 0.00020089,
        0.00060678, 0.00039878, 0.00019813, 0.00039911]),
 'mean_score_time': array([0.00140181, 0.00138383, 0.00039873, 0.0003994 , 0.00039859,
        0.0006074 , 0.0003994 , 0.00059705, 0.00039849]),
 'mean_test_score': array([0.925     , 0.925     , 0.925     , 0.93333333, 0.94166667,
        0.95      , 0.95833333, 0.95833333, 0.95      ]),
 'mean_train_score': array([0.9729565 , 0.95424767, 0.9605196 , 0.96049767, 0.96043323,
        0.96247404, 0.96668457, 0.96455738, 0.96247404]),
 'param_n_neighbors': masked_array(data=[2, 3, 4, 5, 6, 7, 8, 9, 10],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 2},
  {'n_neighbors': 3},
  {'n_neighbors': 4},
  {'n_neighbors': 5},
  {'n_neighbors': 6},
  {'n_neighbors': 7},
  {'n_neighbors': 8},
  {'n_neighbors': 9},
  {'n_nei

In [None]:
param_grid = {'svc__C':[.001,.01,.1,1,10,100],
             'svc__gamma':[.0001,.001,.01,1,10,100]}

grid_pipeline = GridSearchCV(pipeline,param_grid=param_grid,cv=3)