In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action = "ignore")

In [25]:
from sklearn import datasets

In [26]:
df = datasets.load_wine()

In [27]:
df.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

In [28]:
data = pd.DataFrame(df.data, columns=df.feature_names)
data["class"] = df.target

In [29]:
data.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,class
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [30]:
from sklearn.neighbors import KNeighborsClassifier

In [31]:
knn = KNeighborsClassifier(n_neighbors=10)

In [32]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [33]:
X = data.drop(["class"], axis=1)
y = data["class"]

In [34]:
sc.fit_transform(X)

array([[ 1.51861254, -0.5622498 ,  0.23205254, ...,  0.36217728,
         1.84791957,  1.01300893],
       [ 0.24628963, -0.49941338, -0.82799632, ...,  0.40605066,
         1.1134493 ,  0.96524152],
       [ 0.19687903,  0.02123125,  1.10933436, ...,  0.31830389,
         0.78858745,  1.39514818],
       ...,
       [ 0.33275817,  1.74474449, -0.38935541, ..., -1.61212515,
        -1.48544548,  0.28057537],
       [ 0.20923168,  0.22769377,  0.01273209, ..., -1.56825176,
        -1.40069891,  0.29649784],
       [ 1.39508604,  1.58316512,  1.36520822, ..., -1.52437837,
        -1.42894777, -0.59516041]])

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state = 0, test_size = .2)

In [37]:
knn.fit(Xtrain, ytrain)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [38]:
y_pred = knn.predict(Xtest)

In [39]:
knn.score(Xtrain, ytrain)

0.7464788732394366

In [40]:
knn.score(Xtest, ytest)

0.7222222222222222

In [42]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 10,
 'p': 2,
 'weights': 'uniform'}

### K folds

##### Data is divided into k splits of train and test

In [43]:
from sklearn.model_selection import KFold

In [44]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)

In [45]:
kf.split(data)

<generator object _BaseKFold.split at 0x0000022C9B1059C8>

In [47]:
for train_idx, test_idx in kf.split(data):
    print(train_idx)
    print(test_idx)
    print("\n")
    
    
#entire data is divided into train and test with k = 5
# ie. data/5 or say 100/5 = 200 test and 800 train with shuffling

[  0   1   2   3   6   9  10  11  12  13  14  15  16  17  19  20  21  22
  23  24  25  27  28  29  30  31  32  34  35  36  38  39  40  41  42  43
  46  47  48  49  50  52  53  57  58  59  62  64  65  67  68  69  70  71
  72  73  74  75  76  77  78  79  81  82  83  84  85  87  88  89  91  92
  93  94  95  96  97  99 100 101 102 103 105 107 108 109 110 113 114 115
 116 117 118 119 120 122 124 125 127 128 130 131 132 133 134 135 136 137
 138 139 140 142 143 144 145 147 148 149 150 152 153 154 155 156 157 158
 159 161 162 163 165 166 167 169 170 171 172 173 174 175 176 177]
[  4   5   7   8  18  26  33  37  44  45  51  54  55  56  60  61  63  66
  80  86  90  98 104 106 111 112 121 123 126 129 141 146 151 160 164 168]


[  0   1   3   4   5   6   7   8   9  10  11  12  13  14  15  17  18  20
  21  23  25  26  28  29  31  32  33  34  35  36  37  38  39  41  42  43
  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  60  61  63
  64  65  66  67  68  69  70  72  73  75  76  77  78  7

### Repeated K fold

RepeatedKFold repeats K-Fold n times. It can be used when one requires to run KFold n times, producing different splits in each repetition.

In [49]:
from sklearn.model_selection import RepeatedKFold
rkf = RepeatedKFold(n_splits=3, n_repeats=2, random_state=0)

In [51]:
for xtrain_idx, xtest_idx in rkf.split(data):
    print(xtrain_idx)
    print(xtest_idx)
    print("\n")
    
    
# Basically it is just repeating the split into multiple time (n_repeat)

[  0   1   2   3   6   9  10  11  12  13  14  15  17  20  21  23  25  27
  28  29  31  32  34  35  36  38  39  41  42  43  46  47  48  49  50  52
  53  57  58  59  62  64  65  67  68  69  70  72  73  75  76  77  78  79
  81  82  83  84  85  87  88  89  91  95  96  97  99 100 102 103 105 107
 109 110 114 115 116 117 118 119 120 122 124 125 127 128 130 132 133 134
 136 137 139 140 142 143 147 148 149 150 152 153 154 155 156 157 159 163
 165 166 167 169 170 171 172 173 176 177]
[  4   5   7   8  16  18  19  22  24  26  30  33  37  40  44  45  51  54
  55  56  60  61  63  66  71  74  80  86  90  92  93  94  98 101 104 106
 108 111 112 113 121 123 126 129 131 135 138 141 144 145 146 151 158 160
 161 162 164 168 174 175]


[  1   4   5   7   8   9  11  16  17  18  19  21  22  24  25  26  28  29
  30  31  32  33  34  35  36  37  38  39  40  41  42  44  45  47  51  53
  54  55  56  57  58  60  61  63  65  66  67  70  71  72  74  77  79  80
  81  82  86  87  88  90  92  93  94  98  99 101 102 1

### Leave one out cross validation (LOOCV)

- LOOCV is a special case of k-fold CV, where k becomes equal to n (number of observations)
- In LOOCV one sample (row) is choosen for test and other all samples(rows) are choosen as train
- 

In [52]:
from sklearn.model_selection import LeaveOneOut

In [55]:
loocv = LeaveOneOut()

In [57]:
loocv.split(data)

<generator object BaseCrossValidator.split at 0x0000022C9BBB0D48>

In [59]:
i = 1

for xtrain_idx, xtest_idx in loocv.split(data):
    print(xtrain_idx)
    print(xtest_idx)
    print("\n")
    
    i = i+1
    if i == 5:
        break
        
# first 5 splits area displayed

[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177]
[0]


[  0   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68

### Stratified K fold cross validation

- In this split each fold has equal number of classes.
- data is divided as a good representative of whole

In [77]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [78]:
# It takes x and y in seperate as to understand class in lable column
skf.split(data.drop("class", axis = 1), data["class"])

<generator object _BaseKFold.split at 0x0000022C9BC11DC8>

In [79]:
for train_id, test_id in skf.split(data.drop("class", axis = 1), data["class"]):
    print(train_id)
    print(test_id)
    print("\n")
    
# data is now divided into 5 folds, with proper ratio of classes in each fold

[  0   1   3   5   6   7   8   9  12  13  14  15  16  17  18  19  20  21
  23  24  25  27  29  30  31  32  33  36  37  38  39  41  42  44  45  47
  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65
  68  70  71  72  73  74  75  76  77  79  80  82  83  84  86  87  88  90
  94  95  97  98  99 100 101 102 104 105 106 107 108 109 110 111 112 113
 114 115 116 117 119 120 121 123 124 125 126 127 129 130 131 132 133 136
 138 140 141 142 143 144 145 146 147 148 149 150 151 152 155 157 158 159
 160 161 162 163 164 165 166 168 169 171 172 173 174 176 177]
[  2   4  10  11  22  26  28  34  35  40  43  46  66  67  69  78  81  85
  89  91  92  93  96 103 118 122 128 134 135 137 139 153 154 156 167 170
 175]


[  0   1   2   3   4   5   6   8   9  10  11  12  13  15  16  17  18  19
  20  21  22  23  24  25  26  27  28  31  34  35  36  38  39  40  43  44
  45  46  47  50  51  52  53  54  55  57  58  59  60  62  63  64  65  66
  67  69  70  71  72  73  74  75  76  77  78  79  80  

### Cross validating our model

In [80]:
from sklearn.model_selection import cross_val_score

In [81]:
X = data.drop(["class"], axis=1)
y = data["class"]

kf, rkf, loocv, skf

In [82]:
cross_val_score(knn, X, y, scoring="accuracy", cv = kf )

array([0.72222222, 0.61111111, 0.63888889, 0.82857143, 0.71428571])

In [86]:
cross_val_score(knn, X, y, scoring="accuracy", cv = kf ).mean()

0.7030158730158731

In [83]:
cross_val_score(knn, X, y, scoring="accuracy", cv = loocv)

array([1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1.,
       0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0.,
       1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 0.,
       1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1.,
       0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 0.,
       1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0.,
       1., 1., 1., 0., 0., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 1., 1.,
       0., 0., 0., 1., 1., 0., 0., 0.])

In [87]:
cross_val_score(knn, X, y, scoring="accuracy", cv = loocv).mean()

0.6685393258426966

In [84]:
cross_val_score(knn, X, y, scoring="accuracy", cv = rkf)

array([0.7       , 0.6440678 , 0.72881356, 0.7       , 0.6779661 ,
       0.62711864])

In [88]:
cross_val_score(knn, X, y, scoring="accuracy", cv = rkf).mean()

0.6796610169491525

In [85]:
cross_val_score(knn, X, y, scoring="accuracy", cv = skf)

array([0.62162162, 0.72222222, 0.80555556, 0.71428571, 0.58823529])

In [89]:
cross_val_score(knn, X, y, scoring="accuracy", cv = skf).mean()

0.6903840815605522

In [97]:
scoring = { "acc": "accuracy", 
           "prec": "precision_macro",
           "f1": "f1_micro"
          }

### We can check training and testing accuracies

In [95]:
from sklearn.model_selection import cross_validate

In [None]:
kf, rkf, loocv, skf

In [98]:
cross_validate(knn, X, y, cv =kf ,scoring=scoring )

{'fit_time': array([0.00199437, 0.00199604, 0.00299096, 0.00099516, 0.00199366]),
 'score_time': array([0.0149641 , 0.01096654, 0.01296854, 0.00997233, 0.01799273]),
 'test_acc': array([0.72222222, 0.61111111, 0.63888889, 0.82857143, 0.71428571]),
 'test_prec': array([0.64920635, 0.60393773, 0.61282051, 0.8       , 0.71111111]),
 'test_f1': array([0.72222222, 0.61111111, 0.63888889, 0.82857143, 0.71428571])}

In [99]:
cross_validate(knn, X, y, cv =rkf ,scoring=scoring )

{'fit_time': array([0.00299025, 0.00198841, 0.0019958 , 0.00199366, 0.00296807,
        0.00199485]),
 'score_time': array([0.01696277, 0.01496053, 0.01196814, 0.01400113, 0.01195288,
        0.01101375]),
 'test_acc': array([0.7       , 0.6440678 , 0.72881356, 0.7       , 0.6779661 ,
        0.62711864]),
 'test_prec': array([0.66666667, 0.60692641, 0.71398046, 0.72900246, 0.67355072,
        0.51538462]),
 'test_f1': array([0.7       , 0.6440678 , 0.72881356, 0.7       , 0.6779661 ,
        0.62711864])}

In [100]:
cross_validate(knn, X, y, cv =skf ,scoring=scoring )

{'fit_time': array([0.00299191, 0.00199389, 0.00199604, 0.00299311, 0.00399065]),
 'score_time': array([0.01196814, 0.01097131, 0.00997233, 0.01396251, 0.01001644]),
 'test_acc': array([0.62162162, 0.72222222, 0.80555556, 0.71428571, 0.58823529]),
 'test_prec': array([0.5952381 , 0.70185185, 0.8026418 , 0.68809524, 0.59444444]),
 'test_f1': array([0.62162162, 0.72222222, 0.80555556, 0.71428571, 0.58823529])}

In [101]:
cross_validate(knn, X, y, cv =loocv ,scoring=scoring )

{'fit_time': array([0.00299382, 0.00199771, 0.00299335, 0.00099707, 0.00298309,
        0.00199437, 0.00199509, 0.00199413, 0.00199723, 0.00199485,
        0.00099754, 0.0009973 , 0.0009973 , 0.00099707, 0.00199485,
        0.00099754, 0.00199604, 0.0009973 , 0.00100183, 0.00199556,
        0.00099802, 0.00199485, 0.00199938, 0.00199461, 0.00099707,
        0.00199437, 0.00199723, 0.00095057, 0.00199461, 0.00198865,
        0.00199509, 0.00099754, 0.00099778, 0.00199246, 0.00199461,
        0.00199389, 0.00099635, 0.00099778, 0.00099778, 0.00199342,
        0.00299191, 0.00199461, 0.00099874, 0.00199461, 0.00199556,
        0.00298929, 0.00199437, 0.00199389, 0.00199413, 0.0009973 ,
        0.00099754, 0.00099778, 0.00099206, 0.00199437, 0.00099826,
        0.00099707, 0.00199604, 0.0009973 , 0.00099707, 0.0019958 ,
        0.00199437, 0.00099778, 0.00199342, 0.00099778, 0.00099754,
        0.00199366, 0.0009973 , 0.0009973 , 0.00199509, 0.00099993,
        0.00099707, 0.00198817, 0.00

## GridSearchCV for hyperparameter tuning

### All types of cross validation can directly be used inside Gridsearch

In [91]:
from sklearn.model_selection import GridSearchCV

In [93]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 10,
 'p': 2,
 'weights': 'uniform'}

In [157]:
param_grid = {#'algorithm': 'auto',
             #'leaf_size': [30],
             #'metric': 'minkowski',
             #'metric_params': None,
             #'n_jobs': None,
             'n_neighbors': [10,12],
             'p': [1,2],
             }

In [158]:
grid = GridSearchCV(knn, param_grid=param_grid, cv = skf)

In [159]:
grid.fit(Xtrain, ytrain)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=10, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': [10, 12], 'p': [1, 2]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [160]:
grid.best_params_

{'n_neighbors': 10, 'p': 1}

In [161]:
grid.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=1,
                     weights='uniform')

In [162]:
knnn = grid.best_estimator_

In [163]:
knnn.score(Xtest, ytest)

0.75

In [164]:
knnn.score(Xtrain, ytrain)

0.7816901408450704