<a href="https://colab.research.google.com/github/saitejameka/Machine-Learning/blob/master/cross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [14]:
import numpy as np

from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import scale
data = load_breast_cancer()
X,y = data.data, data.target
X = scale(X)

from sklearn.model_selection import train_test_split

X_trainval, X_test, y_trainval, y_test = train_test_split(X,y)

X_train, X_val, y_train, y_val  = train_test_split(X_trainval, y_trainval)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)

print(knn.score(X_val, y_val))
print(knn.score(X_test, y_test))


val = []
test = []

for i in range(1000):
  rng = np.random.RandomState(i)
  noise = rng.normal(scale=.1, size=X_train.shape)
  knn = KNeighborsClassifier(n_neighbors=5)
  knn.fit(X_train + noise, y_train)
  val.append(knn.score(X_val, y_val))
  test.append(knn.score(X_test, y_test))

print(np.max(np.max(val)))
print(test[np.argmax(val)])

0.9532710280373832
0.9440559440559441
0.9626168224299065
0.9370629370629371


In [8]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X,y)

X_train, X_val, y_train, y_val  = train_test_split(X_trainval, y_trainval)


val_scores = []

neighbors = np.arange(1, 15, 2)

for i in neighbors:
  knn = KNeighborsClassifier(n_neighbors=i)
  knn.fit(X_train, y_train)
  val_scores.append(knn.score(X_val, y_val))
print(np.max(val_scores))
best_n_neighbors = neighbors[np.argmax(val_scores)]#the neighbor that gave the best validation score
print(neighbors[np.argmax(val_scores)])
print(neighbors)
print(val_scores)
print(best_n_neighbors)
knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)
knn.fit(X_trainval, y_trainval)

print(knn.score(X_test, y_test))

#3 fold split has high variance - so using cross validation

#cv - more stable, less dependant on the split, more data to built the model
#we also get to know about uncertainity , how good this kind of model is on this kind of data set.

#cv - use training data to find the parameters, and cv on training dataset, compare different models 

#then use test data set for evaluation.



0.9532710280373832
5
[ 1  3  5  7  9 11 13]
[0.9252336448598131, 0.9439252336448598, 0.9532710280373832, 0.9532710280373832, 0.9532710280373832, 0.9532710280373832, 0.9345794392523364]
5
0.965034965034965


In [13]:
from sklearn.model_selection import cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X,y)

cross_val_scores = []

for i in neighbors:
  knn = KNeighborsClassifier(n_neighbors=i)
  scores = cross_val_score(knn, X_train, y_train, cv=10)
  cross_val_scores.append(np.mean(scores))

print(np.max(cross_val_scores))
#agrmax tells which n setting has best validation score
best_n_neighbors = neighbors[np.argmax(cross_val_scores)]
print(best_n_neighbors)

knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))


0.9740310077519381
7
0.965034965034965


In [0]:
from sklearn.model_selection import GridSearchCV
#gridsearch because it search a grid of all possible combinations

In [18]:
#stratify -keeps same distribution for both train and test data sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

param_grid = {'n_neighbors': np.arange(1, 15, 2)}

grid = GridSearchCV(KNeighborsClassifier(), 
                    param_grid=param_grid, 
                    cv=10, 
                    return_train_score=True)

grid.fit(X_train, y_train)

print(grid.best_score_)

print(grid.best_params_)

print(grid.score(X_test, y_test))

print(grid.cv_results_)

0.9576411960132891
{'n_neighbors': 9}
0.9790209790209791
{'mean_fit_time': array([0.00125027, 0.00120716, 0.0011692 , 0.00117397, 0.00115371,
       0.00119102, 0.00124652]), 'std_fit_time': array([2.08536105e-04, 4.95390895e-05, 1.63347179e-05, 1.64104047e-05,
       1.66360824e-05, 3.32443484e-05, 9.95601873e-05]), 'mean_score_time': array([0.00338717, 0.00332406, 0.00350366, 0.00331638, 0.00331724,
       0.00351727, 0.00357499]), 'std_score_time': array([0.00061141, 0.00026136, 0.00086195, 0.00011446, 0.00013551,
       0.00018027, 0.00014893]), 'param_n_neighbors': masked_array(data=[1, 3, 5, 7, 9, 11, 13],
             mask=[False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_neighbors': 1}, {'n_neighbors': 3}, {'n_neighbors': 5}, {'n_neighbors': 7}, {'n_neighbors': 9}, {'n_neighbors': 11}, {'n_neighbors': 13}], 'split0_test_score': array([0.93023256, 0.97674419, 0.97674419, 0.97674419, 1.        ,
       1.        , 

In [25]:
from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit, RepeatedStratifiedKFold

kfold = KFold(n_splits=5)
skfold = StratifiedKFold(n_splits=5, shuffle=True)
ss = ShuffleSplit(n_splits=5, train_size=4, test_size=3)
rs = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)

print(cross_val_score(KNeighborsClassifier(),X, y, cv=kfold))

print(cross_val_score(KNeighborsClassifier(), X, y, cv=skfold))

print(cross_val_score(KNeighborsClassifier(n_neighbors=4), X, y, cv=ss))
 
print(cross_val_score(KNeighborsClassifier(), X, y, cv=rs))
import pandas as pd
from sklearn.model_selection import cross_validate
res = cross_validate(KNeighborsClassifier(), 
                     X, y, 
                     return_train_score=True,
                     cv=5,
                     scoring=['accuracy', 'roc_auc']
                     )
res_df = pd.DataFrame(res)


[0.92982456 0.95614035 0.96491228 0.98245614 0.96460177]
[0.96491228 0.96491228 0.97368421 0.98245614 0.95575221]
[1.         0.66666667 0.66666667 0.         1.        ]
[0.96491228 0.96491228 0.93859649 0.98245614 0.96460177 0.95614035
 0.97368421 0.95614035 0.96491228 0.98230088 0.98245614 0.94736842
 0.96491228 0.96491228 0.98230088 0.96491228 0.94736842 0.97368421
 0.95614035 0.98230088 0.95614035 0.99122807 0.97368421 0.95614035
 0.94690265 0.97368421 0.97368421 0.97368421 0.95614035 0.95575221
 0.98245614 0.97368421 0.95614035 0.95614035 0.97345133 0.98245614
 0.98245614 0.92982456 0.96491228 0.97345133 0.95614035 0.96491228
 0.98245614 0.95614035 0.96460177 0.98245614 0.94736842 0.98245614
 0.95614035 0.96460177]
