#### K-Fold Cross Validation

<img src='./images/K_fold1.png' style="height: 450px; width: 900px; margin-left:250px"></img>

In [1]:
import pandas as pd
from sklearn.datasets import load_digits

digits = load_digits()

X = digits.data
Y = digits.target

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [3]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=3000)
lr.fit(X_train, Y_train)
lr.score(X_test, Y_test)

0.9583333333333334

In [4]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, Y_train)
svc.score(X_test, Y_test)

0.9888888888888889

In [5]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)
rfc.score(X_test, Y_test) 

0.9722222222222222

In [6]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)
kf

KFold(n_splits=5, random_state=None, shuffle=False)

In [7]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9,10]):
    print(train_index, test_index)

[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5) 

<img src='./images/K_fold2.png' style="height: 450px; width: 600px; margin-left:400px"></img>

In [8]:
def get_score(model, X_train, X_test, Y_train, Y_test):
    model.fit(X_train, Y_train)
    return model.score(X_test, Y_test)

In [20]:
from numpy import mean

scores_l, scores_svm, scores_rf = [], [], []

for train_index, test_index in kf.split(digits.data):
    X_train, X_test, Y_train, Y_test = digits.data[train_index], digits.data[test_index], digits.target[train_index], digits.target[test_index]
    
    scores_l.append(get_score(LogisticRegression(max_iter=5000), X_train, X_test, Y_train, Y_test))
    scores_svm.append(get_score(SVC(), X_train, X_test, Y_train, Y_test))
    scores_rf.append(get_score(RandomForestClassifier(), X_train, X_test, Y_train, Y_test))

print('Logistic Regression:', scores_l)
print('Support Vector Machine:', scores_svm)
print('Random Forest:', scores_rf)

Logistic Regression: [0.9305555555555556, 0.875, 0.9415041782729805, 0.9387186629526463, 0.8997214484679665]
Support Vector Machine: [0.9694444444444444, 0.9472222222222222, 0.9832869080779945, 0.9888579387186629, 0.9415041782729805]
Random Forest: [0.9444444444444444, 0.9194444444444444, 0.9554317548746518, 0.9610027855153204, 0.9192200557103064]


In [23]:
from sklearn.model_selection import cross_val_score

print("Logistic Regression:", cross_val_score(LogisticRegression(solver='lbfgs', max_iter=5000), X, Y, cv=6))
print("Support Vector Machine:", cross_val_score(SVC(), X, Y, cv=6))
print("Random Forest:", cross_val_score(RandomForestClassifier(), X, Y, cv=6))

Logistic Regression: [0.91666667 0.92666667 0.92333333 0.96655518 0.92976589 0.90301003]
Support Vector Machine: [0.96333333 0.96666667 0.98666667 0.97993311 0.99331104 0.92976589]
Random Forest: [0.93       0.97       0.94666667 0.96989967 0.97324415 0.93979933]
