In [1]:
import numpy as np 
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_digits

digits = load_digits()

In [2]:
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(digits.data, digits.target, test_size = 0.3)

In [3]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [4]:
get_score(RandomForestClassifier(), X_train, X_test, y_train, y_test)

0.9722222222222222

In [5]:
get_score(SVC(), X_train, X_test, y_train, y_test)

0.9851851851851852

In [6]:
get_score(LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=10000), X_train, X_test, y_train, y_test)

0.9518518518518518

In [7]:
from sklearn.model_selection import KFold
kf = KFold(n_splits = 10)
kf

KFold(n_splits=10, random_state=None, shuffle=False)

In [8]:
for train_i, test_i in kf.split([1,2,3,4,5,6,7,8,9,10]):
    print(train_i, test_i)

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]


In [9]:
score_l = []
score_s = []
score_d = []
for train_i, test_i in kf.split(digits.data): # kf.split is a way of splitting data where train_i and test_i are splitting index arrays here.
                                              # Like train_test_split method but in split size we assigned it 
                                              # thats why we are using loop here becuse it will return every split possible
                                              # and the splitting size is decided by us
            
    X_train, X_test, y_train, y_test = digits.data[train_i], digits.data[test_i], digits.target[train_i], digits.target[test_i]  # as we know in python we can assign variables like this     
    
    score_l.append(get_score(LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=10000), X_train, X_test, y_train, y_test))
    
    score_s.append(get_score(SVC(), X_train, X_test, y_train, y_test))
    
    score_d.append(get_score(RandomForestClassifier(), X_train, X_test, y_train, y_test))

In [10]:
print(score_l)

[0.9055555555555556, 0.9611111111111111, 0.8777777777777778, 0.9611111111111111, 0.9444444444444444, 0.9611111111111111, 0.95, 0.9273743016759777, 0.8938547486033519, 0.9385474860335196]


In [11]:
print(score_s)

[0.9444444444444444, 1.0, 0.9333333333333333, 0.9833333333333333, 0.9833333333333333, 0.9888888888888889, 0.9888888888888889, 0.994413407821229, 0.9664804469273743, 0.9497206703910615]


In [12]:
print(score_d)

[0.9055555555555556, 0.9777777777777777, 0.9277777777777778, 0.9666666666666667, 0.9611111111111111, 0.9833333333333333, 0.9722222222222222, 0.9608938547486033, 0.9553072625698324, 0.9385474860335196]


In [13]:
from sklearn.model_selection import StratifiedKFold # this split method takes care of the diff categories
skf = StratifiedKFold(n_splits = 10)
skf

StratifiedKFold(n_splits=10, random_state=None, shuffle=False)

In [14]:
score_l = []
score_s = []
score_d = []
for train_i, test_i in skf.split(digits.data, digits.target):
            
    X_train, X_test, y_train, y_test = digits.data[train_i], digits.data[test_i], digits.target[train_i], digits.target[test_i]  
    
    score_l.append(get_score(LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=10000), X_train, X_test, y_train, y_test))
    
    score_s.append(get_score(SVC(kernel = 'linear'), X_train, X_test, y_train, y_test))
    
    score_d.append(get_score(RandomForestClassifier(), X_train, X_test, y_train, y_test))

In [15]:
print(score_l)

[0.9055555555555556, 0.9555555555555556, 0.8833333333333333, 0.9333333333333333, 0.9388888888888889, 0.95, 0.95, 0.9273743016759777, 0.88268156424581, 0.9441340782122905]


In [16]:
print(score_s)

[0.9388888888888889, 0.9944444444444445, 0.9333333333333333, 0.9444444444444444, 0.9611111111111111, 0.9888888888888889, 0.9666666666666667, 0.9776536312849162, 0.9329608938547486, 0.9664804469273743]


In [17]:
print(score_d)

[0.9222222222222223, 0.9777777777777777, 0.9277777777777778, 0.9166666666666666, 0.9611111111111111, 0.9722222222222222, 0.9722222222222222, 0.9664804469273743, 0.9273743016759777, 0.9385474860335196]


In [18]:
# the above stratifiedkfold splitting and get_score method can simply be executed in one line i.e, this method called cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(SVC(), digits.data, digits.target, cv=10    ) 

array([0.94444444, 0.98888889, 0.92777778, 0.96666667, 0.98333333,
       0.98888889, 0.98888889, 0.99441341, 0.96089385, 0.95530726])