In [4]:
# Importing the necessary modules

import numpy as np
from sklearn import linear_model, svm, ensemble, datasets, model_selection

In [14]:
# Loading the dataset

dataset = datasets.load_iris()
print(dir(dataset))

['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'frame', 'target', 'target_names']


In [39]:
# Splitting the data into 'k' folds

kf = model_selection.KFold(n_splits = 5)

print(kf.split([1, 2, 3, 4, 5, 6]))

for x in kf.split([1, 3, 5, 7, 9, 11]):
    print(x)
    
for train_index, test_index in kf.split([2, 4, 6, 8, 10, 12]):
    print(train_index, test_index)

<generator object _BaseKFold.split at 0x00000179A2FDDD90>
(array([2, 3, 4, 5]), array([0, 1]))
(array([0, 1, 3, 4, 5]), array([2]))
(array([0, 1, 2, 4, 5]), array([3]))
(array([0, 1, 2, 3, 5]), array([4]))
(array([0, 1, 2, 3, 4]), array([5]))
[2 3 4 5] [0 1]
[0 1 3 4 5] [2]
[0 1 2 4 5] [3]
[0 1 2 3 5] [4]
[0 1 2 3 4] [5]


In [45]:
# Stratified 'k' fold -> Distributes each classification category uniformly

kf = model_selection.StratifiedKFold(n_splits = 5)

for train_index, test_index in kf.split(dataset.data, dataset.target):
    X_train, X_test, y_train, y_test = dataset.data[train_index], dataset.data[test_index], dataset.target[train_index], dataset.target[test_index]

In [53]:
# Evaluating the model performance

lg = linear_model.LogisticRegression(max_iter = 10000)
s = svm.SVC(kernel = "linear")
rf = ensemble.RandomForestClassifier(n_estimators = 50)

lg_scores, s_scores = [], []
rf_scores = s_scores.copy()

def train_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

kf = model_selection.StratifiedKFold(n_splits = 5)

for train_index, test_index in kf.split(dataset.data, dataset.target):
    X_train, X_test, y_train, y_test = dataset.data[train_index], dataset.data[test_index], dataset.target[train_index], dataset.target[test_index]
    lg_scores.append(train_model(lg, X_train, X_test, y_train, y_test))
    s_scores.append(train_model(s, X_train, X_test, y_train, y_test))
    rf_scores.append(train_model(rf, X_train, X_test, y_train, y_test))
    
print(lg_scores)
print(s_scores)
print(rf_scores)

[0.9666666666666667, 1.0, 0.9333333333333333, 0.9666666666666667, 1.0]
[0.9666666666666667, 1.0, 0.9666666666666667, 0.9666666666666667, 1.0]
[0.9666666666666667, 0.9666666666666667, 0.9333333333333333, 0.9333333333333333, 1.0]


In [54]:
# Average Scores

print(sum(lg_scores) / len(lg_scores))
print(sum(s_scores) / len(s_scores))
print(sum(rf_scores) / len(rf_scores))

0.9733333333333334
0.9800000000000001
0.96


In [63]:
# Using cross_val_score to find the accuracy directly  (5 folds by default)

print(model_selection.cross_val_score(linear_model.LogisticRegression(max_iter = 1000), dataset.data, dataset.target))
print(model_selection.cross_val_score(svm.SVC(kernel = "linear"), dataset.data, dataset.target))
print(model_selection.cross_val_score(ensemble.RandomForestClassifier(n_estimators = 50), dataset.data, dataset.target))

[0.96666667 1.         0.93333333 0.96666667 1.        ]
[0.96666667 1.         0.96666667 0.96666667 1.        ]
[0.96666667 0.96666667 0.9        0.96666667 1.        ]


In [60]:
# Conclusion : SVM Performs Better in Classification