In [3]:
# performing different methods on Iris dta to better understand CV

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, cross_val_score

X,y=datasets.load_iris(return_X_y=True)

# methods to corss-validation:
# 1. K-Fold: The training data used in the model is split, into k number of smaller sets, to be used to validate the model. The model is then trained on k-1 folds of training set. The remaining fold is then used as a validation set to evaluate the model.

# create and fit model for validation
clf = DecisionTreeClassifier(random_state=42)

# evaluating the mdoel to see its performance on each k-fold
k_folds = KFold(n_splits=5)
scores = cross_val_score(clf,X,y,cv=k_folds)

print("Cross Validation Scores: ", scores)
print("Average CV score: ", scores.mean())
print("Number of CV scores used in average: ", len(scores))

Cross Validation Scores:  [1.         1.         0.83333333 0.93333333 0.8       ]
Average CV score:  0.9133333333333333
Number of CV scores used in average:  5


In [4]:
# Stratified K-Fold
"""In cases where classes are imbalanced we need a way to account for the imbalance in both the train and validation sets. To do so we can stratify the target classes, meaning that both sets will have an equal proportion of all classes.
"""
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

X,y = datasets.load_iris(return_X_y=True)
cls = DecisionTreeClassifier(random_state=42)
sk_folds = StratifiedKFold(n_splits=5)
scores = cross_val_score(cls,X,y,cv=sk_folds)

print("Cross Validation Scores: ", scores)
print("Average CV score: ", scores.mean())
print("Number of CV scores used in average: ", len(scores))

Cross Validation Scores:  [0.96666667 0.96666667 0.9        0.93333333 1.        ]
Average CV score:  0.9533333333333334
Number of CV scores used in average:  5


While the number of folds is the same, the average CV increases from the basic k-fold when making sure there is stratified classes.