In [None]:
# code for loading the format for the notebook
import os

# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir(os.path.join('..', 'notebook_format'))

from formats import load_style
load_style(plot_style = False)

In [None]:
os.chdir(path)

# 1. magic to print version
# 2. magic so that the notebook will reload external python modules
%load_ext watermark
%load_ext autoreload 
%autoreload 2

import numpy as np
import pandas as pd

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,sklearn,scipy,joblib

#### K-Fold Cross Validation

In [1]:
class KFolds:
    """
    K-Folds cross-validation
    Provides train/test indices to split data in train/test sets. Split
    dataset into k consecutive folds; Each fold is then used once as 
    a validation while the k - 1 remaining folds form the training set

    Parameters
    ----------
    n_splits : int
        number of folds. Must be at least 2
    
    shuffle : bool, default True
        whether to shuffle the data before splitting into batches
    
    seed : int, default 4321
        When shuffle = True, pseudo-random number generator state used for
        shuffling; this ensures reproducibility
    """
    def __init__(self, n_splits, shuffle = True, seed = 4321):
        self.seed = seed
        self.shuffle = shuffle
        self.n_splits = n_splits
        
    def split(self, X):
        """pass in the data to create train/test split for k fold"""
        # shuffle modifies indices inplace
        n_samples = X.shape[0]
        indices = np.arange(n_samples)
        if self.shuffle:
            rstate = np.random.RandomState(self.seed)
            rstate.shuffle(indices)

        for test_mask in self._iter_test_masks(n_samples, indices):
            train_index = indices[np.logical_not(test_mask)]
            test_index = indices[test_mask]
            yield train_index, test_index
        
    def _iter_test_masks(self, n_samples, indices):
        """
        create the mask for the test set, then the indices that
        are not in the test set belongs in the training set
        """
        # indicate the number of samples in each fold, and also
        # make sure the ones that are not evenly splitted also
        # gets assigned to a fold (e.g. if we do 2 fold on a
        # dataset that has 5 samples, then 1 will be left out,
        # and has to be assigned to one of the other fold)
        fold_sizes = (n_samples // self.n_splits) * np.ones(self.n_splits, dtype = np.int)
        fold_sizes[:n_samples % self.n_splits] += 1

        current = 0
        for fold_size in fold_sizes:
            start, stop = current, current + fold_size
            test_indices = indices[start:stop]
            test_mask = np.zeros(n_samples, dtype = np.bool)
            test_mask[test_indices] = True
            yield test_mask
            current = stop

In [2]:
# create some sample data
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [5, 6]])
y = np.array([1, 2, 3, 4, 5])

kf = KFolds(n_splits = 2, shuffle = False, seed = 4312)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    

from sklearn.model_selection import KFold
print('\nconfirm results with scikit-learn')
kf = KFold(n_splits = 2, shuffle = False)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

TRAIN: [3 4] TEST: [0 1 2]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

TRAIN: [0 1 2] TEST: [3 4]

confirm results with scikit-learn
TRAIN: [3 4] TEST: [0 1 2]
TRAIN: [0 1 2] TEST: [3 4]


In [5]:
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold 
from sklearn import metrics  

# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html
# Simulate splitting a dataset  into 5 folds

diabetes = datasets.load_diabetes()
X = diabetes.data[:150]
y = diabetes.target[:150]

lasso = linear_model.Lasso()

# Single metric evaluation using cross_validate

cv_results = cross_validate(lasso, X, y, cv=5)
res = sorted(cv_results.keys()) 
print(res)
ind_res = cv_results['test_score'] 
print(ind_res, ' CV res')

scores = cross_validate(lasso, X, y, cv=5, scoring=('r2', 'neg_mean_squared_error'), return_train_score=True)
print(scores['test_neg_mean_squared_error']) 
print(scores['train_r2'])

['fit_time', 'score_time', 'test_score']
[0.29828759 0.22414975 0.15479913 0.25519691 0.17109036]  CV res
[-2807.16463682 -4890.37465206 -3360.65857776 -4663.03753949
 -5152.29967202]
[0.32756903 0.314596   0.33970757 0.32065399 0.27361795]


In [6]:
cv_results.keys()

dict_keys(['fit_time', 'score_time', 'test_score'])

In [7]:
cv_results

{'fit_time': array([0.00100636, 0.        , 0.        , 0.        , 0.        ]),
 'score_time': array([0.00098872, 0.00099778, 0.00099683, 0.0009973 , 0.        ]),
 'test_score': array([0.29828759, 0.22414975, 0.15479913, 0.25519691, 0.17109036])}

In [8]:
scores

{'fit_time': array([0.        , 0.00099826, 0.        , 0.        , 0.00099707]),
 'score_time': array([0.0009985 , 0.        , 0.00099754, 0.0009954 , 0.        ]),
 'test_r2': array([0.29828759, 0.22414975, 0.15479913, 0.25519691, 0.17109036]),
 'train_r2': array([0.32756903, 0.314596  , 0.33970757, 0.32065399, 0.27361795]),
 'test_neg_mean_squared_error': array([-2807.16463682, -4890.37465206, -3360.65857776, -4663.03753949,
        -5152.29967202]),
 'train_neg_mean_squared_error': array([-4081.58083693, -3748.32523377, -3896.40106456, -3700.91683136,
        -3869.32776223])}