# Introduction

I implemented the new feature for scikit-learn for CPCV and related class.  
That is:
- PurgedKFold
- CombinationalKFold
- CombinationalPurgedKFold

For CPCV, see https://www.kaggle.com/c/g-research-crypto-forecasting/discussion/302445  
Theare many sources such as blog, author's book and so on to learn this concept.  
So I don't write the explanation here.  

## Install forked scikit-learn

In [None]:
from sklearn.model_selection import TimeSeriesSplit,KFold,GroupKFold,CombinationalKFold,CombinationalPurgedKFold,PurgedKFold

**CombinationalKFold, CombinationalPurgedKFold, PurgedKFold** is new feature which is implemented by me.

In [None]:
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

In [None]:
# ref:https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html

def plot_cv_indices(cv, X, y, y_know_futures = np.empty(0),lw=10):
    """Create a sample plot for indices of a cross-validation object."""
    _, ax = plt.subplots()
    n_splits = cv.n_splits
    cmap_cv = plt.cm.coolwarm
    
    # Generate the training/testing visualizations for each CV split
    xrange_max = 0
    
    def visualise(tr,tt):
        xrange_max = 0
        ax.scatter(
            range(len(indices)),
            [ii + 0.5] * len(indices),
            c=indices,
            marker="_",
            lw=lw,
            cmap=cmap_cv,
            vmin=-0.2,
            vmax=1.2,
        )
        if xrange_max < len(tr) + len(tt):
            xrange_max = len(tr) + len(tt)
            
        return xrange_max
        
    
    
    if y_know_futures.size == 0:
        for ii, (tr, tt) in enumerate(cv.split(X=X, y=y)):
            # Fill in indices with the training/test groups
            indices = np.array([np.nan] * len(X))
            indices[tt] = 1
            indices[tr] = 0
            
            xrange_max = visualise(tr,tt)
            
    else:
        for ii, (tr, tt,overlap_index_arrray) in enumerate(cv.split(X=X, y=y, y_know_futures = y_know_futures)):
            # Fill in indices with the training/test groups
            indices = np.array([np.nan] * len(X))
            #print(tr, tt,overlap_index_arrray)
            indices[tt] = 1
            indices[tr] = 0
            indices[overlap_index_arrray] = 0.5
            
            xrange_max = visualise(tr,tt)

    yticklabels = list(range(n_splits))
    ax.set(
        yticks=np.arange(n_splits) + 0.5,
        yticklabels=yticklabels,
        xlabel="Sample index",
        ylabel="CV iteration",
        ylim=[n_splits, -0.2],
        xlim=[0, xrange_max],
    )

    ax.set_title("{}".format(type(cv).__name__), fontsize=15)

## TimeSeriesSplit

In [None]:
X = np.array([10+i for i in range(10)])

y = np.array([i for i in range(10)])

n_splits=5
tscv = TimeSeriesSplit(n_splits)
print(tscv)

In [None]:
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [None]:
plot_cv_indices(tscv, X, y)

## KFold

In [None]:
n_splits=5
kfold = KFold(n_splits)
print(kfold)

In [None]:
for train_index, test_index in kfold.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [None]:
plot_cv_indices(kfold, X, y)

Only 10 data looks sparse.  
So it is better to use more than 100 data for visualization.  

In [None]:
np.random.seed(1338)

n_splits = 4

# Generate the class/group data
n_points = 100
X = np.random.randn(n_points, 10)
percentiles_classes = [0.1, 0.3, 0.6]
y = np.hstack([[ii] * int(n_points * perc) for ii, perc in enumerate(percentiles_classes)])

In [None]:
cv = KFold(n_splits)
print(cv)
plot_cv_indices(cv, X, y)

## PurgedKFold

Let's say if target y depends on future value or know future.  
Then normal Kfold results in leakage.  
So purging should be done.  

`y_know_futures` is the index of the future data that target variables knows.  
For example, 0 means that target only know currenat states, but 2 means that target knows 2 steps later future.  

In [None]:
import random
random.seed(2022)

y_know_futures = np.array([random.randint(0,5) for yi in y ])
y_know_futures

For testing, `y_know_futures` is randomly created.  

In [None]:
purged_kfold = PurgedKFold(n_splits)
print(purged_kfold)

for train_index, test_index, overlap_index in purged_kfold.split(X, y,y_know_futures):
    print("TRAIN:", train_index, "TEST:", test_index, "OVERLAP:", overlap_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [None]:
plot_cv_indices(purged_kfold, X, y,y_know_futures = y_know_futures)

gray parts is overlap data.

## GroupKFold

In [None]:
def plot_groupcv_indices(cv, X, y, y_know_futures = np.empty(0),groups = np.empty(0),lw=10):
    """Create a sample plot for indices of a cross-validation object."""
    _, ax = plt.subplots()
    n_splits = cv.n_splits
    cmap_cv = plt.cm.coolwarm
    
    # Generate the training/testing visualizations for each CV split
    xrange_max = 0
    
    def visualise(tr,tt):
        xrange_max = 0
        ax.scatter(
            range(len(indices)),
            [ii + 0.5] * len(indices),
            c=indices,
            marker="_",
            lw=lw,
            cmap=cmap_cv,
            vmin=-0.2,
            vmax=1.2,
        )
        if xrange_max < len(tr) + len(tt):
            xrange_max = len(tr) + len(tt)
            
        return xrange_max
        
    
    
    #if y_know_futures.size == 0:
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y,groups=groups)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        xrange_max = visualise(tr,tt)

    yticklabels = list(range(n_splits))
    ax.set(
        yticks=np.arange(n_splits) + 0.5,
        yticklabels=yticklabels,
        xlabel="Sample index",
        ylabel="CV iteration",
        ylim=[n_splits, -0.2],
        xlim=[0, xrange_max],
    )

    ax.set_title("{}".format(type(cv).__name__), fontsize=15)

In [None]:
# Evenly spaced groups repeated once
groups = np.hstack([[ii] * 10 for ii in range(10)])

group_kfold = GroupKFold(n_splits)

plot_groupcv_indices(group_kfold, X, y,groups = groups)

In [None]:
n_splits = 5
group_kfold = GroupKFold(n_splits)
plot_groupcv_indices(group_kfold, X, y,groups = groups)

## CombinationalKFold

In [None]:
groups = np.hstack([[ii] * 20 for ii in range(5)])
groups

In [None]:
group_ids = np.unique(groups)
group_ids

In [None]:
def plot_combcv_indices(cv, X, y,lw=10):
    """Create a sample plot for indices of a cross-validation object."""
    _, ax = plt.subplots()
    n_splits = cv.n_splits
    n_combs = len(cv.test_comb_list)
    cmap_cv = plt.cm.coolwarm
    
    # Generate the training/testing visualizations for each CV split
    xrange_max = 0
    
    def visualise(tr,tt):
        xrange_max = 0
        ax.scatter(
            range(len(indices)),
            [ii + 0.5] * len(indices),
            c=indices,
            marker="_",
            lw=lw,
            cmap=cmap_cv,
            vmin=-0.2,
            vmax=1.2,
        )
        if xrange_max < len(tr) + len(tt):
            xrange_max = len(tr) + len(tt)
            
        return xrange_max
        
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        xrange_max = visualise(tr,tt)

    yticklabels = list(range(n_combs))
    ax.set(
        yticks=np.arange(n_combs) + 0.5,
        yticklabels=yticklabels,
        xlabel="Sample index",
        ylabel="CV iteration",
        ylim=[n_combs, -0.2],
        xlim=[0, xrange_max],
    )

    ax.set_title("{}".format(type(cv).__name__), fontsize=15)

In [None]:
comb_kfold = CombinationalKFold(groups, n_splits=5, test_group_choice=2)
for train_index, test_index in comb_kfold.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]


In [None]:
  plot_combcv_indices(comb_kfold, X, y)

## CombinationalPurgedKFold

For now `y_know_futures` is only available, so I will add `X_know_pasts`.

In [None]:
y_know_futures = np.array([random.randint(0,5) for yi in y ])
y_know_futures

In [None]:
comb_purge_kfold = CombinationalPurgedKFold(groups, n_splits=5, test_group_choice=2)


for train_index, test_index, overlap_index in comb_purge_kfold.split(X, y,y_know_futures):
    print("TRAIN:", train_index, "TEST:", test_index, "OVERLAP:", overlap_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [None]:
def plot_combpurgecv_indices(cv, X, y, y_know_futures = np.empty(0),lw=10):
    """Create a sample plot for indices of a cross-validation object."""
    _, ax = plt.subplots()
    n_splits = cv.n_splits
    n_combs = len(cv.test_comb_list)
    cmap_cv = plt.cm.coolwarm
    
    # Generate the training/testing visualizations for each CV split
    xrange_max = 0
    
    def visualise(tr,tt):
        xrange_max = 0
        ax.scatter(
            range(len(indices)),
            [ii + 0.5] * len(indices),
            c=indices,
            marker="_",
            lw=lw,
            cmap=cmap_cv,
            vmin=-0.2,
            vmax=1.2,
        )
        if xrange_max < len(tr) + len(tt):
            xrange_max = len(tr) + len(tt)
            
        return xrange_max
    

    for ii, (tr, tt,overlap_index_arrray) in enumerate(cv.split(X=X, y=y, y_know_futures = y_know_futures)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        #print(tr, tt,overlap_index_arrray)
        indices[tt] = 1
        indices[tr] = 0
        indices[overlap_index_arrray] = 0.5

        xrange_max = visualise(tr,tt)

    yticklabels = list(range(n_combs))
    ax.set(
        yticks=np.arange(n_combs) + 0.5,
        yticklabels=yticklabels,
        xlabel="Sample index",
        ylabel="CV iteration",
        ylim=[n_combs, -0.2],
        xlim=[0, xrange_max],
    )

    ax.set_title("{}".format(type(cv).__name__), fontsize=15)

In [None]:
plot_combpurgecv_indices(comb_purge_kfold, X, y, y_know_futures = y_know_futures)