# 01.01 Stacked generalization split

* Generate a 5-fold cross-validation split. All base learners should make predictions on the split and evaluate their prediction after their training. The results are used in ensemble models. This allows faster training of ensemble models.

In [1]:
import warnings
warnings.filterwarnings('ignore', 'The least populated class')

import numpy
import sklearn.model_selection
import tables

import utils

## Materials

Mutation counts. Use fixed seed to ensure the reproducibility.

In [2]:
mutation_count = utils.data.load('training', 'mutation_count')
fitness_group = utils.data.load('training', 'fitness_group')
groups = mutation_count * (fitness_group.max() + 1) + fitness_group

## Methods

Use a fixed random seed to ensure reproducibility.

In [3]:
RANDOM_STATE = 42
SPLIT_COUNT = 5

kfold = sklearn.model_selection.StratifiedKFold(n_splits=SPLIT_COUNT,
                                                random_state=RANDOM_STATE)
label = numpy.zeros((SPLIT_COUNT, mutation_count.size), dtype='?')
for i, (train_idx, __) in enumerate(kfold.split(groups[:, None], groups)):
    label[i, train_idx] = True
file = tables.open_file(utils.data.path(1) / '01.stack.h5', 'w')
file.create_array('/', 'stack_train', obj=label)
file.close()

Test reproducibility.

In [4]:
kfold = sklearn.model_selection.StratifiedKFold(n_splits=SPLIT_COUNT,
                                                random_state=RANDOM_STATE)
label1 = numpy.zeros((SPLIT_COUNT, mutation_count.size), dtype='?')
for i, (train_idx, __) in enumerate(kfold.split(groups[:, None], groups)):
    label1[i, train_idx] = True

## Results

Reprodicibility

In [5]:
(label == label1).all()

True