# SAMueL2 Create k-fold Data Sets

## Plain English summary
Create and save the data in 5 kfold splits.

Create stratification based on hospital and disability at discharge (target feature)

Read in output file from 02_reformat_data_ml.ipynb (02_reformatted_data_ml.csv)

## Load imports

In [1]:
import numpy as np
import pandas as pd

# sklearn for pre-processing
from sklearn.model_selection import StratifiedKFold

import os

from dataclasses import dataclass

# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")

## Set up paths and filenames

Use os.path.join() to create filenames. So define folders without trailing forward slash, and include all characters in file names.

In [2]:
@dataclass(frozen=True)
class Paths:
    '''Singleton object for storing paths to data and database.'''

    data_read_path: str = './output'
    data_read_filename: str = '02_reformatted_data_ml.csv'
    data_save_path: str = './output/kfold_5fold'
    notebook: str = '03_'

paths = Paths()

# Load data



In [3]:
filename = os.path.join(paths.data_read_path, paths.data_read_filename)
data = pd.read_csv(filename)

## Create stratification based on hospital and disability at discharge (target feature)

In [4]:
strat = (data['stroke_team'].map(str) + '-' + 
         data['discharge_disability'].map(str))

## Set and check output folder exists

In [5]:
if not os.path.exists(paths.data_save_path):
     os.makedirs(paths.data_save_path)

## Create and save five k-fold splits

In [6]:
# Set up splits
number_of_splits = 5
skf = StratifiedKFold(n_splits = number_of_splits, shuffle=True, random_state=13)
skf.get_n_splits(data, strat.values)

# Put in NumPy arrays
X = data.values
y = strat.values
X_col_names = list(data)

# Loop through the k-fold splits
counter = 0
for train_index, test_index in skf.split(X, y):  
    
    # Get Xtrain and test
    train_np, test_np = X[train_index], X[test_index]
    
    # Convert to Pandas DataFrames
    train = pd.DataFrame(train_np, columns=X_col_names)
    test = pd.DataFrame(test_np, columns=X_col_names)
    
    # Save no unit encoding dataset
    filename = os.path.join(paths.data_save_path, 
                            (paths.notebook + 'train_' + str(counter) + '.csv'))
    train.to_csv(filename, index=False)
    filename = os.path.join(paths.data_save_path, 
                            (paths.notebook + 'test_' + str(counter) + '.csv'))
    test.to_csv(filename, index=False)
        
    # Increment counter
    counter += 1