# SAMueL Create k-fold Data Sets

In [1]:
# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")

## Load modules

In [2]:
import numpy as np
import pandas as pd

# sklearn for pre-processing
from sklearn.model_selection import StratifiedKFold

import os

## Load data

Load data (read in the no unit encoding dataset and use this for both without and with hot one encoding - perform the one hot encoding in this notebook again).

Data has already been imputed and had these filters applied: 
* only include patients that attend a hospital with > 300 admissions and 10 thrombolysis
* only include patients with arrival time <4hours and onset out of hospital
* only include patients that have a scan
      
Ensure all values are `float` and shuffle.

In [3]:
data = pd.read_csv('220401_national_data_imputed_filtered_no_unit_encoding.csv')
# Shuffle and remove 'Pathway' field
data = data.sample(frac=1.0, random_state=13)
data.drop('Pathway', axis=1, inplace=True)

In [4]:
data

Unnamed: 0,StrokeTeam,S1AgeOnArrival,S1OnsetToArrival_min,S2RankinBeforeStroke,Loc,LocQuestions,LocCommands,BestGaze,Visual,FacialPalsy,...,AFAnticoagulentHeparin_Yes,AFAnticoagulentHeparin_missing,S2NewAFDiagnosis_No,S2NewAFDiagnosis_Yes,S2NewAFDiagnosis_missing,S2StrokeType_Infarction,S2TIAInLastMonth_No,S2TIAInLastMonth_No but,S2TIAInLastMonth_Yes,S2TIAInLastMonth_missing
3186,APXEE8191H,92.5,105,0,1,2.0,2.0,2.0,2.0,0.0,...,0,0,0,0,1,0,0,0,0,1
12305,EQZZZ5658G,72.5,145,0,1,2.0,2.0,1.0,2.0,2.0,...,0,1,0,0,1,1,0,0,0,1
55848,QOAPO4699N,72.5,85,0,0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,1,1,0,0,0
36019,JINXD0311F,57.5,80,4,1,2.0,2.0,0.0,0.0,1.0,...,0,0,1,0,0,1,0,0,0,1
22910,HBFCN1575G,82.5,145,3,1,2.0,2.0,0.0,1.0,1.0,...,0,0,0,0,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25324,HREGJ0143U,82.5,70,0,0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,1,0,0,0,0,1
65689,TFSJP6914B,67.5,140,0,0,0.0,0.0,0.0,0.0,1.0,...,0,1,0,0,1,0,0,0,0,1
87796,ZHCLE1578P,92.5,165,1,0,2.0,0.0,0.0,0.0,1.0,...,0,0,1,0,0,1,0,0,0,1
33634,JHDQL1362V,77.5,135,0,0,0.0,0.0,0.0,2.0,0.0,...,0,1,0,0,1,0,0,0,0,1


## Create stratification based on hospital and thrombolysis use

In [5]:
strat = data['StrokeTeam'].map(str) + '-' + data['S2Thrombolysis'].map(str)

## Set and check output folder exists

In [6]:
output_dir = './kfold_5fold'
if not os.path.exists(output_dir):
     os.makedirs(output_dir)

## Create and save five k-fold splits

In [7]:
# Set up splits
number_of_splits = 5
skf = StratifiedKFold(n_splits = number_of_splits, shuffle=True, random_state=13)
skf.get_n_splits(data, strat.values)

# Put in NumPy arrays
X = data.values
y = strat.values
X_col_names = list(data)

# Loop through the k-fold splits
counter = 0
for train_index, test_index in skf.split(X, y):  
    
    # Get Xtrain and test
    train_np, test_np = X[train_index], X[test_index]
    
    # Convert to Pandas DataFrames
    train = pd.DataFrame(train_np, columns=X_col_names)
    test = pd.DataFrame(test_np, columns=X_col_names)
    
    # Save no unit encoding dataset
    train.to_csv(f'{output_dir}/train_{counter}.csv', index=False)
    test.to_csv(f'{output_dir}/test_{counter}.csv', index=False)
    
    # One hot encode stroke team
    units = train['StrokeTeam']
    train.drop(['StrokeTeam'],inplace=True, axis=1)
    one_hot_coded = pd.get_dummies(units, prefix='StrokeTeam')
    train = pd.concat([train, one_hot_coded], axis=1)
    train.to_csv(f'{output_dir}/train_one_hot_unit_{counter}.csv', index=False)
    
    units = test['StrokeTeam']
    test.drop(['StrokeTeam'],inplace=True, axis=1)
    one_hot_coded = pd.get_dummies(units, prefix='StrokeTeam')
    test = pd.concat([test, one_hot_coded], axis=1)
    test.to_csv(f'{output_dir}/test_one_hot_unit_{counter}.csv', index=False)
    
    # Increment counter
    counter += 1