# SAMueL Create Straified Data Sets

In [1]:
# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")

## Load modules

In [2]:
import numpy as np
import pandas as pd

# sklearn for pre-processing
from sklearn.model_selection import train_test_split

import os

## Load data

Load data.

Data has already been imputed and had these filters applied: 
* only include patients that attend a hospital with > 300 admissions and 10 thrombolysis
* only include patients with arrival time <4hours and onset out of hospital
* only include patients that have a scan

Ensure all values are `float` and shuffle.

In [3]:
data = pd.read_csv('220401_national_data_imputed_filtered_no_unit_encoding.csv')
# Shuffle and remove 'Pathway' field
data = data.sample(frac=1.0, random_state=42)
data.drop('Pathway', axis=1, inplace=True)

## Set and check output folder exists

In [4]:
output_dir = './10k_training_test'
if not os.path.exists(output_dir):
     os.makedirs(output_dir)

## Create stratification based on hospital and thrombolysis use

In [5]:
strat = data['StrokeTeam'].map(str) + '-' + data['S2Thrombolysis'].map(str)

### Split X and y

In [6]:
X = data.drop('S2Thrombolysis', axis=1)
y = data['S2Thrombolysis']

## Create and save train and test splits

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=10000, stratify=strat, random_state=42)

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [8]:
train.to_csv(f'{output_dir}/cohort_10000_train.csv', index=False)
test.to_csv(f'{output_dir}/cohort_10000_test.csv', index=False)

One hot encode

In [9]:
# One hot encode stroke team
units = train['StrokeTeam']
train.drop(['StrokeTeam'],inplace=True, axis=1)
one_hot_coded = pd.get_dummies(units, prefix='StrokeTeam')
train = pd.concat([train, one_hot_coded], axis=1)
train.to_csv(
    f'{output_dir}/cohort_10000_train_one_hot_unit.csv', index=False)

units = test['StrokeTeam']
test.drop(['StrokeTeam'],inplace=True, axis=1)
one_hot_coded = pd.get_dummies(units, prefix='StrokeTeam')
test = pd.concat([train, one_hot_coded], axis=1)
test.to_csv(
    f'{output_dir}/cohort_10000_test_one_hot_unit.csv', index=False)