# SAMueL Create k-fold Data Sets

## Plain English summary
Create and save the data in 5 kfold splits.

## Load imports

In [1]:
import pandas as pd
import numpy as np
import yaml

from dataclasses import dataclass
from sklearn.model_selection import train_test_split

# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")

## Set up paths and filenames

In [2]:
@dataclass(frozen=True)
class Paths:
    '''Singleton object for storing paths to data and database.'''

    data_read_path: str = './stroke_utilities/data/'
    data_read_filename: str = 'reformatted_data_thrombolysis_decision.csv'
    data_save_path: str = './stroke_utilities/data'
    notebook: str = ''

paths = Paths()

# Load data



In [3]:
filename = paths.data_read_path + paths.data_read_filename
data = pd.read_csv(filename)


Ensure all values are float and shuffle

In [4]:
data = data.sample(frac=1.0, random_state=42)

## Limit to scan with enough time for thrombolysis

In [5]:
from stroke_utilities.scenario import create_masks

In [6]:
with open('./stroke_utilities/fixed_params.yml') as f:
    fixed_params = yaml.safe_load(f)

In [7]:
# allowed_onset_to_needle_time_mins = fixed_params['allowed_onset_to_needle_time_mins']
# minutes_left = fixed_params['minutes_left']
allowed_onset_to_scan_time = fixed_params['allowed_onset_to_scan_time']

In [8]:
def restrict_to_onset_to_scan_on_time(big_data):    
    # Time left after scan for thrombolysis
    big_data['onset_to_scan_time'] = (
        big_data['onset_to_arrival_time'] + 
        big_data['arrival_to_scan_time']
        )

    mask_to_include = big_data['onset_to_scan_time'] <= allowed_onset_to_scan_time

    # Restrict the data to these patients:
    big_data = big_data[mask_to_include]
    return big_data

In [9]:
data = restrict_to_onset_to_scan_on_time(data)

In [10]:
# mask = data['onset_to_arrival_time'] <= 240
# data = data[mask]

## Limit to 10 features and thrombolysis label

In [11]:
features_to_use = [
    'stroke_team_id',
    'stroke_severity',
    'prior_disability',
    'age',
    'infarction',
    'onset_to_arrival_time',
    'precise_onset_known',
    'onset_during_sleep',
    'arrival_to_scan_time',
    'afib_anticoagulant',
    'year',    
    'thrombolysis'
]

data = data[features_to_use]

## Create stratification based on hospital and thrombolysis use

In [12]:
strat = data['stroke_team_id'].map(str) + '-' + data['thrombolysis'].map(str)

## Create and save 10k test and train sets

In [13]:
# Split X and y
X = data.drop('thrombolysis', axis=1)
y = data['thrombolysis']

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=10000, stratify=strat, random_state=42)
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

# Save
train.to_csv(f'{paths.data_save_path}/cohort_10000_train.csv', index=False)
test.to_csv(f'{paths.data_save_path}/cohort_10000_test.csv', index=False)