# SAMueL Create k-fold Data Sets

## Plain English summary
Create and save the data in 5 kfold splits.

## Load imports

In [1]:
import pandas as pd
import numpy as np

from dataclasses import dataclass
from sklearn.model_selection import train_test_split

# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")

## Set up paths and filenames

In [2]:
@dataclass(frozen=True)
class Paths:
    '''Singleton object for storing paths to data and database.'''

    data_read_path: str = './stroke_utilities/data/'
    data_read_filename: str = 'reformatted_data_thrombolysis_decision.csv'
    data_save_path: str = './stroke_utilities/data'
    notebook: str = ''

paths = Paths()

# Load data



In [3]:
filename = paths.data_read_path + paths.data_read_filename
data = pd.read_csv(filename)


Ensure all values are float and shuffle

In [4]:
data = data.sample(frac=1.0, random_state=42)

## Limit to arrivals within 4 hours and scan with enough time for thrombolysis

In [5]:
from stroke_utilities.scenario import create_masks

In [6]:

# Set up allowed time and over-run for thrombolysis...
allowed_onset_to_needle_time_mins = 270  # 4h 30m
allowed_overrun_for_slow_scan_to_needle_mins = 15
# ... and for thrombectomy
allowed_onset_to_puncture_time_mins = 8*60  # --------------------------------- need to check for a reaonsable number here
allowed_overrun_for_slow_scan_to_puncture_mins = 15
minutes_left = 15.0

# Limit for comparing conditions (e.g. is onset to arrival within
# 4hrs?). Separate limits for IVT and MT:
limit_ivt_mins = 4*60
limit_mt_mins = 6*60  # ################################################# look up sensible value


In [7]:
def restrict_to_enough_time_for_thrombolysis(big_data): 
    big_data['arrival_to_thrombolysis_time'] = (
        big_data['arrival_to_scan_time'] + 
        big_data['scan_to_thrombolysis_time']
        )
    
    # Time left after scan for thrombolysis...
    big_data['time_left_for_ivt_after_scan_mins'] = np.maximum((
        allowed_onset_to_needle_time_mins -
        (big_data['onset_to_arrival_time'] + 
          big_data['arrival_to_scan_time'])
        ), -0.0)
    # ... and thrombectomy:
    big_data['time_left_for_mt_after_scan_mins'] = np.maximum((
        allowed_onset_to_puncture_time_mins -
        (big_data['onset_to_arrival_time'] + 
          big_data['arrival_to_scan_time'])
        ), -0.0)

    # # Mask to only get eligible patients:
    # masks_dict_ivt = create_masks(
    #         big_data,
    #         big_data['time_left_for_ivt_after_scan_mins'],
    #         big_data['arrival_to_thrombolysis_time'],
    #         limit_ivt_mins,
    #         minutes_left
    #         )

    # Don't use the create_masks function because its first step is
    # to exclude patients with unknown onset time.
    mask_to_include = big_data['time_left_for_ivt_after_scan_mins'] >= minutes_left
    
    big_data = big_data[
        # masks_dict_ivt['mask5_mask4_and_enough_time_to_treat'] == True
        mask_to_include
    ]
    return big_data

In [8]:
data = restrict_to_enough_time_for_thrombolysis(data)

In [9]:
# mask = data['onset_to_arrival_time'] <= 240
# data = data[mask]

## Limit to 10 features and thrombolysis label

In [10]:
features_to_use = [
    'stroke_team_id',
    'stroke_severity',
    'prior_disability',
    'age',
    'infarction',
    'onset_to_arrival_time',
    'precise_onset_known',
    'onset_during_sleep',
    'arrival_to_scan_time',
    'afib_anticoagulant',
    'year',    
    'thrombolysis'
]

data = data[features_to_use]

## Create stratification based on hospital and thrombolysis use

In [11]:
strat = data['stroke_team_id'].map(str) + '-' + data['thrombolysis'].map(str)

## Create and save 10k test and train sets

In [12]:
# Split X and y
X = data.drop('thrombolysis', axis=1)
y = data['thrombolysis']

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=10000, stratify=strat, random_state=42)
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

# Save
train.to_csv(f'{paths.data_save_path}/cohort_10000_train.csv', index=False)
test.to_csv(f'{paths.data_save_path}/cohort_10000_test.csv', index=False)