# Reformat data

This notebook reformats the cleaned up SSNAP data for use with machine learning.

## Import packages

In [1]:
# Import packages
import numpy as np
import os
import pandas as pd

from dataclasses import dataclass

# Set the maximum number of columns to 100
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

## Set up paths and filenames

In [2]:
@dataclass(frozen=True)
class Paths:
    '''Singleton object for storing paths to data and database.'''

    data_read_path: str = './'
    data_read_filename: str = '01_cleaned_data.csv'
    data_save_path: str = './'
    data_save_filename: str = 'reformatted_data_ml.csv'
    notebook: str = '02_'

paths = Paths()

## Set thresholds

In [3]:
set_duration_not_get_thrombolysis = 9999
min_hosptial_thrombolysis_threshold = 10
min_hospital_admission_threshold = 250


## Load data

In [4]:
filename = os.path.join(paths.data_read_path, paths.data_read_filename)
all_data = pd.read_csv(filename)
all_data.shape

(360379, 65)

## Limit data

In [5]:
# Limit to years 2017 onwars
mask = all_data['year'] >= 2017
data = all_data[mask]

# Limit to infarction stroke
mask = (data['infarction'] == 1)
data = data[mask]

# Limit to arrivals by ambulace
mask = (data['arrive_by_ambulance'] == 1)
data = data[mask]

# Remove patients who have received thrombectomy
mask = (data['thrombectomy'] == 0)
data = data[mask]

# Remove patients with no recorded prior disability
mask = data['prior_disability'] >= 0
data = data[mask]

# Remove records with no recorded discharge_disability
mask = data['discharge_disability'] >= 0
data = data[mask]

# Remove records with negative onset_to_arrival_time
mask = data['onset_to_arrival_time'] <= 0
mask =  mask == False
data = data[mask]

# Remove 'unusual' travel times (outside range 0-1440 minutes for each step)
cols = [
    'onset_to_arrival_time',
    'call_to_ambulance_arrival_time',
    'ambulance_on_scene_time',
    'ambulance_travel_to_hospital_time',
    'ambulance_wait_time_at_hospital'
]

for col in cols:
    # Remove rows with negative values
    mask = data[col] < 0
    mask =  mask == False
    data = data[mask]
    # Remove rows greater than 1440 minutes
    mask = data[col] > 1440
    mask =  mask == False
    data = data[mask]
data.shape

(159980, 65)

## Include hospitals with more than 250 admissions, and give atleast 10 thrombolysis

In [6]:
def filter_stroke_team(data, min_threshold, result):
    """
    Returns the dataframe with only the stroke teams that pass a minimum threshold.
    Currently use to have at least 250 admissions, and give thrombolysis at least 10 times
    data [dataframe]: contains all the hospital data
    min_threshold [float]: threshold above which stroke team needs to be to stay in data
    result [series]: contains value per stroke team, to be compared against the minimum threshold
    """

    mask = result >= min_threshold
    stroke_team_keep = list(result[mask].index)
    data = data[data['stroke_team'].isin(stroke_team_keep)]

    return(data)

In [7]:
result = data.groupby(['stroke_team'])['thrombolysis'].sum()
data = filter_stroke_team(data, min_hosptial_thrombolysis_threshold, result)
data.shape

(159980, 65)

In [8]:
result = data.groupby(['stroke_team'])['stroke_team'].count()
data = filter_stroke_team(data, min_hospital_admission_threshold, result)
data.shape

(159832, 65)

Give scan to thrombolysis time a value of 9999 for those patients that do not recieve thrombolysis. Doing so we will be able to remove thromboylsis as a feature as the information will be captured in the duration feature (can not keep both as then introduce feature dependency)

In [9]:
# check the recorded durations for those not receive thrombolysis
mask = data['thrombolysis'] == 0
data['scan_to_thrombolysis_time'][mask] = set_duration_not_get_thrombolysis

There's an assumption that patients with NK recorded for feature S1OnsetTimeType have their stroke onset at midnight, and so their OnsettoArrivalMinutes taken as the duration from midnight. 

Currently these patients have a 0 for 'onset_known' (a value of 1 is for precise and best estimate).

Keep patients with 'onset_known' = 1

In [10]:
mask = data['onset_known'] == 1
data = data[mask]

## Removing features
Set up a list of features to remove (as there are others) and remove at same time.

Remove 'onset\_known', as all patients have same value (only kept those with a value of 1 ).


In [11]:
remove_features = ['onset_known']

Remove anticolagulant types. 

In [12]:
remove_features.append('afib_vit_k_anticoagulant')
remove_features.append('afib_doac_anticoagulant')
remove_features.append('afib_heparin_anticoagulant')

A value in the scan\_to\_thrombolysis\_time will indicate the patient had thrombolysis. Keeping both in will mean dependencies in the features (SHAP assumes all features are independent). 

Remove thrombolysis, and keep the pathway durations.

In [13]:
remove_features.append('thrombolysis')

Remove features that contain information later in the pathway, or contain information in the target feature (discharge_disability)

In [14]:
remove_features.append('discharge_destination')
remove_features.append('death')
remove_features.append('disability_6_month')

Remove features about ambulance times (not fully filled in)

In [15]:
remove_features.append('call_to_ambulance_arrival_time')
remove_features.append('ambulance_on_scene_time')
remove_features.append('ambulance_travel_to_hospital_time')
remove_features.append('ambulance_wait_time_at_hospital')


Remove features about thrombectomy (patients who receive thromboectomy are already excluded)

In [16]:
remove_features.append('thrombectomy')
remove_features.append('arrival_to_thrombectomy_time')

Remove those features

In [17]:
data = data.drop(remove_features, axis=1)

## Save cleaned data ready for machine learning (predict the discharge outcome)

In [18]:
filename = os.path.join(paths.data_save_path, (paths.notebook + paths.data_save_filename))
data.to_csv(filename, index=False)