# Reformat data

This notebook reformats the cleaned up SSNAP data for use with machine learning.

## Import packages

In [2]:
# Import packages
import numpy as np
import os
import pandas as pd
import random

from dataclasses import dataclass

# Set the maximum number of columns to 100
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

## Set up paths and filenames

In [3]:
@dataclass(frozen=True)
class Paths:
    '''Singleton object for storing paths to data and database.'''

    data_read_path: str = '~/ssnap_data/'
    data_read_filename: str = './clean_samuel_ssnap_extract_v2.csv'
    data_save_path: str = './stroke_utilities/data/'
    data_save_filename: str = 'reformatted_data_thrombolysis_decision.csv'
    notebook: str = ''

paths = Paths()

## Set thresholds

In [4]:
min_hosptial_thrombolysis_threshold = 10
min_hospital_admission_threshold = 250

## Load data

In [5]:
filename = os.path.join(paths.data_read_path, paths.data_read_filename)
data = pd.read_csv(filename)
data.shape

(358993, 70)

## Limit data

In [6]:
# Remove patients with no recorded prior disability
mask = data['prior_disability'] >= 0
data = data[mask]

# Remove records with negative onset_to_arrival_time
mask = data['onset_to_arrival_time'] <= 0
mask =  mask == False
data = data[mask]

## Include hospitals with more than 250 admissions, and give atleast 10 thrombolysis

In [7]:
def filter_stroke_team(data, min_threshold, result):
    """
    Returns the dataframe with only the stroke teams that pass a minimum threshold.
    Currently use to have at least 250 admissions, and give thrombolysis at least 10 times
    data [dataframe]: contains all the hospital data
    min_threshold [float]: threshold above which stroke team needs to be to stay in data
    result [series]: contains value per stroke team, to be compared against the minimum threshold
    """

    mask = result >= min_threshold
    stroke_team_keep = list(result[mask].index)
    data = data[data['stroke_team'].isin(stroke_team_keep)]

    return(data)

In [8]:
result = data.groupby(['stroke_team'])['thrombolysis'].sum()
data = filter_stroke_team(data, min_hosptial_thrombolysis_threshold, result)
data.shape

(358925, 70)

In [9]:
result = data.groupby(['stroke_team'])['stroke_team'].count()
data = filter_stroke_team(data, min_hospital_admission_threshold, result)
data.shape

(358925, 70)

There's an assumption that patients with NK recorded for feature S1OnsetTimeType have their stroke onset at midnight, and so their OnsettoArrivalMinutes taken as the duration from midnight. 

Currently these patients have a 0 for 'onset_known' (a value of 1 is for precise and best estimate).

Keep patients with 'onset_known' = 1

In [10]:
mask = data['onset_known'] == 1
data = data[mask]

## Removing features
Set up a list of features to remove (as there are others) and remove at same time.

Remove 'onset\_known', as all patients have same value (only kept those with a value of 1 ).


In [11]:
remove_features = ['onset_known']

Remove anticolagulant types. 

In [12]:
remove_features.append('afib_vit_k_anticoagulant')
remove_features.append('afib_doac_anticoagulant')
remove_features.append('afib_heparin_anticoagulant')

In [13]:
remove_features.append('discharge_destination')
remove_features.append('death')
remove_features.append('disability_6_month')

Remove features about ambulance times (not fully filled in)

In [14]:
remove_features.append('call_to_ambulance_arrival_time')
remove_features.append('ambulance_on_scene_time')
remove_features.append('ambulance_travel_to_hospital_time')
remove_features.append('ambulance_wait_time_at_hospital')


In [15]:
data = data.drop(remove_features, axis=1)

## Add anonymised stroke team code

In [16]:
# Get list of teams
teams = list(set(data['stroke_team']))
# Shuffle into random order
random.seed(42)
random.shuffle(teams)
# Create dictionary
teams_code_dict = dict()
for i, j in enumerate(teams):
    teams_code_dict[j] = i + 1
# Save teams ID
col_names = ['stroke_team', 'team_code']
teams_code_df = pd.DataFrame(
    teams_code_dict.items(), columns=col_names)
filename = os.path.join(paths.data_save_path, 'team_code.csv')
teams_code_df.to_csv(filename,index=False)
# Apply coding to data
data['stroke_team_id'] = data['stroke_team'].map(teams_code_dict)


## Save cleaned data ready for machine learning (predict the discharge outcome)

In [17]:
filename = paths.data_save_path + paths.data_save_filename
data.to_csv(filename, index=False)

## Add stroke team code to cleaned data file

In [18]:
filename = paths.data_save_path + 'clean_samuel_ssnap_extract_v2.csv'
clean_data = pd.read_csv(filename)

In [19]:
clean_data['stroke_team_id'] = clean_data['stroke_team'].map(teams_code_dict)

In [20]:
clean_data.to_csv(filename, index=False)