# Reformat data

In [1]:
import numpy as np
import os
import pandas as pd

from dataclasses import dataclass
from datetime import datetime

# Set the maximum number of columns to 100
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

## Set up paths and filenames

In [2]:
@dataclass(frozen=True)
class Paths:
    """Singleton object for storing paths to data and database."""

    data_path: str = "./data"
    data_filename: str = "MichaelAllen_HQIP425_300323.csv"
    data_save_path:str = "./output"
    data_save_filename: str = "reformatted_data.csv"
    database_filename: str = "samuel.db"
    
paths = Paths()

## load data

In [None]:
raw_data: pd.DataFrame = pd.read_csv(
    os.path.join(paths.data_path, paths.data_filename), low_memory=False)

# Set up DataFrame for cleaned data
cleaned_data: pd.DataFrame = pd.DataFrame()

# Show the first 5 rows of the raw data
raw_data.tail()

In [4]:
# List all columns in the data
raw_data.columns

Index(['TeamName', 'AgeUnder40', 'Age40to44', 'Age45to49', 'Age50to54',
       'Age55to59', 'Age60to64', 'Age65to69', 'Age70to74', 'Age75to79',
       'Age80to84', 'Age85to89', 'AgeOver90', 'S1Gender',
       'OnsettoArrivalMinutes', 'S1OnsetDateType', 'S1OnsetTimeType',
       'S1ArriveByAmbulance', 'FirstArrivalMonthYear', 'FirstArrivalWeekday',
       'FirstArrivalTime', 'S2CoMCongestiveHeartFailure', 'S2CoMHypertension',
       'S2CoMAtrialFibrillation', 'S2CoMDiabetes', 'S2CoMStrokeTIA',
       'S2CoMAFAntiplatelet', 'S2CoMAFAnticoagulent',
       'S2CoMAFAnticoagulentVitK', 'S2CoMAFAnticoagulentDOAC',
       'S2CoMAFAnticoagulentHeparin', 'S2INR', 'S2INRHigh', 'S2INRNK',
       'S2NewAFDiagnosis', 'S2RankinBeforeStroke', 'S2NihssArrival',
       'S2NihssArrivalLoc', 'S2NihssArrivalLocQuestions',
       'S2NihssArrivalLocCommands', 'S2NihssArrivalBestGaze',
       'S2NihssArrivalVisual', 'S2NihssArrivalFacialPalsy',
       'S2NihssArrivalMotorArmLeft', 'S2NihssArrivalMotorArmRight

In [5]:
# Show shape of the data
raw_data.shape

(360381, 81)

In [6]:
# Show raw data types
print(raw_data.dtypes)


TeamName                                     object
AgeUnder40                                    int64
Age40to44                                     int64
Age45to49                                     int64
Age50to54                                     int64
Age55to59                                     int64
Age60to64                                     int64
Age65to69                                     int64
Age70to74                                     int64
Age75to79                                     int64
Age80to84                                     int64
Age85to89                                     int64
AgeOver90                                     int64
S1Gender                                     object
OnsettoArrivalMinutes                         int64
S1OnsetDateType                              object
S1OnsetTimeType                              object
S1ArriveByAmbulance                          object
FirstArrivalMonthYear                        object
FirstArrival

In [7]:
# Show proportion of completed data for each column
raw_data.count() / raw_data.shape[0]

TeamName                                    1.000000
AgeUnder40                                  1.000000
Age40to44                                   1.000000
Age45to49                                   1.000000
Age50to54                                   1.000000
Age55to59                                   1.000000
Age60to64                                   1.000000
Age65to69                                   1.000000
Age70to74                                   1.000000
Age75to79                                   1.000000
Age80to84                                   1.000000
Age85to89                                   1.000000
AgeOver90                                   1.000000
S1Gender                                    1.000000
OnsettoArrivalMinutes                       1.000000
S1OnsetDateType                             1.000000
S1OnsetTimeType                             1.000000
S1ArriveByAmbulance                         1.000000
FirstArrivalMonthYear                       1.

## Get Stroke Team

In [8]:
cleaned_data['stroke team'] = raw_data['TeamName']

## Extract ages and get gender

In [9]:
ages: dict = {'AgeUnder40': 37.5, 'Age40to44': 42.5, 'Age45to49': 47.5, 
       'Age50to54': 52.5, 'Age55to59': 57.5, 'Age60to64': 62.5, 'Age65to69': 67.5, 
       'Age70to74': 72.5, 'Age75to79': 77.5, 'Age80to84': 82.5, 'Age85to89': 87.5, 
       'AgeOver90': 92.5}

col_extract: pd.DataFrame = raw_data[ages.keys()]
# Find column of col_extract with highest value
age_band: pd.Series = col_extract.idxmax(axis=1)
f = lambda x: ages[x]
cleaned_data['age'] = age_band.apply(f)

In [10]:
f = lambda x: 1 if x=='M' else 0
cleaned_data['male'] = raw_data['S1Gender'].apply(f)

## Get stroke type

In [11]:
f = lambda x: 1 if x=='I' else 0
cleaned_data['infarction'] = raw_data['S2StrokeType'].apply(f)

## Get timings

In [12]:
# Onset to arrival
cleaned_data['onset-to-arrival time'] = raw_data['OnsettoArrivalMinutes']

# Onset time known
f = lambda x: 0 if x=='NK' else 1
cleaned_data['onset known'] = raw_data['S1OnsetTimeType'].apply(f)

# Precise onset time
f = lambda x: 1 if x=='P' else 0
cleaned_data['precise onset known'] = raw_data['S1OnsetTimeType'].apply(f)

# Stroke suring sleep
f = lambda x: 1 if x=='DS' else 0
cleaned_data['onset during sleep'] = raw_data['S1OnsetDateType'].apply(f)

# Arrive by ambulance
f = lambda x: 1 if x=='Y' else 0
cleaned_data['arrive by ambulance'] = raw_data['S1ArriveByAmbulance'].apply(f)

# Call to ambulance arrival
cleaned_data['call-to-ambulance-arrival time'] = (
    raw_data['ArrivalPatientLocationtoArrivalMinutes'] - raw_data['CallConnectedtoArrivalMinutes'])

# Ambulance on scence
cleaned_data['ambulance on-scene time'] = (
       raw_data['DeparturePatientLocationtoArrivalMinutes'] - raw_data['ArrivalPatientLocationtoArrivalMinutes'])

# Ambulance travel time to hospital
cleaned_data['ambulance travel-to-hospital time'] = (
       raw_data['WheelsStoptoArrivalMinutes'] - raw_data['DeparturePatientLocationtoArrivalMinutes'])

# Ambulance wait time at hospital
cleaned_data['ambulance wait time at hospital'] = 0 - raw_data['WheelsStoptoArrivalMinutes']

In [13]:
# Get month and year and conver to numbers

def f(date_string):

    # Parse the date string into a datetime object
    date_obj = datetime.strptime(date_string, '%b-%y')

    # Extract the month number and year from the datetime object
    month_number = date_obj.month
    year = date_obj.year

    return month_number, year

month: list = []
year: list = []

for index, row_data in raw_data.iterrows():
    m, y = f(row_data['FirstArrivalMonthYear'])
    month.append(m)
    year.append(y)

cleaned_data['month'] = month
cleaned_data['year'] = year

In [14]:
# Get arrival weekday

cleaned_data['weekday'] = raw_data['FirstArrivalWeekday']

In [15]:
# Get arrival period (3 hour period during day)

arrival_time_dict: dict = {
    '0000to3000': 0,
    '0300to0600': 3,
    '0600to0900': 6,
    '0900to1200': 9,
    '1200to1500': 12,
    '1500to1800': 15,
    '1800to2100': 18,
    '2100to2400': 24
}

f = lambda x: arrival_time_dict[x]
cleaned_data['arrival time 3 hour period'] = raw_data['FirstArrivalTime'].apply(f)

In [16]:
# Get arrival to scan time

cleaned_data['arrival-to-scan time'] = raw_data['ArrivaltoBrainImagingMinutes']

# Get use of thrombolysis and time from scan
f = lambda x: 1 if x=='Y' else 0
cleaned_data['thrombolysis'] = raw_data['S2Thrombolysis'].apply(f)
cleaned_data['scan-to-thrombolysis time'] = (
    raw_data['ArrivaltoThrombolysisMinutes'] - raw_data['ArrivaltoBrainImagingMinutes'])

## Add comorbidities

In [17]:
comorbidities: list = ['S2CoMCongestiveHeartFailure', 'S2CoMHypertension',
       'S2CoMAtrialFibrillation', 'S2CoMDiabetes', 'S2CoMStrokeTIA',
       'S2CoMAFAntiplatelet', 'S2CoMAFAnticoagulent',
       'S2CoMAFAnticoagulentVitK', 'S2CoMAFAnticoagulentDOAC',
       'S2CoMAFAnticoagulentHeparin']

rename_dict: dict = {
    'S2CoMCongestiveHeartFailure': 'congestive heart failure',
    'S2CoMHypertension': 'hypertension',
    'S2CoMAtrialFibrillation': 'atrial fibrillation',
    'S2CoMDiabetes': 'diabetes',
    'S2CoMStrokeTIA': 'prior stroke/TIA',
    'S2CoMAFAntiplatelet': 'antiplatelet for atrial fibrillation',
    'S2CoMAFAnticoagulent': 'use of AF anticoagulants',
    'S2CoMAFAnticoagulentVitK': 'vit k anticoagulant for atrial fibrillation',
    'S2CoMAFAnticoagulentDOAC': 'DOAC anticoagulant for atrial fibrillation',
    'S2CoMAFAnticoagulentHeparin': 'heparin anticoagulant for atrial fibrillation'}

cleaned_data[comorbidities] = raw_data[comorbidities]

f = lambda x: 1 if x=='Y' else 0
for col in comorbidities:
    cleaned_data[col] = cleaned_data[col].apply(f)

# Rename columns
cleaned_data.rename(rename_dict, axis=1, inplace=True)


## INR (blood clotting)

Insufficient data to be of use

## New AF diagnosis

Not used: All zero or empty

## Add rankin before stroke

In [18]:
cleaned_data['prior disability'] = raw_data['S2RankinBeforeStroke']

## Add NIHSS data

In [19]:
cleaned_data['stroke severity'] = raw_data['S2NihssArrival']

nihss: list = ['S2NihssArrivalLoc', 'S2NihssArrivalLocQuestions',
       'S2NihssArrivalLocCommands', 'S2NihssArrivalBestGaze',
       'S2NihssArrivalVisual', 'S2NihssArrivalFacialPalsy',
       'S2NihssArrivalMotorArmLeft', 'S2NihssArrivalMotorArmRight',
       'S2NihssArrivalMotorLegLeft', 'S2NihssArrivalMotorLegRight',
       'S2NihssArrivalLimbAtaxia', 'S2NihssArrivalSensory',
       'S2NihssArrivalBestLanguage', 'S2NihssArrivalDysarthria',
       'S2NihssArrivalExtinctionInattention']

# Presence of -1 in SSNAP indicates missing value
f = lambda x: 0 if x == -1 else 1
cleaned_data['nihss complete'] = raw_data[nihss].min(axis=1).apply(f)

# Add columns and rename
rename_dict: dict = {x : x[2:] for x in nihss}
cleaned_data[nihss] = raw_data[nihss]
cleaned_data.rename(rename_dict, axis=1, inplace=True)

## Add death and outcome data

In [20]:
# Add discharge destimation

rename_dict: dict = {
    'CH': 'care home', 'D': 'died', 'H': 'home', 'SE': 'somewhere else',
    'TC': 'community team or ESD', 'TCN': 'community team or ESD', 
    'TN': 'non-ssnap hospital team', 'T': 'ssnap hospital team', np.NaN: 'missing'}

f = lambda x: rename_dict[x]
cleaned_data['discharge destination'] = raw_data['S7DischargeType'].apply(f)

# Death and outcome

cleaned_data['death'] = (raw_data['ArrivalToDeathDays'] >= 0) * 1.0
cleaned_data['discharge disability'] = raw_data['S7RankinDischarge']
cleaned_data['6 month disability'] = raw_data['S8Rankin6Month']

## Add reasons for no thrombolysis

In [21]:
no_thrombolysis: list = [
       'S2ThrombolysisNoButHaemorrhagic', 'S2ThrombolysisNoButTimeWindow',
       'S2ThrombolysisNoButComorbidity', 'S2ThrombolysisNoButMedication',
       'S2ThrombolysisNoButRefusal', 'S2ThrombolysisNoButAge',
       'S2ThrombolysisNoButImproving', 'S2ThrombolysisNoButTooMildSevere',
       'S2ThrombolysisNoButTimeUnknownWakeUp',
       'S2ThrombolysisNoButOtherMedical']

# Add columns and rename
rename_dict: dict = {x : x[2:] for x in no_thrombolysis}
cleaned_data[no_thrombolysis] = raw_data[no_thrombolysis]
cleaned_data.rename(rename_dict, axis=1, inplace=True)

In [None]:
cleaned_data.tail(10)

## Save cleaned data

In [23]:
filename = os.path.join(paths.data_save_path, paths.data_save_filename)

cleaned_data.to_csv(filename, index_label='id')