# Reformat data

This notebook tidies SAMueL ssnap extract v2.

## Import packages

In [1]:
# Import packages
import numpy as np
import os
import pandas as pd

from dataclasses import dataclass

# Set the maximum number of columns to 100
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

## Set up paths and filenames

In [2]:
@dataclass(frozen=True)
class Paths:
    '''Singleton object for storing paths to data and database.'''

    data_path: str = './data'
    data_filename: str = 'SAMueL ssnap extract v2.csv'
    data_save_path:str = './output'
    data_save_filename: str = 'reformatted_data.csv'
    database_filename: str = 'samuel.db'
    
paths = Paths()

## Load and view data

In [3]:
raw_data: pd.DataFrame = pd.read_csv(
    os.path.join(paths.data_path, paths.data_filename), low_memory=False)

# Set up DataFrame for cleaned data
cleaned_data: pd.DataFrame = pd.DataFrame()

In [4]:
# Show shape of the data
raw_data.shape

(360381, 83)

In [5]:
# List all columns and show their data types and proportion of completed data
data_types = raw_data.dtypes
completion = raw_data.count() / raw_data.shape[0]
pd.DataFrame({'Data type': data_types, 'Proportion complete': completion})


Unnamed: 0,Data type,Proportion complete
PatientId,int64,1.0
ProClinV1Id,int64,1.0
TeamName,object,1.0
AgeUnder40,int64,1.0
Age40to44,int64,1.0
Age45to49,int64,1.0
Age50to54,int64,1.0
Age55to59,int64,1.0
Age60to64,int64,1.0
Age65to69,int64,1.0


## Get stroke team

Extract list of unique stroke teams - (1) add to cleaned dataset, and (2) save to a seperate csv file.

In [6]:
# Add column with stroke team
cleaned_data['stroke_team'] = raw_data['TeamName']

# Extract list of unique teams and save to csv
stroke_teams = list(cleaned_data['stroke_team'].unique())
stroke_teams = pd.Series(stroke_teams, name='stroke_team')
stroke_teams.to_csv('output/stroke_teams.csv', index=False)

## Get age and gender

In [7]:
# Dictionary defining numeric age as middle of age band
ages: dict = {'AgeUnder40': 37.5,
              'Age40to44': 42.5, 'Age45to49': 47.5,
              'Age50to54': 52.5, 'Age55to59': 57.5,
              'Age60to64': 62.5, 'Age65to69': 67.5,
              'Age70to74': 72.5, 'Age75to79': 77.5,
              'Age80to84': 82.5, 'Age85to89': 87.5,
              'AgeOver90': 92.5}

# Extract age band columns, and find the highest age band that patient is part of
col_extract: pd.DataFrame = raw_data[ages.keys()]
age_band: pd.Series = col_extract.idxmax(axis=1)

# Use that ageband to find appropriate numeric age from ages dictionary
cleaned_data['age'] = age_band.map(ages)

In [8]:
gender: dict = {'M': 1, 'F': 0}
cleaned_data['male'] = raw_data['S1Gender'].map(gender)

## Get stroke type

Stroke type abbreviations: infarction (I), primary intracerebral haemorrage (PIH), or unknown if not imaged (NaN).

In [9]:
infarction: dict = {'I': 1, 'PIH': 0}
cleaned_data['infarction'] = raw_data['S2StrokeType'].map(infarction)

## Get timings

Onset to arrival time, precision, and whether stroke was during sleep.  
Abbreviations: precise (P), best estimate (BE), not known (NK), during sleep (DS).

In [10]:
# Onset to arrival time in minutes
cleaned_data['onset_to_arrival_time'] = raw_data['OnsettoArrivalMinutes']

# Whether onset time is known
onset_known: dict = {'NK': 0, 'P': 1, 'BE': 1}
cleaned_data['onset_known'] = raw_data['S1OnsetTimeType'].map(onset_known)

# Whether onset time is precise - if not, then best estimate or not known
precise_onset_known: dict = {'P': 1, 'BE': 0, 'NK': 0}
cleaned_data['precise_onset_known'] = raw_data['S1OnsetTimeType'].map(precise_onset_known)

# Stroke during sleep
sleep: dict = {'DS': 1, 'P': 0, 'BE': 0}
cleaned_data['onset_during_sleep'] = raw_data['S1OnsetDateType'].map(sleep)

Ambulance timings.

In [11]:
# Arrive by ambulance
by_ambulance: dict = {'Y': 1, 'N': 0}
cleaned_data['arrive_by_ambulance'] = raw_data['S1ArriveByAmbulance'].map(by_ambulance)

# Time from call connected to ambulance arrival at patient location
cleaned_data['call_to_ambulance_arrival_time'] = (
    raw_data['ArrivalPatientLocationtoArrivalMinutes'] - raw_data['CallConnectedtoArrivalMinutes'])

# Time that ambulance on scene at patient location
cleaned_data['ambulance_on_scene_time'] = (
       raw_data['DeparturePatientLocationtoArrivalMinutes'] - raw_data['ArrivalPatientLocationtoArrivalMinutes'])

# Ambulance travel time to from patient location to hospital
cleaned_data['ambulance_travel_to_hospital_time'] = (
       raw_data['WheelsStoptoArrivalMinutes'] - raw_data['DeparturePatientLocationtoArrivalMinutes'])

# Ambulance wait time at hospital
cleaned_data['ambulance_wait_time_at_hospital'] = 0 - raw_data['WheelsStoptoArrivalMinutes']

Day, month, year and time of arrival.

In [12]:
# Month, year and day
cleaned_data['month'] = pd.to_datetime(raw_data['FirstArrivalMonthYear'], format='%b-%y').dt.month
cleaned_data['year'] = pd.to_datetime(raw_data['FirstArrivalMonthYear'], format='%b-%y').dt.year
cleaned_data['weekday'] = raw_data['FirstArrivalWeekday']

In [13]:
# Get arrival period (3 hour period during day)
arrival_time_dict: dict = {
    '0000to3000': 0,
    '0300to0600': 3,
    '0600to0900': 6,
    '0900to1200': 9,
    '1200to1500': 12,
    '1500to1800': 15,
    '1800to2100': 18,
    '2100to2400': 24
}
cleaned_data['arrival_time_3_hour_period'] = raw_data['FirstArrivalTime'].map(arrival_time_dict)

Whether thrombolysis or thrombectomy were performed, and then timings from arrival (to scan) to procedure. For thrombectomy, "No" includes when choice was "no" and when it was "no but was PIH" (primary intracerebral haemmorhage) - thrombolysis used for ischemic strokes but not haemorrhagic.

In [14]:
# Get arrival to scan time
cleaned_data['arrival_to_scan_time'] = raw_data['ArrivaltoBrainImagingMinutes']

# Get use of thrombolysis
thrombolysis: dict = {'Y': 1, 'N': 0, 'NB': 0}
cleaned_data['thrombolysis'] = raw_data['S2Thrombolysis'].map(thrombolysis)

# Get time from scan to thrombolysis
cleaned_data['scan_to_thrombolysis_time'] = (
    raw_data['ArrivaltoThrombolysisMinutes'] - raw_data['ArrivaltoBrainImagingMinutes'])

# Get use of thrombectomy (1 is x is a number, 0 is x is NaN)
f = lambda x: 1 if x > 0 else 0
cleaned_data['thrombectomy'] = raw_data['ArrivaltoArterialPunctureMinutes'].apply(f)

# Get time from arrival to thrombectomy
cleaned_data['arrival_to_thrombectomy_time'] = raw_data['ArrivaltoArterialPunctureMinutes']

## Add comorbidities

In [15]:
comorbidities: list = ['S2CoMCongestiveHeartFailure', 'S2CoMHypertension',
       'S2CoMAtrialFibrillation', 'S2CoMDiabetes', 'S2CoMStrokeTIA',
       'S2CoMAFAntiplatelet', 'S2CoMAFAnticoagulent',
       'S2CoMAFAnticoagulentVitK', 'S2CoMAFAnticoagulentDOAC',
       'S2CoMAFAnticoagulentHeparin']

rename_dict: dict = {
    'S2CoMCongestiveHeartFailure': 'congestive_heart_failure',
    'S2CoMHypertension': 'hypertension',
    'S2CoMAtrialFibrillation': 'atrial_fibrillation',
    'S2CoMDiabetes': 'diabetes',
    'S2CoMStrokeTIA': 'prior_stroke_tia',
    'S2CoMAFAntiplatelet': 'antiplatelet_for_atrial_fibrillation',
    'S2CoMAFAnticoagulent': 'use_of_af_anticoagulants',
    'S2CoMAFAnticoagulentVitK': 'vit_k_anticoagulant_for_atrial_fibrillation',
    'S2CoMAFAnticoagulentDOAC': 'doac_anticoagulant_for_atrial_fibrillation',
    'S2CoMAFAnticoagulentHeparin': 'heparin_anticoagulant_for_atrial_fibrillation'}

cleaned_data[comorbidities] = raw_data[comorbidities]

f = lambda x: 1 if x=='Y' else 0
for col in comorbidities:
    cleaned_data[col] = cleaned_data[col].apply(f)

# Rename columns
cleaned_data.rename(rename_dict, axis=1, inplace=True)


## INR (blood clotting)

Insufficient data to be of use

## New AF diagnosis

Not used: All zero or empty

## Add rankin before stroke

In [16]:
cleaned_data['prior_disability'] = raw_data['S2RankinBeforeStroke']

## Add NIHSS data

In [17]:
def camel_to_snake(str):
    # Converts CamelCase to snake_case
    # Input: str
    snake = ''.join(['_' + i.lower() if i.isupper()
                     else i for i in str]).lstrip('_')
    return snake
               

In [18]:
cleaned_data['stroke_severity'] = raw_data['S2NihssArrival']

# List of NIHSS arrival measures
nihss: list = ['S2NihssArrivalLoc', 'S2NihssArrivalLocQuestions',
               'S2NihssArrivalLocCommands', 'S2NihssArrivalBestGaze',
               'S2NihssArrivalVisual', 'S2NihssArrivalFacialPalsy',
               'S2NihssArrivalMotorArmLeft', 'S2NihssArrivalMotorArmRight',
               'S2NihssArrivalMotorLegLeft', 'S2NihssArrivalMotorLegRight',
               'S2NihssArrivalLimbAtaxia', 'S2NihssArrivalSensory',
               'S2NihssArrivalBestLanguage', 'S2NihssArrivalDysarthria',
               'S2NihssArrivalExtinctionInattention']

# Presence of -1 in SSNAP indicates missing value
f = lambda x: 0 if x == -1 else 1
cleaned_data['nihss_complete'] = raw_data[nihss].min(axis=1).apply(f)

# Add columns and rename - convert to snake case and remove 's2_'
rename_dict: dict = {x : camel_to_snake(x).split('s2_')[1] for x in nihss}
cleaned_data[nihss] = raw_data[nihss]
cleaned_data.rename(rename_dict, axis=1, inplace=True)

## Add death and outcome data

In [19]:
# Add discharge destination

rename_dict: dict = {
    'CH': 'care_home',
    'D': 'died',
    'H': 'home',
    'SE': 'somewhere_else',
    'TC': 'community_team_or_esd',
    'TCN': 'community_team_or_esd', 
    'TN': 'non_ssnap_hospital_team',
    'T': 'ssnap_hospital_team',
    np.NaN: 'missing'}

f = lambda x: rename_dict[x]
cleaned_data['discharge_destination'] = raw_data['S7DischargeType'].apply(f)

# Death and outcome

cleaned_data['death'] = (raw_data['ArrivalToDeathDays'] >= 0) * 1.0
cleaned_data['discharge_disability'] = raw_data['S7RankinDischarge']
cleaned_data['disability_6_month'] = raw_data['S8Rankin6Month']

## Add reasons for no thrombolysis

In [20]:
no_thrombolysis: list = [
       'S2ThrombolysisNoButHaemorrhagic', 'S2ThrombolysisNoButTimeWindow',
       'S2ThrombolysisNoButComorbidity', 'S2ThrombolysisNoButMedication',
       'S2ThrombolysisNoButRefusal', 'S2ThrombolysisNoButAge',
       'S2ThrombolysisNoButImproving', 'S2ThrombolysisNoButTooMildSevere',
       'S2ThrombolysisNoButTimeUnknownWakeUp',
       'S2ThrombolysisNoButOtherMedical']

# Add columns and rename - convert to snake case and remove 's2_'
rename_dict: dict = {x : camel_to_snake(x).split('s2_')[1] for x in no_thrombolysis}
cleaned_data[no_thrombolysis] = raw_data[no_thrombolysis]
cleaned_data.rename(rename_dict, axis=1, inplace=True)

## Save cleaned data

In [21]:
filename = os.path.join(paths.data_save_path, paths.data_save_filename)

cleaned_data.to_csv(filename, index_label='id')

## Run unit tests

These perform checks on the cleaned data, such as to check that:
* Counts are as expected (not erroneously including NaN when not desired)
* Shape of dataframe is consistent (no additional rows or columns added)
* Times are not negative

In [24]:
%run -i './tests/test_01_reformat_data.py'

.........F
FAIL: test_time_negative (__main__.DataTests)
Test that times are not negative when expected to be positive
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/home/amy/Documents/samuel_2_data_prep/tests/test_01_reformat_data.py", line 108, in test_time_negative
    self.time_neg('onset_to_arrival_time')
  File "/home/amy/Documents/samuel_2_data_prep/tests/test_01_reformat_data.py", line 60, in time_neg
    self.assertTrue(all(time_not_null >= 0))
AssertionError: False is not true

----------------------------------------------------------------------
Ran 10 tests in 6.816s

FAILED (failures=1)


SystemExit: True