# Reformat data

This notebook reformats SAMueL ssnap extract v2.

## Import packages

In [1]:
# Linting
%load_ext pycodestyle_magic
%pycodestyle_on

In [2]:
# Import packages
import numpy as np
import os
import pandas as pd

from dataclasses import dataclass

# Set the maximum number of columns to 100
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

## Set up paths and filenames

In [3]:
@dataclass(frozen=True)
class Paths:
    '''Singleton object for storing paths to data and database.'''

    data_path: str = './data'
    data_filename: str = 'SAMueL ssnap extract v2.csv'
    data_save_path: str = './output'
    data_save_filename: str = 'reformatted_data.csv'
    database_filename: str = 'samuel.db'


paths = Paths()

## Load and view data

In [4]:
raw_data: pd.DataFrame = pd.read_csv(
    os.path.join(paths.data_path, paths.data_filename), low_memory=False)

# Set up DataFrame for cleaned data
cleaned_data: pd.DataFrame = pd.DataFrame()

In [5]:
# Show shape of the data
raw_data.shape

(360381, 83)

In [6]:
# List all columns and show their data types and proportion of completed data
data_types = raw_data.dtypes
completion = raw_data.count() / raw_data.shape[0]
data_info = pd.DataFrame({'Data type': data_types,
                          'Proportion complete': completion})

# Show dataframe
data_info

Unnamed: 0,Data type,Proportion complete
PatientId,int64,1.0
ProClinV1Id,int64,1.0
TeamName,object,1.0
AgeUnder40,int64,1.0
Age40to44,int64,1.0
Age45to49,int64,1.0
Age50to54,int64,1.0
Age55to59,int64,1.0
Age60to64,int64,1.0
Age65to69,int64,1.0


## Get stroke team

Extract list of unique stroke teams - (1) add to cleaned dataset, and (2) save to a seperate csv file.

In [7]:
# Add column with stroke team
cleaned_data['stroke_team'] = raw_data['TeamName']

# Extract list of unique teams and save to csv
stroke_teams = list(cleaned_data['stroke_team'].unique())
stroke_teams = pd.Series(stroke_teams, name='stroke_team')
stroke_teams.to_csv('output/stroke_teams.csv', index=False)

## Get age and gender

In [8]:
# Dictionary defining numeric age as middle of age band
ages: dict = {'AgeUnder40': 37.5,
              'Age40to44': 42.5, 'Age45to49': 47.5,
              'Age50to54': 52.5, 'Age55to59': 57.5,
              'Age60to64': 62.5, 'Age65to69': 67.5,
              'Age70to74': 72.5, 'Age75to79': 77.5,
              'Age80to84': 82.5, 'Age85to89': 87.5,
              'AgeOver90': 92.5}

# Extract age band columns, and find the highest age band that
# # the patient is part of
col_extract: pd.DataFrame = raw_data[ages.keys()]
age_band: pd.Series = col_extract.idxmax(axis=1)

# Use that ageband to find appropriate numeric age from ages dictionary
cleaned_data['age'] = age_band.map(ages)

In [9]:
gender: dict = {'M': 1, 'F': 0}
cleaned_data['male'] = raw_data['S1Gender'].map(gender)

## Get stroke type

Stroke type abbreviations: infarction (I), primary intracerebral haemorrage (PIH), or unknown if not imaged (NaN).

In [10]:
infarction: dict = {'I': 1, 'PIH': 0}
cleaned_data['infarction'] = raw_data['S2StrokeType'].map(infarction)

## Get timings

Onset to arrival time, precision, and whether stroke was during sleep.  
Abbreviations: precise (P), best estimate (BE), not known (NK), during sleep (DS).

In [11]:
# Onset to arrival time in minutes
cleaned_data['onset_to_arrival_time'] = raw_data['OnsettoArrivalMinutes']

# Whether onset time is known
onset_known: dict = {'NK': 0, 'P': 1, 'BE': 1}
cleaned_data['onset_known'] = raw_data['S1OnsetTimeType'].map(onset_known)

# Whether onset time is precise - if not, then best estimate or not known
precise_onset_known: dict = {'P': 1, 'BE': 0, 'NK': 0}
cleaned_data['precise_onset_known'] = (
    raw_data['S1OnsetTimeType'].map(precise_onset_known))

# Stroke during sleep
sleep: dict = {'DS': 1, 'P': 0, 'BE': 0}
cleaned_data['onset_during_sleep'] = raw_data['S1OnsetDateType'].map(sleep)

Ambulance timings.

In [12]:
# Arrive by ambulance
by_ambulance: dict = {'Y': 1, 'N': 0}
cleaned_data['arrive_by_ambulance'] = (
    raw_data['S1ArriveByAmbulance'].map(by_ambulance))

# Time from call connected to ambulance arrival at patient location
cleaned_data['call_to_ambulance_arrival_time'] = (
    raw_data['ArrivalPatientLocationtoArrivalMinutes'] -
    raw_data['CallConnectedtoArrivalMinutes'])

# Time that ambulance on scene at patient location
cleaned_data['ambulance_on_scene_time'] = (
       raw_data['DeparturePatientLocationtoArrivalMinutes'] -
       raw_data['ArrivalPatientLocationtoArrivalMinutes'])

# Ambulance travel time to from patient location to hospital
cleaned_data['ambulance_travel_to_hospital_time'] = (
       raw_data['WheelsStoptoArrivalMinutes'] -
       raw_data['DeparturePatientLocationtoArrivalMinutes'])

# Ambulance wait time at hospital
cleaned_data['ambulance_wait_time_at_hospital'] = (
    0 - raw_data['WheelsStoptoArrivalMinutes'])

Day, month, year and time of arrival.

In [13]:
# Month, year and day
cleaned_data['month'] = pd.to_datetime(
    raw_data['FirstArrivalMonthYear'], format='%b-%y').dt.month
cleaned_data['year'] = pd.to_datetime(
    raw_data['FirstArrivalMonthYear'], format='%b-%y').dt.year
cleaned_data['weekday'] = raw_data['FirstArrivalWeekday']

In [14]:
# Get arrival period (3 hour period during day)
arrival_time_dict: dict = {
    '0000to3000': 0,
    '0300to0600': 3,
    '0600to0900': 6,
    '0900to1200': 9,
    '1200to1500': 12,
    '1500to1800': 15,
    '1800to2100': 18,
    '2100to2400': 24
}
cleaned_data['arrival_time_3_hour_period'] = (
    raw_data['FirstArrivalTime'].map(arrival_time_dict))

Whether thrombolysis or thrombectomy were performed, and then timings from arrival (to scan) to procedure. For thrombectomy, "No" includes when choice was "no" and when it was "no but was PIH" (primary intracerebral haemmorhage) - thrombolysis used for ischemic strokes but not haemorrhagic.

In [15]:
# Get arrival to scan time
cleaned_data['arrival_to_scan_time'] = raw_data['ArrivaltoBrainImagingMinutes']

# Get use of thrombolysis
# NB is the answer automatically selected if type of stroke is PIH
thrombolysis: dict = {'Y': 1, 'N': 0, 'NB': 0}
cleaned_data['thrombolysis'] = raw_data['S2Thrombolysis'].map(thrombolysis)

# Get time from scan to thrombolysis
cleaned_data['scan_to_thrombolysis_time'] = (
    raw_data['ArrivaltoThrombolysisMinutes'] -
    raw_data['ArrivaltoBrainImagingMinutes'])

# Get use of thrombectomy (0 if x is NaN, 1 if x is a number)
cleaned_data['thrombectomy'] = (
    raw_data['ArrivaltoArterialPunctureMinutes'].apply(
        lambda x: 0 if np.isnan(x) else 1))

# Get time from arrival to thrombectomy
cleaned_data['arrival_to_thrombectomy_time'] = (
    raw_data['ArrivaltoArterialPunctureMinutes'])

## Add comorbidities

Co-morbidities that were present prior to this admission, and medication that patient was on prior to this admission.

In [16]:
comorbidities: dict = {
    'S2CoMCongestiveHeartFailure': 'congestive_heart_failure',
    'S2CoMHypertension': 'hypertension',
    'S2CoMAtrialFibrillation': 'atrial_fibrillation',
    'S2CoMDiabetes': 'diabetes',
    'S2CoMStrokeTIA': 'prior_stroke_tia',
    'S2CoMAFAntiplatelet': 'afib_antiplatelet',
    'S2CoMAFAnticoagulent': 'afib_anticoagulant',
    'S2CoMAFAnticoagulentVitK': 'afib_vit_k_anticoagulant',
    'S2CoMAFAnticoagulentDOAC': 'afib_doac_anticoagulant',
    'S2CoMAFAnticoagulentHeparin': 'afib_heparin_anticoagulant'}

# Add comorbidites columns, with new names
cleaned_data[list(comorbidities.values())] = raw_data[comorbidities.keys()]

# Change from Y/N to 1/0
comorbid_marker = {'Y': 1, 1: 1,
                   'N': 0, 'NB': 0, 0: 0, np.nan: 0}
for col in comorbidities.values():
    cleaned_data[col] = cleaned_data[col].map(comorbid_marker)

## Add rankin before stroke

In [17]:
cleaned_data['prior_disability'] = raw_data['S2RankinBeforeStroke']

## Add NIHSS data

In [18]:
def camel_to_snake(str):
    # Converts CamelCase to snake_case
    # Input: str
    snake = ''.join(['_' + i.lower() if i.isupper()
                     else i for i in str]).lstrip('_')
    return snake

In [19]:
# Stroke severity is NIHSS score on arrival
cleaned_data['stroke_severity'] = raw_data['S2NihssArrival']

# List of NIHSS arrival measures
nihss: list = ['S2NihssArrivalLoc', 'S2NihssArrivalLocQuestions',
               'S2NihssArrivalLocCommands', 'S2NihssArrivalBestGaze',
               'S2NihssArrivalVisual', 'S2NihssArrivalFacialPalsy',
               'S2NihssArrivalMotorArmLeft', 'S2NihssArrivalMotorArmRight',
               'S2NihssArrivalMotorLegLeft', 'S2NihssArrivalMotorLegRight',
               'S2NihssArrivalLimbAtaxia', 'S2NihssArrivalSensory',
               'S2NihssArrivalBestLanguage', 'S2NihssArrivalDysarthria',
               'S2NihssArrivalExtinctionInattention']

# Finds the minimum value across these columns, and uses that to create
# marker of whether any of them contain a missing value (indicated by -1)
cleaned_data['nihss_complete'] = raw_data[nihss].min(axis=1).apply(
    lambda x: 0 if x == -1 else 1)

# Add columns (exactly as are in raw data)
cleaned_data[nihss] = raw_data[nihss]

# Rename - convert to snake case and remove 's2_'
rename_dict: dict = {}
for col in nihss:
    rename_dict[col] = camel_to_snake(col).split('s2_')[1]
cleaned_data.rename(rename_dict, axis=1, inplace=True)

## Add death and outcome data

In [20]:
# Discharge destination
discharge: dict = {
    'CH': 'care_home',
    'D': 'died',
    'H': 'home',
    'SE': 'somewhere_else',
    'TC': 'community_team_or_esd',
    'TCN': 'community_team_or_esd',
    'TN': 'non_ssnap_hospital_team',
    'T': 'ssnap_hospital_team',
    np.NaN: 'missing'}
cleaned_data['discharge_destination'] = (
    raw_data['S7DischargeType'].map(discharge))

# Death - if NaN then 0, if 0+ days (so if died) then 1
cleaned_data['death'] = (raw_data['ArrivalToDeathDays'] >= 0) * 1

# Outcome
cleaned_data['discharge_disability'] = raw_data['S7RankinDischarge']
cleaned_data['disability_6_month'] = raw_data['S8Rankin6Month']
# S8Rankin6MonthNK not included as implicit that not known
# if NaN is S8Rankin6Month

## Add reasons for no thrombolysis

In [21]:
no_thrombolysis: list = [
       'S2ThrombolysisNoButHaemorrhagic', 'S2ThrombolysisNoButTimeWindow',
       'S2ThrombolysisNoButComorbidity', 'S2ThrombolysisNoButMedication',
       'S2ThrombolysisNoButRefusal', 'S2ThrombolysisNoButAge',
       'S2ThrombolysisNoButImproving', 'S2ThrombolysisNoButTooMildSevere',
       'S2ThrombolysisNoButTimeUnknownWakeUp',
       'S2ThrombolysisNoButOtherMedical']

# Add columns
cleaned_data[no_thrombolysis] = raw_data[no_thrombolysis]

# Rename - convert to snake case and remove 's2_'
rename_dict: dict = {}
for col in no_thrombolysis:
    rename_dict[col] = camel_to_snake(col).split('s2_')[1]
cleaned_data.rename(rename_dict, axis=1, inplace=True)

## Temporary - add patient id and original varibales

Adding patient ID so have consistent identifier between raw and clean dataset. Adding original variables to support with exploring the presence of NaNs in 01_descriptive_analysis.ipynb

In [22]:
cleaned_data['patient_id'] = raw_data['PatientId']

# Original variables used to calculate ambulance times
cleaned_data['CallConnectedtoArrivalMinutes'] = (
    raw_data['CallConnectedtoArrivalMinutes'])
cleaned_data['ArrivalPatientLocationtoArrivalMinutes'] = (
    raw_data['ArrivalPatientLocationtoArrivalMinutes'])
cleaned_data['DeparturePatientLocationtoArrivalMinutes'] = (
    raw_data['DeparturePatientLocationtoArrivalMinutes'])
cleaned_data['WheelsStoptoArrivalMinutes'] = (
    raw_data['WheelsStoptoArrivalMinutes'])

## Save cleaned data

In [23]:
filename = os.path.join(paths.data_save_path, paths.data_save_filename)
cleaned_data.to_csv(filename, index_label='id')

## Variables that were not included

**INR (blood clotting)** - insufficient data to be of use

In [24]:
print('S2INR')
print('NaN: {0}'.format(raw_data['S2INR'].isnull().values.ravel().sum()))
print('Data available: {0}'.format(raw_data['S2INR'].count()))

print('')
print('S2INRHigh')
print(raw_data['S2INRHigh'].value_counts(dropna=False).to_string())

print('')
print('S2INRNK')
print(raw_data['S2INRNK'].value_counts(dropna=False).to_string())

S2INR
NaN: 351875
Data available: 8506

S2INRHigh
0.0    247974
NaN    112376
1.0        31

S2INRNK
0.0    247382
NaN    112376
1.0       623


**New AF diagnosis** - missing data
* Data dictionary says should only be answered if atrial_fibrillation is marked as no (i.e. 0). In all cases where that is 1, new AF diagnosis is missing, which is as anticipated
* However, it is also missing for 95809 cases where atrial_fibrillation is as marked as 0.

In [25]:
((raw_data[['S2CoMAtrialFibrillation', 'S2NewAFDiagnosis']]
  .value_counts(dropna=False)
  .reset_index(name='count')
  .sort_values(by='S2NewAFDiagnosis')))

Unnamed: 0,S2CoMAtrialFibrillation,S2NewAFDiagnosis,count
0,N,N,185842
3,N,Y,13133
1,N,,95809
2,Y,,65597


**Mobile data to arrival** - <mark>to explain</mark>

In [26]:
raw_data['MobileDatatoArrivalMinutes'].isnull().value_counts()

True     344310
False     16071
Name: MobileDatatoArrivalMinutes, dtype: int64

**TIA in the last month** - <mark>to explain</mark>

Should only be completed if 2.1.5 (had stroke/TIA) is marked as yes, but this is not the case.

In [27]:
(raw_data[['S2CoMStrokeTIA', 'S2TIAInLastMonth']]
 .value_counts(dropna=False)
 .reset_index(name='count')
 .sort_values(by='S2CoMStrokeTIA'))

Unnamed: 0,S2CoMStrokeTIA,S2TIAInLastMonth,count
0,N,,268486
5,N,N,342
6,N,NK,42
1,Y,,64115
2,Y,N,22205
3,Y,NK,3845
4,Y,Y,1346


**Stroke unit death** - <mark>to explain</mark>

In [28]:
death_check = pd.DataFrame(raw_data['S7StrokeUnitDeath'])
death_check['death'] = (raw_data['ArrivalToDeathDays'] >= 0) * 1
(death_check
 .value_counts(dropna=False)
 .reset_index(name='count')
 .sort_values(by='death'))

Unnamed: 0,S7StrokeUnitDeath,death,count
0,,0,310320
4,Y,0,4
1,Y,1,43292
2,,1,4489
3,N,1,2276


**Pre hospital impression** - <mark>to explain</mark>

In [29]:
raw_data['S1PreHospitalImpression'].value_counts(dropna=False).head(20)

NaN    252031
FP      76194
O       20090
FN      11711
NR        109
St         95
N          57
Ge         18
Ne         10
Ot          8
o           5
Co          5
Di          5
Fa          5
He          4
Un          4
Hy          3
Ab          3
Y           3
Sh          3
Name: S1PreHospitalImpression, dtype: int64

## Run unit tests

These perform checks on the cleaned data, such as to check that:
* Counts are as expected (i.e. including what intend into 1 and 0)
* Shape of dataframe is consistent (no additional rows or columns added)
* Times are not negative

In [30]:
%run -i './tests/test_01_reformat_data.py'

.........F........
FAIL: test_no_ambulance (__main__.DataTests)
Test that people who do not arrive by ambulance therefore have
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/home/amy/Documents/samuel_2_data_prep/tests/test_01_reformat_data.py", line 142, in test_no_ambulance
    self.assertEqual(len(amb_neg.index), 0)
AssertionError: 9 != 0

FAIL: test_time_negative (__main__.DataTests) [onset_to_arrival_time]
Test that times are not negative when expected to be positive
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/home/amy/Documents/samuel_2_data_prep/tests/test_01_reformat_data.py", line 130, in test_time_negative
    self.time_neg(col)
  File "/home/amy/Documents/samuel_2_data_prep/tests/test_01_reformat_data.py", line 56, in time_neg
    self.assertEqual(sum(self.clean[time_column] < 0), 0)
AssertionError: 2 != 0

FAIL: test_time_negative (__main

SystemExit: True