# Set up SQLite database for data

In [7]:
import os
import pandas as pd
import sqlite3

from dataclasses import dataclass

# Set the maximum number of columns to 100
pd.set_option('display.max_columns', 100)

## Set up paths and filenames

In [2]:
@dataclass(frozen=True)
class Paths:
    """Singleton object for storing paths to data and database."""

    data_path: str = os.path.join(os.path.dirname(os.getcwd()), "data")
    database_path: str = os.path.join(os.path.dirname(os.getcwd()), "sqlite")
    data_filename: str = "MichaelAllen_HQIP425_300323.csv"
    database_filename: str = "samuel.db"

paths = Paths()

## Set up database

In [3]:
# Delete old database if present
if os.path.exists(os.path.join(paths.database_path, paths.database_filename)):
    os.remove(os.path.join(paths.database_path, paths.database_filename))
    print ("Old database deleted")

# Create new database
conn = sqlite3.connect(os.path.join(paths.database_path, paths.database_filename))
conn.close()

Old database deleted


## load data

In [25]:
raw_data: pd.DataFrame = pd.read_csv(
    os.path.join(paths.data_path, paths.data_filename), low_memory=False)

# Set up DataFrame for cleaned data
cleaned_data: pd.DataFrame = pd.DataFrame()

# Show the first 5 rows of the raw data
raw_data.head()

Unnamed: 0,TeamName,AgeUnder40,Age40to44,Age45to49,Age50to54,Age55to59,Age60to64,Age65to69,Age70to74,Age75to79,Age80to84,Age85to89,AgeOver90,S1Gender,OnsettoArrivalMinutes,S1OnsetDateType,S1OnsetTimeType,S1ArriveByAmbulance,FirstArrivalMonthYear,FirstArrivalWeekday,FirstArrivalTime,S2CoMCongestiveHeartFailure,S2CoMHypertension,S2CoMAtrialFibrillation,S2CoMDiabetes,S2CoMStrokeTIA,S2CoMAFAntiplatelet,S2CoMAFAnticoagulent,S2CoMAFAnticoagulentVitK,S2CoMAFAnticoagulentDOAC,S2CoMAFAnticoagulentHeparin,S2INR,S2INRHigh,S2INRNK,S2NewAFDiagnosis,S2RankinBeforeStroke,S2NihssArrival,S2NihssArrivalLoc,S2NihssArrivalLocQuestions,S2NihssArrivalLocCommands,S2NihssArrivalBestGaze,S2NihssArrivalVisual,S2NihssArrivalFacialPalsy,S2NihssArrivalMotorArmLeft,S2NihssArrivalMotorArmRight,S2NihssArrivalMotorLegLeft,S2NihssArrivalMotorLegRight,S2NihssArrivalLimbAtaxia,S2NihssArrivalSensory,S2NihssArrivalBestLanguage,S2NihssArrivalDysarthria,S2NihssArrivalExtinctionInattention,ArrivaltoBrainImagingMinutes,S2StrokeType,S2Thrombolysis,S2ThrombolysisNoReason,S2ThrombolysisNoButHaemorrhagic,S2ThrombolysisNoButTimeWindow,S2ThrombolysisNoButComorbidity,S2ThrombolysisNoButMedication,S2ThrombolysisNoButRefusal,S2ThrombolysisNoButAge,S2ThrombolysisNoButImproving,S2ThrombolysisNoButTooMildSevere,S2ThrombolysisNoButTimeUnknownWakeUp,S2ThrombolysisNoButOtherMedical,ArrivaltoThrombolysisMinutes,S2TIAInLastMonth,DoortoThrombolysisMinutes,S7DischargeType,ArrivalToDeathDays,S7StrokeUnitDeath,S7RankinDischarge,S8Rankin6Month,S8Rankin6MonthNK,CallConnectedtoArrivalMinutes,ArrivalPatientLocationtoArrivalMinutes,DeparturePatientLocationtoArrivalMinutes,WheelsStoptoArrivalMinutes,MobileDatatoArrivalMinutes,S1PreHospitalImpression
0,Northumbria Specialist Emergency Care Hospital...,0,0,0,0,0,0,0,0,1,0,0,0,M,3033,DS,BE,N,Jan-16,Friday,0900to1200,N,N,N,N,Y,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,183.0,I,NB,,0,0,0,0,0,0,1,1,0,0,,N,,H,,,1.0,,,,,,,,
1,Northumbria Specialist Emergency Care Hospital...,0,0,0,0,0,0,0,1,0,0,0,0,F,5395,BE,NK,N,Jan-16,Saturday,1500to1800,N,Y,N,N,N,,,,,,,,,,0,2,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,109.0,I,NB,,0,1,0,0,0,0,0,0,0,0,,,,TC,,,1.0,,,,,,,,
2,West Suffolk Hospital,0,0,0,0,0,0,0,0,0,0,1,0,F,713,P,NK,Y,Jan-16,Tuesday,0900to1200,N,Y,Y,Y,N,N,Y,,,,,,,,2,28,2,2,1,2,0,1,4,2,4,3,0,2,3,2,0,6.0,PIH,NB,,1,0,0,0,0,0,0,0,0,0,,,,D,4.0,Y,6.0,,0.0,,,,,,
3,West Suffolk Hospital,0,0,0,0,0,0,0,1,0,0,0,0,F,278,P,P,Y,Jan-16,Sunday,2100to2400,N,Y,N,N,N,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,55.0,I,NB,,0,1,0,0,0,0,1,1,0,0,,,,H,,,1.0,0.0,0.0,,,,,,
4,Royal Victoria Infirmary,0,0,0,0,0,0,0,0,1,0,0,0,M,170,P,P,Y,Jan-16,Tuesday,0000to3000,N,Y,N,N,Y,,,,,,,,,,1,12,0,0,0,0,0,2,0,4,0,4,0,1,0,1,0,7.0,I,Y,,0,0,0,0,0,0,0,0,0,0,102.0,NK,,CH,,,2.0,4.0,0.0,,,,,,


In [15]:
# List all columns in the data
raw_data.columns

Index(['TeamName', 'AgeUnder40', 'Age40to44', 'Age45to49', 'Age50to54',
       'Age55to59', 'Age60to64', 'Age65to69', 'Age70to74', 'Age75to79',
       'Age80to84', 'Age85to89', 'AgeOver90', 'S1Gender',
       'OnsettoArrivalMinutes', 'S1OnsetDateType', 'S1OnsetTimeType',
       'S1ArriveByAmbulance', 'FirstArrivalMonthYear', 'FirstArrivalWeekday',
       'FirstArrivalTime', 'S2CoMCongestiveHeartFailure', 'S2CoMHypertension',
       'S2CoMAtrialFibrillation', 'S2CoMDiabetes', 'S2CoMStrokeTIA',
       'S2CoMAFAntiplatelet', 'S2CoMAFAnticoagulent',
       'S2CoMAFAnticoagulentVitK', 'S2CoMAFAnticoagulentDOAC',
       'S2CoMAFAnticoagulentHeparin', 'S2INR', 'S2INRHigh', 'S2INRNK',
       'S2NewAFDiagnosis', 'S2RankinBeforeStroke', 'S2NihssArrival',
       'S2NihssArrivalLoc', 'S2NihssArrivalLocQuestions',
       'S2NihssArrivalLocCommands', 'S2NihssArrivalBestGaze',
       'S2NihssArrivalVisual', 'S2NihssArrivalFacialPalsy',
       'S2NihssArrivalMotorArmLeft', 'S2NihssArrivalMotorArmRight

In [12]:
# Show shape of the data
raw_data.shape

(360381, 81)

In [6]:
# Show raw data types
raw_data.dtypes


TeamName                                     object
AgeUnder40                                    int64
Age40to44                                     int64
Age45to49                                     int64
Age50to54                                     int64
Age55to59                                     int64
Age60to64                                     int64
Age65to69                                     int64
Age70to74                                     int64
Age75to79                                     int64
Age80to84                                     int64
Age85to89                                     int64
AgeOver90                                     int64
S1Gender                                     object
OnsettoArrivalMinutes                         int64
S1OnsetDateType                              object
S1OnsetTimeType                              object
S1ArriveByAmbulance                          object
FirstArrivalMonthYear                        object
FirstArrival

In [14]:
# Show proportion of completed data for each column
raw_data.count() / raw_data.shape[0]

TeamName                                    1.000000
AgeUnder40                                  1.000000
Age40to44                                   1.000000
Age45to49                                   1.000000
Age50to54                                   1.000000
Age55to59                                   1.000000
Age60to64                                   1.000000
Age65to69                                   1.000000
Age70to74                                   1.000000
Age75to79                                   1.000000
Age80to84                                   1.000000
Age85to89                                   1.000000
AgeOver90                                   1.000000
S1Gender                                    1.000000
OnsettoArrivalMinutes                       1.000000
S1OnsetDateType                             1.000000
S1OnsetTimeType                             1.000000
S1ArriveByAmbulance                         1.000000
FirstArrivalMonthYear                       1.

## Extract ages

In [26]:
ages: dict = {'AgeUnder40': 37.5, 'Age40to44': 42.5, 'Age45to49': 47.5, 
       'Age50to54': 52.5, 'Age55to59': 57.5, 'Age60to64': 62.5, 'Age65to69': 67.5, 
       'Age70to74': 72.5, 'Age75to79': 77.5, 'Age80to84': 82.5, 'Age85to89': 87.5, 
       'AgeOver90': 92.5}

col_extract: pd.DataFrame = raw_data[ages.keys()]
# Find column of col_extract with highest value
age_band: pd.Series = col_extract.idxmax(axis=1)
f = lambda x: ages[x]
cleaned_data['age'] = age_band.apply(f)

0