# Clean stroke hospitals

## Summary

This notebook cleans stroke_hospitals_2022.csv, by adding a teams column to stroke_hospitals that matches teams in our SSNAP dataset.

## Set up

In [1]:
# Import packages
from dataclasses import dataclass
import numpy as np
import os
import pandas as pd

# Linting
%load_ext pycodestyle_magic
%pycodestyle_on

In [2]:
# Set paths and filenames
@dataclass(frozen=True)
class Paths:
    '''Singleton object for storing paths to data and database.'''

    raw_path: str = './data'
    data_filename: str = 'stroke_hospitals_2022.csv'

    clean_path: str = './output'
    save_filename: str = 'clean_stroke_hospitals_2022.csv'
    ssnap_filename: str = 'clean_samuel_ssnap_extract_v2.csv'

    database_filename: str = 'samuel.db'


paths = Paths()

In [3]:
# Load data
hospitals: pd.DataFrame = pd.read_csv(os.path.join(
    paths.raw_path, paths.data_filename), low_memory=False)
ssnap: pd.DataFrame = pd.read_csv(os.path.join(
    paths.clean_path, paths.ssnap_filename))

## Preview hospitals dataframe

In [4]:
display(hospitals.columns)
display(hospitals.head())

Index(['Postcode', 'Hospital_name', 'Use_IVT', 'Use_MT', 'Use_MSU', 'Country',
       'Strategic Clinical Network', 'Health Board / Trust', 'Stroke Team',
       'SSNAP name', 'Admissions 21/22', 'Thrombolysis', 'ivt_rate', 'Easting',
       'Northing', 'long', 'lat', 'Neuroscience',
       '30 England Thrombectomy Example', 'hospital_city', 'Notes'],
      dtype='object')

Unnamed: 0,Postcode,Hospital_name,Use_IVT,Use_MT,Use_MSU,Country,Strategic Clinical Network,Health Board / Trust,Stroke Team,SSNAP name,...,Thrombolysis,ivt_rate,Easting,Northing,long,lat,Neuroscience,30 England Thrombectomy Example,hospital_city,Notes
0,RM70AG,RM70AG,1,1,1,England,London SCN,Barking,Havering and Redbridge University Hospitals N...,Queens Hospital Romford HASU,...,117.0,11.9,551118,187780,0.179031,51.568647,1,0,Romford,
1,E11BB,E11BB,1,1,1,England,London SCN,Barts Health NHS Trust,The Royal London Hospital,Royal London Hospital HASU,...,115.0,13.4,534829,181798,-0.058133,51.519018,1,1,Royal London,
2,SW66SX,SW66SX,1,1,1,England,London SCN,Imperial College Healthcare NHS Trust,"Charing Cross Hospital, London",Charing Cross Hospital HASU,...,113.0,9.9,524226,176487,-0.212736,51.473717,1,1,Charing Cross,
3,SE59RW,SE59RW,1,1,1,England,London SCN,King's College Hospital NHS Foundation Trust,"King's College Hospital, London",King's College Hospital HASU,...,124.0,15.0,532536,176228,-0.093251,51.469505,1,0,Kings College,
4,BR68ND,BR68ND,1,0,0,England,London SCN,King's College Hospital NHS Foundation Trust,Princess Royal University Hospital,Princess Royal University Hospital HASU,...,113.0,13.3,543443,165032,0.059146,51.366243,0,0,Princess Royal,


## Add teams column to match data

In [5]:
# Merge hospitals with the stroke teams in the SSNAP data
data_names = pd.DataFrame({'data_team': ssnap['stroke_team'].unique()})
raw_teams = data_names.merge(
    hospitals, left_on='data_team', right_on='SSNAP name', how='left')
raw_teams.shape

(121, 22)

In [6]:
# Identify duplicates and remove incorrect matches
display(raw_teams.loc[
    raw_teams.duplicated(subset=['data_team'], keep=False),
    ['data_team', 'Postcode', 'Country', 'Strategic Clinical Network',
     'Health Board / Trust', 'Stroke Team', 'SSNAP name']])
teams = raw_teams[~raw_teams['Stroke Team'].isin([
    'Princess Royal Hospital, Haywards Heath',
    'Pinderfields Hospital, Wakefield'])]
teams.shape

Unnamed: 0,data_team,Postcode,Country,Strategic Clinical Network,Health Board / Trust,Stroke Team,SSNAP name


(119, 22)

In [7]:
# Identity hospitals not merged
display(teams.loc[
    teams['SSNAP name'].isnull(), 'data_team'])

Series([], Name: data_team, dtype: object)

## Save data

In [10]:
teams.to_csv(os.path.join(paths.clean_path, paths.save_filename), index=False)