# Collate demographic data for each LSOA

This notebook collates various and saves as `collated_data.csv` in data folder. 

In [1]:
import numpy as np
import pandas as pd

## Unzip required travel matrix file

In [2]:
import zipfile
with zipfile.ZipFile("./data/lsoa_travel_time_matrix_calibrated.zip","r") as zip_ref:
    zip_ref.extractall("./data/")

## Import admissions data

In [3]:
data = pd.read_csv('./data/admissions_2017-2019.csv', index_col='area')
data.rename(columns={'Admissions': 'admissions'}, inplace=True)

data.head(2)

Unnamed: 0_level_0,admissions
area,Unnamed: 1_level_1
Welwyn Hatfield 010F,0.666667
Welwyn Hatfield 012A,4.0


## Import units

In [4]:
units = pd.read_csv('./data/stroke_hospitals_2022.csv')

units.head(2)

Unnamed: 0,Postcode,Hospital_name,Use,Country,Strategic Clinical Network,Health Board / Trust,Stroke Team,SSNAP name,Admissions 21/22,Thrombolysis,ivt_rate,Easting,Northing,Neuroscience,30 England Thrombectomy Example,Notes
0,RM70AG,RM70AG,1,England,London SCN,Barking; Havering and Redbridge University Hos...,"Queen's Hospital, Romford",Queens Hospital Romford HASU,981,117,11.9,551118,187780,1,0,
1,E11BB,E11BB,1,England,London SCN,Barts Health NHS Trust,The Royal London Hospital,Royal London Hospital HASU,861,115,13.4,534829,181798,1,1,


Get postcodes of unit in use for ivt)

In [5]:
mask = units['Use'] == 1
units_in_use = list(units[mask]['Postcode'])
units_in_use[0:5]

['RM70AG', 'E11BB', 'SW66SX', 'SE59RW', 'BR68ND']

Get postcodes of units providing thrombectomy.

In [6]:
mask = units['Neuroscience'] == 1
thromectomy_units = list(units[mask]['Postcode'])
thromectomy_units[0:5]

['RM70AG', 'E11BB', 'SW66SX', 'SE59RW', 'SW170QT']

## Import travel time to stroke units

In [7]:
travel_time = pd.read_csv('./data/lsoa_travel_time_matrix_calibrated.csv', index_col='LSOA')
# Limit to units in use
travel_time = travel_time[units_in_use]
travel_time.head(2)

Unnamed: 0_level_0,RM70AG,E11BB,SW66SX,SE59RW,BR68ND,HA13UJ,SW170QT,NW12BU,DE223NE,NN15BD,...,LL137TD,LL572PW,CF144XW,CF479DT,CF311RQ,SY231ER,SA148QF,SA312AF,SA612PZ,SA66NL
LSOA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
City of London 001A,34.3,11.7,26.7,18.1,39.6,32.1,27.8,13.8,142.9,82.6,...,202.0,276.2,165.4,190.2,180.5,256.8,210.6,227.8,259.0,203.1
City of London 001B,33.2,10.6,28.9,18.1,38.6,33.2,26.7,13.8,142.9,82.6,...,203.1,276.2,166.5,191.2,181.6,256.8,212.7,228.9,261.1,204.1


Get closest unit and time  for IVT (convert series to dataframes)

In [None]:
closest_unit = pd.DataFrame(travel_time.idxmin(axis=1), columns=['closest_ivt_unit'])

In [None]:
closest_unit_time = pd.DataFrame(travel_time.min(axis=1), columns=['closest_ivt_time'])

Merge into admissions

In [None]:
data = data.merge(closest_unit, left_index=True, right_index=True, how='left')
data = data.merge(closest_unit_time, left_index=True, right_index=True,  how='left')
data.head(2)

Add travel times to thrombectomy units 

In [None]:
mask = units['Neuroscience'] == 1
thromectomy_units = list(units[mask]['Postcode'])

In [None]:
closest_thrombectomy_unit = pd.DataFrame(travel_time[thromectomy_units].idxmin(axis=1), columns=['closest_mt_unit'])
closest_thrombectomy_time = pd.DataFrame(travel_time[thromectomy_units].min(axis=1), columns=['closest_mt_time'])

In [None]:
data = data.merge(closest_thrombectomy_unit, left_index=True, right_index=True, how='left')
data = data.merge(closest_thrombectomy_time, left_index=True, right_index=True, how='left')
data.head(2)

Add travel time between IVT and ET units.

In [None]:
inter_hospital_times = pd.read_csv('./data/inter_hospital_time_calibrated.csv', index_col='from_postcode')

In [None]:
transfer_times = []
for index, value in data.iterrows():
    ivt_unit = value['closest_ivt_unit']
    mt_unit = value['closest_mt_unit']
    transfer_time = inter_hospital_times.loc[ivt_unit][mt_unit]
    transfer_times.append(transfer_time)

In [None]:
data['mt_transfer_time'] = transfer_times
data.head(2)

In [None]:
lsoa_index = data.index
lsoa_index

## Add thrombolysis rate of closest IVT unit

In [None]:
mask = units['Use'] == 1
ivt_rate = units[mask][['Postcode', 'ivt_rate']]

data = data.merge(
    ivt_rate, left_on='closest_ivt_unit', right_on='Postcode', how='left')
data.drop('Postcode', axis=1, inplace=True)
data.set_index(lsoa_index, inplace=True)

In [None]:
data.head(2)

## Import deprivation demographics

In [None]:
deprivation = pd.read_csv(
    './data/demographic/File_7_-_All_IoD2019_Scores__Ranks__Deciles_and_Population_Denominators_3.csv',
    index_col='LSOA name (2011)')
cols_to_drop = ['LSOA code (2011)','Local Authority District code (2019)']
deprivation.drop(cols_to_drop, axis=1, inplace=True)

In [None]:
deprivation.head(2)

In [None]:
data = data.merge(deprivation, left_index=True, right_index=True,  how='left')

data.set_index(lsoa_index, inplace=True)

data.head(2)

## Import ethnicity

In [None]:
ethnicity = pd.read_csv(
    './data/demographic/ethnicity.csv',
    index_col='geography')

# Include only ethnicity for all people (no age/gender breakdown)
cols_to_include = [
    x for x in list(ethnicity) if 'Sex: All persons; Age: All categories' in x]

# Get selected data
extract = ethnicity[cols_to_include]

# Truncate field name
rename_dict = dict()
for field in list(extract):
    rename_dict[field] = field[44:]
extract = extract.rename(rename_dict, axis='columns')
    
# Merge in with data
data = data.merge(extract, left_index=True, right_index=True,  how='left')

data.set_index(lsoa_index, inplace=True)

data.head(2)

## Import health

In [None]:
health = pd.read_csv(
    './data/demographic/general_health.csv',
    index_col='geography')

# Include only ethnicity for all people (no age/gender breakdown)
cols_to_include = [
    x for x in list(health) if 'Sex: All persons; Age: All categories' in x]

# Get selected data
extract = health[cols_to_include]

# Truncate field name
rename_dict = dict()
for field in list(extract):
    rename_dict[field] = field[44:]
extract = extract.rename(rename_dict, axis='columns')
    
# Merge in with data
data = data.merge(extract, left_index=True, right_index=True,  how='left')
data.set_index(lsoa_index, inplace=True)

data.head(2)

## Import rural vs urban

In [None]:
rural_urban = pd.read_csv(
    './data/demographic/Rural_Urban_Classification_2011_of_Lower_Layer_Super_Output_Areas_in_England_and_Wales.csv',
    index_col='LSOA11NM')
cols_to_drop = ['LSOA11CD','RUC11CD', 'FID']
rural_urban.drop(cols_to_drop, axis=1, inplace=True)

data = data.merge(rural_urban, left_index=True, right_index=True,  how='left')
data.set_index(lsoa_index, inplace=True)

data.head(2)

## Import age demographics

In [None]:
ages = pd.read_csv(
    './data/demographic/mid_2018_persons.csv',
    index_col='LSOA')

all_ages = ages['All Ages']
ages.drop('All Ages', axis=1, inplace=True)
data['All persons'] = all_ages

# Change '90+' to 91
rename_dict = dict()
rename_dict['90+'] = '91'
ages = ages.rename(rename_dict, axis='columns')

age_bands = pd.DataFrame()

for band in np.arange(0, 96,5):
    cols_to_get = []
    for field in list(ages):
        if int(int(field)/5) * 5 == band:
            cols_to_get.append(field)
    extract = ages[cols_to_get]
    age_bands[f'age band {band}'] = extract.sum(axis=1)
    
data = data.merge(age_bands, left_index=True, right_index=True,  how='left')
data.set_index(lsoa_index, inplace=True)
data.head(2)

In [None]:
ages = pd.read_csv(
    './data/demographic/mid_2018_females.csv',
    index_col='LSOA')

all_ages = ages['All Ages']
ages.drop('All Ages', axis=1, inplace=True)
data['All females'] = all_ages

# Change '90+' to 91
rename_dict = dict()
rename_dict['90+'] = '91'
ages = ages.rename(rename_dict, axis='columns')

age_bands = pd.DataFrame()

for band in np.arange(0, 96,5):
    cols_to_get = []
    for field in list(ages):
        if int(int(field)/5) * 5 == band:
            cols_to_get.append(field)
    extract = ages[cols_to_get]
    age_bands[f'age band females {band}'] = extract.sum(axis=1)
    
data = data.merge(age_bands, left_index=True, right_index=True,  how='left')
data.set_index(lsoa_index, inplace=True)
data.head(2)

In [None]:
ages = pd.read_csv(
    './data/demographic/mid_2018_males.csv',
    index_col='LSOA')

all_ages = ages['All Ages']
ages.drop('All Ages', axis=1, inplace=True)
data['All males'] = all_ages

# Change '90+' to 91
rename_dict = dict()
rename_dict['90+'] = '91'
ages = ages.rename(rename_dict, axis='columns')

age_bands = pd.DataFrame()

for band in np.arange(0, 96,5):
    cols_to_get = []
    for field in list(ages):
        if int(int(field)/5) * 5 == band:
            cols_to_get.append(field)
    extract = ages[cols_to_get]
    age_bands[f'age band males {band}'] = extract.sum(axis=1)
    
data = data.merge(age_bands, left_index=True, right_index=True,  how='left')
data.set_index(lsoa_index, inplace=True)
data.head(2)

## Save output

In [None]:
data.to_csv('./data/collated_data.csv', index_label='LSOA')