# Match data to SAMueL-2 teams

Only keep the data that matches teams used in the SAMuel-2 project. This will mean keeping only the acute stroke units.

In [1]:
import os
import pandas as pd
import numpy as np  # for NaN

### Import data

In [2]:
dir_files = '../data/organisational_audit/processed'
file_excel = 'processed_2019_portfolio_key_indicators_summary_numeric.csv'

In [3]:
df = pd.read_csv(os.path.join(dir_files, file_excel))

In [4]:
df.head(3)

Unnamed: 0,Hospital names,acute_unit,bed1,bed3,ki_total,ki1,ki1_nurses6_wte,ki1_nurses7_wte,ki2,ki2_psych_wte,...,ki9_tia_outpatient_timescale_nextweekday,ki9_tia_outpatient_timescale_sameday,ki9_tia_outpatient_timescale_samedayexcludingweekends,ki9_tia_outpatient_timescale_withinmonth,ki9_tia_outpatient_timescale_withinweek,ki9_tia_outpatients_seen,ki10,ki10_management_chairmanofclinicalgovernance,ki10_management_executiveonboard,ki10_management_nonexecutiveonboard
0,Queens Hospital Romford HASU,1.0,1.0,0.0,6.0,1,2.73,1.14,0,0.48,...,0,1,0,0,0,1,1,1,1,1
1,Newham General Hospital,0.0,1.0,0.0,6.0,1,1.54,1.54,0,0.0,...,0,0,1,0,0,1,0,0,0,0
2,Royal London Hospital HASU,1.0,1.0,0.0,5.0,1,2.85,0.38,0,0.23,...,0,0,1,0,0,1,0,0,0,0


Hospital name lookup

In [5]:
dir_files = '../data/organisational_audit/processed'
file_excel = 'hospital_names_trusts.csv'

In [6]:
df_team_names = pd.read_csv(os.path.join(dir_files, file_excel))

In [7]:
df_team_names

Unnamed: 0,hospital_name_1,hospital_name_2,hospital_name_3,scn,trust,site_name
0,Queens Hospital Romford HASU,Queens Hospital Romford SU,,London,"Barking, Havering and Redbridge University Hos...","Barking, Havering and Redbridge University Hos..."
1,Newham General Hospital,,,London,Barts Health NHS Trust,Barts Health NHS Trust (Newham University Hosp...
2,Royal London Hospital HASU,Royal London Hospital SU,,London,Barts Health NHS Trust,Barts Health NHS Trust (Royal London Hospital)
3,Whipps Cross University Hospital,,,London,Barts Health NHS Trust,Barts Health NHS Trust (Whipps Cross Universit...
4,Charing Cross Hospital HASU,Charing Cross Hospital SU,,London,Imperial College Healthcare NHS Trust,Imperial College Healthcare NHS Trust
...,...,...,...,...,...,...
166,Altnagelvin Hospital,,,Northern Ireland,Western Health and Social Care Trust,Western Health and Social Care Trust (Altnagel...
167,South West Acute Hospital,,,Northern Ireland,Western Health and Social Care Trust,Western Health and Social Care Trust (South We...
168,Noble's Hospital,,,Islands,Isle of Man Department of Health and Social Care,Isle of Man Department of Health and Social Care
169,Walton Centre Stroke Team,,,North West Coast,Walton Centre NHS Foundation Trust,Walton Centre NHS Foundation Trust


SAMueL-2 results - anonymised SHAP values

In [8]:
dir_files = '../data_samuel2/'
file_excel = 'benchmark_codes.csv'

In [9]:
df_shap = pd.read_csv(os.path.join(dir_files, file_excel))

In [10]:
df_shap

Unnamed: 0,stroke_team_id,shap_mean,Rank
0,102,1.341623,1
1,72,1.111925,2
2,8,0.997945,3
3,71,0.949360,4
4,37,0.905034,5
...,...,...,...
114,117,-0.688748,115
115,58,-0.820926,116
116,65,-0.832516,117
117,26,-1.062729,118


SAMueL-2 results - link anonymised SHAP names to real names

In [11]:
dir_files = '../data_samuel2/'
file_excel = 'team_code.csv'

In [12]:
df_codes = pd.read_csv(os.path.join(dir_files, file_excel))

In [13]:
df_codes.columns

Index(['stroke_team', 'team_code'], dtype='object')

In [14]:
df_codes.index

RangeIndex(start=0, stop=119, step=1)

In [15]:
df_codes = df_codes.rename(columns={'team_code': 'stroke_team_id'})

## Remove unwanted data

Several rows under Key Indicator 3 are not mentioned in the matching report and don't seem to be important. They were used as a sanity check that Key Indicator 3 was actually met and that people haven't just answered "yes" to having out-of-hours nurses when the rest of the numbers don't back that up. They don't seem to belong with the final data.

In [16]:
cols_to_remove = [
    'ki3_strokenurse_bed1_weekdays10pm',
    'ki3_strokenurse_bed1_saturdays',
    'ki3_strokenurse_bed1_saturdays10pm',
    'ki3_strokenurse_bed1_sundays',
    'ki3_strokenurse_bed1_sundays10pm',
    'ki3_strokenurse_bed3_weekdays10pm',
    'ki3_strokenurse_bed3_saturdays',
    'ki3_strokenurse_bed3_saturdays10pm',
    'ki3_strokenurse_bed3_sundays',
    'ki3_strokenurse_bed3_sundays10pm',
]

df = df.drop(cols_to_remove, axis='columns')

## Keep only wanted teams

In [17]:
teams_to_keep = df_codes['stroke_team'].values
teams_to_keep = sorted(teams_to_keep)

Find their names in the main dataframe (some hospitals have multiple names and there's no chance they'll all match):

In [18]:
def find_main_hospital_name(
        df_hospital_names: 'pd.DataFrame',
        name_to_look_up: 'str',
        column_main_name: 'str'='hospital_name_1'
        ):
    # df of True/False for name matches the name_to_look_up:
    df_bool = df_hospital_names.eq(name_to_look_up)
    # series of True/False, one for each row in the dataframe,
    # and the row is True when any value in the row in df_bool is True.
    series_bool = df_bool.any(axis='columns')
    # Use that series as a mask to pick out only the right row,
    # then pick out the value in that row and the right column:
    main_hospital_name = df_hospital_names.loc[series_bool, column_main_name]
    try:
        main_hospital_name = main_hospital_name.values[0]
    except IndexError:
        main_hospital_name = '?'
    return main_hospital_name

In [19]:
teams_to_keep_current_names = []

for team in teams_to_keep:
    team_current_name = find_main_hospital_name(df_team_names, team)
    teams_to_keep_current_names.append(team_current_name)

In [20]:
df_team_lookup = pd.DataFrame(
    np.array([teams_to_keep, teams_to_keep_current_names]).T,
    columns=['team_samuel', 'team_audit']
)

In [21]:
df_team_lookup[df_team_lookup['team_audit'] != df_team_lookup['team_samuel']]

Unnamed: 0,team_samuel,team_audit
24,Grange University Hospital,?
28,Invicta Ward Kent and Canterbury Hospital,?
102,University Hospitals Dorset Stroke Service,?
111,Wirral Arrowe Park Hospital,?


Manually update the missing team names

In [22]:
manual_names = {
    'Wirral Arrowe Park Hospital': 'Arrowe Park Hospital',
    'Invicta Ward Kent and Canterbury Hospital': '?',  # This doesn't seem to be in the 2019 audit data.
    'Grange University Hospital': '?',
    'University Hospitals Dorset Stroke Service': 'Poole Hospital',
}

In [23]:
for name_samuel, name_audit in manual_names.items():
    mask = df_team_lookup['team_samuel'] == name_samuel
    df_team_lookup.loc[mask, 'team_audit'] = name_audit

In [24]:
df_team_lookup[df_team_lookup['team_audit'] != df_team_lookup['team_samuel']]

Unnamed: 0,team_samuel,team_audit
24,Grange University Hospital,?
28,Invicta Ward Kent and Canterbury Hospital,?
102,University Hospitals Dorset Stroke Service,Poole Hospital
111,Wirral Arrowe Park Hospital,Arrowe Park Hospital


Two of the teams don't seem to exist in the audit data so remove them.

In [25]:
mask = df_team_lookup['team_audit'] == '?'
df_team_lookup = df_team_lookup.loc[~mask].copy()

## Link stroke team names to their IDs

In [26]:
df_team_lookup = pd.merge(
    df_team_lookup, df_codes,
    left_on='team_samuel', right_on='stroke_team', how='left')
df_team_lookup = df_team_lookup.drop('stroke_team', axis='columns')

## Merge in SHAP values

In [27]:
df_teams_shap = pd.merge(
    df_team_lookup, df_shap,
    left_on='stroke_team_id', right_on='stroke_team_id', how='left')

## Merge audit data and SHAP values

In [28]:
df_audit_shap = pd.merge(
    df, df_teams_shap,
    left_on='Hospital names', right_on='team_audit', how='inner'
)
# 'inner' to only keep names in the SAMueL-2 data.

In [29]:
df_audit_shap = df_audit_shap.drop('team_audit', axis='columns')

In [30]:
df_audit_shap.columns

Index(['Hospital names', 'acute_unit', 'bed1', 'bed3', 'ki_total', 'ki1',
       'ki1_nurses6_wte', 'ki1_nurses7_wte', 'ki2', 'ki2_psych_wte', 'ki3',
       'ki3_strokenurse_outofhours', 'ki4',
       'ki4_minimum_nurse_bed1_saturdays10am',
       'ki4_minimum_nurse_bed1_sundays10am',
       'ki4_minimum_nurse_bed3_saturdays10am',
       'ki4_minimum_nurse_bed3_sundays10am', 'ki5',
       'ki5_occupationaltherapy_7days', 'ki5_physiotherapy_7days',
       'ki5_speechlanguagetherapy_7days', 'ki6', 'ki6_prealert_consultant',
       'ki6_prealert_fastpositive_sometimes', 'ki6_prealert_fastpositive_yes',
       'ki6_prealert_ivtcandidates_sometimes',
       'ki6_prealert_ivtcandidates_yes', 'ki6_prealert_juniordoctor',
       'ki6_prealert_othersuspectedstroke_sometimes',
       'ki6_prealert_othersuspectedstroke_yes', 'ki6_prealert_strokenurse',
       'ki7', 'ki7_percent_access_to_stroke_esd_team', 'ki7_stroke_esd_team',
       'ki8', 'ki8_patientcarersurvey_frequency_1to2peryear',
      

## Save merged file

In [31]:
dir_files = '../data/organisational_audit/processed'
df_audit_shap.to_csv(os.path.join(dir_files, 'processed_2019_portfolio_key_indicators_summary_numeric_shap.csv'), index=False)