# Recreate national results from the new reduced summary data

In [1]:
import os
import pandas as pd
import numpy as np  # for NaN

### Import data

In [2]:
dir_files = '../data/organisational_audit/processed'
file_excel = 'processed_2019_portfolio_key_indicators_summary_numeric.csv'

In [3]:
df = pd.read_csv(os.path.join(dir_files, file_excel), index_col=0)

In [4]:
df

Unnamed: 0_level_0,acute_unit,bed1,bed3,ki_total,ki1,ki1_nurses6_wte,ki1_nurses7_wte,ki2,ki2_psych_wte,ki3,...,ki9_tia_outpatient_timescale_nextweekday,ki9_tia_outpatient_timescale_sameday,ki9_tia_outpatient_timescale_samedayexcludingweekends,ki9_tia_outpatient_timescale_withinmonth,ki9_tia_outpatient_timescale_withinweek,ki9_tia_outpatients_seen,ki10,ki10_management_chairmanofclinicalgovernance,ki10_management_executiveonboard,ki10_management_nonexecutiveonboard
Hospital names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Queens Hospital Romford HASU,1.0,1.0,0.0,6.0,1,2.73,1.14,0,0.48,1,...,0,1,0,0,0,1,1,1,1,1
Newham General Hospital,0.0,1.0,0.0,6.0,1,1.54,1.54,0,0.00,1,...,0,0,1,0,0,1,0,0,0,0
Royal London Hospital HASU,1.0,1.0,0.0,5.0,1,2.85,0.38,0,0.23,1,...,0,0,1,0,0,1,0,0,0,0
Whipps Cross University Hospital,1.0,0.0,1.0,3.0,1,2.63,0.53,0,0.00,0,...,0,0,1,0,0,1,0,0,0,0
Charing Cross Hospital HASU,1.0,1.0,1.0,5.0,1,3.67,0.44,0,0.27,1,...,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Craigavon Area Hospital,1.0,0.0,1.0,2.0,1,2.63,0.26,0,0.00,0,...,0,0,0,0,1,1,0,0,0,0
Daisy Hill Hospital,1.0,0.0,1.0,2.0,0,1.50,0.33,0,0.00,0,...,0,0,0,0,0,0,0,0,0,0
Altnagelvin Hospital,1.0,1.0,0.0,3.0,0,1.20,0.40,0,0.00,0,...,0,0,1,0,0,1,0,0,0,0
South West Acute Hospital,1.0,1.0,0.0,3.0,0,1.56,0.56,0,0.00,0,...,0,0,0,0,1,1,0,0,0,0


In [5]:
dir_files = '../data/organisational_audit/processed'
file_excel = 'processed_2019_portfolio_key_indicators_summary_national.csv'

In [6]:
df_national = pd.read_csv(os.path.join(dir_files, file_excel), index_col=0)

In [7]:
df_national

Unnamed: 0,ki_total,ki1,ki1_nurses6_wte,ki1_nurses7_wte,ki2,ki2_psych_wte,ki3,ki3_strokenurse_outofhours,ki3_strokenurse_bed1_weekdays10pm,ki3_strokenurse_bed1_saturdays,...,ki8,ki8_patientcarersurvey_frequency,ki9,ki9_tia_brainimaging_mostused,ki9_tia_outpatients_seen,ki9_tia_outpatient_timescale,ki10,ki10_management_executiveonboard,ki10_management_nonexecutiveonboard,ki10_management_chairmanofclinicalgovernance
National Results,1: 5% (8/169)_x000D_\n2: 7% (12/169)_x000D_\n3...,58% (98/169),1.9 (1.4-2.9)_x000D_\nMedian (IQR),0.5 (0.4-0.8)_x000D_\nMedian (IQR),7% (12/169),0.1 (0-0.3)_x000D_\nMedian (IQR),71% (101/142),71% (101/142),215,243,...,56% (95/169),Never: 21% (35/169)\nLess than once a year: 23...,33% (56/169),Computed Tomography: 50% (85/169)\nMagnetic Re...,99% (160/162),The same day (7 days a week): 31% (50/160)_x00...,63% (106/169),58% (98/169),17% (28/169),25% (43/169)


## Generic functions for checks

In [8]:
def check_total(df, df_national, ki_str):
    national_str = df_national[ki_str].values[0]
    print(f'National results: {national_str}')

    full_count = df[ki_str].sum()
    full_total = len(df)
    full_perc = (full_count / full_total) * 100.0
    print(f'From full data  : {full_perc:.0f}% ({full_count}/{full_total})')

def check_iqr(df, df_national, ki_str='', ki_cols=[]):
    if len(ki_cols) == 0:
        ki_cols = [col for col in df.columns if col.startswith(ki_str)]
        try:
            ki_cols.remove(ki_str)
        except ValueError:
            pass

    iqr = pd.DataFrame(df[ki_cols].quantile([0.5, 0.25, 0.75]))
    df_to_display = pd.concat(
        [df_national[ki_cols], iqr],
        axis='rows'
    )
    display(df_to_display.T)

def check_count(df, df_national, ki_str='', ki_cols=[]):
    if len(ki_cols) == 0:
        ki_cols = [col for col in df.columns if col.startswith(ki_str)]
        try:
            ki_cols.remove(ki_str)
        except ValueError:
            pass

    counts = pd.DataFrame(df[ki_cols].sum(axis=0), columns=['From full data']).T
    counts.name = 'From full data'

    try:
        df_to_display = pd.concat(
            [df_national[ki_cols], counts],
            axis='rows'
        )
    except KeyError:
        # National data doesn't have the same columns.
        df_to_display = counts
    display(df_to_display.T)

## Total key indicators met counts

In [9]:
cols_ki = [f'ki{i}' for i in range(1, 11)]

In [10]:
df[cols_ki]

Unnamed: 0_level_0,ki1,ki2,ki3,ki4,ki5,ki6,ki7,ki8,ki9,ki10
Hospital names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Queens Hospital Romford HASU,1,0,1,1,0,0,1,1,0,1
Newham General Hospital,1,0,1,1,0,0,1,1,1,0
Royal London Hospital HASU,1,0,1,1,0,0,1,0,1,0
Whipps Cross University Hospital,1,0,0,1,0,0,1,0,0,0
Charing Cross Hospital HASU,1,0,1,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
Craigavon Area Hospital,1,0,0,0,0,0,1,0,0,0
Daisy Hill Hospital,0,0,0,1,0,0,1,0,0,0
Altnagelvin Hospital,0,0,0,1,0,0,0,1,1,0
South West Acute Hospital,0,0,0,1,0,1,0,1,0,0


If the following cell returns True, then the big DataFrame is consistent with itself. The separate KI values do sum to the value in the total KI column.

In [11]:
# Simple check of summing row contents, does it match total
(df['ki_total'] == df[cols_ki].sum(axis=1)).all()

True

The national data is given as counts for each total.

In [12]:
ki_total_national = df_national['ki_total'].values[0].split('_x000D_\n')

ki_total_national

['1: 5% (8/169)',
 '2: 7% (12/169)',
 '3: 18% (31/169)',
 '4: 20% (33/169)',
 '5: 18% (30/169)',
 '6: 17% (28/169)',
 '7: 7% (11/169)',
 '8: 7% (11/169)',
 '9: 2% (4/169)',
 '10: 1% (1/169)']

Recreate this from the full data:

In [13]:
ki_total_values = sorted(df['ki_total'].unique())

In [14]:
ki_total_counts = []

for v in ki_total_values:
    df_here = df.loc[df['ki_total'] == v]
    count = len(df_here)
    ki_total_counts.append(count)

In [15]:
ki_total_counts

[8, 12, 31, 33, 30, 28, 11, 11, 4, 1]

In [16]:
sum(ki_total_counts)

169

The counts match the national data.

## Key Indicator 1: nurses per bed

In [17]:
ki_str = 'ki1'

In [18]:
check_total(df, df_national, ki_str)

National results: 58% (98/169)
From full data  : 58% (98/169)


In [19]:
check_iqr(df, df_national, ki_str + '_')

Unnamed: 0,National Results,0.5,0.25,0.75
ki1_nurses6_wte,1.9 (1.4-2.9)_x000D_\nMedian (IQR),1.94,1.36,2.92
ki1_nurses7_wte,0.5 (0.4-0.8)_x000D_\nMedian (IQR),0.5,0.42,0.83


## Key Indicator 2: psychologist

In [20]:
ki_str = 'ki2'

In [21]:
check_total(df, df_national, ki_str)

National results: 7% (12/169)
From full data  : 7% (12/169)


In [22]:
check_iqr(df, df_national, ki_str + '_')

Unnamed: 0,National Results,0.5,0.25,0.75
ki2_psych_wte,0.1 (0-0.3)_x000D_\nMedian (IQR),0.07,0.0,0.34


## Key indicator 3: stroke nurses

Have to mask this one to only include acute stroke units.

In [23]:
df_acute = df.loc[df['acute_unit'] == 1]

In [24]:
len(df_acute)

142

In [25]:
ki_str = 'ki3'

In [26]:
check_total(df_acute, df_national, ki_str)

National results: 71% (101/142)
From full data  : 71% (101/142)


In [27]:
check_count(df_acute, df_national, ki_str + '_')

Unnamed: 0,National Results,From full data
ki3_strokenurse_bed1_saturdays,243.0,243.0
ki3_strokenurse_bed1_saturdays10pm,215.0,215.0
ki3_strokenurse_bed1_sundays,243.0,243.0
ki3_strokenurse_bed1_sundays10pm,214.0,214.0
ki3_strokenurse_bed1_weekdays10pm,215.0,215.0
ki3_strokenurse_bed3_saturdays,316.0,316.0
ki3_strokenurse_bed3_saturdays10pm,234.0,234.0
ki3_strokenurse_bed3_sundays,317.0,317.0
ki3_strokenurse_bed3_sundays10pm,234.0,234.0
ki3_strokenurse_bed3_weekdays10pm,234.0,234.0


## Key indicator 4: Minimum number of nurses

Have to mask this one to only include acute stroke units.

In [28]:
ki_str = 'ki4'

In [29]:
check_total(df_acute, df_national, ki_str)

National results: 30% (42/142)
From full data  : 30% (42/142)


For IQRs, have to mask by only units with bed types 1 or 3:

In [30]:
df_bed1 = df.loc[(df['bed1'] == 1) & (df['acute_unit'] == 1)]

In [31]:
df_bed3 = df.loc[df['bed3'] == 1]

In [32]:
cols_bed1 = [c for c in df.columns if (c.startswith('ki4_') & ('bed1' in c))]
cols_bed3 = [c for c in df.columns if (c.startswith('ki4_') & ('bed3' in c))]

In [33]:
check_iqr(df_bed1, df_national, ki_cols=cols_bed1)

Unnamed: 0,National Results,0.5,0.25,0.75
ki4_minimum_nurse_bed1_saturdays10am,3.3 (2.5-4.8)_x000D_\nMedian (IQR),3.33,2.5,4.845
ki4_minimum_nurse_bed1_sundays10am,3.3 (2.5-4.8)_x000D_\nMedian (IQR),3.33,2.5,4.845


In [34]:
check_iqr(df_bed3, df_national, ki_cols=cols_bed3)

Unnamed: 0,National Results,0.5,0.25,0.75
ki4_minimum_nurse_bed3_saturdays10am,1.7 (1.4-2.1)_x000D_\nMedian (IQR),1.75,1.43,2.08
ki4_minimum_nurse_bed3_sundays10am,1.8 (1.5-2.1)_x000D_\nMedian (IQR),1.775,1.4825,2.08


## Key indicator 5: therapy availability

In [35]:
ki_str = 'ki5'

In [36]:
check_total(df, df_national, ki_str)

National results: 38% (65/169)
From full data  : 38% (65/169)


In [37]:
def find_count_matching_value(df, col, value_list):
    total = len(df)
    for val in value_list:
        mask = df[col] == val
        count = len(df.loc[mask, col])
        perc = (count / total) * 100.0
        print(f'{val}: {perc:.0f}% ({count}/{total})')

In [38]:
cols_ki5 = ['ki5_occupationaltherapy_7days', 'ki5_physiotherapy_7days', 'ki5_speechlanguagetherapy_7days']

for col in cols_ki5:
    print(col)
    print('National Results:')
    print(df_national[col].values[0].split('_x000D_\n'))
    print('From full data:')
    find_count_matching_value(df, col, [5, 6, 7])
    print('')

ki5_occupationaltherapy_7days
National Results:
['5: 42% (71/169)', '6: 20% (34/169)', '7: 38% (64/169)']
From full data:
5: 42% (71/169)
6: 20% (34/169)
7: 38% (64/169)

ki5_physiotherapy_7days
National Results:
['5: 38% (64/169)', '6: 16% (27/169)', '7: 46% (78/169)']
From full data:
5: 38% (64/169)
6: 16% (27/169)
7: 46% (78/169)

ki5_speechlanguagetherapy_7days
National Results:
['5: 78% (129/165)', '6: 12% (19/165)', '7: 10% (17/165)']
From full data:
5: 76% (129/169)
6: 11% (19/169)
7: 10% (17/169)



^ for the bottom one here, the counts are right but the percentages are wrong because four teams with the value 0 days (instead of 5, 6, 7) should be excluded from the total count.

## Key indicator 6: pre-alerts

Have to mask this one to only include acute stroke units.

In [39]:
ki_str = 'ki6'

In [40]:
check_total(df_acute, df_national, ki_str)

National results: 38% (54/142)
From full data  : 38% (54/142)


In [41]:
cols_ki6_patients = [
    'ki6_prealert_ivtcandidates_yes',
    'ki6_prealert_ivtcandidates_sometimes',
    'ki6_prealert_fastpositive_yes',
    'ki6_prealert_fastpositive_sometimes',
    'ki6_prealert_othersuspectedstroke_yes',
    'ki6_prealert_othersuspectedstroke_sometimes',
    ]

cols_ki6_answers = [
    'ki6_prealert_strokenurse',
    'ki6_prealert_juniordoctor',
    'ki6_prealert_consultant',
]

In [42]:
def find_count_yes_no_sometimes(df, col_yes, col_sometimes):
    count_yes = len(df.loc[df[col_yes] == 1])
    count_sometimes = len(df.loc[df[col_sometimes] == 1])
    count_no = len(df.loc[((df[col_yes] == 0) & (df_acute[col_sometimes] == 0))])
    return count_yes, count_sometimes, count_no

def print_name_perc_count(val, count, total):
    perc = (count / total) * 100.0
    print(f'{val}: {perc:.0f}% ({count}/{total})')

In [43]:
for patients in ['ivtcandidates', 'fastpositive', 'othersuspectedstroke']:
    print(patients)
    count_yes, count_sometimes, count_no = find_count_yes_no_sometimes(
        df_acute, 
        f'ki6_prealert_{patients}_yes',
        f'ki6_prealert_{patients}_sometimes',
    )
    
    print_name_perc_count('Yes', count_yes, len(df_acute))
    print_name_perc_count('No', count_no, len(df_acute))
    print_name_perc_count('Sometimes', count_sometimes, len(df_acute))
    print('')
    print('National Results: ', df_national[f'ki6_prealert_{patients}'].values[0].split('_x000D_\n'))
    print('')

ivtcandidates
Yes: 80% (113/142)
No: 11% (16/142)
Sometimes: 9% (13/142)

National Results:  ['Yes: 80% (113/142)', 'No: 11% (16/142)', 'Sometimes: 9% (13/142)']

fastpositive
Yes: 69% (98/142)
No: 11% (15/142)
Sometimes: 20% (29/142)

National Results:  ['Yes: 69% (98/142)', 'No: 11% (15/142)', 'Sometimes: 20% (29/142)']

othersuspectedstroke
Yes: 37% (52/142)
No: 15% (22/142)
Sometimes: 48% (68/142)

National Results:  ['Yes: 37% (52/142)', 'No: 15% (22/142)', 'Sometimes: 48% (68/142)']



In [44]:
check_count(df_acute, df_national, ki_cols=cols_ki6_answers)

Unnamed: 0,National Results,From full data
ki6_prealert_strokenurse,44% (63/142),63
ki6_prealert_juniordoctor,13% (19/142),19
ki6_prealert_consultant,12% (17/142),17


## Key indicator 7: early supported discharge

In [45]:
ki_str = 'ki7'

In [46]:
check_total(df, df_national, ki_str)

National results: 63% (107/169)
From full data  : 63% (107/169)


In [47]:
check_count(df, df_national, ki_cols=['ki7_stroke_esd_team'])

Unnamed: 0,National Results,From full data
ki7_stroke_esd_team,85% (143/169),143


To check the next one, need to mask by only teams that allow access to an ESD team:

In [48]:
df_esd = df.loc[df['ki7_stroke_esd_team'] == 1]

In [49]:
check_iqr(df_esd, df_national, ki_cols=['ki7_percent_access_to_stroke_esd_team'])

Unnamed: 0,National Results,0.5,0.25,0.75
ki7_percent_access_to_stroke_esd_team,100 (67-100)_x000D_\nMedian (IQR),1.0,0.67,1.0


^ the numbers are right, only the national results are in % and the full data results are scaled to between 0 and 1.

## Key indicator 8: patient and carer survey

In [50]:
ki_str = 'ki8'

In [51]:
check_total(df, df_national, ki_str)

National results: 56% (95/169)
From full data  : 56% (95/169)


In [52]:
for a in df_national['ki8_patientcarersurvey_frequency'].values:
    print(a)

Never: 21% (35/169)
Less than once a year: 23% (39/169)
1-2 times a year: 20% (33/169)
3-4 times a year: 4% (6/169)
More than 4 a year: 8% (14/169)
Continuous (every patient): 25% (42/169)


In [53]:
ki8_cols = [
    'ki8_patientcarersurvey_frequency_never',
    'ki8_patientcarersurvey_frequency_less1peryear',
    'ki8_patientcarersurvey_frequency_1to2peryear',
    'ki8_patientcarersurvey_frequency_3to4peryear',
    'ki8_patientcarersurvey_frequency_more4peryear',
    'ki8_patientcarersurvey_frequency_everypatient'
]

check_count(df, df_national, ki_cols=ki8_cols)

Unnamed: 0,From full data
ki8_patientcarersurvey_frequency_never,35
ki8_patientcarersurvey_frequency_less1peryear,39
ki8_patientcarersurvey_frequency_1to2peryear,33
ki8_patientcarersurvey_frequency_3to4peryear,6
ki8_patientcarersurvey_frequency_more4peryear,14
ki8_patientcarersurvey_frequency_everypatient,42


## Key indicator 9: brain imaging type

In [54]:
ki_str = 'ki9'

In [55]:
check_total(df, df_national, ki_str)

National results: 33% (56/169)
From full data  : 33% (56/169)


In [56]:
df_national['ki9_tia_brainimaging_mostused'].values[0].split('\n')

['Computed Tomography: 50% (85/169)',
 'Magnetic Resonance Imaging: 45% (76/169)',
 'Rarely image TIAs: 5% (8/169)']

In [57]:
ki9_cols = [
    'ki9_tia_brainimaging_mostused_ct', 
    'ki9_tia_brainimaging_mostused_mri',
    'ki9_tia_brainimaging_mostused_rare',
]

check_count(df, df_national, ki_cols=ki9_cols)

Unnamed: 0,From full data
ki9_tia_brainimaging_mostused_ct,85
ki9_tia_brainimaging_mostused_mri,76
ki9_tia_brainimaging_mostused_rare,8


Mask by units with TIA clinics:

In [58]:
df_tia = df.loc[df['ki9_tia_clinic'] == 1]

In [59]:
check_count(df_tia, df_national, ki_cols=['ki9_tia_outpatients_seen'])

Unnamed: 0,National Results,From full data
ki9_tia_outpatients_seen,99% (160/162),160


In [60]:
df_national['ki9_tia_outpatient_timescale'].values[0].split('_x000D_\n')

['The same day (7 days a week): 31% (50/160)',
 'The same day (5 days a week): 26% (41/160)',
 'The next day: 14% (23/160)',
 'The next weekday: 8% (13/160)',
 'Within a week: 19% (31/160)',
 'Within a month: 1% (2/160)',
 'Longer than a month: 0% (0/160)']

In [61]:
ki9_cols = [
   'ki9_tia_outpatient_timescale_sameday',
   'ki9_tia_outpatient_timescale_samedayexcludingweekends',
   'ki9_tia_outpatient_timescale_nextday',
   'ki9_tia_outpatient_timescale_nextweekday',
   'ki9_tia_outpatient_timescale_withinweek', 
   'ki9_tia_outpatient_timescale_withinmonth',
   'ki9_tia_outpatient_timescale_aftermonth',
]

check_count(df, df_national, ki_cols=ki9_cols)

Unnamed: 0,From full data
ki9_tia_outpatient_timescale_sameday,50
ki9_tia_outpatient_timescale_samedayexcludingweekends,41
ki9_tia_outpatient_timescale_nextday,23
ki9_tia_outpatient_timescale_nextweekday,13
ki9_tia_outpatient_timescale_withinweek,31
ki9_tia_outpatient_timescale_withinmonth,2
ki9_tia_outpatient_timescale_aftermonth,0


## Key indicator 10: leadership

In [62]:
ki_str = 'ki10'

In [63]:
check_total(df, df_national, ki_str)

National results: 63% (106/169)
From full data  : 63% (106/169)


In [64]:
ki10_cols = [
    'ki10_management_executiveonboard',
    'ki10_management_nonexecutiveonboard',
    'ki10_management_chairmanofclinicalgovernance',
]

check_count(df, df_national, ki_cols=ki10_cols)

Unnamed: 0,National Results,From full data
ki10_management_executiveonboard,58% (98/169),98
ki10_management_nonexecutiveonboard,17% (28/169),28
ki10_management_chairmanofclinicalgovernance,25% (43/169),43
