# Extract hospital performance for pathway model

## Aims

* Extract and save hospital performance for pathway simulation model
* Create breakdowns by weekend/weekday/day/night

## Import libraries

In [1]:
import numpy as np
import pandas as pd

## Load data

* Load data
* Restrict data to fields necessary for pathway extraction
* Remove in-hospital admissions

In [2]:
# Load data
data_loaded = pd.read_csv(
    './data/SAMueL ssnap extract v2.csv', low_memory=False)

# Number of years data covers
# January 2016 to December 2021
data_years = 6.0

In [3]:
# Restrict fields
used_fields = [
    'TeamName',
    # 'MoreEqual80y',  # Doesn't exist in 2023 dataset
    'S1Gender',
    # 'S1OnsetInHospital',  Doesn't exist in 2023 dataset
    'OnsettoArrivalMinutes',
    # 'S1AdmissionHour',
    # 'S1AdmissionDay',
    'S1OnsetTimeType',
    'ArrivaltoBrainImagingMinutes',
    'S2StrokeType',
    'S2Thrombolysis',
    'ArrivaltoThrombolysisMinutes',
    # Thrombectomy:
    'ArrivaltoArterialPunctureMinutes' 
]

data_loaded = data_loaded[used_fields]

In [4]:
# # Remove in hospital admissions
# mask = data_loaded['S1OnsetInHospital'] == 'No'
# data_loaded = data_loaded[mask]

# This info doesn't exist in the new dataset.

## Extract hospital performance

█▓▒░

In [5]:
def analyse_by_team(input_data):
    """
    █▓▒░
    
    With each step, whittle down the full group of patients in the
    following way. In the example, the sizes of blocks are arbitrary. 
    
    Key:
    ░ - patients still in the subgroup
    ▒ - patients rejected from the subgroup at this step
    █ - patients rejected from the subgroup in previous steps
    
    ▏Start: Full group                                                ▕
    ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░
    ▏-------------------------All patients----------------------------▕
    ▏                                                                 ▕
    ▏Step 1: Is onset time known?                                     ▕
    ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒
    ▏--------------------Yes----------------------▏---------No--------▕
    ▏                                             ▏                   ▕
    ▏Step 2: Is arrival within x hours?           ▏                   ▕
    ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░▒▒▒▒▒▒▒▒▒▒▒█████████████████████
    ▏---------------Yes----------------▏----No----▏------Rejected-----▕
    ▏                                  ▏          ▏                   ▕
    ▏Step 3: Is scan within x hours of arrival?   ▏                   ▕
    ░░░░░░░░░░░░░░░░░░░░░░░░░░░░▒▒▒▒▒▒▒████████████████████████████████
    ▏------------Yes------------▏--No--▏-----------Rejected-----------▕
    ▏                           ▏      ▏                              ▕
    ▏Step 4: Is scan within x hours of onset?                         ▕
    ░░░░░░░░░░░░░░░░░░░░░░░▒▒▒▒▒███████████████████████████████████████
    ▏----------Yes---------▏-No-▏---------------Rejected--------------▕
    ▏                      ▏    ▏                                     ▕
    ▏Step 5: Did the patient receive thrombolysis/thrombectomy?       ▕
    ░░░░░░░░░░░░▒▒▒▒▒▒▒▒▒▒▒████████████████████████████████████████████
    ▏----Yes----▏----No----▏------------------Rejected----------------▕
    

    Patient proportions measured:    
    +--------------------------------+--------------------------------+
    | Proportion                     | Measure                        |
    +--------------------------------+--------------------------------+
    | Thrombolysis rate or           | Total number treated such      |
    | thrombectomy rate              | divided by all patients.       |
    | Onset known                    | "Yes" to Step 1 divided by     |
    |                                | all patients.                  |
    | Known arrival within x hours   | "Yes" to Step 2 divided by     |
    |                                | "Yes" to Step 1.               |
    | Scan within x hours of arrival | "Yes" to Step 3 divided by     |
    |                                | "Yes" to Step 2.               |
    | Scan within x hours of onset   | "Yes" to Step 4 divided by     |
    |                                | "Yes" to Step 3.               |
    | "Chosen" for thrombolysis      | "Yes" to Step 5 divided by     |
    | or for thrombectomy            | "Yes" to Step 4.               |
    +--------------------------------+--------------------------------+
    The "proportion chosen for thrombolysis" is a different measure
    from the "thrombolysis rate", which is the proportion of all of the
    patients at the start who were given thrombolysis. It is possible
    some patients received thrombolysis in real life but that by this
    process they were rejected before Step 6.

    The log-normal mean and standard deviation (mu and sigma) are taken
    for the groups of patients who answer "Yes" to everything up to and
    including particular steps.
    +---------------------------------+-------------------------------+
    | Subgroup who answer "yes" at... | Log-normal distribution       |
    +---------------------------------+-------------------------------+
    |                          Step 2 | Onset to arrival time         |
    |                          Step 3 | Arrival to scan time          |
    |                          Step 4 | Scan to needle or             |
    |                                 | scan to puncture time         |
    +---------------------------------+-------------------------------+
    
    The same process is used to determine stats about thrombectomy,
    with the following changes:
    - the time limit is 8 hours instead of 4 hours.
    - Step 4 determines the scan to puncture time.
    - if fewer than 10 patients receive thrombectomy per year,
      the mu and sigma for scan to puncture time are discarded. ------------------------- is this implemented yet?
      
    """
    # Copy data
    data = input_data.copy()
    
    # Set up results lists
    stroke_team = []
    admissions = []
    # Proportions of all patients:
    thrombolysis_rate = []
    thrombectomy_rate = []
    onset_known = []
    # Proportions of subsets:
    known_arrival_within_4hrs = []
    scan_within_4_hrs = []
    onset_scan_4_hrs = []
    scan_needle_4_hrs = []
    proportion_chosen_for_thrombolysis = []
    known_arrival_within_8hrs = []
    scan_within_8_hrs = []
    onset_scan_8_hrs = []
    scan_puncture_8_hrs = []
    proportion_chosen_for_thrombectomy = []
    # Log-normal distribution measures:
    onset_arrival_mins_mu = []                           # Thrombolysis
    onset_arrival_mins_sigma = []                        # Thrombolysis
    arrival_scan_arrival_mins_mu = []                    # Thrombolysis
    arrival_scan_arrival_mins_sigma = []                 # Thrombolysis
    scan_needle_mins_mu = []                             # Thrombolysis
    scan_needle_mins_sigma = []                          # Thrombolysis
    
    onset_arrival_thrombectomy_mins_mu = []              # Thrombectomy
    onset_arrival_thrombectomy_mins_sigma = []           # Thrombectomy
    arrival_scan_arrival_thrombectomy_mins_mu = []       # Thrombectomy
    arrival_scan_arrival_thrombectomy_mins_sigma = []    # Thrombectomy
    # arrival_puncture_mins_mu = []                        # Thrombectomy
    # arrival_puncture_mins_sigma = []                     # Thrombectomy
    scan_puncture_mins_mu = []                           # Thrombectomy
    scan_puncture_mins_sigma = []                        # Thrombectomy
    
    
    # Split data by stroke team
    groups = data.groupby('TeamName') # creates a new object of groups of data
    group_count = 0
    for index, group_df in groups: # each group has an index + dataframe of data
        group_count += 1

        # Record stroke team
        stroke_team.append(index)

        # Record admission numbers
        admissions.append(group_df.shape[0])
        # Get thrombolysis rate
        thrombolysed = group_df['S2Thrombolysis'] == 'Y'
        thrombolysis_rate.append(thrombolysed.mean())
        # Get thrombectomy rate
        thrombectomised = group_df['ArrivaltoArterialPunctureMinutes'] >= 0
        thrombectomy_rate.append(thrombectomised.mean())

        # Record onset known proportion and remove rest
        f = lambda x: x in ['P', 'BE']
        mask = group_df['S1OnsetTimeType'].apply(f)
        onset_known.append(mask.mean())
        group_df_onset_known = group_df[mask]

        # ----- Thrombolysis -----
        # Record onset <4 hours and remove rest
        mask = group_df_onset_known['OnsettoArrivalMinutes'] <= 240
        known_arrival_within_4hrs.append(mask.mean())
        group_df = group_df_onset_known[mask]

        # Log mean/sd of onset to arrival
        ln_onset_to_arrival = np.log(group_df['OnsettoArrivalMinutes'])
        onset_arrival_mins_mu.append(ln_onset_to_arrival.mean())
        onset_arrival_mins_sigma.append(ln_onset_to_arrival.std())

        # Record scan within 4 hours of arrival (and remove the rest)
        mask = group_df['ArrivaltoBrainImagingMinutes'] <= 240
        scan_within_4_hrs.append(mask.mean())
        group_df = group_df[mask]
        
        # Log mean/sd of arrival to scan
        ln_arrival_to_scan = np.log(group_df['ArrivaltoBrainImagingMinutes'])
        arrival_scan_arrival_mins_mu.append(ln_arrival_to_scan.mean())
        arrival_scan_arrival_mins_sigma.append(ln_arrival_to_scan.std())
        
        # Record onset to scan in 4 hours and remove rest
        mask = (group_df['OnsettoArrivalMinutes'] + 
                group_df['ArrivaltoBrainImagingMinutes']) <= 240
        onset_scan_4_hrs.append(mask.mean())
        group_df = group_df[mask]

        # Thrombolysis given (to remaining patients)
        thrombolysed = group_df['S2Thrombolysis'] == 'Y'
        proportion_chosen_for_thrombolysis.append(thrombolysed.mean())

        # Scan to need (Replace any zero scan to needle times with 1)
        mask = group_df['ArrivaltoThrombolysisMinutes'] > 0
        thrombolysed = group_df[mask]
        scan_to_needle = (thrombolysed['ArrivaltoThrombolysisMinutes'] - 
                          thrombolysed['ArrivaltoBrainImagingMinutes'])
        mask = scan_to_needle == 0
        scan_to_needle[mask] = 1
        # Record scan to needle in 4 hours and remove rest
        mask = (scan_to_needle <= 240)
        scan_needle_4_hrs.append(mask.mean())
        ln_scan_to_needle = np.log(scan_to_needle)
        scan_needle_mins_mu.append(ln_scan_to_needle.mean())
        scan_needle_mins_sigma.append(ln_scan_to_needle.std())

        
        
        # ----- Thrombectomy -----
        # Record onset <8 hours and remove rest
        mask = group_df_onset_known['OnsettoArrivalMinutes'] <= 8*60
        known_arrival_within_8hrs.append(mask.mean())
        group_df = group_df_onset_known[mask]

        # Log mean/sd of onset to arrival
        ln_onset_to_arrival = np.log(group_df['OnsettoArrivalMinutes'])
        onset_arrival_thrombectomy_mins_mu.append(ln_onset_to_arrival.mean())
        onset_arrival_thrombectomy_mins_sigma.append(ln_onset_to_arrival.std())

        # Record scan within 4 hours of arrival (and remove the rest)
        mask = group_df['ArrivaltoBrainImagingMinutes'] <= 8*60
        scan_within_8_hrs.append(mask.mean())
        group_df = group_df[mask]
        
        # Log mean/sd of arrival to scan
        ln_arrival_to_scan = np.log(group_df['ArrivaltoBrainImagingMinutes'])
        arrival_scan_arrival_thrombectomy_mins_mu.append(ln_arrival_to_scan.mean())
        arrival_scan_arrival_thrombectomy_mins_sigma.append(ln_arrival_to_scan.std())
        
        # Record onset to scan in 4 hours and remove rest
        mask = (group_df['OnsettoArrivalMinutes'] + 
                group_df['ArrivaltoBrainImagingMinutes']) <= 8*60
        onset_scan_8_hrs.append(mask.mean())
        group_df = group_df[mask]

        # Arrival to thrombectomy (arterial puncture) in minutes:
        mask = group_df['ArrivaltoArterialPunctureMinutes'] >= 0
        thrombectomised = group_df[mask]
        
        # Thrombectomy given (to remaining patients)
        proportion_chosen_for_thrombectomy.append(mask.mean())
        
        # # The *1 here is stupid but stops the arrival_to_puncture being
        # # a view instead of a copy and so prevents a warning message.
        # arrival_to_puncture = thrombectomised['ArrivaltoArterialPunctureMinutes'] * 1
        # mask = arrival_to_puncture == 0
        # arrival_to_puncture[mask] = 1
        # ln_arrival_to_puncture = np.log(arrival_to_puncture)
        # arrival_puncture_mins_mu.append(ln_arrival_to_puncture.mean())
        # arrival_puncture_mins_sigma.append(ln_arrival_to_puncture.std())
        
        # Scan to thrombectomy (arterial puncture) in minutes:
        scan_to_puncture = (thrombectomised['ArrivaltoArterialPunctureMinutes'] - 
                          thrombectomised['ArrivaltoBrainImagingMinutes'])
        mask = scan_to_puncture == 0
        scan_to_puncture[mask] = 1
        # Record scan to puncture in 8 hours and remove rest
        mask = (scan_to_puncture <= 8*60)
        scan_puncture_8_hrs.append(mask.mean())
        ln_scan_to_puncture = np.log(scan_to_puncture)
        scan_puncture_mins_mu.append(ln_scan_to_puncture.mean())
        scan_puncture_mins_sigma.append(ln_scan_to_puncture.std())
            
    df = pd.DataFrame()
    df['stroke_team'] = stroke_team
    df['thrombolysis_rate'] = thrombolysis_rate
    df['thrombectomy_rate'] = thrombectomy_rate
    df['admissions'] = admissions
    df['admissions'] = df['admissions'] /data_years
    # df['80_plus'] = age_80_plus
    df['onset_known'] = onset_known
    # ----- Thrombolysis -----
    df['known_arrival_within_4hrs'] = known_arrival_within_4hrs
    df['onset_arrival_mins_mu'] = onset_arrival_mins_mu
    df['onset_arrival_mins_sigma'] = onset_arrival_mins_sigma
    df['scan_within_4_hrs'] = scan_within_4_hrs
    df['arrival_scan_arrival_mins_mu'] = arrival_scan_arrival_mins_mu
    df['arrival_scan_arrival_mins_sigma'] = arrival_scan_arrival_mins_sigma
    df['onset_scan_4_hrs'] = onset_scan_4_hrs
    df['scan_needle_4_hrs'] = scan_needle_4_hrs
    df['scan_needle_mins_mu'] = scan_needle_mins_mu
    df['scan_needle_mins_sigma'] = scan_needle_mins_sigma
    df['proportion_chosen_for_thrombolysis'] = proportion_chosen_for_thrombolysis
    # ----- Thrombectomy -----
    df['known_arrival_within_8hrs'] = known_arrival_within_8hrs
    df['onset_arrival_thrombectomy_mins_mu'] = onset_arrival_thrombectomy_mins_mu
    df['onset_arrival_thrombectomy_mins_sigma'] = onset_arrival_thrombectomy_mins_sigma
    df['scan_within_8_hrs'] = scan_within_8_hrs
    df['arrival_scan_arrival_thrombectomy_mins_mu'] = arrival_scan_arrival_thrombectomy_mins_mu
    df['arrival_scan_arrival_thrombectomy_mins_sigma'] = arrival_scan_arrival_thrombectomy_mins_sigma
    df['onset_scan_8_hrs'] = onset_scan_8_hrs
    # df['arrival_puncture_mins_mu'] = arrival_puncture_mins_mu
    # df['arrival_puncture_mins_sigma'] = arrival_puncture_mins_sigma
    df['scan_puncture_8_hrs'] = scan_puncture_8_hrs
    df['scan_puncture_mins_mu'] = scan_puncture_mins_mu
    df['scan_puncture_mins_sigma'] = scan_puncture_mins_sigma
    
    df['proportion_chosen_for_thrombectomy'] = proportion_chosen_for_thrombectomy
    return df
    

In [6]:
df_all = analyse_by_team(data_loaded)

# Limit to hosp with > 100 admissions/year and >10 thrombolysis in total and >10 thrombectomy in total
admissions = df_all['admissions']
thrombolysed = admissions * df_all['thrombolysis_rate']
thrombectomy = admissions * df_all['thrombectomy_rate']
mask = ((admissions >= (100 / data_years)) &
        (thrombolysed >= (10.0 / data_years)) &
        (thrombectomy >= (10.0 / data_years))
       )
df_all = df_all[mask]

# Save
df_all.to_csv('data/hospital_performance_thrombectomy.csv', index=False)

# Show data for five hopsitals
df_all.head().T

Unnamed: 0,0,1,2,5,6
stroke_team,Addenbrooke's Hospital,Basildon University Hospital,Blackpool Victoria Hospital,Broomfield Hospital,Calderdale Royal Hospital
thrombolysis_rate,0.149184,0.132237,0.091938,0.104681,0.135504
thrombectomy_rate,0.026571,0.01199,0.01235,0.006635,0.00604
admissions,602.166667,486.5,485.833333,452.166667,634.666667
onset_known,0.590645,0.648167,0.44837,0.558791,0.546218
known_arrival_within_4hrs,0.679007,0.57241,0.693191,0.605541,0.645673
onset_arrival_mins_mu,4.636923,4.598469,4.477038,4.643075,4.500197
onset_arrival_mins_sigma,0.551945,0.53132,0.59518,0.543022,0.608022
scan_within_4_hrs,0.94893,0.958449,0.941501,0.991285,0.946389
arrival_scan_arrival_mins_mu,3.661057,2.467406,3.576595,2.859362,2.708081


In [7]:
print(stop, here, please)

NameError: name 'stop' is not defined

### Limit full data to units with at least 300 admissions

In [None]:
units_with_300_admissions = list(set(df_all['stroke_team']))
mask = data_loaded['StrokeTeam'].isin(units_with_300_admissions)
data_restricted = data_loaded[mask]

### Produce results for day/night and weekday/weekend

In [None]:
day_time_values = ['09:00 to 11:59', '12:00 to 14:59', '15:00 to 17:59']
values = data_restricted['S1AdmissionHour'].isin(day_time_values)
data_restricted = data_restricted.assign(day_time=values)    

In [None]:
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
values = data_restricted['S1AdmissionDay'].isin(weekdays)
data_restricted = data_restricted.assign(mon_fri=values)

Weekday

In [None]:
mask = data_restricted['mon_fri']
df = data_restricted[mask]
df = analyse_by_team(df)
df.to_csv(
    'hosp_performance_output/hospital_performance_weekday.csv', index=False)

Weekday day

In [None]:
mask = data_restricted['day_time'] & data_restricted['mon_fri']
df = data_restricted[mask]
df = analyse_by_team(df)
df.to_csv(
    'hosp_performance_output/hospital_performance_weekday_day.csv', index=False)

Weekday night

In [None]:
mask = data_restricted['day_time'] == False & data_restricted['mon_fri']
df = data_restricted[mask]
df = analyse_by_team(df)
df.to_csv(
    'hosp_performance_output/hospital_performance_weekday_night.csv', index=False)

Weekend

In [None]:
mask = data_restricted['mon_fri'] == False
df = data_restricted[mask]
df = analyse_by_team(df)
df.to_csv(
    'hosp_performance_output/hospital_performance_weekend.csv', index=False)

Weekend day

In [None]:
mask = data_restricted['day_time'] & data_restricted['mon_fri'] == False
df = data_restricted[mask]
df = analyse_by_team(df)
df.to_csv(
    'hosp_performance_output/hospital_performance_weekend_day.csv', index=False)

Weekend night

In [None]:
mask = (
    data_restricted['day_time'] == False) & (data_restricted['mon_fri'] == False)
df = data_restricted[mask]
df = analyse_by_team(df)
df.to_csv(
    'hosp_performance_output/hospital_performance_weekend_night.csv', index=False)