To Do:

Model of death

# Descriptive stats for South West hospitals

In [1]:
import pandas as pd


In [2]:
sw = pd.read_csv('stroke_teams.csv', index_col=False)

In [3]:
# List of stroke teams
sw = list(sw.squeeze().values)

In [4]:
# Read in the data
data_all = pd.read_csv('~/ssnap_data/clean_samuel_ssnap_extract_v2.csv')

key_results = pd.DataFrame()

In [5]:
years_covered = sorted(list(set(data_all['year'])))
all_years_str = f'{years_covered[0]} to {years_covered[-1]}'

In [6]:
# # Limit data to years 2021 to 2021
# data_all = data_all[(data_all['year'] >= 2021)
#                     & (data_all['year'] <= 2021)]

# Limit data to out of hopsital onset arriving by ambulance only
# data_all = data_all[data_all['onset_to_arrival_time'] > 0]
# ^ can't limit by this as the new cleaned data (2nd Oct 2023)
# has no onset to arrival time given whenever onset known is 0.
data_all = data_all[data_all['arrive_by_ambulance'] == True]

# Add new fields
data_all['count'] = 1
data_all['prestroke_mrs_0-2'] = data_all['prior_disability'] <= 2
data_all['mrs_5-6'] = data_all['discharge_disability'] >= 5
data_all['mrs_0-2'] = data_all['discharge_disability'] <= 2


Restrict fields

In [7]:
required_fields: list = [
    'count', 'stroke_team', 'age', 'male', 'infarction', 'stroke_severity',
    'onset_to_arrival_time', 'onset_known', 'precise_onset_known',
    'onset_during_sleep', 'arrive_by_ambulance', 'year',
    'afib_anticoagulant', 'prior_disability', 'prestroke_mrs_0-2',
    'arrival_to_scan_time', 'thrombolysis',
    'scan_to_thrombolysis_time', 'death', 'discharge_disability', 'mrs_5-6', 'mrs_0-2'
]

data_all = data_all[required_fields]
mask = (
    (data_all['onset_known'] == True) & 
    (data_all['onset_to_arrival_time'] <= 240)
)
data_all['arrive_in_4_hours'] = mask

# Add change in disability
data_all['increased_disability_due_to_stroke'] = data_all['discharge_disability'] - data_all['prior_disability']

In [8]:
summary_stats_dict = {
    'count': 'sum',
    'stroke_team': 'none',
    'age': 'mean',
    'male': 'mean',
    'infarction': 'mean',
    'stroke_severity': 'mean',
    'onset_to_arrival_time': 'median',
    'onset_known': 'mean',
    'arrive_in_4_hours': 'mean',
    'precise_onset_known': 'mean',
    'onset_during_sleep': 'mean',
    'year': 'none',
    'afib_anticoagulant': 'mean',
    'prior_disability': 'mean',
    'prestroke_mrs_0-2': 'mean',
    'arrival_to_scan_time':     'median',
    'thrombolysis':     'mean',
    'scan_to_thrombolysis_time': 'median',
    'death': 'mean',
    'discharge_disability': 'mean',
    'increased_disability_due_to_stroke': 'mean',
    'mrs_5-6': 'mean',
    'mrs_0-2': 'mean'
}

## Group stroke teams by region

Import a dataframe that contains a list of the stroke team names and useful location information such as their longitude and latitude and which regions they are in. Most of the column names are Office for National Statistics labels.

In [9]:
df_stroke_team = pd.read_csv(
    './hospitals_and_lsoas_descriptive_stats.csv',
    index_col=False
    )

In [10]:
df_stroke_team.head(5).T

Unnamed: 0,0,1,2,3,4
Postcode,B152TH,B714HJ,BA13NG,BA214AT,BB23HH
Stroke Team,Queen Elizabeth Hospital Edgbaston,Sandwell District Hospital,Royal United Hospital Bath,Yeovil District Hospital,Royal Blackburn Hospital
long,-1.936284,-1.987884,-2.390566,-2.633907,-2.466806
lat,52.453272,52.527357,51.392764,50.945297,53.736622
LSOA11CD,E01033562,E01010104,E01014428,E01029231,E01012632
LSOA11NM,Birmingham 087F,Sandwell 017E,Bath and North East Somerset 008B,South Somerset 014A,Blackburn with Darwen 011F
LSOA11NMW,Birmingham 087F,Sandwell 017E,Bath and North East Somerset 008B,South Somerset 014A,Blackburn with Darwen 011F
LSOA11LONG,-1.94449,-1.98593,-2.38922,-2.6334,-2.45667
LSOA11LAT,52.45242,52.5221,51.39029,50.95035,53.73091
CCG19CD,E38000220,E38000144,E38000009,E38000150,E38000014


The following regions are contained in this file:

In [11]:
regions = sorted(list(set(df_stroke_team['RGN11NM'])))

regions

['East Midlands',
 'East of England',
 'London',
 'North East',
 'North West',
 'South East',
 'South West',
 'Wales',
 'West Midlands',
 'Yorkshire and The Humber']

Add a "region" column to the full dataset:

In [12]:
# Print the length and column headings of the dataframe
# as a check that the merge worked correctly.
print(len(data_all), data_all.columns)

data_all = pd.merge(data_all, df_stroke_team, left_on='stroke_team', right_on='Stroke Team')

print(len(data_all), data_all.columns)

283285 Index(['count', 'stroke_team', 'age', 'male', 'infarction', 'stroke_severity',
       'onset_to_arrival_time', 'onset_known', 'precise_onset_known',
       'onset_during_sleep', 'arrive_by_ambulance', 'year',
       'afib_anticoagulant', 'prior_disability', 'prestroke_mrs_0-2',
       'arrival_to_scan_time', 'thrombolysis', 'scan_to_thrombolysis_time',
       'death', 'discharge_disability', 'mrs_5-6', 'mrs_0-2',
       'arrive_in_4_hours', 'increased_disability_due_to_stroke'],
      dtype='object')
282302 Index(['count', 'stroke_team', 'age', 'male', 'infarction', 'stroke_severity',
       'onset_to_arrival_time', 'onset_known', 'precise_onset_known',
       'onset_during_sleep', 'arrive_by_ambulance', 'year',
       'afib_anticoagulant', 'prior_disability', 'prestroke_mrs_0-2',
       'arrival_to_scan_time', 'thrombolysis', 'scan_to_thrombolysis_time',
       'death', 'discharge_disability', 'mrs_5-6', 'mrs_0-2',
       'arrive_in_4_hours', 'increased_disability_due_to_stroke

## Descriptive stats for all patients

All teams:

In [13]:
summary_stats = dict()
for year in [all_years_str] + years_covered:
    if year == all_years_str:
        data = data_all
    else:
        data = data_all[(
            (data_all['year'] == year)
            )]
    results = dict()
    for k, v in summary_stats_dict.items():
        if v == 'mean':
            results[k] = data[k].mean()
        elif v == 'median':
            results[k] = data[k].median()
        elif v == 'sum':
            results[k] = data[k].sum()
    results['stroke_team'] = 'All England & Wales'
    results['year'] = year
    summary_stats[f'All England & Wales ({year})'] = results


Split by region:

In [14]:
for region in regions:
    for year in [all_years_str] + years_covered:
        if year == all_years_str:
            data = data_all[data_all['RGN11NM'] == region]
        else:
            data = data_all[(
            (data_all['RGN11NM'] == region) &
            (data_all['year'] == year)
            )]
        results = dict()
        for k, v in summary_stats_dict.items():
            if v == 'mean':
                results[k] = data[k].mean()
            elif v == 'median':
                results[k] = data[k].median()
            elif v == 'sum':
                results[k] = data[k].sum()
        results['year'] = year
        results['stroke_team'] = f'All {region}'
        summary_stats[f'All {region} ({year})'] = results


Split by individual team:

In [15]:
for hospital in sw:
    for year in [all_years_str] + years_covered:
        if year == all_years_str:
            data = data_all[data_all['stroke_team'] == hospital]
        else:
            data = data_all[(
            (data_all['stroke_team'] == hospital) &
            (data_all['year'] == year)
            )]
        results = dict()
        for k, v in summary_stats_dict.items():
            if v == 'mean':
                results[k] = data[k].mean()
            elif v == 'median':
                results[k] = data[k].median()
            elif v == 'sum':
                results[k] = data[k].sum()
        results['stroke_team'] = hospital
        results['year'] = year
        summary_stats[f'{hospital} ({year})'] = results


In [16]:
summary_stats_df = pd.DataFrame(summary_stats)

# Only keep hospitals with more than 100 admissions:
mask_count = summary_stats_df.loc['count'] > 100
summary_stats_df = summary_stats_df.T[mask_count].T

In [17]:

key_results['onset_known'] = summary_stats_df.loc['onset_known']
key_results['arrive in 4 hours'] = summary_stats_df.loc['arrive_in_4_hours']
key_results['thrombolysis_all_arrivals'] = summary_stats_df.loc['thrombolysis']

# Round floats to 3 decimal places:
summary_stats_df = summary_stats_df.applymap(lambda x: round(x, 3) if isinstance(x, (float)) else x)

# Convert "time" rows to float:
time_rows = ['onset_to_arrival_time', 'scan_to_thrombolysis_time', 'arrival_to_scan_time']
# summary_stats_4hr_df.loc[time_rows] = 
for r in time_rows:
    summary_stats_df.loc[r] = pd.to_numeric(summary_stats_df.loc[r])

summary_stats_df

Unnamed: 0,All England & Wales (2016 to 2021),All England & Wales (2016),All England & Wales (2017),All England & Wales (2018),All England & Wales (2019),All England & Wales (2020),All England & Wales (2021),All East Midlands (2016 to 2021),All East Midlands (2016),All East Midlands (2017),...,Royal Derby Hospital (2019),Royal Derby Hospital (2020),Royal Derby Hospital (2021),Queen's Medical Centre - Nottingham (2016 to 2021),Queen's Medical Centre - Nottingham (2020),Queen's Medical Centre - Nottingham (2021),University Hospitals Dorset Stroke Service (2016 to 2021),University Hospitals Dorset Stroke Service (2021),Grange University Hospital (2016 to 2021),Grange University Hospital (2021)
count,282302,45807,46994,46011,46696,46771,50023,20569,2939,2878,...,677,815,896,1213,253,948,519,420,233,193
age,75.572,75.794,75.858,75.716,75.807,75.153,75.138,75.28,75.851,75.796,...,75.964,74.831,74.492,73.893,74.457,73.919,75.679,75.524,73.68,73.666
male,0.515,0.505,0.505,0.515,0.514,0.52,0.528,0.516,0.503,0.497,...,0.517,0.486,0.554,0.488,0.482,0.485,0.555,0.54,0.528,0.503
infarction,0.867,0.867,0.87,0.868,0.864,0.868,0.866,0.866,0.865,0.881,...,0.846,0.87,0.857,0.862,0.897,0.855,0.877,0.864,0.807,0.808
stroke_severity,8.088,8.104,8.262,8.251,8.178,7.929,7.825,8.39,8.298,8.501,...,10.809,9.42,9.943,9.036,9.312,8.944,6.728,6.693,7.258,7.275
onset_to_arrival_time,164.0,146.0,150.0,160.0,166.0,171.0,189.0,159.0,134.0,146.0,...,145.0,175.0,193.0,155.5,147.0,163.0,196.0,195.5,260.0,243.0
onset_known,0.705,0.695,0.696,0.707,0.719,0.707,0.707,0.643,0.616,0.656,...,0.548,0.58,0.622,0.577,0.711,0.537,0.802,0.819,0.691,0.699
arrive_in_4_hours,0.437,0.451,0.448,0.447,0.443,0.425,0.411,0.413,0.426,0.446,...,0.381,0.339,0.348,0.376,0.522,0.331,0.472,0.476,0.326,0.347
precise_onset_known,0.362,0.366,0.368,0.37,0.364,0.357,0.351,0.313,0.33,0.329,...,0.458,0.474,0.517,0.017,0.016,0.007,0.277,0.252,0.425,0.42
onset_during_sleep,0.137,0.134,0.14,0.137,0.139,0.14,0.134,0.144,0.149,0.162,...,0.161,0.16,0.154,0.005,0.004,0.004,0.139,0.133,0.185,0.181


In [18]:
summary_stats_df.to_csv('summary_stats.csv')

## Limit analysis to arrival in 4 hours

In [19]:
mask = data_all['arrive_in_4_hours'] == True
data_4hr = data_all[mask]

Add onset to thrombolysis



In [20]:
data_4hr['onset_to_thrombolysis'] = (
    data_4hr['onset_to_arrival_time'] + 
    data_4hr['arrival_to_scan_time'] + 
    data_4hr['scan_to_thrombolysis_time'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_4hr['onset_to_thrombolysis'] = (


In [21]:
# Repeat analysis for patients arriving in 4 hours
summary_stats_4hr = dict()
for year in [all_years_str] + years_covered:
    if year == all_years_str:
        data = data_4hr
    else:
        data = data_4hr[(
            (data_4hr['year'] == year)
            )]
    results = dict()
    for k, v in summary_stats_dict.items():
        if v == 'mean':
            results[k] = data[k].mean()
        elif v == 'median':
            results[k] = data[k].median()
        elif v == 'sum':
            results[k] = data[k].sum()
    results['stroke_team'] = 'All England & Wales'
    results['year'] = year
    summary_stats_4hr[f'All England & Wales ({year})'] = results

for region in regions:
    for year in [all_years_str] + years_covered:
        if year == all_years_str:
            data = data_4hr[data_4hr['RGN11NM'] == region]
        else:
            data = data_4hr[(
                (data_4hr['RGN11NM'] == region) &
                (data_4hr['year'] == year)
                )]
        results = dict()
        for k, v in summary_stats_dict.items():
            if v == 'mean':
                results[k] = data[k].mean()
            elif v == 'median':
                results[k] = data[k].median()
            elif v == 'sum':
                results[k] = data[k].sum()
        results['year'] = year
        results['stroke_team'] = f'All {region}'
        summary_stats_4hr[f'All {region} ({year})'] = results

for hospital in sw:
    for year in [all_years_str] + years_covered:
        if year == all_years_str:
            data = data_4hr[data_4hr['stroke_team'] == hospital]
        else:
            data = data_4hr[(
                (data_4hr['stroke_team'] == hospital) &
                (data_4hr['year'] == year)
                )]
        # data = data_4hr[data_4hr['stroke_team'] == hospital]
        results = dict()
        for k, v in summary_stats_dict.items():
            if v == 'mean':
                results[k] = data[k].mean()
            elif v == 'median':
                results[k] = data[k].median()
            elif v == 'sum':
                results[k] = data[k].sum()
        results['stroke_team'] = hospital
        results['year'] = year
        summary_stats_4hr[f'{hospital} ({year})'] = results

summary_stats_4hr_df = pd.DataFrame(summary_stats_4hr)

# Only keep hospitals with more than 100 admissions
# in the full data (not the 4hr data) - same mask as earlier.
summary_stats_4hr_df = summary_stats_4hr_df.T[mask_count].T

# Round floats to 3 decimal places:
summary_stats_4hr_df = summary_stats_4hr_df.applymap(lambda x: round(x, 3) if isinstance(x, (float)) else x)


In [22]:
# Convert "time" rows to float:
time_rows = ['onset_to_arrival_time', 'scan_to_thrombolysis_time', 'arrival_to_scan_time']
# summary_stats_4hr_df.loc[time_rows] = 
for r in time_rows:
    summary_stats_4hr_df.loc[r] = pd.to_numeric(summary_stats_4hr_df.loc[r])

In [23]:
summary_stats_4hr_df.to_csv('summary_stats_4hr.csv')

In [24]:
rows = [
    'age', 'infarction', 'precise_onset_known', 'onset_during_sleep',
    'afib_anticoagulant', 'prior_disability', 'prestroke_mrs_0-2', 
    'stroke_severity',
    'onset_to_arrival_time', 'arrival_to_scan_time', 'thrombolysis',
    'scan_to_thrombolysis_time', 'discharge_disability', 'death',
    'increased_disability_due_to_stroke', 'mrs_5-6', 'mrs_0-2'
]

for row in rows:
    key_results[row] = summary_stats_4hr_df.loc[row]


In [25]:
key_results = key_results.round(3).T

In [26]:
key_results

Unnamed: 0,All England & Wales (2016 to 2021),All England & Wales (2016),All England & Wales (2017),All England & Wales (2018),All England & Wales (2019),All England & Wales (2020),All England & Wales (2021),All East Midlands (2016 to 2021),All East Midlands (2016),All East Midlands (2017),...,Royal Derby Hospital (2019),Royal Derby Hospital (2020),Royal Derby Hospital (2021),Queen's Medical Centre - Nottingham (2016 to 2021),Queen's Medical Centre - Nottingham (2020),Queen's Medical Centre - Nottingham (2021),University Hospitals Dorset Stroke Service (2016 to 2021),University Hospitals Dorset Stroke Service (2021),Grange University Hospital (2016 to 2021),Grange University Hospital (2021)
onset_known,0.705121,0.694632,0.695621,0.7072,0.719462,0.70687,0.706715,0.643201,0.615856,0.655664,...,0.548006,0.580368,0.621652,0.577082,0.711462,0.53692,0.801541,0.819048,0.690987,0.699482
arrive in 4 hours,0.437028,0.451154,0.448142,0.446698,0.442586,0.425328,0.410511,0.413195,0.425655,0.445796,...,0.381093,0.33865,0.348214,0.375927,0.521739,0.331224,0.472062,0.47619,0.32618,0.34715
thrombolysis_all_arrivals,0.134193,0.136049,0.137166,0.138402,0.140033,0.128755,0.125462,0.122126,0.129636,0.131341,...,0.098966,0.094479,0.077009,0.158285,0.201581,0.142405,0.132948,0.119048,0.145923,0.160622
age,75.118,75.412,75.282,75.28,75.265,74.646,74.799,74.698,75.386,75.329,...,75.717,72.971,74.599,73.52,72.955,74.156,75.908,75.95,72.5,72.873
infarction,0.849,0.851,0.854,0.851,0.846,0.847,0.847,0.852,0.851,0.866,...,0.837,0.83,0.837,0.875,0.909,0.866,0.849,0.835,0.776,0.761
precise_onset_known,0.632,0.63,0.641,0.638,0.633,0.626,0.622,0.582,0.621,0.581,...,0.884,0.884,0.901,0.044,0.03,0.022,0.498,0.46,0.763,0.761
onset_during_sleep,0.047,0.045,0.046,0.048,0.046,0.05,0.046,0.055,0.034,0.062,...,0.058,0.054,0.048,0.004,0.0,0.006,0.073,0.08,0.0,0.0
afib_anticoagulant,0.19,0.495,0.42,0.147,0.156,0.158,0.166,0.175,0.424,0.419,...,0.128,0.17,0.205,0.112,0.106,0.118,0.139,0.12,0.053,0.06
prior_disability,1.1,1.091,1.089,1.128,1.123,1.07,1.099,1.029,1.087,1.089,...,1.295,0.793,0.798,0.513,0.591,0.497,1.016,0.99,0.487,0.448
prestroke_mrs_0-2,0.788,0.784,0.791,0.782,0.782,0.797,0.794,0.8,0.782,0.787,...,0.709,0.851,0.84,0.906,0.894,0.908,0.792,0.8,0.934,0.94
