# Save data for thrombolysis model

In [9]:
import pandas as pd

## Read in data and restrict fields

In [10]:

data: pd.DataFrame = pd.read_csv('../data/data.csv')

Restrict years

In [11]:
year_start = 2017
year_end = 2019

data = data[(data['year'] >= year_start) & (data['year'] <= year_end)]

Restrict data by year, admissions, and thrombolysis

In [12]:
minimum_admissions_per_year = 100
minimum_thrombolysis_per_year = 3
number_of_years = year_end - year_start + 1

included_teams = []

# Create groupy object, by stroke team
groupby = data.groupby('stroke team')

# Loop through groupby object and check stroke team within limits
for stroke_team, group in groupby:

    # Include the group by default
    include = True

    # Check number of admissions
    if len(group) / number_of_years < minimum_admissions_per_year:
        include = False

    # Check number of thrombolysis
    elif (group['thrombolysis'].sum() / number_of_years
            < minimum_thrombolysis_per_year):
        include = False

    # If the group is to be included, append it to included teams list
    if include:
            included_teams.append(stroke_team)

# Censor arrival-to-scan time to 360 minutes
data['arrival-to-scan time'] = data['arrival-to-scan time'].clip(upper=360)

# Filter data to only include included teams
data = data[data['stroke team'].isin(included_teams)]

# Drop rows with onset known = 0
data = data[data['onset known'] == 1]

# Drop rows with onset-to-arrival time of > 240 mins
data = data[data['onset-to-arrival time'] <= 240]
data = data[data['onset known'] == True]

# Drop rows with onset-to-arrival time of < 0 mins
data = data[data['onset-to-arrival time'] > 0]

# Drop those not arriving by ambulance
data = data[data['arrive by ambulance'] == True]

In [13]:
xgb_thrombolysis_fields: list = [
            'stroke team', 'age', 'infarction', 'stroke severity',
            'onset-to-arrival time', 'precise onset known',
            'onset during sleep', 'use of AF anticoagulants',
            'prior disability', 'arrival-to-scan time', 'thrombolysis']

data = data[xgb_thrombolysis_fields]

In [14]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,62234.0,75.282322,13.237182,37.5,67.5,77.5,87.5,92.5
infarction,62234.0,0.849295,0.357764,0.0,1.0,1.0,1.0,1.0
stroke severity,62234.0,9.435598,8.406524,0.0,3.0,6.0,15.0,42.0
onset-to-arrival time,62234.0,113.227914,51.831917,1.0,73.0,102.0,147.0,240.0
precise onset known,62234.0,0.637079,0.480846,0.0,0.0,1.0,1.0,1.0
onset during sleep,62234.0,0.046743,0.211089,0.0,0.0,0.0,0.0,1.0
use of AF anticoagulants,62234.0,0.139586,0.34656,0.0,0.0,0.0,0.0,1.0
prior disability,62234.0,1.114535,1.445364,0.0,0.0,0.0,2.0,5.0
arrival-to-scan time,62158.0,52.038499,73.227005,1.0,14.0,26.0,51.0,360.0
thrombolysis,62234.0,0.30268,0.459422,0.0,0.0,0.0,1.0,1.0


## Save data

In [15]:
data.to_csv('../data/thrombolysis_xgb_data_2017_2019.csv', index=False)