# Save data for thrombolysis model

In [8]:
import pandas as pd

## Read in data and restrict fields

In [9]:

data: pd.DataFrame = pd.read_csv('../data/data.csv')

Restrict years

In [10]:
year_start = 2017
year_end = 2019

data = data[(data['year'] >= year_start) & (data['year'] <= year_end)]

Restrict data by year, admissions, and thrombolysis

In [11]:
minimum_admissions_per_year = 100
minimum_thrombolysis_per_year = 3
number_of_years = year_end - year_start + 1

included_teams = []

# Create groupy object, by stroke team
groupby = data.groupby('stroke team')

# Loop through groupby object and check stroke team within limits
for stroke_team, group in groupby:

    # Include the group by default
    include = True

    # Check number of admissions
    if len(group) / number_of_years < minimum_admissions_per_year:
        include = False

    # Check number of thrombolysis
    elif (group['thrombolysis'].sum() / number_of_years
            < minimum_thrombolysis_per_year):
        include = False

    # If the group is to be included, append it to included teams list
    if include:
            included_teams.append(stroke_team)

# Censor arrival-to-scan time to 360 minutes
data['arrival-to-scan time'] = data['arrival-to-scan time'].clip(upper=360)

# Filter data to only include included teams
data = data[data['stroke team'].isin(included_teams)]

# Drop rows with onset-to-arrival time of > 240 mins
data = data[data['onset-to-arrival time'] <= 240]

# Drop rows with onset-to-arrival time of < 0 mins
data = data[data['onset-to-arrival time'] >= 0]

In [12]:
xgb_thrombolysis_fields: list = [
            'stroke team', 'age', 'infarction', 'stroke severity',
            'onset-to-arrival time', 'precise onset known',
            'onset during sleep', 'use of AF anticoagulants',
            'prior disability', 'arrival-to-scan time', 'thrombolysis']

data = data[xgb_thrombolysis_fields]

In [13]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,69510.0,74.666451,13.428511,37.5,67.5,77.5,87.5,92.5
infarction,69510.0,0.852539,0.354567,0.0,1.0,1.0,1.0,1.0
stroke severity,69510.0,9.004402,8.326078,0.0,3.0,6.0,14.0,42.0
onset-to-arrival time,69510.0,112.968105,53.235907,0.0,72.0,102.0,149.0,240.0
precise onset known,69510.0,0.630873,0.482572,0.0,0.0,1.0,1.0,1.0
onset during sleep,69510.0,0.050309,0.218584,0.0,0.0,0.0,0.0,1.0
use of AF anticoagulants,69510.0,0.135448,0.342204,0.0,0.0,0.0,0.0,1.0
prior disability,69510.0,1.062034,1.420023,0.0,0.0,0.0,2.0,5.0
arrival-to-scan time,69420.0,56.234342,77.025687,1.0,15.0,28.0,56.0,360.0
thrombolysis,69510.0,0.292346,0.454844,0.0,0.0,0.0,1.0,1.0


## Save data

In [14]:
data.to_csv('../data/thrombolysis_xgb_data_2017_2019.csv', index=False)