# Creating a control non-thrombolysed patient data set with similar characteristics to thrombolysed patients

The aim of this notebook is to create 'control' non-thrombolysed patient data that has similar overall patient characteristics to the thrombolysed group of patients, emulating a clinical trial for thrombolysis.

Non-thrombolysed patients will be selected based using a nearest-neighbour method based on key patient characteristics. 

## Import libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# Turn off warnings
import warnings
warnings.filterwarnings("ignore")

## Import data

In [2]:
all_data = pd.read_csv('../output/reformatted_data.csv')

# Limit to years 2017 to 2019
mask = (all_data['year'] >= 2017) & (all_data['year'] <= 2019)
data = all_data[mask]

# Limit to infarction stroke
mask = (data['infarction'] == 1)
data = data[mask]

# Limit to arrivals by ambulace
mask = (data['arrive_by_ambulance'] == 1)
data = data[mask]

# Remove patiens who have received thrombectomy
mask = (data['thrombectomy'] == 0)

Limit to patient clinical characteristics (not arrival timings).

In [3]:
required_cols = [
    'age',
    'male',
    'infarction',
    'thrombolysis',
    'congestive_heart_failure',
    'hypertension',
    'atrial_fibrillation',
    'diabetes',
    'prior_stroke_tia',
    'afib_anticoagulant',
    'prior_disability',
    'stroke_severity',
    'nihss_complete',
    'nihss_arrival_loc',
    'nihss_arrival_loc_questions',
    'nihss_arrival_loc_commands',
    'nihss_arrival_best_gaze',
    'nihss_arrival_visual',
    'nihss_arrival_facial_palsy',
    'nihss_arrival_motor_arm_left',
    'nihss_arrival_motor_arm_right',
    'nihss_arrival_motor_leg_left',
    'nihss_arrival_motor_leg_right',
    'nihss_arrival_limb_ataxia',
    'nihss_arrival_sensory',
    'nihss_arrival_best_language',
    'nihss_arrival_dysarthria',
    'nihss_arrival_extinction_inattention',
    'discharge_destination',
    'death',
    'discharge_disability',
    'disability_6_month'
    ]

In [4]:
data = data[required_cols]

# Keep only complete cases
data = data.dropna()
data.shape

(35860, 32)

## Split data by use of thrombolysis

In [5]:
# Split the data into two groups
data_thrombolysis = data[data['thrombolysis'] == 1]
data_no_thrombolysis = data[data['thrombolysis'] == 0]

# Shuffle the data
data_thrombolysis = data_thrombolysis.sample(frac=1, random_state=42)
data_no_thrombolysis = data_no_thrombolysis.sample(frac=1, random_state=42)

## Select the columns to be used for nearest neighbour

In [6]:
nn_cols = [
    'prior_disability',
    'stroke_severity',
    'age',
    'male',
    'congestive_heart_failure',
    'hypertension',
    'atrial_fibrillation',
    'diabetes',
    'prior_stroke_tia',
    'afib_anticoagulant'
]

## Standardise the data

In [7]:
# Concatenate the data
concatenated_data = pd.concat([data_thrombolysis, data_no_thrombolysis])
# Scale the data based on concatenated data
scaler = StandardScaler()
scaler.fit(concatenated_data[nn_cols])
data_thrombolysis_standardised = scaler.transform(data_thrombolysis[nn_cols])
data_no_thrombolysis_standardised = scaler.transform(data_no_thrombolysis[nn_cols])

## Find nearest neighbours to each thrombolysed patients

We will find the three nearest neighbours 

In [8]:
# Set up nearest neighbour engine
nn = NearestNeighbors(n_neighbors=3, algorithm='auto').fit(
    data_no_thrombolysis_standardised)

# Loop through the data_no_thrombolysis_standardised data and find the nearest neighbour in data_thrombolysis_standardised
distances, indices = nn.kneighbors(data_thrombolysis_standardised)

# Get the indices of the nearest neighbours
indices = pd.Series(indices.reshape(len(indices.flatten())))

# Drop duplicate rows
indices = indices.drop_duplicates()

In [9]:
nearest_neighbour_size = len(indices)
sample_size = data_thrombolysis.shape[0]

print (f'Nearest neighbours found: {nearest_neighbour_size}')
print (f'Required sample size: {sample_size}')

# Sample the data
replacement = True if nearest_neighbour_size < sample_size else False
indicies = indices.sample(sample_size, replace=replacement, random_state=42).values
sampled_no_thrombolysis = data_no_thrombolysis.iloc[indicies]

Nearest neighbours found: 7201
Required sample size: 5822


## Show data statistics

### Summary statistics

In [10]:
results = pd.DataFrame()
results['all_data'] = data.mean()
results['all no thrombolysis'] = data_no_thrombolysis.mean()
results['sampled no thrombolysis'] = sampled_no_thrombolysis.mean()
results['thrombolysis'] = data_thrombolysis.mean()
results


Unnamed: 0,all_data,all no thrombolysis,sampled no thrombolysis,thrombolysis
age,75.07097,75.590585,74.215046,72.390072
male,0.525432,0.517445,0.5146,0.566644
infarction,1.0,1.0,1.0,1.0
thrombolysis,0.162354,0.0,0.0,1.0
congestive_heart_failure,0.051645,0.053665,0.06527,0.041223
hypertension,0.557306,0.565584,0.539677,0.5146
atrial_fibrillation,0.177635,0.191457,0.156819,0.106321
diabetes,0.216899,0.22598,0.233768,0.170045
prior_stroke_tia,0.264752,0.276583,0.283236,0.20371
afib_anticoagulant,0.120245,0.138225,0.057025,0.027482


### Full statisics

In [11]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,35860.0,75.07097,12.597642,37.5,67.5,77.5,82.5,92.5
male,35860.0,0.525432,0.49936,0.0,0.0,1.0,1.0,1.0
infarction,35860.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
thrombolysis,35860.0,0.162354,0.36878,0.0,0.0,0.0,0.0,1.0
congestive_heart_failure,35860.0,0.051645,0.221313,0.0,0.0,0.0,0.0,1.0
hypertension,35860.0,0.557306,0.496712,0.0,0.0,1.0,1.0,1.0
atrial_fibrillation,35860.0,0.177635,0.382211,0.0,0.0,0.0,0.0,1.0
diabetes,35860.0,0.216899,0.412139,0.0,0.0,0.0,0.0,1.0
prior_stroke_tia,35860.0,0.264752,0.441207,0.0,0.0,0.0,1.0,1.0
afib_anticoagulant,35860.0,0.120245,0.325253,0.0,0.0,0.0,0.0,1.0


In [12]:
data_no_thrombolysis.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,30038.0,75.590585,12.483915,37.5,67.5,77.5,87.5,92.5
male,30038.0,0.517445,0.499704,0.0,0.0,1.0,1.0,1.0
infarction,30038.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
thrombolysis,30038.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
congestive_heart_failure,30038.0,0.053665,0.22536,0.0,0.0,0.0,0.0,1.0
hypertension,30038.0,0.565584,0.495688,0.0,0.0,1.0,1.0,1.0
atrial_fibrillation,30038.0,0.191457,0.393455,0.0,0.0,0.0,0.0,1.0
diabetes,30038.0,0.22598,0.418233,0.0,0.0,0.0,0.0,1.0
prior_stroke_tia,30038.0,0.276583,0.447316,0.0,0.0,0.0,1.0,1.0
afib_anticoagulant,30038.0,0.138225,0.345142,0.0,0.0,0.0,0.0,1.0


In [13]:
sampled_no_thrombolysis.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,5822.0,74.215046,13.138357,37.5,67.5,77.5,82.5,92.5
male,5822.0,0.5146,0.49983,0.0,0.0,1.0,1.0,1.0
infarction,5822.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
thrombolysis,5822.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
congestive_heart_failure,5822.0,0.06527,0.247022,0.0,0.0,0.0,0.0,1.0
hypertension,5822.0,0.539677,0.498466,0.0,0.0,1.0,1.0,1.0
atrial_fibrillation,5822.0,0.156819,0.363661,0.0,0.0,0.0,0.0,1.0
diabetes,5822.0,0.233768,0.423263,0.0,0.0,0.0,0.0,1.0
prior_stroke_tia,5822.0,0.283236,0.450609,0.0,0.0,0.0,1.0,1.0
afib_anticoagulant,5822.0,0.057025,0.23191,0.0,0.0,0.0,0.0,1.0


In [14]:
data_thrombolysis.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,5822.0,72.390072,12.83998,37.5,62.5,72.5,82.5,92.5
male,5822.0,0.566644,0.495581,0.0,0.0,1.0,1.0,1.0
infarction,5822.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
thrombolysis,5822.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
congestive_heart_failure,5822.0,0.041223,0.198823,0.0,0.0,0.0,0.0,1.0
hypertension,5822.0,0.5146,0.49983,0.0,0.0,1.0,1.0,1.0
atrial_fibrillation,5822.0,0.106321,0.308274,0.0,0.0,0.0,0.0,1.0
diabetes,5822.0,0.170045,0.375704,0.0,0.0,0.0,0.0,1.0
prior_stroke_tia,5822.0,0.20371,0.40279,0.0,0.0,0.0,0.0,1.0
afib_anticoagulant,5822.0,0.027482,0.163497,0.0,0.0,0.0,0.0,1.0


## Save data

In [15]:
sampled_data = pd.concat([data_thrombolysis, sampled_no_thrombolysis])
sampled_data = sampled_data.sample(frac=1, random_state=42)    
sampled_data.to_csv('../output/nearest_neighbour_sampled_data.csv', index=False)

## Test model

Good outcome (mRS 0-2)

In [16]:
cols_for_X = [
    'age',
    'male',
    'infarction',
    'thrombolysis',
    'congestive_heart_failure',
    'hypertension',
    'atrial_fibrillation',
    'diabetes',
    'prior_stroke_tia',
    'afib_anticoagulant',
    'prior_disability',
    'stroke_severity'
    ]

X = sampled_data[cols_for_X]
y = sampled_data['discharge_disability'] <= 2

# Build logistic regression model
logreg = linear_model.LogisticRegression(random_state=42)
logreg.fit(X, y)

# Get the coefficients
model_coefficients = pd.Series(logreg.coef_[0], index=X.columns)
model_coefficients

age                        -0.022933
male                        0.052471
infarction                  1.527013
thrombolysis                0.763280
congestive_heart_failure    0.189702
hypertension                0.037355
atrial_fibrillation        -0.054698
diabetes                   -0.052366
prior_stroke_tia            0.126180
afib_anticoagulant          0.244685
prior_disability           -0.634119
stroke_severity            -0.121995
dtype: float64

In [17]:
thrombolysis_odds = np.exp(model_coefficients['thrombolysis'])
print (f'Thrombolysis odds ratio {thrombolysis_odds:.2f}')

Thrombolysis odds ratio 2.15


Bad outcome (mRS 5-6)

In [18]:
y = sampled_data['discharge_disability'] >= 5

# Build logistic regression model
logreg = linear_model.LogisticRegression(random_state=42)
logreg.fit(X, y)

# Get the coefficients
model_coefficients = pd.Series(logreg.coef_[0], index=X.columns)
model_coefficients

age                         0.043692
male                        0.059291
infarction                 -3.777237
thrombolysis               -0.656666
congestive_heart_failure   -0.183524
hypertension               -0.176702
atrial_fibrillation         0.161942
diabetes                   -0.081930
prior_stroke_tia           -0.191174
afib_anticoagulant         -0.544605
prior_disability            0.477900
stroke_severity             0.135582
dtype: float64

In [19]:
thrombolysis_odds = np.exp(model_coefficients['thrombolysis'])
print (f'Thrombolysis odds ratio {thrombolysis_odds:.2f}')

Thrombolysis odds ratio 0.52
