# Creating a control non-thrombolysed patient data set with similar characteristics to thrombolysed patients

The aim of this notebook is to create 'control' non-thrombolysed patient data that has similar overall patient characteristics to the thrombolysed group of patients, emulating a clinical trial for thrombolysis.

Non-thrombolysed patients will be selected based using a nearest-neighbour method based on key patient characteristics. 

## Import libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# Turn off warnings
import warnings
warnings.filterwarnings("ignore")

# Set maximum number of rows to display
pd.options.display.max_rows = 100

## Import data

In [2]:
all_data = pd.read_csv('../output/reformatted_data.csv')

# Limit to years 2017 to 2019
mask = (all_data['year'] >= 2017) & (all_data['year'] <= 2019)
data = all_data[mask]

# Limit to infarction stroke
mask = (data['infarction'] == 1)
data = data[mask]

# Limit to arrivals by ambulace
mask = (data['arrive_by_ambulance'] == 1)
data = data[mask]

# Remove patients who have received thrombectomy
mask = (data['thrombectomy'] == 0)

# Remove patients with no recorded prior disability
mask = data['prior_disability'] >= 0
data = data[mask]

# Remove records with no recorded discharge_disability
mask = data['discharge_disability'] >= 0
data = data[mask]

# Remove records with negative onset_to_arrival_time
mask = data['onset_to_arrival_time'] <= 0
mask =  mask == False
data = data[mask]

# Remove 'unusual' travel times (outside range 0-1440 minutes for each step)
cols = [
    'onset_to_arrival_time',
    'call_to_ambulance_arrival_time',
    'ambulance_on_scene_time',
    'ambulance_travel_to_hospital_time',
    'ambulance_wait_time_at_hospital'
]

for col in cols:
    # Remove rows with negative values
    mask = data[col] < 0
    mask =  mask == False
    data = data[mask]
    # Remove rows greater than 1440 minutes
    mask = data[col] > 1440
    mask =  mask == False
    data = data[mask]

## Split data by use of thrombolysis

In [3]:
# Split the data into two groups
data_thrombolysis = data[data['thrombolysis'] == 1]
data_no_thrombolysis = data[data['thrombolysis'] == 0]

# Shuffle the data
data_thrombolysis = data_thrombolysis.sample(frac=1, random_state=42)
data_no_thrombolysis = data_no_thrombolysis.sample(frac=1, random_state=42)

## Select the columns to be used for nearest neighbour

In [4]:
nn_cols = [
    'prior_disability',
    'stroke_severity',
    'age',
    'male',
    'congestive_heart_failure',
    'hypertension',
    'atrial_fibrillation',
    'diabetes',
    'prior_stroke_tia',
    'afib_anticoagulant'
]

## Standardise the data

In [5]:
# Concatenate the data
concatenated_data = pd.concat([data_thrombolysis, data_no_thrombolysis])
# Scale the data based on concatenated data
scaler = StandardScaler()
scaler.fit(concatenated_data[nn_cols])
data_thrombolysis_standardised = scaler.transform(data_thrombolysis[nn_cols])
data_no_thrombolysis_standardised = scaler.transform(data_no_thrombolysis[nn_cols])

## Find nearest neighbours to each thrombolysed patients

We will find the required number of nearest neighbour points (matching the size of the thrombolysis data set) by increasing the circle of nearest neighbours until the required number of nearest neighbour points have been found.

In [6]:
required_sample_size = data_thrombolysis.shape[0]
selected_sample_size = 0
selected_ids = []
nearest_neighbour_limit = 0

# Loop until the required sample size is reached
while selected_sample_size < required_sample_size:

    # Increment the nearest neighbour limit
    nearest_neighbour_limit += 1

    # Set up nearest neighbour engine based on no-thrombolysis
    nn = NearestNeighbors(n_neighbors=nearest_neighbour_limit, algorithm='auto').fit(
        data_no_thrombolysis_standardised)
    
    # Get the indices of the nearest neighbours to thrombolysis patients
    distances, indices = nn.kneighbors(data_thrombolysis_standardised)
    indices = pd.Series(indices.reshape(len(indices.flatten())))

    # Get the set of unique indices, combined with previously selected indices
    combined_ids = list(set(indices).union(set(selected_ids)))

    # If the combined indices are less than the required sample size, then
    # select all the combined indices
    if len(combined_ids) < required_sample_size:
        selected_ids = combined_ids
    # Otherwise, sample the required number of new indices
    else:
        # Find unique indices that are not in previously selected indices
        unique_new_ids = list(set(combined_ids) - set(selected_ids))
        number_of_new_ids = required_sample_size - len(selected_ids)
        # Sample the required number of new indices
        new_ids = np.random.choice(unique_new_ids, number_of_new_ids, replace=False)
        # Combine the new indices with previously selected indices
        selected_ids = list(set(selected_ids).union(set(new_ids)))

    # Update the selected sample size
    selected_sample_size = len(selected_ids)

# Select the required data
sampled_no_thrombolysis = data_no_thrombolysis.iloc[selected_ids]
print (f'Last nearest neighbour limit: {nearest_neighbour_limit}')

Last nearest neighbour limit: 3


## Show data statistics

### Summary statistics

In [7]:
results = pd.DataFrame()
results['all_data'] = data.mean()
results['all no thrombolysis'] = data_no_thrombolysis.mean()
results['sampled no thrombolysis'] = sampled_no_thrombolysis.mean()
results['thrombolysis'] = data_thrombolysis.mean()
results

Unnamed: 0,all_data,all no thrombolysis,sampled no thrombolysis,thrombolysis
id,140077.420055,139745.673837,139171.668306,141477.401375
age,75.825248,76.537734,76.286942,72.81853
male,0.511561,0.502568,0.488554,0.549511
infarction,1.0,1.0,1.0,1.0
onset_to_arrival_time,384.614876,449.85712,457.716416,109.290193
onset_known,0.797249,0.751735,0.715623,0.989321
precise_onset_known,0.429029,0.338127,0.286915,0.812635
onset_during_sleep,0.145103,0.17747,0.194607,0.008512
arrive_by_ambulance,1.0,1.0,1.0,1.0
call_to_ambulance_arrival_time,28.882604,30.444573,28.46156,22.393842


## Save data

In [8]:
sampled_data = pd.concat([data_thrombolysis, sampled_no_thrombolysis])
sampled_data = sampled_data.sample(frac=1, random_state=42)    
sampled_data.to_csv('../output/nearest_neighbour_sampled_data.csv', index=False)

## Build logistic regresssion models to test the models

Build binary classification models for two outcomes:

1. Good outcome (mRS 0-2)
2. Bad outcome (mRS 5-6)

### Predict good outcome (mRS 0-2)

In [9]:
X_cols = [
    'age',
    'male',
    'congestive_heart_failure',
    'hypertension',
    'atrial_fibrillation',
    'diabetes',
    'prior_stroke_tia',
    'afib_anticoagulant',
    'prior_disability',
    'stroke_severity',
    'thrombolysis'
    ]

# Split the data into X and y
X = sampled_data[X_cols]
y = sampled_data['discharge_disability'] <= 2

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr.fit(X_train, y_train)

# Predict the test set
y_pred = lr.predict(X_test)
accuracy = np.mean(y_pred == y_test)
print (f'Accuracy: {accuracy:0.2f}')

# Get the model coefficients
model_coefficients = pd.DataFrame(index=X_cols)
model_coefficients['coefficient'] = lr.coef_[0]
model_coefficients['odds ratio'] = np.exp(lr.coef_[0])

# Sort model coefficients by absolute value of coefficient
model_coefficients = model_coefficients.reindex(
    model_coefficients['coefficient'].abs().sort_values(ascending=False).index)

# Print the model coefficients
print ('\nModel coefficients sorted by absolute value of coefficient:\n')
print (model_coefficients)


Accuracy: 0.77

Model coefficients sorted by absolute value of coefficient:

                          coefficient  odds ratio
thrombolysis                 0.678469    1.970858
prior_disability            -0.633408    0.530780
stroke_severity             -0.152286    0.858743
atrial_fibrillation         -0.131559    0.876727
afib_anticoagulant           0.123528    1.131481
diabetes                    -0.105814    0.899592
prior_stroke_tia             0.073230    1.075978
male                         0.057128    1.058791
congestive_heart_failure    -0.041858    0.959006
age                         -0.031491    0.969000
hypertension                -0.005218    0.994795


### Predict bad outcome (mRS 5-6)

In [10]:
X_cols = [
    'age',
    'male',
    'congestive_heart_failure',
    'hypertension',
    'atrial_fibrillation',
    'diabetes',
    'prior_stroke_tia',
    'afib_anticoagulant',
    'prior_disability',
    'stroke_severity',
    'thrombolysis'
    ]

# Split the data into X and y
X = sampled_data[X_cols]
y = sampled_data['discharge_disability'] >= 6

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr.fit(X_train, y_train)

# Predict the test set
y_pred = lr.predict(X_test)
accuracy = np.mean(y_pred == y_test)
print (f'Accuracy: {accuracy:0.2f}')

# Get the model coefficients
model_coefficients = pd.DataFrame(index=X_cols)
model_coefficients['coefficient'] = lr.coef_[0]
model_coefficients['odds ratio'] = np.exp(lr.coef_[0])

# Sort model coefficients by absolute value of coefficient
model_coefficients = model_coefficients.reindex(
    model_coefficients['coefficient'].abs().sort_values(ascending=False).index)

# Print the model coefficients
print ('\nModel coefficients sorted by absolute value of coefficient:\n')
print (model_coefficients)

Accuracy: 0.83

Model coefficients sorted by absolute value of coefficient:

                          coefficient  odds ratio
thrombolysis                -0.410661    0.663212
congestive_heart_failure     0.277498    1.319823
atrial_fibrillation          0.247215    1.280455
diabetes                     0.227613    1.255600
male                         0.195270    1.215639
prior_stroke_tia            -0.175120    0.839356
stroke_severity              0.143347    1.154130
afib_anticoagulant           0.137967    1.147938
prior_disability             0.084731    1.088425
age                          0.043134    1.044078
hypertension                 0.009162    1.009204
