# Creating a control non-thrombolysed patient data set with similar characteristics to thrombolysed patients

The aim of this notebook is to create 'control' non-thrombolysed patient data that has similar overall patient characteristics to the thrombolysed group of patients, emulating a clinical trial for thrombolysis.

Non-thrombolysed patients will be selected based using a nearest-neighbour method based on key patient characteristics. 

## Import libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# Turn off warnings
import warnings
warnings.filterwarnings("ignore")

## Import data

In [2]:
all_data = pd.read_csv('../output/reformatted_data.csv')

# Limit to years 2017 to 2019
mask = (all_data['year'] >= 2017) & (all_data['year'] <= 2019)
data = all_data[mask]

# Limit to infarction stroke
mask = (data['infarction'] == 1)
data = data[mask]

# Limit to arrivals by ambulace
mask = (data['arrive_by_ambulance'] == 1)
data = data[mask]

# Remove patiens who have received thrombectomy
mask = (data['thrombectomy'] == 0)

# Remove records with empty prior disability
mask = data['prior_disability'] >= 0
data = data[mask]

# Remove records with empty discharge_disability
mask = data['discharge_disability'] >= 0
data = data[mask]

Limit to patient clinical characteristics (not arrival timings).

In [3]:
required_cols = [
    'age',
    'male',
    'infarction',
    'thrombolysis',
    'congestive_heart_failure',
    'hypertension',
    'atrial_fibrillation',
    'diabetes',
    'prior_stroke_tia',
    'afib_anticoagulant',
    'prior_disability',
    'stroke_severity',
    'nihss_complete',
    'nihss_arrival_loc',
    'nihss_arrival_loc_questions',
    'nihss_arrival_loc_commands',
    'nihss_arrival_best_gaze',
    'nihss_arrival_visual',
    'nihss_arrival_facial_palsy',
    'nihss_arrival_motor_arm_left',
    'nihss_arrival_motor_arm_right',
    'nihss_arrival_motor_leg_left',
    'nihss_arrival_motor_leg_right',
    'nihss_arrival_limb_ataxia',
    'nihss_arrival_sensory',
    'nihss_arrival_best_language',
    'nihss_arrival_dysarthria',
    'nihss_arrival_extinction_inattention',
    'discharge_destination',
    'death',
    'discharge_disability',
    'disability_6_month'
    ]

In [4]:
data = data[required_cols]

## Split data by use of thrombolysis

In [5]:
# Split the data into two groups
data_thrombolysis = data[data['thrombolysis'] == 1]
data_no_thrombolysis = data[data['thrombolysis'] == 0]

# Shuffle the data
data_thrombolysis = data_thrombolysis.sample(frac=1, random_state=42)
data_no_thrombolysis = data_no_thrombolysis.sample(frac=1, random_state=42)

## Select the columns to be used for nearest neighbour

In [6]:
nn_cols = [
    'prior_disability',
    'stroke_severity',
    'age',
    'male',
    'congestive_heart_failure',
    'hypertension',
    'atrial_fibrillation',
    'diabetes',
    'prior_stroke_tia',
    'afib_anticoagulant'
]

## Standardise the data

In [7]:
# Concatenate the data
concatenated_data = pd.concat([data_thrombolysis, data_no_thrombolysis])
# Scale the data based on concatenated data
scaler = StandardScaler()
scaler.fit(concatenated_data[nn_cols])
data_thrombolysis_standardised = scaler.transform(data_thrombolysis[nn_cols])
data_no_thrombolysis_standardised = scaler.transform(data_no_thrombolysis[nn_cols])

## Find nearest neighbours to each thrombolysed patients

We will find the three nearest neighbours 

In [8]:
required_sample_size = data_thrombolysis.shape[0]
selected_sample_size = 0
selected_ids = []
nearest_neighbour_limit = 0

while selected_sample_size < required_sample_size:
    # Increment the nearest neighbour limit
    nearest_neighbour_limit += 1

    # Set up nearest neighbour engine
    nn = NearestNeighbors(n_neighbors=nearest_neighbour_limit, algorithm='auto').fit(
        data_no_thrombolysis_standardised)
    
    # Get the indices of the nearest neighbours
    distances, indices = nn.kneighbors(data_thrombolysis_standardised)
    indices = pd.Series(indices.reshape(len(indices.flatten())))

    # Get the set of unique indices combined with selected indices
    combined_ids = list(set(indices).union(set(selected_ids)))

    # If the combined indices are less than the required sample size, then
    # select all the combined indices
    if len(combined_ids) < required_sample_size:
        selected_ids = combined_ids
    # Otherwise, sample the required number of new indices
    else:
        # Find unique indices that are not in previously selected indices
        unique_new_ids = list(set(combined_ids) - set(selected_ids))
        number_of_new_ids = required_sample_size - len(selected_ids)
        # Sample the required number of new indices
        new_ids = np.random.choice(unique_new_ids, number_of_new_ids, replace=False)
        # Combine the new indices with previously selected indices
        selected_ids = list(set(selected_ids).union(set(new_ids)))

    selected_sample_size = len(selected_ids)

sampled_no_thrombolysis = data_no_thrombolysis.iloc[selected_ids]
print (f'Last nearest neighbour limit: {nearest_neighbour_limit}')

Last nearest neighbour limit: 3


## Show data statistics

### Summary statistics

In [9]:
results = pd.DataFrame()
results['all_data'] = data.mean()
results['all no thrombolysis'] = data_no_thrombolysis.mean()
results['sampled no thrombolysis'] = sampled_no_thrombolysis.mean()
results['thrombolysis'] = data_thrombolysis.mean()
results


Unnamed: 0,all_data,all no thrombolysis,sampled no thrombolysis,thrombolysis
age,75.687547,76.235706,76.093083,72.801073
male,0.514274,0.507518,0.490051,0.549849
infarction,1.0,1.0,1.0,1.0
thrombolysis,0.159598,0.0,0.0,1.0
congestive_heart_failure,0.054734,0.057304,0.07605,0.041202
hypertension,0.548632,0.554871,0.548859,0.515783
atrial_fibrillation,0.201779,0.21616,0.221429,0.126055
diabetes,0.218464,0.226704,0.260079,0.17507
prior_stroke_tia,0.272018,0.283633,0.313574,0.210855
afib_anticoagulant,0.12903,0.146639,0.082613,0.036306


### Full statisics

In [10]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,120290.0,75.687547,13.071333,37.5,67.5,77.5,87.5,92.5
male,120290.0,0.514274,0.499798,0.0,0.0,1.0,1.0,1.0
infarction,120290.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
thrombolysis,120290.0,0.159598,0.366234,0.0,0.0,0.0,0.0,1.0
congestive_heart_failure,120290.0,0.054734,0.227462,0.0,0.0,0.0,0.0,1.0
hypertension,120290.0,0.548632,0.497631,0.0,0.0,1.0,1.0,1.0
atrial_fibrillation,120290.0,0.201779,0.40133,0.0,0.0,0.0,0.0,1.0
diabetes,120290.0,0.218464,0.413205,0.0,0.0,0.0,0.0,1.0
prior_stroke_tia,120290.0,0.272018,0.445001,0.0,0.0,0.0,1.0,1.0
afib_anticoagulant,120290.0,0.12903,0.335234,0.0,0.0,0.0,0.0,1.0


In [11]:
data_no_thrombolysis.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,101092.0,76.235706,12.909125,37.5,67.5,77.5,87.5,92.5
male,101092.0,0.507518,0.499946,0.0,0.0,1.0,1.0,1.0
infarction,101092.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
thrombolysis,101092.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
congestive_heart_failure,101092.0,0.057304,0.232424,0.0,0.0,0.0,0.0,1.0
hypertension,101092.0,0.554871,0.496983,0.0,0.0,1.0,1.0,1.0
atrial_fibrillation,101092.0,0.21616,0.411626,0.0,0.0,0.0,0.0,1.0
diabetes,101092.0,0.226704,0.418702,0.0,0.0,0.0,0.0,1.0
prior_stroke_tia,101092.0,0.283633,0.450763,0.0,0.0,0.0,1.0,1.0
afib_anticoagulant,101092.0,0.146639,0.353747,0.0,0.0,0.0,0.0,1.0


In [12]:
sampled_no_thrombolysis.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,19198.0,76.093083,13.239727,37.5,67.5,77.5,87.5,92.5
male,19198.0,0.490051,0.499914,0.0,0.0,0.0,1.0,1.0
infarction,19198.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
thrombolysis,19198.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
congestive_heart_failure,19198.0,0.07605,0.265084,0.0,0.0,0.0,0.0,1.0
hypertension,19198.0,0.548859,0.49762,0.0,0.0,1.0,1.0,1.0
atrial_fibrillation,19198.0,0.221429,0.41522,0.0,0.0,0.0,0.0,1.0
diabetes,19198.0,0.260079,0.438689,0.0,0.0,0.0,1.0,1.0
prior_stroke_tia,19198.0,0.313574,0.463958,0.0,0.0,0.0,1.0,1.0
afib_anticoagulant,19198.0,0.082613,0.275303,0.0,0.0,0.0,0.0,1.0


In [13]:
data_thrombolysis.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,19198.0,72.801073,13.533156,37.5,62.5,72.5,82.5,92.5
male,19198.0,0.549849,0.497522,0.0,0.0,1.0,1.0,1.0
infarction,19198.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
thrombolysis,19198.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
congestive_heart_failure,19198.0,0.041202,0.198763,0.0,0.0,0.0,0.0,1.0
hypertension,19198.0,0.515783,0.499764,0.0,0.0,1.0,1.0,1.0
atrial_fibrillation,19198.0,0.126055,0.33192,0.0,0.0,0.0,0.0,1.0
diabetes,19198.0,0.17507,0.380037,0.0,0.0,0.0,0.0,1.0
prior_stroke_tia,19198.0,0.210855,0.407926,0.0,0.0,0.0,0.0,1.0
afib_anticoagulant,19198.0,0.036306,0.187055,0.0,0.0,0.0,0.0,1.0


## Save data

In [14]:
sampled_data = pd.concat([data_thrombolysis, sampled_no_thrombolysis])
sampled_data = sampled_data.sample(frac=1, random_state=42)    
sampled_data.to_csv('../output/nearest_neighbour_sampled_data.csv', index=False)

## Build test logistic regresssion models

### Predict good outcome (mRS <= 2)

In [15]:
X_cols = [
    'age',
    'male',
    'congestive_heart_failure',
    'hypertension',
    'atrial_fibrillation',
    'diabetes',
    'prior_stroke_tia',
    'afib_anticoagulant',
    'prior_disability',
    'stroke_severity',
    'thrombolysis'
    ]

# Split the data into X and y
X = sampled_data[X_cols]
y = sampled_data['discharge_disability'] <= 2

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr.fit(X_train, y_train)

# Predict the test set
y_pred = lr.predict(X_test)
accuracy = np.mean(y_pred == y_test)
print (f'Accuracy: {accuracy:0.2f}')

# Get the model coefficients
model_coefficients = pd.DataFrame(index=X_cols)
model_coefficients['coefficient'] = lr.coef_[0]
model_coefficients['odds ratio'] = np.exp(lr.coef_[0])

# Sort model coefficients by absolute value of coefficient
model_coefficients = model_coefficients.reindex(
    model_coefficients['coefficient'].abs().sort_values(ascending=False).index)

# Print the model coefficients
print ('Model coefficients sorted by absolute value of coefficient')
print(model_coefficients)


Accuracy: 0.78
Model coefficients sorted by absolute value of coefficient
                          coefficient  odds ratio
thrombolysis                 0.719841    2.054107
prior_disability            -0.619582    0.538170
diabetes                    -0.161391    0.850959
stroke_severity             -0.154135    0.857156
atrial_fibrillation         -0.140397    0.869013
prior_stroke_tia             0.105123    1.110848
male                         0.056432    1.058055
age                         -0.031384    0.969103
congestive_heart_failure     0.024473    1.024775
afib_anticoagulant           0.013849    1.013945
hypertension                -0.006064    0.993954


### Predict bad outcome (mRS >=5)

In [16]:
X_cols = [
    'age',
    'male',
    'congestive_heart_failure',
    'hypertension',
    'atrial_fibrillation',
    'diabetes',
    'prior_stroke_tia',
    'afib_anticoagulant',
    'prior_disability',
    'stroke_severity',
    'thrombolysis'
    ]

# Split the data into X and y
X = sampled_data[X_cols]
y = sampled_data['discharge_disability'] >= 6

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr.fit(X_train, y_train)

# Predict the test set
y_pred = lr.predict(X_test)
accuracy = np.mean(y_pred == y_test)
print (f'Accuracy: {accuracy:0.2f}')

# Get the model coefficients
model_coefficients = pd.DataFrame(index=X_cols)
model_coefficients['coefficient'] = lr.coef_[0]
model_coefficients['odds ratio'] = np.exp(lr.coef_[0])

# Sort model coefficients by absolute value of coefficient
model_coefficients = model_coefficients.reindex(
    model_coefficients['coefficient'].abs().sort_values(ascending=False).index)

# Print the model coefficients
print ('Model coefficients sorted by absolute value of coefficient')
print(model_coefficients)

Accuracy: 0.84
Model coefficients sorted by absolute value of coefficient
                          coefficient  odds ratio
thrombolysis                -0.413464    0.661356
congestive_heart_failure     0.305498    1.357300
male                         0.263105    1.300964
diabetes                     0.236626    1.266967
atrial_fibrillation          0.234321    1.264051
afib_anticoagulant           0.187339    1.206037
stroke_severity              0.146912    1.158252
prior_stroke_tia            -0.144514    0.865443
prior_disability             0.078528    1.081693
age                          0.043829    1.044804
hypertension                -0.010902    0.989157
