# Creation of synthetic data

Synthetic data created here is based on simple sampling from patient features. No correlations are maintained between features (except all patients marked as being on anticoagulants for atrial fibrillation are also marked as having atrial fibrillation). The data is used simply for demonstration purposes. Data is sampled for each hospital, but the hospital ID replaced by a code, and all hospitals have the same amount fo data generated (500 patients).

The patients are labelled for thrombolysis use and outcome by using the output of pretrained models.

In [1]:
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier

from utils.artificial_data import ArtificialPatientData
from utils.artificial_pathway_data import ArtificialPathwayData

In [2]:
apd = ArtificialPatientData()

In [3]:
create_new_data = True
if create_new_data:
    apd.create_artificial_data(patients_per_hospital=500)    

## Compare data

In [4]:
artificial_data = pd.read_csv('./data/artificial_ml_data/artificial_ml_data.csv', low_memory=False)
fields = list(artificial_data.columns)
original_data = pd.read_csv('./data/data_for_ml/complete_ml_data.csv', low_memory=False)
original_data = original_data[fields]
stroke_teams = original_data['stroke_team'].unique()
stroke_teams.sort()

In [5]:
comparison = pd.DataFrame()
comparison['original'] = original_data.drop('stroke_team', axis=1).mean()
comparison['artificial'] = artificial_data.drop('stroke_team', axis=1).mean()
comparison.round(2)

Unnamed: 0,original,artificial
age,75.06,75.15
onset_to_arrival_time,115.13,115.63
onset_during_sleep,0.05,0.04
precise_onset_known,0.63,0.6
arrival_to_scan_time,43.35,42.64
infarction,0.85,0.85
stroke_severity,9.4,9.5
prior_disability,1.1,1.1
afib_anticoagulant,0.15,0.15
any_afib_diagnosis,0.26,0.15


## Test artificial data for training models

### Thrombolysis choice

Split real data into train and test

In [6]:
X_thrombolysis_fields = [
            'stroke_team',
            'onset_to_arrival_time',
            'onset_during_sleep',
            'arrival_to_scan_time',
            'infarction',
            'stroke_severity',
            'precise_onset_known',
            'prior_disability',
            'afib_anticoagulant',
            'age',
        ]

X_train, X_test, y_train, y_test = train_test_split(
    original_data[X_thrombolysis_fields],
    original_data['thrombolysis'],
    test_size=0.25, random_state=42)

Train model on artificial data

In [7]:
# Train model on artifical data
X = artificial_data[X_thrombolysis_fields]
y = artificial_data['thrombolysis']
# One hot encode stroke teams using OneHotEncoder with self.stroke_teams as categories
encoder = OneHotEncoder(categories=[stroke_teams], sparse=False)
encoder.fit(X[['stroke_team']])
one_hot_encoded = encoder.transform(X[['stroke_team']])
one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=stroke_teams, index=X.index)
X_one_hot = pd.concat([X, one_hot_encoded_df], axis=1)
X_one_hot.drop('stroke_team', axis=1, inplace=True)
# Define and Fit model
choice_model_apd = XGBClassifier(verbosity=0, seed=42, learning_rate=0.5)
choice_model_apd.fit(X_one_hot, y)

Train model on real data.

In [8]:
X = X_train
y = y_train
# One hot encode stroke teams using OneHotEncoder with self.stroke_teams as categories
encoder = OneHotEncoder(categories=[stroke_teams], sparse=False)
encoder.fit(X[['stroke_team']])
one_hot_encoded = encoder.transform(X[['stroke_team']])
one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=stroke_teams, index=X.index)
X_one_hot = pd.concat([X, one_hot_encoded_df], axis=1)
X_one_hot.drop('stroke_team', axis=1, inplace=True)
# Define and Fit model
choice_model_original = XGBClassifier(verbosity=0, seed=42, learning_rate=0.5)
choice_model_original.fit(X_one_hot, y)

In [9]:
# Get ROC AUC score for test data
X = X_test
y = y_test
# One hot encode stroke teams using OneHotEncoder with self.stroke_teams as categories
encoder = OneHotEncoder(categories=[stroke_teams], sparse=False)
encoder.fit(X[['stroke_team']])
one_hot_encoded = encoder.transform(X[['stroke_team']])
one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=stroke_teams, index=X.index)
X_one_hot = pd.concat([X, one_hot_encoded_df], axis=1)
X_one_hot.drop('stroke_team', axis=1, inplace=True)
# Get predictions from choice_model_original
y_pred_original = choice_model_original.predict_proba(X_one_hot)[:, 1]
# Get predictions from choice_model_apd
y_pred_apd = choice_model_apd.predict_proba(X_one_hot)[:, 1]
# Get ROC AUC score
roc_auc_original = roc_auc_score(y, y_pred_original)
roc_auc_apd = roc_auc_score(y, y_pred_apd)
print(f'ROC AUC score for original data: {roc_auc_original:0.3f}')
print(f'ROC AUC score for artificial data: {roc_auc_apd:0.3f}')

ROC AUC score for original data: 0.917
ROC AUC score for artificial data: 0.919


### Outcome model

Split real data into train and test

In [10]:
X_outcome_fields = [
    'prior_disability',
    'stroke_severity',
    'stroke_team',
    'onset_to_thrombolysis',
    'age',
    'precise_onset_known',
    'any_afib_diagnosis',
    'infarction']

X_train, X_test, y_train, y_test = train_test_split(
    original_data[X_outcome_fields], 
    original_data['discharge_disability'],
    test_size=0.25, random_state=42)

Train model on artificial data

In [11]:
# Train model on artifical data
X = artificial_data[X_outcome_fields]
y = artificial_data['discharge_disability']
# One hot encode stroke teams using OneHotEncoder with self.stroke_teams as categories
encoder = OneHotEncoder(categories=[stroke_teams], sparse=False)
encoder.fit(X[['stroke_team']])
one_hot_encoded = encoder.transform(X[['stroke_team']])
one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=stroke_teams, index=X.index)
X_one_hot = pd.concat([X, one_hot_encoded_df], axis=1)
X_one_hot.drop('stroke_team', axis=1, inplace=True)
# Define and Fit model
outcome_model_apd = XGBClassifier(verbosity=0, seed=42, learning_rate=0.5)
outcome_model_apd.fit(X_one_hot, y)

Train model on real data.

In [12]:
X = X_train
y = y_train
# One hot encode stroke teams using OneHotEncoder with self.stroke_teams as categories
encoder = OneHotEncoder(categories=[stroke_teams], sparse=False)
encoder.fit(X[['stroke_team']])
one_hot_encoded = encoder.transform(X[['stroke_team']])
one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=stroke_teams, index=X.index)
X_one_hot = pd.concat([X, one_hot_encoded_df], axis=1)
X_one_hot.drop('stroke_team', axis=1, inplace=True)
# Define and Fit model
outcome_model_original = XGBClassifier(verbosity=0, seed=42, learning_rate=0.5)
outcome_model_original.fit(X_one_hot, y)

In [13]:
# Get ROC AUC score for test data
X = X_test
y = y_test
# One hot encode stroke teams using OneHotEncoder with self.stroke_teams as categories
encoder = OneHotEncoder(categories=[stroke_teams], sparse=False)
encoder.fit(X[['stroke_team']])
one_hot_encoded = encoder.transform(X[['stroke_team']])
one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=stroke_teams, index=X.index)
X_one_hot = pd.concat([X, one_hot_encoded_df], axis=1)
X_one_hot.drop('stroke_team', axis=1, inplace=True)
# Get predictions from choice_model_original
y_pred_original = outcome_model_original.predict_proba(X_one_hot)
# Get predictions from choice_model_apd
y_pred_apd = outcome_model_apd.predict_proba(X_one_hot)
# Get ROC AUC score
roc_auc_original = roc_auc_score(y, y_pred_original, multi_class='ovo', average='macro')
roc_auc_apd = roc_auc_score(y, y_pred_apd, multi_class='ovo', average='macro')
print(f'ROC AUC score for original data: {roc_auc_original:0.3f}')
print(f'ROC AUC score for artificial data: {roc_auc_apd:0.3f}')

ROC AUC score for original data: 0.808
ROC AUC score for artificial data: 0.812


## Anomysise stroke teams in artificial data and resave

In [14]:
# Shuffle stroke teams
stroke_teams = np.random.permutation(stroke_teams)
rename_dict = {team: f'team_{i+1}' for i, team in enumerate(stroke_teams)}
# Apply rename dict to artificial data stroke team column
artificial_data['stroke_team'] = artificial_data['stroke_team'].map(rename_dict)
artificial_data.head()

# Save rename_dict as pickle
import pickle
with open('./data/artificial_ml_data/rename_dict.pkl', 'wb') as f:
    pickle.dump(rename_dict, f)

In [15]:
# Save
artificial_data.to_csv('./data/artificial_ml_data/artificial_ml_data.csv', index=False)

In [16]:
pathway = ArtificialPathwayData()

create_new_data = True
if create_new_data:
    pathway.create_artificial_pathway_data(patients_per_hospital=500)