# Creation of artificial (synthetic) patient data

Note: This artificial data is intended only for use in exploring the methods, using up to 10 features. The method of synthethis does not maintain any covariance between features (as feature values are created independetly of each other, to eliminate any risk of identifying original data), though average feature values for patients at each hopsital are approximately maintained. These data may be used to train models with minimal loss of accuracy.

The key methodology is:

* Remove thrombolysis label
* Group original data by hopsital
    * For each of 10 features take bootstrap samples of that feature
* Combine data across hospitals
* Remove any duplicate rows, or rows that are identical to original data
* Train an XGBoost model on original data to predict use of thrombolysis
* Use the XGBoost model to leabl the synthetic data

## Load packages

In [1]:
# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")


import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

## Load data

In [2]:
# Load data
data_loc = '../data/samuel_1/10k_training_test/'
original_data = pd.read_csv(data_loc + 'cohort_10000_train.csv')

# Get stroke teams = 
stroke_teams = list(set(original_data['StrokeTeam']))
stroke_teams.sort()


## Create unlabelled synthetic data by bootstrap sampling from individual feature values

In [3]:
cases = 1000
synthetic_data_list = []

# Sample data for each stroke team
for stroke_team in stroke_teams:
    # Set up data frame for synthetic team data
    synthetic_data = pd.DataFrame()
    
    # Get original team data
    mask = original_data['StrokeTeam'] == stroke_team
    team_data = original_data[mask]
    team_data_length = len(team_data)
    
    # Set team
    synthetic_data['StrokeTeam'] = np.repeat(stroke_team, cases)
    # Sample individual items from orioginal data with replacement

    synthetic_data['S2BrainImagingTime_min'] = np.random.choice(
        team_data['S2BrainImagingTime_min'], replace=True, size=cases)

    synthetic_data['S2StrokeType_Infarction'] = np.random.choice(
        team_data['S2StrokeType_Infarction'], replace=True, size=cases)

    synthetic_data['S2NihssArrival'] = np.random.choice(
        team_data['S2NihssArrival'], replace=True, size=cases)
    
    synthetic_data['S2RankinBeforeStroke'] = np.random.choice(
        team_data['S2RankinBeforeStroke'], replace=True, size=cases)
    
    synthetic_data['AFAnticoagulent_Yes'] = np.random.choice(
        team_data['AFAnticoagulent_Yes'], replace=True, size=cases)

    synthetic_data['S1OnsetToArrival_min'] = np.random.choice(
        team_data['S1OnsetToArrival_min'], replace=True, size=cases)
    
    synthetic_data['S1AgeOnArrival'] = np.random.choice(
        team_data['S1AgeOnArrival'], replace=True, size=cases)
    
    # Use the same random index for S1OnsetTimeType_Precise and S1OnsetDateType_Stroke
    random_index = np.random.randint(0, team_data_length, size=cases)

    synthetic_data['S1OnsetTimeType_Precise'] = \
            [team_data['S1OnsetTimeType_Precise'].iloc[i] for i in random_index]
    
    synthetic_data['S1OnsetDateType_Stroke during sleep'] = \
            [team_data['S1OnsetDateType_Stroke during sleep'].iloc[i] for i in random_index]    

    synthetic_data_list.append(synthetic_data)

# Concatenate lists
synthetic_data_df = pd.concat(synthetic_data_list)

# Shuffle data
synthetic_data_df = synthetic_data_df.sample(frac=1)

## Train a model on original data, to use to label synthetic data

In [4]:
# Load data
train = pd.read_csv(data_loc + 'cohort_10000_train.csv')
test = pd.read_csv(data_loc + 'cohort_10000_test.csv')

# Read in the names of the selected features for the model
number_of_features_to_use = 10
key_features = pd.read_csv('../data/samuel_1/feature_selection.csv')
key_features = list(key_features['feature'])[:number_of_features_to_use]
# And add the target feature name: S2Thrombolysis
key_features.append("S2Thrombolysis")

# Select features
train = train[key_features]
test = test[key_features]

# Get X and y
X_train = train.drop('S2Thrombolysis', axis=1)
X_test = test.drop('S2Thrombolysis', axis=1)
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']

# One hot encode hospitals
X_train_hosp = pd.get_dummies(X_train['StrokeTeam'], prefix = 'team')
X_train = pd.concat([X_train, X_train_hosp], axis=1)
X_train.drop('StrokeTeam', axis=1, inplace=True)
X_test_hosp = pd.get_dummies(X_test['StrokeTeam'], prefix = 'team')
X_test = pd.concat([X_test, X_test_hosp], axis=1)
X_test.drop('StrokeTeam', axis=1, inplace=True)    

# Define model
model = XGBClassifier(verbosity=0, seed=42, learning_rate=0.5)

# Fit model
model.fit(X_train, y_train)

# Get predicted probabilities and class
y_probs = model.predict_proba(X_test)[:,1]
y_pred = y_probs > 0.5

# Show accuracy
accuracy = np.mean(y_pred == y_test)
print (f'Accuracy: {accuracy:.3f}')

Accuracy: 0.848


## Predict label for synthetic data set

In [5]:
# One hot encode hopsitals
X_features = key_features.copy(); X_features.remove('S2Thrombolysis')
X_synthetic = synthetic_data_df[X_features]
X_synthetic_hosp = pd.get_dummies(X_synthetic['StrokeTeam'], prefix = 'team')
X_synthetic = pd.concat([X_synthetic, X_synthetic_hosp], axis=1)
X_synthetic.drop('StrokeTeam', axis=1, inplace=True)

In [6]:
# Get predicted probabilities and class
y_probs = model.predict_proba(X_synthetic)[:,1]
y_pred = np.array([np.random.binomial(1, p) for p in y_probs])

# Ensure non-iscaemica stroke have no thrombolysis
mask = synthetic_data_df['S2StrokeType_Infarction'] == 0
y_pred[mask] = 0

synthetic_data_df['S2Thrombolysis'] = y_pred
# Save
synthetic_data_df.to_csv('./output/synthetic_10K_train.csv', index=False)

In [7]:
synthetic_data_df

Unnamed: 0,StrokeTeam,S2BrainImagingTime_min,S2StrokeType_Infarction,S2NihssArrival,S2RankinBeforeStroke,AFAnticoagulent_Yes,S1OnsetToArrival_min,S1AgeOnArrival,S1OnsetTimeType_Precise,S1OnsetDateType_Stroke during sleep,S2Thrombolysis
910,WKDIW7014B,221.0,1,0.0,2,1,135.0,82.5,1,0,0
563,KZKEZ2257Z,21.0,1,2.0,5,0,83.0,82.5,1,0,0
387,UJETD9177J,13.0,1,19.0,4,0,79.0,77.5,0,0,0
225,TQQYU0036V,13.0,1,2.0,0,0,127.0,67.5,1,0,0
843,ROOIZ9592Y,23.0,1,10.0,3,0,195.0,92.5,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
842,FLVXS2956M,5.0,1,10.0,4,0,44.0,72.5,0,0,0
316,KZKEZ2257Z,42.0,1,5.0,3,0,163.0,67.5,1,0,1
427,YQMZV4284N,48.0,1,4.0,0,0,92.0,72.5,0,1,0
228,DLNBB9786K,17.0,0,17.0,4,0,109.0,92.5,0,0,0


## Test synthetic data to train model

In [8]:
# Load data
train = synthetic_data_df
test = pd.read_csv(data_loc + 'cohort_10000_test.csv')

# Select features
train = train[key_features]
test = test[key_features]

# Get X and y
X_train = train.drop('S2Thrombolysis', axis=1)
X_test = test.drop('S2Thrombolysis', axis=1)
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']

# One hot encode hospitals
X_train_hosp = pd.get_dummies(X_train['StrokeTeam'], prefix = 'team')
X_train = pd.concat([X_train, X_train_hosp], axis=1)
X_train.drop('StrokeTeam', axis=1, inplace=True)
X_test_hosp = pd.get_dummies(X_test['StrokeTeam'], prefix = 'team')
X_test = pd.concat([X_test, X_test_hosp], axis=1)
X_test.drop('StrokeTeam', axis=1, inplace=True)    

# Define model
model = XGBClassifier(verbosity=0, seed=42, learning_rate=0.5)

# Fit model
model.fit(X_train, y_train)

# Get predicted probabilities and class
y_probs = model.predict_proba(X_test)[:,1]
y_pred = y_probs > 0.5

# Show accuracy
accuracy = np.mean(y_pred == y_test)
print (f'Accuracy: {accuracy:.3f}')

Accuracy: 0.847
