# Creation of artificial (synthetic) patient data

Note: This artificial data is intended only for use in exploring the methods, using up to 10 features. The method of synthethis does not maintain any covariance between features, though average feature values for patients at each hopsital are approximately maintained.



In [1]:
# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")


import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

In [2]:
# Load data
data_loc = '../data/10k_training_test/'
original_data = pd.read_csv(data_loc + 'cohort_10000_train.csv')

# Get stroke teams = 
stroke_teams = list(set(original_data['StrokeTeam']))
stroke_teams.sort()


In [3]:
cases = 1000
synthetic_data_list = []

# Sample data for each stroke team
for stroke_team in stroke_teams:
    # Set up data frame for synthetic team data
    synthetic_data = pd.DataFrame()
    
    # Get original team data
    mask = original_data['StrokeTeam'] == stroke_team
    team_data = original_data[mask]
    
    # Set team
    synthetic_data['StrokeTeam'] = np.repeat(stroke_team, cases)

    # Sample individual items from orioginal data with replacement
    # Times are rounded to 5 minutes
    
    synthetic_data['S2BrainImagingTime_min'] = np.random.choice(
        team_data['S2BrainImagingTime_min'], replace=True, size=cases)

    synthetic_data['S2StrokeType_Infarction'] = np.random.choice(
        team_data['S2StrokeType_Infarction'], replace=True, size=cases)

    synthetic_data['S2NihssArrival'] = np.random.choice(
        team_data['S2NihssArrival'], replace=True, size=cases)
    
    synthetic_data['S1OnsetTimeType_Precise'] = np.random.choice(
        team_data['S1OnsetTimeType_Precise'], replace=True, size=cases)
    
    synthetic_data['S2RankinBeforeStroke'] = np.random.choice(
        team_data['S2RankinBeforeStroke'], replace=True, size=cases)
    
    synthetic_data['AFAnticoagulent_Yes'] = np.random.choice(
        team_data['AFAnticoagulent_Yes'], replace=True, size=cases)

    synthetic_data['S1OnsetToArrival_min'] = np.random.choice(
        team_data['S1OnsetToArrival_min'], replace=True, size=cases)
    
    synthetic_data['S1OnsetDateType_Stroke during sleep'] = np.random.choice(
        team_data['S1OnsetDateType_Stroke during sleep'], replace=True, size=cases)
    
    synthetic_data['S1AgeOnArrival'] = np.random.choice(
        team_data['S1AgeOnArrival'], replace=True, size=cases)

    synthetic_data_list.append(synthetic_data)

# Concatenate lists
synthetic_data_df = pd.concat(synthetic_data_list)

# Shuffle data
synthetic_data_df = synthetic_data_df.sample(frac=1)

## Train a model on training data, to use to label synthetic data

In [4]:
# Load data
train = pd.read_csv(data_loc + 'cohort_10000_train.csv')
test = pd.read_csv(data_loc + 'cohort_10000_test.csv')

# Read in the names of the selected features for the model
number_of_features_to_use = 10
key_features = pd.read_csv('./output/feature_selection.csv')
key_features = list(key_features['feature'])[:number_of_features_to_use]
# And add the target feature name: S2Thrombolysis
key_features.append('S2Thrombolysis')

# Select features
train = train[key_features]
test = test[key_features]

# Get X and y
X_train = train.drop('S2Thrombolysis', axis=1)
X_test = test.drop('S2Thrombolysis', axis=1)
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']

# One hot encode hospitals
X_train_hosp = pd.get_dummies(X_train['StrokeTeam'], prefix = 'team')
X_train = pd.concat([X_train, X_train_hosp], axis=1)
X_train.drop('StrokeTeam', axis=1, inplace=True)
X_test_hosp = pd.get_dummies(X_test['StrokeTeam'], prefix = 'team')
X_test = pd.concat([X_test, X_test_hosp], axis=1)
X_test.drop('StrokeTeam', axis=1, inplace=True)    

# Define model
model = XGBClassifier(verbosity=0, seed=42, learning_rate=0.5)

# Fit model
model.fit(X_train, y_train)

# Get predicted probabilities and class
y_probs = model.predict_proba(X_test)[:,1]
y_pred = y_probs > 0.5

# Show accuracy
accuracy = np.mean(y_pred == y_test)
print (f'Accuracy: {accuracy:.3f}')

Accuracy: 0.848


## Predict label for synthetic data set

In [5]:
# One hot encode hopsitals
X_synthetic = synthetic_data_df
X_synthetic_hosp = pd.get_dummies(X_synthetic['StrokeTeam'], prefix = 'team')
X_synthetic = pd.concat([X_synthetic, X_synthetic_hosp], axis=1)
X_synthetic.drop('StrokeTeam', axis=1, inplace=True)

# Get predicted probabilities and class
y_probs = model.predict_proba(X_synthetic)[:,1]
y_pred = y_probs > 0.5
synthetic_data_df['S2Thrombolysis'] = y_pred * 1.0

# Save
synthetic_data_df.to_csv('./output/synthetic_10K_train.csv', index=False)

## Test synthetic data to train model

In [6]:
# Load data
train = synthetic_data_df
test = pd.read_csv(data_loc + 'cohort_10000_test.csv')

# Read in the names of the selected features for the model
number_of_features_to_use = 10
key_features = pd.read_csv('./output/feature_selection.csv')
key_features = list(key_features['feature'])[:number_of_features_to_use]
# And add the target feature name: S2Thrombolysis
key_features.append('S2Thrombolysis')

# Select features
train = train[key_features]
test = test[key_features]

# Get X and y
X_train = train.drop('S2Thrombolysis', axis=1)
X_test = test.drop('S2Thrombolysis', axis=1)
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']

# One hot encode hospitals
X_train_hosp = pd.get_dummies(X_train['StrokeTeam'], prefix = 'team')
X_train = pd.concat([X_train, X_train_hosp], axis=1)
X_train.drop('StrokeTeam', axis=1, inplace=True)
X_test_hosp = pd.get_dummies(X_test['StrokeTeam'], prefix = 'team')
X_test = pd.concat([X_test, X_test_hosp], axis=1)
X_test.drop('StrokeTeam', axis=1, inplace=True)    

# Define model
model = XGBClassifier(verbosity=0, seed=42, learning_rate=0.5)

# Fit model
model.fit(X_train, y_train)

# Get predicted probabilities and class
y_probs = model.predict_proba(X_test)[:,1]
y_pred = y_probs > 0.5

# Show accuracy
accuracy = np.mean(y_pred == y_test)
print (f'Accuracy: {accuracy:.3f}')

Accuracy: 0.849


## Create k-fold data sets

In [7]:
# Stratify by thrombolysis and stroke team
strat = synthetic_data_df['StrokeTeam'].map(str) + '-' + synthetic_data_df['S2Thrombolysis'].map(str)

# Set up splits
number_of_splits = 5
skf = StratifiedKFold(n_splits = number_of_splits)
skf.get_n_splits(synthetic_data_df, strat.values)

# Put in NumPy arrays
X = synthetic_data_df.values
y = strat.values
X_col_names = list(synthetic_data_df)

# Loop through the k-fold splits
counter = 0
for train_index, test_index in skf.split(X, y):  
    
    # Get Xtrain and test
    train_np, test_np = X[train_index], X[test_index]
    
    # Convert to Pandas DataFrames
    train = pd.DataFrame(train_np, columns=X_col_names)
    test = pd.DataFrame(test_np, columns=X_col_names)
    
    # Save
    train.to_csv(f'./output/kfold_5fold/synth_train_{counter}.csv', index=False)
    test.to_csv(f'./output/kfold_5fold/synth_test_{counter}.csv', index=False)

    counter += 1