In [1]:
import random
import numpy as np
import pandas as pd
from faker import Faker
fake = Faker()

seed = 1234
np.random.seed(seed)
fake.seed = seed
random.seed = seed

In [2]:
def generate_data(num_records):

    patient_data = []
    history_data = []

    for _ in range(0, num_records):
        smokes = np.random.rand() > 0.75

        is_male = np.random.rand() > 0.5
        gave_consent = np.random.rand() > 0.02
        if is_male:
            weight = np.random.normal(loc=80, scale=5.0)
        else:
            weight = np.random.normal(loc=60, scale=3.0)

        complication_prob = 0.15

        if smokes:
           complication_prob += 0.60

        if is_male and weight > 90:
           complication_prob += 0.70
        elif is_male and weight > 85:
           complication_prob += 0.20

        if weight < 45:
           complication_prob += 0.50

        if complication_prob > 0.30 and np.random.rand() > 0.5:
            notes = "high risk"
        else:
            notes = "normal risk"

        has_complication = np.random.rand() < complication_prob

        smokes_cat = 'no'
        if smokes:
            smokes_cat = 'yes'

        hospital = random.choice(["AL", "AK", "AR", "AZ"])

        ssn = fake.ssn()

        patient_data.append((smokes_cat, weight, gave_consent, ssn))
        history_data.append((notes, has_complication, ssn, hospital))

    patients = pd.DataFrame.from_records(patient_data, columns=['smokes', 'weight', 'gave_consent', 'ssn'])
    histories = pd.DataFrame.from_records(history_data, columns=['notes', 'has_complication', 'ssn', 'hospital'])
    return patients, histories

In [3]:
tmp_patients, tmp_histories = generate_data(10)
tmp_patients.merge(tmp_histories, on="ssn")

Unnamed: 0,smokes,weight,gave_consent,ssn,notes,has_complication,hospital
0,no,83.312545,True,429-75-2175,normal risk,False,AZ
1,no,83.376232,True,625-61-3089,normal risk,False,AZ
2,no,75.653765,True,025-67-8228,normal risk,False,AR
3,no,64.274952,True,272-10-8089,normal risk,False,AR
4,no,83.021544,True,041-10-9047,normal risk,False,AK
5,no,79.084929,True,381-25-3122,normal risk,False,AL
6,yes,61.324315,True,130-23-3826,high risk,True,AK
7,yes,57.690382,True,516-80-6194,normal risk,True,AL
8,no,77.041701,True,383-69-6773,normal risk,False,AZ
9,no,80.21578,True,893-85-0715,normal risk,False,AZ


In [4]:
from sklearn.preprocessing import OneHotEncoder, label_binarize, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline

from tensorflow.python.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential


In [5]:
train_patients, train_histories = generate_data(10000)
test_patients, test_histories = generate_data(4000)
merged_patients = pd.concat([train_patients, test_patients])

train_data = merged_patients.merge(train_histories, on="ssn")
test_data = merged_patients.merge(test_histories, on="ssn")

In [6]:
encode = ColumnTransformer(transformers=[
    ('numerical_features', StandardScaler(), ['weight']),
    ('categorical_features', OneHotEncoder(handle_unknown='ignore'), ['smokes']),
    ('textual_features', HashingVectorizer(ngram_range=(1, 2), n_features=10), 'notes')])

def create_mlp():
    nn = Sequential([
        Dense(8, activation='relu'), Dropout(0.3),
        Dense(4, activation='relu'),
        Dense(2, activation='softmax')])
    nn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics='accuracy')
    return nn

pipeline = Pipeline([
    ('features', encode),
    ('learner', KerasClassifier(create_mlp, epochs=5))])

model = pipeline.fit(train_data, train_data.has_complication)
model.score(test_data, test_data.has_complication)

2022-10-14 16:21:38.753319: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-14 16:21:38.794313: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


0.815500020980835

In [7]:
corrupted_test_patients, corrupted_test_histories = generate_data(4000)

corrupted_test_patients.loc[corrupted_test_patients.sample(frac=0.2).index, 'weight'] = 0
corrupted_test_patients.loc[corrupted_test_patients.sample(frac=0.2).index, 'weight'] = 60000

corrupted_test_data = corrupted_test_patients.merge(corrupted_test_histories, on="ssn")
model.score(corrupted_test_data, corrupted_test_data.has_complication)



0.7072499990463257

In [8]:
merged_patients = pd.concat([train_patients, corrupted_test_patients])
merged_patients.to_csv('patients.csv', index=False)
train_histories.to_csv('histories.csv', index=False)
corrupted_test_histories.to_csv('test_histories.csv', index=False)