In [8]:
import random
import numpy as np
import pandas as pd
from faker import Faker
fake = Faker()

seed = 1234
np.random.seed(seed)
fake.seed = seed
random.seed = seed

In [9]:
def generate_data(num_records):

    patient_data = []
    history_data = []

    for _ in range(0, num_records):
        smokes = np.random.rand() > 0.75

        is_male = np.random.rand() > 0.5
        gave_consent = np.random.rand() > 0.02
        if is_male:
            weight = np.random.normal(loc=80, scale=5.0)
        else:
            weight = np.random.normal(loc=60, scale=3.0)

        complication_prob = 0.20

        if smokes:
           complication_prob += 0.10

        if is_male and weight > 90:
           complication_prob += 0.30
        elif is_male and weight > 85:
           complication_prob += 0.20

        if weight < 45:
           complication_prob += 0.10

        if complication_prob >= 0.20 and np.random.rand() > 0.3:
            notes = "high risk"
        else:
            notes = "normal risk"

        has_complication = np.random.rand() < complication_prob

        smokes_cat = 'no'
        if smokes:
            smokes_cat = 'yes'

        hospital = random.choice(["AL", "AK", "AR", "AZ"])

        ssn = fake.ssn()

        patient_data.append((smokes_cat, weight, gave_consent, ssn))
        history_data.append((notes, has_complication, ssn, hospital))

    patients = pd.DataFrame.from_records(patient_data, columns=['smokes', 'weight', 'gave_consent', 'ssn'])
    histories = pd.DataFrame.from_records(history_data, columns=['notes', 'has_complication', 'ssn', 'hospital'])
    return patients, histories

In [10]:
tmp_patients, tmp_histories = generate_data(10)
tmp_patients.merge(tmp_histories, on="ssn")

Unnamed: 0,smokes,weight,gave_consent,ssn,notes,has_complication,hospital
0,no,83.312545,True,145-30-4647,normal risk,False,AZ
1,yes,83.376232,True,585-16-5590,high risk,False,AZ
2,no,80.729046,True,329-35-0407,normal risk,False,AK
3,yes,68.682273,True,450-62-4605,normal risk,False,AZ
4,yes,76.697793,True,279-97-5059,high risk,False,AZ
5,no,85.20543,True,304-35-8430,high risk,False,AK
6,no,84.569758,True,513-31-4578,high risk,True,AK
7,no,65.067364,True,577-44-3249,normal risk,False,AL
8,no,57.983418,True,090-29-5357,high risk,False,AR
9,no,62.741663,True,103-60-4282,high risk,False,AK


In [11]:
from sklearn.preprocessing import OneHotEncoder, label_binarize, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline

from tensorflow.python.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential


In [12]:
train_patients, train_histories = generate_data(10000)
test_patients, test_histories = generate_data(4000)
merged_patients = pd.concat([train_patients, test_patients])

train_data = merged_patients.merge(train_histories, on="ssn")
test_data = merged_patients.merge(test_histories, on="ssn")

merged_patients.to_csv('patients.csv', index=False)
train_histories.to_csv('histories.csv', index=False)
test_histories.to_csv('test_histories.csv', index=False)

In [13]:
encode = ColumnTransformer(transformers=[
    ('numerical_features', StandardScaler(), ['weight']),
    ('categorical_features', OneHotEncoder(handle_unknown='ignore'), ['smokes']),
    ('textual_features', HashingVectorizer(ngram_range=(1, 2), n_features=10), 'notes')])

def create_mlp():
    nn = Sequential([
        Dense(8, activation='relu'), Dropout(0.3),
        Dense(4, activation='relu'),
        Dense(2, activation='softmax')])
    nn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics='accuracy')
    return nn

pipeline = Pipeline([
    ('features', encode),
    ('learner', KerasClassifier(create_mlp, epochs=5))])

model = pipeline.fit(train_data, train_data.has_complication)
model.score(test_data, test_data.has_complication)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


0.749750018119812

In [14]:
corrupted_test_patients, corrupted_test_histories = generate_data(1000)
corrupted_test_data = corrupted_test_patients.merge(corrupted_test_histories, on="ssn")

corrupted_test_data.loc[corrupted_test_data.sample(frac=0.25).index, 'weight'] = 0
corrupted_test_data.loc[corrupted_test_data.sample(frac=0.25).index, 'weight'] = 60000

model.score(corrupted_test_data, corrupted_test_data.has_complication)



0.6269999742507935