In [11]:
import numpy as np
import pandas as pd

In [19]:
def generate_data(num_records):

    data = []
    
    for _ in range(0, num_records):
        smokes = np.random.rand() > 0.75

        is_male = np.random.rand() > 0.5
        if is_male:
            weight = np.random.normal(loc=80, scale=5.0)
        else:
            weight = np.random.normal(loc=60, scale=3.0)

        complication_prob = 0.20    

        if smokes:
               complication_prob += 0.10

        if is_male and weight > 90:     
               complication_prob += 0.20

        if weight < 45:            
               complication_prob += 0.10                    

        notes = ""
        if complication_prob > 0.20 and np.random.rand() > 0.6:
            notes = "high risk"

        has_complication = np.random.rand() < complication_prob       

        smokes_cat = 'no'
        if smokes:
            smokes_cat = 'yes'
        
        data.append((smokes_cat, weight, notes, has_complication))
    
    return pd.DataFrame.from_records(data, columns=['smokes', 'weight', 'notes', 'has_complication'])

In [20]:
generate_data(10)

Unnamed: 0,smokes,weight,notes,has_complication
0,no,59.802151,,False
1,no,80.313039,,False
2,no,64.794924,,False
3,yes,59.432555,,False
4,yes,78.519597,,False
5,no,84.463416,,False
6,no,56.135506,,True
7,yes,63.849577,,False
8,no,60.237643,,False
9,no,71.575839,,False


In [18]:
from sklearn.preprocessing import OneHotEncoder, label_binarize, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline

from tensorflow.python.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential

In [28]:
train_data = generate_data(2500)
test_data = generate_data(1000)

In [29]:
encode = ColumnTransformer(transformers=[
    ('numerical_features', StandardScaler(), ['weight']),
    ('categorical_features', OneHotEncoder(handle_unknown='ignore'), ['smokes']),
    ('textual_features', HashingVectorizer(ngram_range=(1, 2), n_features=10), 'notes')])

def create_mlp():
    nn = Sequential([
        Dense(8, activation='relu'), Dropout(0.3),
        Dense(4, activation='relu'),
        Dense(2, activation='softmax')])
    nn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics='accuracy')
    return nn

pipeline = Pipeline([
    ('features', encode),
    ('learner', KerasClassifier(create_mlp, epochs=5))])

model = pipeline.fit(train_data, train_data.has_complication)
model.score(test_data, test_data.has_complication)

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


0.7730000019073486

In [30]:
corrupted_test_data = generate_data(1000)

corrupted_test_data.loc[corrupted_test_data.sample(frac=0.2).index, 'weight'] = 0
corrupted_test_data.loc[corrupted_test_data.sample(frac=0.2).index, 'weight'] = 1000

model.score(corrupted_test_data, corrupted_test_data.has_complication)



0.6489999890327454