In [249]:
%%bash
# These files appear to be from the original competition (that is unmodified)
wget -q -c https://raw.githubusercontent.com/yasminlucero/Kaggle/master/Data/compDataAsSQLiteDB/compData.db
wget -q -c https://raw.githubusercontent.com/yasminlucero/Kaggle/master/Data/compDataAsSQLiteDB/compData.sql

In [250]:
%matplotlib inline
import seaborn as sns

import sqlite3
import pandas as pd
import numpy as np

from contextlib import closing

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.utils import np_utils

seed = 13431
np.random.seed(seed)

with closing(sqlite3.connect('file:compData.db?mode=ro', uri=True)) as conn:    
    patientTranscripts = pd.read_sql_query("""
    SELECT PatientGuid,
        dmIndicator,
        Gender,
        YearOfBirth,
        VisitYear,
        CASE WHEN BMI = 0 THEN NULL ELSE BMI END AS BMI,
        CASE WHEN SystolicBP = "NULL" THEN NULL ELSE SystolicBP END AS SystolicBP,
        CASE WHEN DiastolicBP = "NULL" THEN NULL ELSE DiastolicBP END AS DiastolicBP
    FROM training_patientTranscript
""", conn)
  
# patientTranscripts.info()
# patientTranscripts.describe()

In [251]:
# https://www.cdc.gov/healthyweight/assessing/index.html
def translateBMI(bmi):
    if bmi <  18.5:
        return 0 # "Underweight"
    elif 18.5 <= bmi <= 24.9:
        return 1 # "Normal"
    elif 25 <= bmi <= 29.9:
        return 2 # "Overweight"
    else:
        return 3 # "Obese"
        
# https://www.heart.org/HEARTORG/Conditions/HighBloodPressure/KnowYourNumbers/Understanding-Blood-Pressure-Readings_UCM_301764_Article.jsp
def translateBP(systolicBP, diastolicBP):
    if systolicBP < 120 and diastolicBP < 80:
        return 0 # "Normal"
    elif systolicBP <= 129 and diastolicBP < 80:
        return 1 # "Elevated"
    elif systolicBP <= 139 or 80 <= diastolicBP <= 89:
        return 2 #  "HTN_S1"
    elif 140 <= systolicBP or 90 <= diastolicBP:
        return 3 # "HTN_S2"
    else:
        return 4 # "Hypertensive_Crisis"

def translateGender(gender):
    if gender == "F":
        return 0
    elif gender == "M":
        return 1
    else:
        return 2

def calcAge(visitYear, yearOfBirth):
    return visitYear - yearOfBirth

def translateDmIndicator(dmIndicator):
    return np.True_ if dmIndicator == 0 else np.False_

def processPatientTranscripts(pt):
    pt = pt.sort_values(['VisitYear'],  ascending=False)
    pt = pt.groupby('PatientGuid', as_index=False).agg(lambda x: np.nan if x.first_valid_index() is None else x.loc[x.first_valid_index()])

    #pt["dmIndicator"] = patientTranscripts.apply(lambda row: translateDmIndicator(row.dmIndicator), axis=1)
    #pt["Gender"] = patientTranscripts["Gender"].astype('category')
    pt["Gender"] = pt.apply(lambda row: translateGender(row.Gender), axis=1).astype('category')
    pt['Age'] = pt.apply(lambda row: calcAge(row.VisitYear, row.YearOfBirth), axis=1)
    #pt['BMICategory'] = pt.apply(lambda row: translateBMI(row.BMI), axis=1).astype('category')
    #pt['BPCategory'] = pt.apply(lambda row: translateBP(row.SystolicBP, row.DiastolicBP), axis=1).astype('category')

    pt.drop(['VisitYear', 'PatientGuid', 'YearOfBirth'], axis=1, inplace=True)
    pt.dropna(how='any', inplace=True)
    
    return pt

In [252]:
patientTranscripts = processPatientTranscripts(patientTranscripts)
patientTranscripts.head()

Unnamed: 0,dmIndicator,Gender,BMI,SystolicBP,DiastolicBP,Age
0,0,1,27.67,122.0,76.0,53
1,0,0,22.463,128.0,82.0,58
2,0,0,16.654,80.0,60.0,87
3,0,1,28.19,125.0,75.0,42
4,0,0,19.388,112.0,80.0,82


In [253]:
X = patientTranscripts.values[:, 1:6]
y = patientTranscripts.values[:, 0]
X

array([[1, 27.67, 122.0, 76.0, 53],
       [0, 22.463, 128.0, 82.0, 58],
       [0, 16.654, 80.0, 60.0, 87],
       ..., 
       [0, 37.454, 124.0, 72.0, 47],
       [1, 28.749, 130.0, 79.0, 82],
       [0, 26.153, 138.0, 84.0, 65]], dtype=object)

In [254]:
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.77, random_state=seed)
train_X

array([[0, 32.673, 188.0, 68.0, 81],
       [1, 23.732, 102.0, 68.0, 62],
       [0, 24.241, 120.0, 70.0, 67],
       ..., 
       [1, 21.371, 160.0, 110.0, 47],
       [1, 27.82, 133.0, 68.0, 80],
       [0, 35.699, 120.0, 70.0, 42]], dtype=object)

In [256]:
model = Sequential()
model.add(Dense(12, input_dim=5, activation='relu'))
model.add(Dense(5, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])

model.fit(train_X, train_y, epochs=100, batch_size=10, verbose=0);

loss, accuracy = model.evaluate(test_X, test_y, verbose=0)
print("Accuracy = {:.2f}".format(accuracy))

Accuracy = 0.81
