In [112]:
%%bash
# These files appear to be from the original competition (that is unmodified)
wget -q -c https://raw.githubusercontent.com/yasminlucero/Kaggle/master/Data/compDataAsSQLiteDB/compData.db
wget -q -c https://raw.githubusercontent.com/yasminlucero/Kaggle/master/Data/compDataAsSQLiteDB/compData.sql

In [113]:
%matplotlib inline
import seaborn as sns

import sqlite3
import pandas as pd
import numpy as np

from contextlib import closing

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.utils import np_utils

seed = 13431
np.random.seed(seed)

with closing(sqlite3.connect('file:compData.db?mode=ro', uri=True)) as conn:    
    patientTranscripts = pd.read_sql_query("""
    SELECT PatientGuid,
        dmIndicator,
        Gender,
        YearOfBirth,
        VisitYear,
        CASE WHEN BMI = 0 THEN NULL ELSE BMI END AS BMI,
        CASE WHEN SystolicBP = "NULL" THEN NULL ELSE SystolicBP END AS SystolicBP,
        CASE WHEN DiastolicBP = "NULL" THEN NULL ELSE DiastolicBP END AS DiastolicBP
    FROM training_patientTranscript
""", conn)
  
    patientIcd9Codes = pd.read_sql_query("""
SELECT PatientGuid, ICD9Code
FROM training_diagnosis 
""", conn)
    
#     preDbMatch = pd.read_sql_query("""
# SELECT DISTINCT PatientGuid
# FROM training_diagnosis 
# WHERE ICD9Code IN ('790.29','648.83')
# """, conn)

In [114]:
# https://www.cdc.gov/healthyweight/assessing/index.html
def translateBMI(bmi):
    if bmi <  18.5:
        return 0 # "Underweight"
    elif 18.5 <= bmi <= 24.9:
        return 1 # "Normal"
    elif 25 <= bmi <= 29.9:
        return 2 # "Overweight"
    else:
        return 3 # "Obese"
        
# https://www.heart.org/HEARTORG/Conditions/HighBloodPressure/KnowYourNumbers/Understanding-Blood-Pressure-Readings_UCM_301764_Article.jsp
def translateBP(systolicBP, diastolicBP):
    if systolicBP < 120 and diastolicBP < 80:
        return 0 # "Normal"
    elif systolicBP <= 129 and diastolicBP < 80:
        return 1 # "Elevated"
    elif systolicBP <= 139 or 80 <= diastolicBP <= 89:
        return 2 #  "HTN_S1"
    elif 140 <= systolicBP or 90 <= diastolicBP:
        return 3 # "HTN_S2"
    else:
        return 4 # "Hypertensive_Crisis"

def translateGender(gender):
    if gender == "F":
        return 0
    elif gender == "M":
        return 1
    else:
        return 2

def calcAge(yearOfBirth):
    return 2012 - yearOfBirth

def translateDmIndicator(dmIndicator):
    return np.True_ if dmIndicator == 0 else np.False_

# def isPreDB(patientGuid):
#     return 1 if patientGuid in preDbMatch.PatientGuid.values else 0

def ohe_icd9Codes():
    icd9CodesByPatient = patientIcd9Codes.groupby('PatientGuid')["ICD9Code"].apply(list).reset_index()
    ohe_icd9CodesByPatient = icd9CodesByPatient['ICD9Code'].str.join('|').str.get_dummies()
    return ohe_icd9CodesByPatient

def processPatientTranscripts(pt):
    pt = pt.sort_values(['VisitYear'],  ascending=False)
    pt = pt.groupby('PatientGuid', as_index=False).agg(lambda x: np.nan if x.first_valid_index() is None else x.loc[x.first_valid_index()])

    #pt["dmIndicator"] = patientTranscripts.apply(lambda row: translateDmIndicator(row.dmIndicator), axis=1)
    #pt["Gender"] = patientTranscripts["Gender"].astype('category')
    pt["Gender"] = pt.apply(lambda row: translateGender(row.Gender), axis=1).astype('category')
    pt['Age'] = pt.apply(lambda row: calcAge(row.YearOfBirth), axis=1)
    #pt['PreDB'] = pt.apply(lambda row: isPreDB(row.PatientGuid), axis=1)
    #pt['BMICategory'] = pt.apply(lambda row: translateBMI(row.BMI), axis=1).astype('category')
    #pt['BPCategory'] = pt.apply(lambda row: translateBP(row.SystolicBP, row.DiastolicBP), axis=1).astype('category')

    pt.drop(['VisitYear', 'PatientGuid', 'YearOfBirth'], axis=1, inplace=True)
    pt.dropna(how='any', inplace=True)
    
    return pt

In [115]:
patientTranscripts = processPatientTranscripts(patientTranscripts)
patientTranscripts

Unnamed: 0,dmIndicator,Gender,BMI,SystolicBP,DiastolicBP,Age
0,0,1,27.670,122.0,76.0,56
1,0,0,22.463,128.0,82.0,60
2,0,0,16.654,80.0,60.0,87
3,0,1,28.190,125.0,75.0,42
4,0,0,19.388,112.0,80.0,84
5,0,0,26.606,121.0,72.0,59
6,0,0,20.026,130.0,82.0,86
7,0,0,42.864,152.0,92.0,51
8,0,0,26.623,128.0,96.0,27
9,0,1,24.226,117.0,78.0,25


In [116]:
X = patientTranscripts.values[:, 1:]
y = patientTranscripts.values[:, 0].astype("int")
X

array([[1, 27.67, 122.0, 76.0, 56],
       [0, 22.463, 128.0, 82.0, 60],
       [0, 16.654, 80.0, 60.0, 87],
       ..., 
       [0, 37.454, 124.0, 72.0, 47],
       [1, 28.749, 130.0, 79.0, 82],
       [0, 26.153, 138.0, 84.0, 65]], dtype=object)

In [117]:
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.77, random_state=seed)

In [118]:
lr = LogisticRegressionCV()
lr.fit(train_X, train_y)

print("Accuracy = {:.2f}".format(lr.score(test_X, test_y)))

Accuracy = 0.80


In [130]:
patientTranscripts = pd.concat([patientTranscripts, ohe_icd9Codes()], axis=1)
patientTranscripts

Unnamed: 0,dmIndicator,Gender,BMI,SystolicBP,DiastolicBP,Age,002.0,003.0,003.23,005.9,...,V85.41,V85.42,V85.52,V85.53,V87.31,V87.45,V88.01,V88.02,v43.3,v58.69
0,0.0,1.0,27.670,122.0,76.0,56.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,22.463,128.0,82.0,60.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,16.654,80.0,60.0,87.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,1.0,28.190,125.0,75.0,42.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,19.388,112.0,80.0,84.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0.0,0.0,26.606,121.0,72.0,59.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0.0,0.0,20.026,130.0,82.0,86.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0.0,0.0,42.864,152.0,92.0,51.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0.0,0.0,26.623,128.0,96.0,27.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0.0,1.0,24.226,117.0,78.0,25.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [135]:
X = patientTranscripts.values[:, 1:]
y = patientTranscripts.values[:, 0].astype("int")
X

array([[   1.   ,   27.67 ,  122.   , ...,    0.   ,    0.   ,    0.   ],
       [   0.   ,   22.463,  128.   , ...,    0.   ,    0.   ,    0.   ],
       [   0.   ,   16.654,   80.   , ...,    0.   ,    0.   ,    0.   ],
       ..., 
       [   0.   ,   37.454,  124.   , ...,    0.   ,    0.   ,    0.   ],
       [   1.   ,   28.749,  130.   , ...,    0.   ,    0.   ,    0.   ],
       [   0.   ,   26.153,  138.   , ...,    0.   ,    0.   ,    0.   ]])

In [139]:
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.77, random_state=seed)
input_shape = len(X[0])

In [140]:
model = Sequential()
model.add(Dense(16, input_shape=(input_shape,)))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])

model.fit(train_X, train_y, epochs=100, batch_size=10, verbose=1);

loss, accuracy = model.evaluate(test_X, test_y, verbose=0)
print("Accuracy = {:.2f}".format(accuracy))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Accuracy = 0.75
