Perhaps add category values for hypertension and BMIs
Perhaps use dropout? See A Ng lectures on tuning

In [24]:
%%bash
# These files appear to be from the original competition (that is unmodified)
wget -q -c https://raw.githubusercontent.com/yasminlucero/Kaggle/master/Data/compDataAsSQLiteDB/compData.db
wget -q -c https://raw.githubusercontent.com/yasminlucero/Kaggle/master/Data/compDataAsSQLiteDB/compData.sql

In [25]:
%matplotlib inline

import sqlite3
import pandas as pd
import numpy as np

from contextlib import closing

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.utils import np_utils

In [26]:
seed = 13431
np.random.seed(seed)

with closing(sqlite3.connect('file:compData.db?mode=ro', uri=True)) as conn:    
    patientTranscripts = pd.read_sql_query("""
    SELECT PatientGuid,
        dmIndicator,
        Gender,
        YearOfBirth,
        VisitYear,
        CASE WHEN BMI = 0 THEN NULL ELSE BMI END AS BMI,
        CASE WHEN SystolicBP = "NULL" THEN NULL ELSE SystolicBP END AS SystolicBP,
        CASE WHEN DiastolicBP = "NULL" THEN NULL ELSE DiastolicBP END AS DiastolicBP
    FROM training_patientTranscript
""", conn)
  
    patientIcd9Codes = pd.read_sql_query("""
SELECT PatientGuid, ICD9Code
FROM training_diagnosis 
""", conn)
    
    # Note: Need to use training_patientSmokingStatus directly rather than patient_smoking
    # as the NIST codes used translate to multiple statuses
    patientSmokingStatus = pd.read_sql_query("""
SELECT PatientGuid, 
    EffectiveYear,
    SmokingStatusGuid
FROM training_patientSmokingStatus
""", conn)
    
    preDbMatch = pd.read_sql_query("""
SELECT DISTINCT PatientGuid
FROM training_diagnosis 
WHERE ICD9Code IN ('790.29','648.83')
""", conn)    

In [27]:
def isPreDB(patientGuid):
   return patientGuid in preDbMatch.PatientGuid.values
 
def isSmoker(ps, patientGuid):
    return patientGuid in ps.index and ps.loc[patientGuid, 'SmokingStatusGuid'] in ('FA2B7AE4-4D14-4768-A8C7-55B5F0CDF4AF', 
                                                                                    '02116D5A-F26C-4A48-9A11-75AC21BC4FD3', \
                                                                                    'DD01E545-D7AF-4F00-B248-9FD40010D81D', \
                                                                                    'FCD437AA-0451-4D8A-9396-B6F19D8B25E8', \
                                                                                    '2548BD83-03AE-4287-A578-FA170F39E32F')

def calcAge(yearOfBirth):
    return 2012 - yearOfBirth

def ohe_icd9Codes():
    icd9CodesByPatient = patientIcd9Codes.groupby('PatientGuid', as_index=False).agg({"ICD9Code": lambda x: '|'.join(x)})#.reset_index()
    return icd9CodesByPatient[['PatientGuid']].join(icd9CodesByPatient.ICD9Code.str.get_dummies())

def processPatientTranscripts(pt):
    pt = pt.sort_values(['VisitYear'],  ascending=False)
    pt = pt.groupby('PatientGuid', as_index=False).agg(lambda x: np.nan if x.first_valid_index() is None else x.loc[x.first_valid_index()])

    pt['dmIndicator'] = pt['dmIndicator'].astype('bool')
    pt['Gender'] = pt['Gender'].astype('category')
    pt['Age'] = pt.apply(lambda row: calcAge(row.YearOfBirth), axis=1)
    pt['PreDB'] = pt.apply(lambda row: isPreDB(row.PatientGuid), axis=1)
    
    pt.drop(['VisitYear', 'YearOfBirth'], axis=1, inplace=True)
    pt.dropna(how='any', inplace=True)
    
    pt[['SystolicBP', 'DiastolicBP']] = pt[['SystolicBP', 'DiastolicBP']].astype(int)
    
    return pt

def processPatientSmoking(ps):
    ps = ps.sort_values(['EffectiveYear'], ascending=False)
    ps = ps.groupby('PatientGuid').agg(lambda x: np.nan if x.first_valid_index() is None else x.loc[x.first_valid_index()])
    
    ps['Smoker'] = ps.apply(lambda row: isSmoker(ps, row.name), axis=1)
    
    ps.drop(['EffectiveYear', 'SmokingStatusGuid'], axis=1, inplace=True)
    ps.dropna(how='any', inplace=True)
    ps.reset_index(level=0, inplace=True)

    return ps


In [28]:
smokingStatus = processPatientSmoking(patientSmokingStatus)
smokingStatus.head()

Unnamed: 0,PatientGuid,Smoker
0,000B4862-7CE7-4EC5-8043-A97FCD74BD78,False
1,0029BBC8-7C22-4444-9F44-87BEF05FE033,False
2,003CEE1F-5BF5-4171-9284-F5464EC12D41,False
3,005F61C9-E537-4AD2-B39C-37F25891F33A,False
4,0063B34F-C2C4-423A-A144-E51F7149253A,False


In [29]:
transcripts = processPatientTranscripts(patientTranscripts)
transcripts.head()

Unnamed: 0,PatientGuid,dmIndicator,Gender,BMI,SystolicBP,DiastolicBP,Age,PreDB
0,00023761-9D8D-445B-874C-2424CC7CF620,False,M,27.67,122,76,56,False
1,0005D9BD-0247-4F02-B7EE-7C1B44825FA1,False,F,22.463,128,82,60,False
2,000B4862-7CE7-4EC5-8043-A97FCD74BD78,False,F,16.654,80,60,87,False
3,00110ABC-DAB9-49E3-A1C8-88BBF8D58109,False,M,28.19,125,75,42,False
4,002667F4-B9A3-4DE2-875F-8034DD637865,False,F,19.388,112,80,84,False


In [30]:
patients = pd.merge(transcripts, smokingStatus, on='PatientGuid', how='left')
#patients = pd.merge(patients, ohe_icd9Codes(), on='PatientGuid', how='left')
patients.fillna(False, inplace=True)
patients = pd.get_dummies(patients, columns=['Gender'])
#for column in patients.columns[patients.columns.str.contains(pat = 'SmokingStatusGuid_')]:
#    patients[column] = patients[column].astype(int)
patients.head()

Unnamed: 0,PatientGuid,dmIndicator,BMI,SystolicBP,DiastolicBP,Age,PreDB,Smoker,Gender_F,Gender_M
0,00023761-9D8D-445B-874C-2424CC7CF620,False,27.67,122,76,56,False,False,0,1
1,0005D9BD-0247-4F02-B7EE-7C1B44825FA1,False,22.463,128,82,60,False,False,1,0
2,000B4862-7CE7-4EC5-8043-A97FCD74BD78,False,16.654,80,60,87,False,False,1,0
3,00110ABC-DAB9-49E3-A1C8-88BBF8D58109,False,28.19,125,75,42,False,False,0,1
4,002667F4-B9A3-4DE2-875F-8034DD637865,False,19.388,112,80,84,False,False,1,0


In [31]:
X = patients.values[:, 2:]
y = patients.values[:, 1]
X

array([[27.67, 122, 76, ..., False, 0, 1],
       [22.463, 128, 82, ..., False, 1, 0],
       [16.654, 80, 60, ..., False, 1, 0],
       ..., 
       [37.454, 124, 72, ..., False, 1, 0],
       [28.749, 130, 79, ..., False, 0, 1],
       [26.153, 138, 84, ..., False, 1, 0]], dtype=object)

In [32]:
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.8, random_state=seed)
input_shape = len(X[0])

In [33]:
model = Sequential()
model.add(Dense(16, input_shape=(input_shape,)))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])

model.fit(train_X, train_y, epochs=100, batch_size=10, verbose=1);

loss, accuracy = model.evaluate(test_X, test_y, verbose=0)
print("Accuracy = {:.3f}".format(accuracy))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Accuracy = 0.815
