Perhaps add category values for hypertension and BMIs
Perhaps use dropout? See A Ng lectures on tuning

In [15]:
%%bash
# These files appear to be from the original competition (that is unmodified)
wget -q -c https://raw.githubusercontent.com/yasminlucero/Kaggle/master/Data/compDataAsSQLiteDB/compData.db
wget -q -c https://raw.githubusercontent.com/yasminlucero/Kaggle/master/Data/compDataAsSQLiteDB/compData.sql

In [16]:
%matplotlib inline

import sqlite3
import pandas as pd
import numpy as np

from contextlib import closing

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.utils import np_utils

seed = 13431
np.random.seed(seed)

with closing(sqlite3.connect('file:compData.db?mode=ro', uri=True)) as conn:    
    patientTranscripts = pd.read_sql_query("""
    SELECT PatientGuid,
        dmIndicator,
        Gender,
        YearOfBirth,
        VisitYear,
        CASE WHEN BMI = 0 THEN NULL ELSE BMI END AS BMI,
        CASE WHEN SystolicBP = "NULL" THEN NULL ELSE SystolicBP END AS SystolicBP,
        CASE WHEN DiastolicBP = "NULL" THEN NULL ELSE DiastolicBP END AS DiastolicBP
    FROM training_patientTranscript
""", conn)
  
    patientIcd9Codes = pd.read_sql_query("""
SELECT PatientGuid, ICD9Code
FROM training_diagnosis 
""", conn)
    
    # Note: Need to use training_patientSmokingStatus directly rather than patient_smoking
    # as the NIST codes used translate to multiple statuses
    patientSmokingStatus = pd.read_sql_query("""
SELECT PatientGuid, 
    EffectiveYear,
    SmokingStatusGuid
FROM training_patientSmokingStatus
""", conn)

In [17]:
def calcAge(yearOfBirth):
    return 2012 - yearOfBirth

def ohe_icd9Codes():
    icd9CodesByPatient = patientIcd9Codes.groupby('PatientGuid', as_index=False).agg({"ICD9Code": lambda x: '|'.join(x)})#.reset_index()
    return icd9CodesByPatient[['PatientGuid']].join(icd9CodesByPatient.ICD9Code.str.get_dummies())

def processPatientTranscripts(pt):
    pt = pt.sort_values(['VisitYear'],  ascending=False)
    pt = pt.groupby('PatientGuid', as_index=False).agg(lambda x: np.nan if x.first_valid_index() is None else x.loc[x.first_valid_index()])

    pt['Age'] = pt.apply(lambda row: calcAge(row.YearOfBirth), axis=1)
    patients[['SystolicBP', 'DiastolicBP']] = patients[['SystolicBP', 'DiastolicBP']].astype(int)
    pt = pd.get_dummies(pt, columns=['Gender'])
    pt.drop(['VisitYear', 'YearOfBirth'], axis=1, inplace=True)
    pt.dropna(how='any', inplace=True)
    
    return pt

def processPatientSmoking(ps):
    ps = ps.sort_values(['EffectiveYear'],  ascending=False)
    ps = ps.groupby('PatientGuid', as_index=False).agg(lambda x: np.nan if x.first_valid_index() is None else x.loc[x.first_valid_index()])
    
    ps.drop(['EffectiveYear'], axis=1, inplace=True)
    ps.dropna(how='any', inplace=True)
    ps = pd.get_dummies(ps, columns=['SmokingStatusGuid'])

    return ps

In [18]:
smokingStatus = processPatientSmoking(patientSmokingStatus)
smokingStatus

Unnamed: 0,PatientGuid,SmokingStatusGuid_02116D5A-F26C-4A48-9A11-75AC21BC4FD3,SmokingStatusGuid_0815F240-3DD3-43C6-8618-613CA9E41F9F,SmokingStatusGuid_1F3BFBBF-AB76-481B-B1E0-08A3689A54BC,SmokingStatusGuid_2548BD83-03AE-4287-A578-FA170F39E32F,SmokingStatusGuid_5ABBAB35-836F-4F3E-8632-CE063828DA15,SmokingStatusGuid_C12C2DB7-D31A-4514-88C0-42CBD339F764,SmokingStatusGuid_DD01E545-D7AF-4F00-B248-9FD40010D81D,SmokingStatusGuid_E86CA3A8-E35B-4BBF-80E2-0375AB4A1460,SmokingStatusGuid_FA2B7AE4-4D14-4768-A8C7-55B5F0CDF4AF,SmokingStatusGuid_FCD437AA-0451-4D8A-9396-B6F19D8B25E8
0,000B4862-7CE7-4EC5-8043-A97FCD74BD78,0,0,0,0,0,1,0,0,0,0
1,0029BBC8-7C22-4444-9F44-87BEF05FE033,0,0,0,0,0,1,0,0,0,0
2,003CEE1F-5BF5-4171-9284-F5464EC12D41,0,0,0,0,1,0,0,0,0,0
3,005F61C9-E537-4AD2-B39C-37F25891F33A,0,0,0,0,1,0,0,0,0,0
4,0063B34F-C2C4-423A-A144-E51F7149253A,0,0,1,0,0,0,0,0,0,0
5,006948F2-1118-4F56-A561-6A254EE357C6,0,0,0,0,1,0,0,0,0,0
6,006A9198-1BB4-42B9-A864-210BF14AD445,0,0,1,0,0,0,0,0,0,0
7,006E3A23-F786-4ED2-BE70-D91D23BA56EA,0,0,1,0,0,0,0,0,0,0
8,008383B8-BF24-4DBB-A848-A377A4899599,0,0,0,0,1,0,0,0,0,0
9,008A55FD-0735-47EC-A2FA-20043D1423C2,0,0,1,0,0,0,0,0,0,0


In [19]:
transcripts = processPatientTranscripts(patientTranscripts)
transcripts

Unnamed: 0,PatientGuid,dmIndicator,BMI,SystolicBP,DiastolicBP,Age,Gender_F,Gender_M
0,00023761-9D8D-445B-874C-2424CC7CF620,0,27.670,122.0,76.0,56,0,1
1,0005D9BD-0247-4F02-B7EE-7C1B44825FA1,0,22.463,128.0,82.0,60,1,0
2,000B4862-7CE7-4EC5-8043-A97FCD74BD78,0,16.654,80.0,60.0,87,1,0
3,00110ABC-DAB9-49E3-A1C8-88BBF8D58109,0,28.190,125.0,75.0,42,0,1
4,002667F4-B9A3-4DE2-875F-8034DD637865,0,19.388,112.0,80.0,84,1,0
5,0029BBC8-7C22-4444-9F44-87BEF05FE033,0,26.606,121.0,72.0,59,1,0
6,003CEE1F-5BF5-4171-9284-F5464EC12D41,0,20.026,130.0,82.0,86,1,0
7,004382BD-E31F-4091-8DE5-E86A59D70C2D,0,42.864,152.0,92.0,51,1,0
8,004AACFF-5D67-49B3-81F4-883A72A2AB97,0,26.623,128.0,96.0,27,1,0
9,004CC712-BAD3-4A0E-8C9D-659F6C83A3AD,0,24.226,117.0,78.0,25,0,1


In [20]:
patients = pd.merge(transcripts, smokingStatus, on='PatientGuid', how='left')
patients = pd.merge(patients, ohe_icd9Codes(), on='PatientGuid', how='left')
patients.fillna(0, inplace=True)
for column in patients.columns[patients.columns.str.contains(pat = 'SmokingStatusGuid_')]:
    patients[column] = patients[column].astype(int)
patients

Unnamed: 0,PatientGuid,dmIndicator,BMI,SystolicBP,DiastolicBP,Age,Gender_F,Gender_M,SmokingStatusGuid_02116D5A-F26C-4A48-9A11-75AC21BC4FD3,SmokingStatusGuid_0815F240-3DD3-43C6-8618-613CA9E41F9F,...,V85.41,V85.42,V85.52,V85.53,V87.31,V87.45,V88.01,V88.02,v43.3,v58.69
0,00023761-9D8D-445B-874C-2424CC7CF620,0,27.670,122.0,76.0,56,0,1,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0005D9BD-0247-4F02-B7EE-7C1B44825FA1,0,22.463,128.0,82.0,60,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,000B4862-7CE7-4EC5-8043-A97FCD74BD78,0,16.654,80.0,60.0,87,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,00110ABC-DAB9-49E3-A1C8-88BBF8D58109,0,28.190,125.0,75.0,42,0,1,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,002667F4-B9A3-4DE2-875F-8034DD637865,0,19.388,112.0,80.0,84,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5,0029BBC8-7C22-4444-9F44-87BEF05FE033,0,26.606,121.0,72.0,59,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
6,003CEE1F-5BF5-4171-9284-F5464EC12D41,0,20.026,130.0,82.0,86,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,004382BD-E31F-4091-8DE5-E86A59D70C2D,0,42.864,152.0,92.0,51,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
8,004AACFF-5D67-49B3-81F4-883A72A2AB97,0,26.623,128.0,96.0,27,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
9,004CC712-BAD3-4A0E-8C9D-659F6C83A3AD,0,24.226,117.0,78.0,25,0,1,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
X = patients.values[:, 2:]
y = patients.values[:, 1]
X

array([[27.67, 122.0, 76.0, ..., 0, 0, 0],
       [22.463, 128.0, 82.0, ..., 0, 0, 0],
       [16.654, 80.0, 60.0, ..., 0, 0, 0],
       ..., 
       [37.454, 124.0, 72.0, ..., 0, 0, 0],
       [28.749, 130.0, 79.0, ..., 0, 0, 0],
       [26.153, 138.0, 84.0, ..., 0, 0, 0]], dtype=object)

In [22]:
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.8, random_state=seed)
input_shape = len(X[0])

3959

In [23]:
model = Sequential()
model.add(Dense(16, input_shape=(input_shape,)))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])

model.fit(train_X, train_y, epochs=100, batch_size=10, verbose=0);

loss, accuracy = model.evaluate(test_X, test_y, verbose=0)
print("Accuracy = {:.2f}".format(accuracy))

Accuracy = 0.78


In [2]:
pd.DataFrame(zip(X.columns, np.transpose(model.coef_)))

NameError: name 'pd' is not defined