In [113]:
%%bash
# These files appear to be from the original competition (that is unmodified)
wget -q -c https://raw.githubusercontent.com/yasminlucero/Kaggle/master/Data/compDataAsSQLiteDB/compData.db
wget -q -c https://raw.githubusercontent.com/yasminlucero/Kaggle/master/Data/compDataAsSQLiteDB/compData.sql

In [114]:
%matplotlib inline
import seaborn as sns

import sqlite3
import pandas as pd
import numpy as np

from contextlib import closing

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.utils import np_utils

seed = 13431
np.random.seed(seed)

with closing(sqlite3.connect('file:compData.db?mode=ro', uri=True)) as conn:    
    patientTranscripts = pd.read_sql_query("""
    SELECT PatientGuid,
        dmIndicator,
        Gender,
        YearOfBirth,
        VisitYear,
        CASE WHEN BMI = 0 THEN NULL ELSE BMI END AS BMI,
        CASE WHEN SystolicBP = "NULL" THEN NULL ELSE SystolicBP END AS SystolicBP,
        CASE WHEN DiastolicBP = "NULL" THEN NULL ELSE DiastolicBP END AS DiastolicBP
    FROM training_patientTranscript
""", conn)
  
    patientIcd9Codes = pd.read_sql_query("""
SELECT PatientGuid, ICD9Code
FROM training_diagnosis 
""", conn)
    
    patientSmokingStatus = pd.read_sql_query("""
SELECT PatientGuid, 
    SmokeEffectiveYear,
    SmokingStatus_NISTCode
FROM training_smoke
""", conn)
    
#     preDbMatch = pd.read_sql_query("""
# SELECT DISTINCT PatientGuid
# FROM training_diagnosis 
# WHERE ICD9Code IN ('790.29','648.83')
# """, conn)

In [115]:
# https://www.cdc.gov/healthyweight/assessing/index.html
def translateBMI(bmi):
    if bmi <  18.5:
        return 0 # "Underweight"
    elif 18.5 <= bmi <= 24.9:
        return 1 # "Normal"
    elif 25 <= bmi <= 29.9:
        return 2 # "Overweight"
    else:
        return 3 # "Obese"
        
# https://www.heart.org/HEARTORG/Conditions/HighBloodPressure/KnowYourNumbers/Understanding-Blood-Pressure-Readings_UCM_301764_Article.jsp
def translateBP(systolicBP, diastolicBP):
    if systolicBP < 120 and diastolicBP < 80:
        return 0 # "Normal"
    elif systolicBP <= 129 and diastolicBP < 80:
        return 1 # "Elevated"
    elif systolicBP <= 139 or 80 <= diastolicBP <= 89:
        return 2 #  "HTN_S1"
    elif 140 <= systolicBP or 90 <= diastolicBP:
        return 3 # "HTN_S2"
    else:
        return 4 # "Hypertensive_Crisis"

def translateGender(gender):
    if gender == "F":
        return 0
    elif gender == "M":
        return 1
    else:
        return 2

def calcAge(yearOfBirth):
    return 2012 - yearOfBirth

def translateDmIndicator(dmIndicator):
    return np.True_ if dmIndicator == 0 else np.False_

# def isPreDB(patientGuid):
#     return 1 if patientGuid in preDbMatch.PatientGuid.values else 0

def ohe_icd9Codes():
    icd9CodesByPatient = patientIcd9Codes.groupby('PatientGuid')["ICD9Code"].apply(list).reset_index()
    ohe_icd9CodesByPatient = icd9CodesByPatient['ICD9Code'].str.join('|').str.get_dummies()
    return ohe_icd9CodesByPatient

def processPatientTranscripts(pt):
    pt = pt.sort_values(['VisitYear'],  ascending=False)
    pt = pt.groupby('PatientGuid', as_index=False).agg(lambda x: np.nan if x.first_valid_index() is None else x.loc[x.first_valid_index()])

    #pt["dmIndicator"] = patientTranscripts.apply(lambda row: translateDmIndicator(row.dmIndicator), axis=1)
    #pt["Gender"] = patientTranscripts["Gender"].astype('category')
    pt["Gender"] = pt.apply(lambda row: translateGender(row.Gender), axis=1).astype('category')
    pt['Age'] = pt.apply(lambda row: calcAge(row.YearOfBirth), axis=1)
    #pt['PreDB'] = pt.apply(lambda row: isPreDB(row.PatientGuid), axis=1)
    #pt['BMICategory'] = pt.apply(lambda row: translateBMI(row.BMI), axis=1).astype('category')
    #pt['BPCategory'] = pt.apply(lambda row: translateBP(row.SystolicBP, row.DiastolicBP), axis=1).astype('category')

    pt.drop(['VisitYear', 'YearOfBirth'], axis=1, inplace=True)
    pt.dropna(how='any', inplace=True)
    
    return pt

def processPatientSmoking(ps):
    ps = ps.sort_values(['SmokeEffectiveYear'],  ascending=False)
    ps = ps.groupby('PatientGuid', as_index=False).agg(lambda x: np.nan if x.first_valid_index() is None else x.loc[x.first_valid_index()])
    
    ps.drop(['SmokeEffectiveYear'], axis=1, inplace=True)
    ps.dropna(how='any', inplace=True)
    ps["SmokingStatus_NISTCode"] = ps["SmokingStatus_NISTCode"].astype('int')
    
    return ps


In [116]:
smokingStatus = processPatientSmoking(patientSmokingStatus)
smokingStatus

Unnamed: 0,PatientGuid,SmokingStatus_NISTCode
2,000B4862-7CE7-4EC5-8043-A97FCD74BD78,3
5,0029BBC8-7C22-4444-9F44-87BEF05FE033,3
6,003CEE1F-5BF5-4171-9284-F5464EC12D41,4
11,005F61C9-E537-4AD2-B39C-37F25891F33A,4
13,0063B34F-C2C4-423A-A144-E51F7149253A,0
15,006948F2-1118-4F56-A561-6A254EE357C6,4
16,006A9198-1BB4-42B9-A864-210BF14AD445,0
18,006E3A23-F786-4ED2-BE70-D91D23BA56EA,0
23,008383B8-BF24-4DBB-A848-A377A4899599,4
24,008A55FD-0735-47EC-A2FA-20043D1423C2,0


In [117]:
transcripts = processPatientTranscripts(patientTranscripts)
transcripts

Unnamed: 0,PatientGuid,dmIndicator,Gender,BMI,SystolicBP,DiastolicBP,Age
0,00023761-9D8D-445B-874C-2424CC7CF620,0,1,27.670,122.0,76.0,56
1,0005D9BD-0247-4F02-B7EE-7C1B44825FA1,0,0,22.463,128.0,82.0,60
2,000B4862-7CE7-4EC5-8043-A97FCD74BD78,0,0,16.654,80.0,60.0,87
3,00110ABC-DAB9-49E3-A1C8-88BBF8D58109,0,1,28.190,125.0,75.0,42
4,002667F4-B9A3-4DE2-875F-8034DD637865,0,0,19.388,112.0,80.0,84
5,0029BBC8-7C22-4444-9F44-87BEF05FE033,0,0,26.606,121.0,72.0,59
6,003CEE1F-5BF5-4171-9284-F5464EC12D41,0,0,20.026,130.0,82.0,86
7,004382BD-E31F-4091-8DE5-E86A59D70C2D,0,0,42.864,152.0,92.0,51
8,004AACFF-5D67-49B3-81F4-883A72A2AB97,0,0,26.623,128.0,96.0,27
9,004CC712-BAD3-4A0E-8C9D-659F6C83A3AD,0,1,24.226,117.0,78.0,25


In [126]:
patients = pd.merge(transcripts, smokingStatus, on='PatientGuid', how='left')
patients['SmokingStatus_NISTCode'] = patients['SmokingStatus_NISTCode'].fillna(9)

patients

Unnamed: 0,PatientGuid,dmIndicator,Gender,BMI,SystolicBP,DiastolicBP,Age,SmokingStatus_NISTCode
0,00023761-9D8D-445B-874C-2424CC7CF620,0,1,27.670,122.0,76.0,56,9.0
1,0005D9BD-0247-4F02-B7EE-7C1B44825FA1,0,0,22.463,128.0,82.0,60,9.0
2,000B4862-7CE7-4EC5-8043-A97FCD74BD78,0,0,16.654,80.0,60.0,87,3.0
3,00110ABC-DAB9-49E3-A1C8-88BBF8D58109,0,1,28.190,125.0,75.0,42,9.0
4,002667F4-B9A3-4DE2-875F-8034DD637865,0,0,19.388,112.0,80.0,84,9.0
5,0029BBC8-7C22-4444-9F44-87BEF05FE033,0,0,26.606,121.0,72.0,59,3.0
6,003CEE1F-5BF5-4171-9284-F5464EC12D41,0,0,20.026,130.0,82.0,86,4.0
7,004382BD-E31F-4091-8DE5-E86A59D70C2D,0,0,42.864,152.0,92.0,51,9.0
8,004AACFF-5D67-49B3-81F4-883A72A2AB97,0,0,26.623,128.0,96.0,27,9.0
9,004CC712-BAD3-4A0E-8C9D-659F6C83A3AD,0,1,24.226,117.0,78.0,25,9.0


In [130]:
X = patients.values[:, 2:]
y = patients.values[:, 1].astype("int")
X

array([[1, 27.67, 122.0, 76.0, 56, 9.0],
       [0, 22.463, 128.0, 82.0, 60, 9.0],
       [0, 16.654, 80.0, 60.0, 87, 3.0],
       ..., 
       [0, 37.454, 124.0, 72.0, 47, 0.0],
       [1, 28.749, 130.0, 79.0, 82, 3.0],
       [0, 26.153, 138.0, 84.0, 65, 4.0]], dtype=object)

In [131]:
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.77, random_state=seed)

In [132]:
lr = LogisticRegressionCV()
lr.fit(train_X, train_y)

print("Accuracy = {:.2f}".format(lr.score(test_X, test_y)))

Accuracy = 0.81


In [133]:
patients = pd.concat([patients, ohe_icd9Codes()], axis=1)
patients
# Why do we end up with a NAN in last row?

Unnamed: 0,PatientGuid,dmIndicator,Gender,BMI,SystolicBP,DiastolicBP,Age,SmokingStatus_NISTCode,002.0,003.0,...,V85.41,V85.42,V85.52,V85.53,V87.31,V87.45,V88.01,V88.02,v43.3,v58.69
0,00023761-9D8D-445B-874C-2424CC7CF620,0.0,1.0,27.670,122.0,76.0,56.0,9.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0005D9BD-0247-4F02-B7EE-7C1B44825FA1,0.0,0.0,22.463,128.0,82.0,60.0,9.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,000B4862-7CE7-4EC5-8043-A97FCD74BD78,0.0,0.0,16.654,80.0,60.0,87.0,3.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,00110ABC-DAB9-49E3-A1C8-88BBF8D58109,0.0,1.0,28.190,125.0,75.0,42.0,9.0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,002667F4-B9A3-4DE2-875F-8034DD637865,0.0,0.0,19.388,112.0,80.0,84.0,9.0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0029BBC8-7C22-4444-9F44-87BEF05FE033,0.0,0.0,26.606,121.0,72.0,59.0,3.0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,003CEE1F-5BF5-4171-9284-F5464EC12D41,0.0,0.0,20.026,130.0,82.0,86.0,4.0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,004382BD-E31F-4091-8DE5-E86A59D70C2D,0.0,0.0,42.864,152.0,92.0,51.0,9.0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,004AACFF-5D67-49B3-81F4-883A72A2AB97,0.0,0.0,26.623,128.0,96.0,27.0,9.0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,004CC712-BAD3-4A0E-8C9D-659F6C83A3AD,0.0,1.0,24.226,117.0,78.0,25.0,9.0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [123]:
X = patients.values[:, 1:]
y = patients.values[:, 0].astype("int")
X

array([[   1.   ,   27.67 ,  122.   , ...,    0.   ,    0.   ,    0.   ],
       [   0.   ,   22.463,  128.   , ...,    0.   ,    0.   ,    0.   ],
       [   0.   ,   16.654,   80.   , ...,    0.   ,    0.   ,    0.   ],
       ..., 
       [   1.   ,   28.749,  130.   , ...,    0.   ,    0.   ,    0.   ],
       [   0.   ,   26.153,  138.   , ...,    0.   ,    0.   ,    0.   ],
       [     nan,      nan,      nan, ...,    0.   ,    0.   ,    0.   ]])

In [124]:
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.77, random_state=seed)
input_shape = len(X[0])

In [125]:
model = Sequential()
model.add(Dense(16, input_shape=(input_shape,)))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])

model.fit(train_X, train_y, epochs=100, batch_size=10, verbose=1);

loss, accuracy = model.evaluate(test_X, test_y, verbose=0)
print("Accuracy = {:.2f}".format(accuracy))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100

KeyboardInterrupt: 