In [21]:
%%bash
# These files appear to be from the original competition (that is unmodified)
wget -q -c https://raw.githubusercontent.com/yasminlucero/Kaggle/master/Data/compDataAsSQLiteDB/compData.db
wget -q -c https://raw.githubusercontent.com/yasminlucero/Kaggle/master/Data/compDataAsSQLiteDB/compData.sql

In [22]:
%matplotlib inline
import seaborn as sns

import sqlite3
import pandas as pd
import numpy as np

from contextlib import closing

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.utils import np_utils

seed = 13431
np.random.seed(seed)

with closing(sqlite3.connect('file:compData.db?mode=ro', uri=True)) as conn:    
    patientTranscripts = pd.read_sql_query("""
    SELECT PatientGuid,
        dmIndicator,
        Gender,
        YearOfBirth,
        VisitYear,
        CASE WHEN BMI = 0 THEN NULL ELSE BMI END AS BMI,
        CASE WHEN SystolicBP = "NULL" THEN NULL ELSE SystolicBP END AS SystolicBP,
        CASE WHEN DiastolicBP = "NULL" THEN NULL ELSE DiastolicBP END AS DiastolicBP
    FROM training_patientTranscript
""", conn)
  
    patientIcd9Codes = pd.read_sql_query("""
SELECT PatientGuid, ICD9Code
FROM training_diagnosis 
""", conn)
    
    # Note: Need to use training_patientSmokingStatus directly rather than patient_smoking
    # as the NIST codes used translate to multiple statuses
    patientSmokingStatus = pd.read_sql_query("""
SELECT PatientGuid, 
    EffectiveYear,
    SmokingStatusGuid
FROM training_patientSmokingStatus
""", conn)
    
#     preDbMatch = pd.read_sql_query("""
# SELECT DISTINCT PatientGuid
# FROM training_diagnosis 
# WHERE ICD9Code IN ('790.29','648.83')
# """, conn)

In [23]:
# https://www.cdc.gov/healthyweight/assessing/index.html
def translateBMI(bmi):
    if bmi <  18.5:
        return 0 # "Underweight"
    elif 18.5 <= bmi <= 24.9:
        return 1 # "Normal"
    elif 25 <= bmi <= 29.9:
        return 2 # "Overweight"
    else:
        return 3 # "Obese"
        
# https://www.heart.org/HEARTORG/Conditions/HighBloodPressure/KnowYourNumbers/Understanding-Blood-Pressure-Readings_UCM_301764_Article.jsp
def translateBP(systolicBP, diastolicBP):
    if systolicBP < 120 and diastolicBP < 80:
        return 0 # "Normal"
    elif systolicBP <= 129 and diastolicBP < 80:
        return 1 # "Elevated"
    elif systolicBP <= 139 or 80 <= diastolicBP <= 89:
        return 2 #  "HTN_S1"
    elif 140 <= systolicBP or 90 <= diastolicBP:
        return 3 # "HTN_S2"
    else:
        return 4 # "Hypertensive_Crisis"

def translateGender(gender):
    if gender == "F":
        return 0
    elif gender == "M":
        return 1
    else:
        return 2

def calcAge(yearOfBirth):
    return 2012 - yearOfBirth

def translateDmIndicator(dmIndicator):
    return np.True_ if dmIndicator == 0 else np.False_

# def isPreDB(patientGuid):
#     return 1 if patientGuid in preDbMatch.PatientGuid.values else 0

def ohe_icd9Codes():
    icd9CodesByPatient = patientIcd9Codes.groupby('PatientGuid')["ICD9Code"].apply(list).reset_index()
    ohe_icd9CodesByPatient = icd9CodesByPatient['ICD9Code'].str.join('|').str.get_dummies()
    return ohe_icd9CodesByPatient

def processPatientTranscripts(pt):
    pt = pt.sort_values(['VisitYear'],  ascending=False)
    pt = pt.groupby('PatientGuid', as_index=False).agg(lambda x: np.nan if x.first_valid_index() is None else x.loc[x.first_valid_index()])

    #pt["dmIndicator"] = patientTranscripts.apply(lambda row: translateDmIndicator(row.dmIndicator), axis=1)
    #pt["Gender"] = patientTranscripts["Gender"].astype('category')
    #pt["Gender"] = pt.apply(lambda row: translateGender(row.Gender), axis=1).astype('category')
    pt = pd.get_dummies(pt, columns=['Gender'])
    pt['Age'] = pt.apply(lambda row: calcAge(row.YearOfBirth), axis=1)
    #pt['PreDB'] = pt.apply(lambda row: isPreDB(row.PatientGuid), axis=1)
    #pt['BMICategory'] = pt.apply(lambda row: translateBMI(row.BMI), axis=1).astype('category')
    #pt['BPCategory'] = pt.apply(lambda row: translateBP(row.SystolicBP, row.DiastolicBP), axis=1).astype('category')

    pt.drop(['VisitYear', 'YearOfBirth'], axis=1, inplace=True)
    pt.dropna(how='any', inplace=True)
    
    return pt

def processPatientSmoking(ps):
    ps = ps.sort_values(['EffectiveYear'],  ascending=False)
    ps = ps.groupby('PatientGuid', as_index=False).agg(lambda x: np.nan if x.first_valid_index() is None else x.loc[x.first_valid_index()])
    
    ps.drop(['EffectiveYear'], axis=1, inplace=True)
    ps.dropna(how='any', inplace=True)
    ps = pd.get_dummies(ps, columns=['SmokingStatusGuid'])

    return ps


In [24]:
patientIcd9Codes

Unnamed: 0,PatientGuid,ICD9Code
0,BA954BD7-5EE7-4CCF-AB31-07B9C7F72D07,825.0
1,F8D890EA-7920-40AF-BFCF-42010E1BF563,784.0
2,A3AD2D57-5589-47E2-BDB2-1D5B51764896,461.9
3,2AC1DCDA-9C11-44ED-A2BE-4B1F2393B68E,V72.31
4,9DB2B66C-A696-4308-BFA1-4C8F6E97977E,345.90
5,89B52928-EEED-4C1C-B4D5-0B9BA63A78F0,401.9
6,FC106A57-FBEB-4913-9130-50E4375277A0,466.0
7,32B1FAEF-B2D0-41BB-9379-84F35E142846,368.8
8,61484860-F8D8-4CB9-BEB0-74699DF8528E,401.1
9,535717FC-84EF-4B4C-A208-D71CC1EF0B64,911.5


In [25]:
# TODO Get cound of number of ICD9 codes
#len(patientIcd9Codes.ICD9Code.unique())
#3943

# Try to keep this as a DF
#data.groupby('month', as_index=False).agg({"duration": "sum"})
# USE THIS APPROACH INSTEAD? df.groupby('A').B.agg(['min', 'max'])
icd9CodesByPatient = patientIcd9Codes.groupby('PatientGuid', as_index=False).agg({"ICD9Code": lambda x: '|'.join(x)})#.reset_index()
#icd9CodesByPatient = patientIcd9Codes.groupby('PatientGuid')["ICD9Code"].apply(list).reset_index()
#icd9CodesByPatient
#ohe_icd9CodesByPatient = icd9CodesByPatient['ICD9Code'].str.get_dummies(sep='|')
ohe_icd9CodesByPatient = icd9CodesByPatient[['PatientGuid']].join(icd9CodesByPatient.ICD9Code.str.get_dummies())
ohe_icd9CodesByPatient
#ohe_icd9CodesByPatient
#df['label'].str.join(sep='*').str.get_dummies(sep='*')
#icd9CodesByPatient = patientIcd9Codes.groupby('PatientGuid').ICD9Code.agg(lambda x: list(x))#.reset_index()
#ohe_icd9CodesByPatient = icd9CodesByPatient['ICD9Code'].str.join('|').str.get_dummies()
#ohe_icd9CodesByPatient = icd9CodesByPatient.drop('ICD9Code', 1).join(icd9CodesByPatient.ICD9Code.str.join('|').str.get_dummies())
#ohe_icd9CodesByPatient =  icd9CodesByPatient['ICD9Code'].str.join('|').str.get_dummies()
#ohe_icd9CodesByPatient.describe() # Seems like same result we were getting previously?
#icd9CodesByPatient.describe()
#len(icd9CodesByPatient.PatientGuid.unique())
#icd9CodesByPatient
#pd.get_dummies(patientIcd9Codes, columns=['ICD9Code'])
# USE THIS APPROACH? Group by patient guid and then aggregate where if 1 in any row (sum >= 1?) then 1 else 0

Unnamed: 0,PatientGuid,002.0,003.0,003.23,005.9,007.1,007.8,008.43,008.45,008.5,...,V85.41,V85.42,V85.52,V85.53,V87.31,V87.45,V88.01,V88.02,v43.3,v58.69
0,00023761-9D8D-445B-874C-2424CC7CF620,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0005D9BD-0247-4F02-B7EE-7C1B44825FA1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,000B4862-7CE7-4EC5-8043-A97FCD74BD78,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,00110ABC-DAB9-49E3-A1C8-88BBF8D58109,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,002667F4-B9A3-4DE2-875F-8034DD637865,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0029BBC8-7C22-4444-9F44-87BEF05FE033,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,003CEE1F-5BF5-4171-9284-F5464EC12D41,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,004382BD-E31F-4091-8DE5-E86A59D70C2D,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,004AACFF-5D67-49B3-81F4-883A72A2AB97,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,004CC712-BAD3-4A0E-8C9D-659F6C83A3AD,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
smokingStatus = processPatientSmoking(patientSmokingStatus)
smokingStatus

Unnamed: 0,PatientGuid,SmokingStatusGuid_02116D5A-F26C-4A48-9A11-75AC21BC4FD3,SmokingStatusGuid_0815F240-3DD3-43C6-8618-613CA9E41F9F,SmokingStatusGuid_1F3BFBBF-AB76-481B-B1E0-08A3689A54BC,SmokingStatusGuid_2548BD83-03AE-4287-A578-FA170F39E32F,SmokingStatusGuid_5ABBAB35-836F-4F3E-8632-CE063828DA15,SmokingStatusGuid_C12C2DB7-D31A-4514-88C0-42CBD339F764,SmokingStatusGuid_DD01E545-D7AF-4F00-B248-9FD40010D81D,SmokingStatusGuid_E86CA3A8-E35B-4BBF-80E2-0375AB4A1460,SmokingStatusGuid_FA2B7AE4-4D14-4768-A8C7-55B5F0CDF4AF,SmokingStatusGuid_FCD437AA-0451-4D8A-9396-B6F19D8B25E8
0,000B4862-7CE7-4EC5-8043-A97FCD74BD78,0,0,0,0,0,1,0,0,0,0
1,0029BBC8-7C22-4444-9F44-87BEF05FE033,0,0,0,0,0,1,0,0,0,0
2,003CEE1F-5BF5-4171-9284-F5464EC12D41,0,0,0,0,1,0,0,0,0,0
3,005F61C9-E537-4AD2-B39C-37F25891F33A,0,0,0,0,1,0,0,0,0,0
4,0063B34F-C2C4-423A-A144-E51F7149253A,0,0,1,0,0,0,0,0,0,0
5,006948F2-1118-4F56-A561-6A254EE357C6,0,0,0,0,1,0,0,0,0,0
6,006A9198-1BB4-42B9-A864-210BF14AD445,0,0,1,0,0,0,0,0,0,0
7,006E3A23-F786-4ED2-BE70-D91D23BA56EA,0,0,1,0,0,0,0,0,0,0
8,008383B8-BF24-4DBB-A848-A377A4899599,0,0,0,0,1,0,0,0,0,0
9,008A55FD-0735-47EC-A2FA-20043D1423C2,0,0,1,0,0,0,0,0,0,0


In [27]:
transcripts = processPatientTranscripts(patientTranscripts)
transcripts

Unnamed: 0,PatientGuid,dmIndicator,BMI,SystolicBP,DiastolicBP,Gender_F,Gender_M,Age
0,00023761-9D8D-445B-874C-2424CC7CF620,0,27.670,122.0,76.0,0,1,56
1,0005D9BD-0247-4F02-B7EE-7C1B44825FA1,0,22.463,128.0,82.0,1,0,60
2,000B4862-7CE7-4EC5-8043-A97FCD74BD78,0,16.654,80.0,60.0,1,0,87
3,00110ABC-DAB9-49E3-A1C8-88BBF8D58109,0,28.190,125.0,75.0,0,1,42
4,002667F4-B9A3-4DE2-875F-8034DD637865,0,19.388,112.0,80.0,1,0,84
5,0029BBC8-7C22-4444-9F44-87BEF05FE033,0,26.606,121.0,72.0,1,0,59
6,003CEE1F-5BF5-4171-9284-F5464EC12D41,0,20.026,130.0,82.0,1,0,86
7,004382BD-E31F-4091-8DE5-E86A59D70C2D,0,42.864,152.0,92.0,1,0,51
8,004AACFF-5D67-49B3-81F4-883A72A2AB97,0,26.623,128.0,96.0,1,0,27
9,004CC712-BAD3-4A0E-8C9D-659F6C83A3AD,0,24.226,117.0,78.0,0,1,25


In [30]:
patients = pd.merge(transcripts, smokingStatus, on='PatientGuid', how='left')
#patients['SmokingStatus_NISTCode'] = patients['SmokingStatus_NISTCode'].fillna(9)
patients.fillna(0, inplace=True)
patients

Unnamed: 0,PatientGuid,dmIndicator,Gender,BMI,SystolicBP,DiastolicBP,Age,SmokingStatusGuid_02116D5A-F26C-4A48-9A11-75AC21BC4FD3,SmokingStatusGuid_0815F240-3DD3-43C6-8618-613CA9E41F9F,SmokingStatusGuid_1F3BFBBF-AB76-481B-B1E0-08A3689A54BC,SmokingStatusGuid_2548BD83-03AE-4287-A578-FA170F39E32F,SmokingStatusGuid_5ABBAB35-836F-4F3E-8632-CE063828DA15,SmokingStatusGuid_C12C2DB7-D31A-4514-88C0-42CBD339F764,SmokingStatusGuid_DD01E545-D7AF-4F00-B248-9FD40010D81D,SmokingStatusGuid_E86CA3A8-E35B-4BBF-80E2-0375AB4A1460,SmokingStatusGuid_FA2B7AE4-4D14-4768-A8C7-55B5F0CDF4AF,SmokingStatusGuid_FCD437AA-0451-4D8A-9396-B6F19D8B25E8
0,00023761-9D8D-445B-874C-2424CC7CF620,0,1,27.670,122.0,76.0,56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0005D9BD-0247-4F02-B7EE-7C1B44825FA1,0,0,22.463,128.0,82.0,60,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,000B4862-7CE7-4EC5-8043-A97FCD74BD78,0,0,16.654,80.0,60.0,87,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,00110ABC-DAB9-49E3-A1C8-88BBF8D58109,0,1,28.190,125.0,75.0,42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,002667F4-B9A3-4DE2-875F-8034DD637865,0,0,19.388,112.0,80.0,84,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0029BBC8-7C22-4444-9F44-87BEF05FE033,0,0,26.606,121.0,72.0,59,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,003CEE1F-5BF5-4171-9284-F5464EC12D41,0,0,20.026,130.0,82.0,86,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,004382BD-E31F-4091-8DE5-E86A59D70C2D,0,0,42.864,152.0,92.0,51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,004AACFF-5D67-49B3-81F4-883A72A2AB97,0,0,26.623,128.0,96.0,27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,004CC712-BAD3-4A0E-8C9D-659F6C83A3AD,0,1,24.226,117.0,78.0,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
X = patients.values[:, 2:]
y = patients.values[:, 1].astype("int")
X

array([[1, 27.67, 122.0, ..., 0.0, 0.0, 0.0],
       [0, 22.463, 128.0, ..., 0.0, 0.0, 0.0],
       [0, 16.654, 80.0, ..., 0.0, 0.0, 0.0],
       ..., 
       [0, 37.454, 124.0, ..., 0.0, 0.0, 0.0],
       [1, 28.749, 130.0, ..., 0.0, 0.0, 0.0],
       [0, 26.153, 138.0, ..., 0.0, 0.0, 0.0]], dtype=object)

In [32]:
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.77, random_state=seed)

In [33]:
# TODO Is it OK to use OHE values in a linear model?
lr = LogisticRegressionCV()
lr.fit(train_X, train_y)

print("Accuracy = {:.2f}".format(lr.score(test_X, test_y)))

Accuracy = 0.81


In [34]:
patients

Unnamed: 0,PatientGuid,dmIndicator,Gender,BMI,SystolicBP,DiastolicBP,Age,SmokingStatusGuid_02116D5A-F26C-4A48-9A11-75AC21BC4FD3,SmokingStatusGuid_0815F240-3DD3-43C6-8618-613CA9E41F9F,SmokingStatusGuid_1F3BFBBF-AB76-481B-B1E0-08A3689A54BC,SmokingStatusGuid_2548BD83-03AE-4287-A578-FA170F39E32F,SmokingStatusGuid_5ABBAB35-836F-4F3E-8632-CE063828DA15,SmokingStatusGuid_C12C2DB7-D31A-4514-88C0-42CBD339F764,SmokingStatusGuid_DD01E545-D7AF-4F00-B248-9FD40010D81D,SmokingStatusGuid_E86CA3A8-E35B-4BBF-80E2-0375AB4A1460,SmokingStatusGuid_FA2B7AE4-4D14-4768-A8C7-55B5F0CDF4AF,SmokingStatusGuid_FCD437AA-0451-4D8A-9396-B6F19D8B25E8
0,00023761-9D8D-445B-874C-2424CC7CF620,0,1,27.670,122.0,76.0,56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0005D9BD-0247-4F02-B7EE-7C1B44825FA1,0,0,22.463,128.0,82.0,60,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,000B4862-7CE7-4EC5-8043-A97FCD74BD78,0,0,16.654,80.0,60.0,87,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,00110ABC-DAB9-49E3-A1C8-88BBF8D58109,0,1,28.190,125.0,75.0,42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,002667F4-B9A3-4DE2-875F-8034DD637865,0,0,19.388,112.0,80.0,84,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0029BBC8-7C22-4444-9F44-87BEF05FE033,0,0,26.606,121.0,72.0,59,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,003CEE1F-5BF5-4171-9284-F5464EC12D41,0,0,20.026,130.0,82.0,86,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,004382BD-E31F-4091-8DE5-E86A59D70C2D,0,0,42.864,152.0,92.0,51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,004AACFF-5D67-49B3-81F4-883A72A2AB97,0,0,26.623,128.0,96.0,27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,004CC712-BAD3-4A0E-8C9D-659F6C83A3AD,0,1,24.226,117.0,78.0,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
ohe_icd9Codes()

Unnamed: 0,002.0,003.0,003.23,005.9,007.1,007.8,008.43,008.45,008.5,008.69,...,V85.41,V85.42,V85.52,V85.53,V87.31,V87.45,V88.01,V88.02,v43.3,v58.69
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
patients = pd.concat([patients, ohe_icd9Codes()], axis=1)
patients
# Why do we end up with a NAN in last row?

Unnamed: 0,PatientGuid,dmIndicator,Gender,BMI,SystolicBP,DiastolicBP,Age,SmokingStatusGuid_02116D5A-F26C-4A48-9A11-75AC21BC4FD3,SmokingStatusGuid_0815F240-3DD3-43C6-8618-613CA9E41F9F,SmokingStatusGuid_1F3BFBBF-AB76-481B-B1E0-08A3689A54BC,...,V85.41,V85.42,V85.52,V85.53,V87.31,V87.45,V88.01,V88.02,v43.3,v58.69
0,00023761-9D8D-445B-874C-2424CC7CF620,0.0,1.0,27.670,122.0,76.0,56.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0005D9BD-0247-4F02-B7EE-7C1B44825FA1,0.0,0.0,22.463,128.0,82.0,60.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,000B4862-7CE7-4EC5-8043-A97FCD74BD78,0.0,0.0,16.654,80.0,60.0,87.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,00110ABC-DAB9-49E3-A1C8-88BBF8D58109,0.0,1.0,28.190,125.0,75.0,42.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,002667F4-B9A3-4DE2-875F-8034DD637865,0.0,0.0,19.388,112.0,80.0,84.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5,0029BBC8-7C22-4444-9F44-87BEF05FE033,0.0,0.0,26.606,121.0,72.0,59.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
6,003CEE1F-5BF5-4171-9284-F5464EC12D41,0.0,0.0,20.026,130.0,82.0,86.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,004382BD-E31F-4091-8DE5-E86A59D70C2D,0.0,0.0,42.864,152.0,92.0,51.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
8,004AACFF-5D67-49B3-81F4-883A72A2AB97,0.0,0.0,26.623,128.0,96.0,27.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
9,004CC712-BAD3-4A0E-8C9D-659F6C83A3AD,0.0,1.0,24.226,117.0,78.0,25.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
X = patients.values[:, 1:]
y = patients.values[:, 0]#.astype("int")
X

array([[0.0, 1.0, 27.67, ..., 0, 0, 0],
       [0.0, 0.0, 22.463, ..., 0, 0, 0],
       [0.0, 0.0, 16.654, ..., 0, 0, 0],
       ..., 
       [1.0, 1.0, 28.749, ..., 0, 0, 0],
       [0.0, 0.0, 26.153, ..., 0, 0, 0],
       [nan, nan, nan, ..., 0, 0, 0]], dtype=object)

In [124]:
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.77, random_state=seed)
input_shape = len(X[0])

In [125]:
model = Sequential()
model.add(Dense(16, input_shape=(input_shape,)))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])

model.fit(train_X, train_y, epochs=100, batch_size=10, verbose=1);

loss, accuracy = model.evaluate(test_X, test_y, verbose=0)
print("Accuracy = {:.2f}".format(accuracy))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100

KeyboardInterrupt: 