In [50]:
%%bash
# These files appear to be from the original competition (that is unmodified)
wget -q -c https://raw.githubusercontent.com/yasminlucero/Kaggle/master/Data/compDataAsSQLiteDB/compData.db
wget -q -c https://raw.githubusercontent.com/yasminlucero/Kaggle/master/Data/compDataAsSQLiteDB/compData.sql

In [51]:
%matplotlib inline
import seaborn as sns

import sqlite3
import pandas as pd
import numpy as np

from patsy import dmatrices

from contextlib import closing

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [52]:
seed = 13431
np.random.seed(seed)

with closing(sqlite3.connect('file:compData.db?mode=ro', uri=True)) as conn:    
    patientTranscripts = pd.read_sql_query("""
    SELECT PatientGuid,
        dmIndicator,
        Gender,
        YearOfBirth,
        VisitYear,
        CASE WHEN BMI = 0 THEN NULL ELSE BMI END AS BMI,
        CASE WHEN SystolicBP = 'NULL' THEN NULL ELSE SystolicBP END AS SystolicBP,
        CASE WHEN DiastolicBP = 'NULL' THEN NULL ELSE DiastolicBP END AS DiastolicBP
    FROM training_patientTranscript
""", conn)
  
    patientIcd9Codes = pd.read_sql_query("""
SELECT PatientGuid, ICD9Code
FROM training_diagnosis 
""", conn)
    
    # Note: Need to use training_patientSmokingStatus directly rather than patient_smoking
    # as the NIST codes used translate to multiple statuses
    patientSmokingStatus = pd.read_sql_query("""
SELECT PatientGuid, 
    EffectiveYear,
    SmokingStatusGuid
FROM training_patientSmokingStatus
""", conn)
    
    preDbMatch = pd.read_sql_query("""
SELECT DISTINCT PatientGuid
FROM training_diagnosis 
WHERE ICD9Code IN ('790.29','648.83')
""", conn)

In [53]:
def calcAge(yearOfBirth):
    return 2012 - yearOfBirth

def isPreDB(patientGuid):
   return patientGuid in preDbMatch.PatientGuid.values
 
def isSmoker(ps, patientGuid):
    return patientGuid in ps.index and ps.loc[patientGuid, 'SmokingStatusGuid'] in ('FA2B7AE4-4D14-4768-A8C7-55B5F0CDF4AF', 
                                                                                    '02116D5A-F26C-4A48-9A11-75AC21BC4FD3', \
                                                                                    'DD01E545-D7AF-4F00-B248-9FD40010D81D', \
                                                                                    'FCD437AA-0451-4D8A-9396-B6F19D8B25E8', \
                                                                                    '2548BD83-03AE-4287-A578-FA170F39E32F')

def processPatientTranscripts(pt):
    pt = pt.sort_values(['VisitYear'],  ascending=False)
    pt = pt.groupby('PatientGuid', as_index=False).agg(lambda x: np.nan if x.first_valid_index() is None else x.loc[x.first_valid_index()])

    pt['dmIndicator'] = pt['dmIndicator'].astype('bool')
    pt['Gender'] = pt['Gender'].astype('category')
    pt['Age'] = pt.apply(lambda row: calcAge(row.YearOfBirth), axis=1)
    pt['PreDB'] = pt.apply(lambda row: isPreDB(row.PatientGuid), axis=1)
    
    pt.drop(['VisitYear', 'YearOfBirth'], axis=1, inplace=True)
    pt.dropna(how='any', inplace=True)
    
    pt['SystolicBP'] = pt['SystolicBP'].astype('int')
    pt['DiastolicBP'] = pt['DiastolicBP'].astype('int')
    
    return pt

def processPatientSmoking(ps):
    ps = ps.sort_values(['EffectiveYear'], ascending=False)
    ps = ps.groupby('PatientGuid').agg(lambda x: np.nan if x.first_valid_index() is None else x.loc[x.first_valid_index()])
    
    ps['Smoker'] = ps.apply(lambda row: isSmoker(ps, row.name), axis=1)
    
    ps.drop(['EffectiveYear', 'SmokingStatusGuid'], axis=1, inplace=True)
    ps.dropna(how='any', inplace=True)
    ps.reset_index(level=0, inplace=True)

    return ps


In [54]:
transcripts = processPatientTranscripts(patientTranscripts)
transcripts.head()

Unnamed: 0,PatientGuid,dmIndicator,Gender,BMI,SystolicBP,DiastolicBP,Age,PreDB
0,00023761-9D8D-445B-874C-2424CC7CF620,False,M,27.67,122,76,56,False
1,0005D9BD-0247-4F02-B7EE-7C1B44825FA1,False,F,22.463,128,82,60,False
2,000B4862-7CE7-4EC5-8043-A97FCD74BD78,False,F,16.654,80,60,87,False
3,00110ABC-DAB9-49E3-A1C8-88BBF8D58109,False,M,28.19,125,75,42,False
4,002667F4-B9A3-4DE2-875F-8034DD637865,False,F,19.388,112,80,84,False


In [55]:
smokingStatus = processPatientSmoking(patientSmokingStatus)
smokingStatus.head()

Unnamed: 0,PatientGuid,Smoker
0,000B4862-7CE7-4EC5-8043-A97FCD74BD78,False
1,0029BBC8-7C22-4444-9F44-87BEF05FE033,False
2,003CEE1F-5BF5-4171-9284-F5464EC12D41,False
3,005F61C9-E537-4AD2-B39C-37F25891F33A,False
4,0063B34F-C2C4-423A-A144-E51F7149253A,False


In [56]:
patients = pd.merge(transcripts, smokingStatus, on='PatientGuid', how='left')
patients[['Smoker']] = patients[['Smoker']].fillna(True)
patients.drop(['PatientGuid'], axis=1, inplace=True)
patients.head()

Unnamed: 0,dmIndicator,Gender,BMI,SystolicBP,DiastolicBP,Age,PreDB,Smoker
0,False,M,27.67,122,76,56,False,True
1,False,F,22.463,128,82,60,False,True
2,False,F,16.654,80,60,87,False,False
3,False,M,28.19,125,75,42,False,True
4,False,F,19.388,112,80,84,False,True


In [57]:
y, X = dmatrices('dmIndicator ~ C(Gender) + BMI + SystolicBP + DiastolicBP + Age + PreDB + Smoker',
                  patients, return_type="dataframe")
print(X.columns)

Index(['Intercept', 'C(Gender)[T.M]', 'PreDB[T.True]', 'Smoker[T.True]', 'BMI',
       'SystolicBP', 'DiastolicBP', 'Age'],
      dtype='object')


In [58]:
y = y.iloc[:, 1].astype('bool')

In [59]:
model = LogisticRegression()
model = model.fit(X, y)

model.score(X, y)

0.80567005127174018

In [60]:
y.mean()

0.19141449683321604

In [61]:
pd.DataFrame(list(zip(X.columns, np.transpose(model.coef_))))

Unnamed: 0,0,1
0,Intercept,[-2.39786148003]
1,C(Gender)[T.M],[0.370198194892]
2,PreDB[T.True],[0.577498774058]
3,Smoker[T.True],[-0.0948212633921]
4,BMI,[3.54883008786e-05]
5,SystolicBP,[0.0178644777476]
6,DiastolicBP,[-0.0159035868074]
7,Age,[0.0368764450791]


In [62]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=seed)

In [63]:
lr = LogisticRegression()
lr.fit(train_X, train_y)

print("Accuracy = {:.3f}".format(lr.score(test_X, test_y)))

Accuracy = 0.805


In [64]:
# predict class labels for the test set
predicted = model.predict(test_X)
# generate class probabilities
probs = model.predict_proba(test_X)

print(metrics.accuracy_score(test_y, predicted))
print(metrics.roc_auc_score(test_y, probs[:, 1]))

0.806532663317
0.712884715183


In [65]:
print(metrics.confusion_matrix(test_y, predicted))
print(metrics.classification_report(test_y, predicted))

[[1593   24]
 [ 361   12]]
             precision    recall  f1-score   support

      False       0.82      0.99      0.89      1617
       True       0.33      0.03      0.06       373

avg / total       0.72      0.81      0.74      1990

