In [172]:
# Importing the libraries
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib
import matplotlib.pyplot as plt
import os
import re
import scipy.stats as scs
from scipy import spatial
import sklearn
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from patsy import dmatrices
import seaborn as sns

In [173]:
#Reading the csv files
diagnosisT = pd.read_csv('training_SyncDiagnosis.csv')
allergyT = pd.read_csv('training_SyncAllergy.csv')
immunT = pd.read_csv('training_SyncImmunization.csv')
labObsT = pd.read_csv('training_SyncLabObservation.csv')
labPanT = pd.read_csv('training_SyncLabPanel.csv')
labResT = pd.read_csv('training_SyncLabResult.csv')
medicationT = pd.read_csv('training_SyncMedication.csv')
patientT = pd.read_csv('training_SyncPatient.csv')
patientCondT = pd.read_csv('training_SyncPatientCondition.csv')
smokingT = pd.read_csv('training_SyncPatientSmokingStatus.csv')
prescripT = pd.read_csv('training_SyncPrescription.csv')
trnscrpT = pd.read_csv('training_SyncTranscript.csv')
trnscrpAT = pd.read_csv('training_SyncTranscriptAllergy.csv')
trnscrpDT = pd.read_csv('training_SyncTranscriptDiagnosis.csv')
trnscrptMT = pd.read_csv('training_SyncTranscriptMedication.csv')
conditionT = pd.read_csv('SyncCondition.csv')
smokingT = pd.read_csv('SyncSmokingStatus.csv')

In [174]:
#Defining a function retBMI for returning the BMI index
def retBMI():
    trnscrpT['BMI'] = trnscrpT['BMI'].replace(0.000, np.nan)
    bmi1 = trnscrpT.groupby(['PatientGuid'])['BMI'].mean()
    bmi2 = trnscrpT.groupby(['PatientGuid'])['BMI'].max()
    bmi3 = trnscrpT.groupby(['PatientGuid'])['BMI'].min()
    bmi = pd.concat([bmi1, bmi2, bmi3], axis = 1)
    bmi.columns = ['MeanBMI', 'MaxBMI', 'MinBMI']
    bmi['isOverweight'] = 0
    bmi['isObese'] = 0
    bmi.loc[(bmi['MeanBMI'] >= 25) & (bmi['MeanBMI'] < 30), 'isOverweight'] = 1
    bmi.loc[bmi['MeanBMI'] >= 30, 'isObese'] = 1
    bmi['BMIDiff'] = np.abs(bmi['MaxBMI'] - bmi['MinBMI'])
    bmi = bmi.reset_index()
    return bmi

In [175]:
#Defining a function retBP for retuning the bloodpressure value
def retBP():
    bloodpressure1 = trnscrpT.groupby(['PatientGuid'])[['SystolicBP', 'DiastolicBP']].mean()
    bloodpressure2 = trnscrpT.groupby(['PatientGuid'])[['SystolicBP', 'DiastolicBP']].max()
    bloodpressure3 = trnscrpT.groupby(['PatientGuid'])[['SystolicBP', 'DiastolicBP']].min()
    bloodpressure = pd.concat([bloodpressure1, bloodpressure2, bloodpressure3], axis = 1)
    bloodpressure.columns = ['meanSystolicBP', 'meanDiastolicBP', 'maxSystolicBP', 'maxDiastolicBP', 'minSystolicBP', 'minDiastolicBP']
    bloodpressure['SystDiff'] = np.abs(bloodpressure['maxSystolicBP'] - bloodpressure['minSystolicBP'])
    bloodpressure['DiastDiff'] = np.abs(bloodpressure['maxDiastolicBP'] - bloodpressure['minDiastolicBP'])
    bloodpressure['isPreHyp'] = 0
    bloodpressure['isStage1HBP'] = 0
    bloodpressure['isStage2HBP'] = 0
    bloodpressure.loc[((bloodpressure['meanSystolicBP'] >= 120) & (bloodpressure['meanSystolicBP'] < 140)) | ((bloodpressure['meanDiastolicBP'] >= 80) & (bloodpressure['meanDiastolicBP'] < 90)), 'isPreHyp'] = 1
    bloodpressure.loc[((bloodpressure['meanSystolicBP'] >= 140) & (bloodpressure['meanSystolicBP'] < 160)) | ((bloodpressure['meanDiastolicBP'] >= 90) & (bloodpressure['meanDiastolicBP'] < 100)), 'isStage1HBP'] = 1
    bloodpressure.loc[(bloodpressure['meanSystolicBP'] >= 160) | (bloodpressure['meanDiastolicBP'] >= 100), 'isStage2HBP'] = 1
    bloodpressure.loc[bloodpressure['isStage2HBP'] == 1, 'isStage1HBP'] = 0
    bloodpressure.loc[bloodpressure['isStage1HBP'] == 1, 'isPreHyp'] = 0
    bloodpressure = bloodpressure.reset_index()
    return bloodpressure

In [176]:
#Defining a function retConditions for returning the PatientGuid, DMIndicator and ICD9Code
def retConditions():
    diaganalysis = pd.merge(patientT, diagnosisT, on=['PatientGuid'])
    chidata = diaganalysis[['PatientGuid', 'DMIndicator', 'ICD9Code']].drop_duplicates()
    chidata.index = chidata['PatientGuid']
    dummies = pd.get_dummies(chidata['ICD9Code']).reset_index()
    dummies = dummies.groupby(['PatientGuid'])[dummies.columns[1:]].sum()
    dummies = dummies.replace(2, 1)
    chidata = chidata[['PatientGuid', 'DMIndicator']]
    chidata = chidata.drop_duplicates()
    chidata = pd.concat([chidata, dummies], axis=1)
    chidata['DMIndicator'] = chidata['DMIndicator'].astype(float)
    return chidata

In [177]:
#Defining a function to return heart disease or not 
def heartDisease(text):
    if bool(re.match('41[0-4]|42[0-5]|427|429|74[5-6]', text)) == True:
        return 1
    return 0
def CAD(text):
    if bool(re.match('41[0-4]|429', text)) == True:
        return 1
    return 0
def cardiomyopathy(text):
    if bool(re.match('42[0-5]', text)) == True:
        return 1
    return 0
def CHF(text):
    if bool(re.match('426', text)) == True:
        return 1
    return 0
def arrhythmias(text):
    if bool(re.match('427', text)) == True:
        return 1
    return 0
def heartdefects(text):
    if bool(re.match('74[5-6]', text)) == True:
        return 1
    return 0
def stroke(text):
    if bool(re.match('43[0-1]|43[3-6]|997.02', text)) == True:
        return 1
    return 0
def sleepApnea(text):
    if bool(re.match('727.23|780.57', text)) == True:
        return 1
    return 0
def gestDiab(text):
    if bool(re.match('648.8', text)) == True:
        return 1
    return 0
def polyOvary(text):
    if bool(re.match('256.4', text)) == True:
        return 1
    return 0
def frozenShoulder(text):
    if bool(re.match('726.0', text)) == True:
        return 1
    return 0
def hemochr(text):
    if bool(re.match('275.03', text)) == True:
        return 1
    return 0
def hepatitis(text):
    if bool(re.match('070.2|070.3', text)) == True:
        return 1
    return 0
def diabCompl(text):
    if bool(re.match('250.[1-3]|250.5|250.8|251.[0-2]|270.3|775.6|962.3', text)) == True:
        return 1
    return 0
def kidneyFailure(text):
    if bool(re.match('58[4-5]', text)) == True:
        return 1
    return 0
def dementia(text):
    if bool(re.match('331|290|294|797', text)) == True:
        return 1
    return 0
def acanthosis(text):
    if bool(re.match('701.2', text)) == True:
        return 1
    return 0
def blindness(text):
    if bool(re.match('369', text)) == True:
        return 1
    return 0
def sDysfunction(text):
    if bool(re.match('302.7', text)) == True:
        return 1
    return 0
def preDiabetes(text):
    if bool(re.match('790.29', text)) == True:
        return 1
    return 0

In [178]:
#Defining a function to tell about the ICD9 labels
def ICD9Label(text):
    if bool(re.match('14[0-9]|2[0-3][0-9]', text)) == True:
        return 'neoplasms'
    elif bool(re.match('2[4-7][0-9]', text)) == True:
        return 'endoctrine'
    elif bool(re.match('28[0-9]', text)) == True:
        return 'blood'
    elif bool(re.match('29[0-9]|3[0-1][0-9]', text)) == True:
        return 'mental'
    elif bool(re.match('3[2-5][0-9]', text)) == True:
        return 'nervous'
    elif bool(re.match('3[6-8][0-9]', text)) == True:
        return 'sense'
    elif bool(re.match('39[0-9]|4[0-5][0-9]', text)) == True:
        return 'circulatory'
    elif bool(re.match('4[6-9][0-9]|5[0-1][0-9]', text)) == True:
        return 'respiratory'
    elif bool(re.match('5[2-7][0-9]', text)) == True:
        return 'digestive'
    elif bool(re.match('5[8-9][0-9]|6[0-2][0-9]', text)) == True:
        return 'genitourinary'
    elif bool(re.match('6[3-7][0-9]', text)) == True:
        return 'pregnancy'    
    elif bool(re.match('6[8-9][0-9]|70[0-9]', text)) == True:
        return 'skin'   
    elif bool(re.match('7[1-3][0-9]', text)) == True:
        return 'musculoskeletal'       
    elif bool(re.match('7[4-5][0-9]', text)) == True:
        return 'congenital' 
    elif bool(re.match('7[6-7][0-9]', text)) == True:
        return 'perinatal' 
    elif bool(re.match('7[8-9][0-9]', text)) == True:
        return 'symptoms or ill-defined' 
    elif bool(re.match('8[0-9][0-9]|9[0-9][0-9]', text)) == True:
        return 'injuries' 
    elif bool(re.match('E|V', text)) == True:
        return 'suppl' 
    else:
        return 'infectious'

In [179]:
#Defining a function to remove fractional parts/ remove period
def stripPeriods(text):
    result = re.sub('\.', "", text)
    return result

In [180]:
#Defining a function for returning the Diagnosis Summary 
def retDiag():
    data = diagnosisT.copy()
    data['ICD9'] = data['ICD9Code'].apply(stripPeriods)
    data['hasHeartDisease'] = data['ICD9Code'].apply(heartDisease)
    data['hasCAD'] = data['ICD9Code'].apply(CAD)
    data['hasCardiomyo'] = data['ICD9Code'].apply(cardiomyopathy)
    data['hasArrhy'] = data['ICD9Code'].apply(arrhythmias)
    data['hasHeartDefects'] = data['ICD9Code'].apply(heartdefects)
    data['hasCHF'] = data['ICD9Code'].apply(CHF)
    data['hasStroke'] = data['ICD9Code'].apply(stroke)
    data['hasSleepA'] = data['ICD9Code'].apply(sleepApnea)
    data['hasGestDiab'] = data['ICD9Code'].apply(gestDiab)
    data['hasPolyO'] = data['ICD9Code'].apply(polyOvary)
    data['hasFrozenShoulder'] = data['ICD9Code'].apply(frozenShoulder)
    data['hasHemoChr'] = data['ICD9Code'].apply(hemochr)
    data['hasHepatitis'] = data['ICD9Code'].apply(hepatitis)
    data['hasDiabComp'] = data['ICD9Code'].apply(diabCompl)
    data['hasKidneyF'] = data['ICD9Code'].apply(kidneyFailure)
    data['hasDementia'] = data['ICD9Code'].apply(dementia)
    data['hasAcanthosis'] = data['ICD9Code'].apply(acanthosis)
    data['hasBlindness'] = data['ICD9Code'].apply(blindness)
    data['hasSDysf'] = data['ICD9Code'].apply(sDysfunction)
    data['hasPreDiab'] = data['ICD9Code'].apply(preDiabetes)
    diagnosisSummary = data.groupby(['PatientGuid'])[['hasHeartDisease', 'hasCHF', 'hasStroke', 'hasSleepA', 'hasGestDiab','hasPolyO', 'hasFrozenShoulder', 'hasHemoChr','hasHepatitis', 'hasDiabComp', 'hasKidneyF', 'hasDementia','hasAcanthosis', 'hasBlindness' ,'hasSDysf', 'hasPreDiab']].max()
    diagnosisSummary = diagnosisSummary.reset_index()
    return diagnosisSummary

In [181]:
#Defining the function for returning the ICD9 Labels
def makeICD9Labels():
    data = diagnosisT.copy()
    data['ICD9Label'] = data['ICD9Code'].apply(ICD9Label)
    data['Counts'] = 1
    ICD9Labeldata = data.groupby(['PatientGuid', 'ICD9Label'])['Counts'].max().reset_index().pivot('PatientGuid', 'ICD9Label', 'Counts').reset_index()
    ICD9Labeldata = ICD9Labeldata.fillna(0)
    return ICD9Labeldata

In [182]:
#Defining a function for returning the patient's age
def getPatientAge():
    data=patientT.copy()
    data.drop('PracticeGuid', axis=1, inplace=True)
    data.drop('State', axis=1, inplace=True)
    data['Age'] = 2015 - data['YearOfBirth']
    data.drop('YearOfBirth', axis=1, inplace=True)
    data.drop('Gender', axis=1, inplace=True)
    return data

In [183]:
bloodpressure = retBP()
bmi = retBMI()
conditions = retConditions()
specials = retDiag()
patients = getPatientAge()
icd9lab = makeICD9Labels()

In [184]:
len(conditions.PatientGuid.unique())

9948

In [185]:
diagnames = diagnosisT[['ICD9Code', 'DiagnosisDescription']].drop_duplicates()
diagnames.index = diagnames['ICD9Code']
diagnames = diagnames['DiagnosisDescription']
diagnames.head(5)

ICD9Code
825.0                         Fracture of calcaneus, closed
784.0                                              Headache
461.9                          Acute sinusitis, unspecified
V72.31                    Routine gynecological examination
345.90    Epilepsy, unspecified, without mention of intr...
Name: DiagnosisDescription, dtype: object

In [186]:
def makeSimilarity():
    columns = conditions.columns[2:]
    chitest = {}
    cmatrix = {}
    cos = {}
    percent = {}
    for column in columns:
        cm = confusion_matrix(np.array(conditions['DMIndicator']), np.array(conditions[column]))
        cmatrix[column] = cm
        chitest[column]= scs.chi2_contingency(cm)[1]
        cos[column] = 1 - spatial.distance.cosine(conditions['DMIndicator'], conditions[column])
        percent[column] = [float(cm[1][1])/(float(cm[0][1]) + float(cm[1][1])), (float(cm[0][1]) + float(cm[1][1]))]
    return cos, chitest, cmatrix, percent

In [187]:
perc = makeSimilarity()[3]

In [188]:
perc_elements = perc.items()
list(perc_elements)[1:5]

[('003.0', [0.5, 2.0]),
 ('003.23', [0.0, 1.0]),
 ('005.9', [0.0, 4.0]),
 ('007.1', [0.0, 4.0])]

In [189]:
output1 = pd.DataFrame(data=perc).T
output1.columns = ['PercentageDiab', 'PatientsWithCondition']
output1['PercentageOfTotalPop'] = output1['PatientsWithCondition']/9948
output1 = output1.sort_values('PatientsWithCondition', ascending=False)
output2 = pd.concat([diagnames, output1], axis=1).reset_index()
output2.head()

Unnamed: 0,index,DiagnosisDescription,PercentageDiab,PatientsWithCondition,PercentageOfTotalPop
0,825.0,"Fracture of calcaneus, closed",0.25,4.0,0.000402
1,784.0,Headache,0.151119,536.0,0.05388
2,461.9,"Acute sinusitis, unspecified",0.11871,775.0,0.077905
3,V72.31,Routine gynecological examination,0.080292,411.0,0.041315
4,345.90,"Epilepsy, unspecified, without mention of intr...",0.2,15.0,0.001508


In [190]:
def getICD9(threshold=0.2):
    ICD9 = []
    for key in perc.keys():
        if perc[key][0] > threshold:
            ICD9.append(key)
    return ICD9

In [191]:
print (bloodpressure.shape, bmi.shape, specials.shape, patients.shape, conditions.shape, icd9lab.shape)

(9948, 12) (9948, 7) (9948, 17) (9948, 3) (9948, 3945) (9948, 20)


In [192]:
import sys
sys.setrecursionlimit(10000)

In [193]:
def makeData(threshold = 0.2):
    ICD9 = getICD9(threshold)
    newconditions = pd.concat([conditions['PatientGuid'], conditions[ICD9]], axis=1)
    patientdata = pd.merge(patients, newconditions, how='inner', on =['PatientGuid'])
    patientdata = pd.merge(patientdata, specials, how='inner', on =['PatientGuid'])
    patientdata = pd.merge(patientdata, icd9lab, how='inner', on =['PatientGuid'])
    patientdata = pd.merge(patientdata, bloodpressure, how='inner', on =['PatientGuid'])
    patientdata = pd.merge(patientdata, bmi, how='inner', on =['PatientGuid'])

    #create model variables
    modelelements = '") + Q("'.join(patientdata.columns[2:])
    modelelements = 'Q("' + modelelements + '")'
    formula = 'DMIndicator ~ ' + modelelements
    y, X = dmatrices(formula, data=patientdata, return_type='dataframe')
    X = X.drop('Intercept', 1)

    #setting up test train split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=46)
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train).ravel()
    y_test = np.array(y_test).ravel()
    
    columns = X.columns
    
    return X_train, y_train, X_test, y_test, columns

In [194]:
X_train, y_train, X_test, y_test, features = makeData()

In [195]:
len(features)

1688

In [196]:
#Importing the all the required libraries 
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors  import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import f1_score

# helper functions
def train_clf(clf, X_train, y_train):
    
    return clf.fit(X_train, y_train)
    
def pred_clf(clf, features, target):
    
    y_pred = clf.predict(features)
    y_pred_proba = clf.predict_proba(features)
    return brier_score_loss(target, y_pred_proba[:,1])

def train_predict(clf, X_train, y_train, X_test, y_test):
    
    train_clf(clf, X_train, y_train)
    
    print("Brier score for training set is: {:.4f}".format(pred_clf(clf, X_train, y_train)))
    print("Brier score for testing set is: {:.4f}\n".format(pred_clf(clf, X_test, y_test)))

In [197]:
#Checking which algorithm is the best 
nb = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=18)
log = LogisticRegression()
dtc = DecisionTreeClassifier(random_state=0)
rfc = RandomForestClassifier(random_state=0)
abc = AdaBoostClassifier(random_state=0)
gbc = GradientBoostingClassifier(random_state=0)

algorithms = [nb,knn, log, dtc, rfc, abc, gbc]

for clf in algorithms:
 
    print("\n{}: \n".format(clf.__class__.__name__))
    
    #create training data from first 100, then 200, then 300
    for n in [179, 358, 537]:
        train_predict(clf, X_train[:n], y_train[:n], X_test, y_test)
       
    print("{}:".format(clf))
    train_predict(clf, X_train, y_train, X_test, y_test)


GaussianNB: 

Brier score for training set is: 0.2291
Brier score for testing set is: 0.5317

Brier score for training set is: 0.2626
Brier score for testing set is: 0.5724

Brier score for training set is: 0.3557
Brier score for testing set is: 0.6151

GaussianNB():
Brier score for training set is: 0.1765
Brier score for testing set is: 0.1818


KNeighborsClassifier: 

Brier score for training set is: 0.1289
Brier score for testing set is: 0.1441

Brier score for training set is: 0.1329
Brier score for testing set is: 0.1422

Brier score for training set is: 0.1324
Brier score for testing set is: 0.1424

KNeighborsClassifier(n_neighbors=18):
Brier score for training set is: 0.1226
Brier score for testing set is: 0.1416


LogisticRegression: 

Brier score for training set is: 0.0845
Brier score for testing set is: 0.1625

Brier score for training set is: 0.0870
Brier score for testing set is: 0.1447

Brier score for training set is: 0.1064
Brier score for testing set is: 0.1360

Logis

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Brier score for training set is: 0.1326
Brier score for testing set is: 0.1343


DecisionTreeClassifier: 

Brier score for training set is: 0.0000
Brier score for testing set is: 0.2362

Brier score for training set is: 0.0000
Brier score for testing set is: 0.2377

Brier score for training set is: 0.0000
Brier score for testing set is: 0.2452

DecisionTreeClassifier(random_state=0):
Brier score for training set is: 0.0000
Brier score for testing set is: 0.2256


RandomForestClassifier: 

Brier score for training set is: 0.0183
Brier score for testing set is: 0.1390

Brier score for training set is: 0.0194
Brier score for testing set is: 0.1366

Brier score for training set is: 0.0182
Brier score for testing set is: 0.1336

RandomForestClassifier(random_state=0):
Brier score for training set is: 0.0169
Brier score for testing set is: 0.1251


AdaBoostClassifier: 

Brier score for training set is: 0.1482
Brier score for testing set is: 0.1890

Brier score for training set is: 0.1961
Bri

In [198]:
#Choosing GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(random_state = 1)
param_grid = {
'n_estimators': [100, 200, 300, 400], 'learning_rate': [0.2,0.6,1.2], 'max_depth' : [1, 5, 9]
}

In [199]:
#Using RandomizedSearchCV for finding the best parameters 
from sklearn.model_selection import RandomizedSearchCV
gb_Grid = RandomizedSearchCV(gb_clf, param_grid, cv = 5, scoring = 'roc_auc', refit = True, n_jobs=-1, verbose = 2)

In [200]:
gb_Grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 11.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 31.2min finished


RandomizedSearchCV(cv=5, estimator=GradientBoostingClassifier(random_state=1),
                   n_jobs=-1,
                   param_distributions={'learning_rate': [0.2, 0.6, 1.2],
                                        'max_depth': [1, 5, 9],
                                        'n_estimators': [100, 200, 300, 400]},
                   scoring='roc_auc', verbose=2)

In [201]:
gb_Grid.best_params_

{'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.2}

In [202]:
gb_clf = GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.2, loss='deviance', max_depth=1,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=300,
              presort='auto', random_state=1, subsample=1.0, verbose=0,
              warm_start=False)

In [203]:
#The final Brier Score for the prediction model
train_predict(gb_clf, X_train, y_train, X_test, y_test)



Brier score for training set is: 0.1089
Brier score for testing set is: 0.1225



In [205]:
gb_Grid.score(X_train, y_train)

0.9482670009048588

In [208]:
#Reading the test_SyncPatient
test_Patient = pd.read_csv('test_SyncPatient.csv')

In [209]:
#Defining a function for returning the test_patient's age
def getPatientAge_nodrop():
    data=test_Patient.copy()
    #data.drop('PracticeGuid', axis=1, inplace=True)
    #data.drop('State', axis=1, inplace=True)
    data['Age'] = 2015 - data['YearOfBirth']
    data.drop('YearOfBirth', axis=1, inplace=True)
    return data

In [210]:
bloodpressure = retBP()
bmi = retBMI()
conditions = retConditions()
specials = retDiag()
patients = getPatientAge_nodrop()
icd9lab = makeICD9Labels()

In [211]:
import sys
sys.setrecursionlimit(10000)

In [212]:
ICD9 = getICD9(threshold=0.2)
newconditions = pd.concat([conditions['PatientGuid'], conditions[ICD9]], axis=1)
patientdata = pd.merge(patients, newconditions, how='inner', on =['PatientGuid'])
patientdata = pd.merge(patientdata, specials, how='inner', on =['PatientGuid'])
patientdata = pd.merge(patientdata, icd9lab, how='inner', on =['PatientGuid'])
patientdata = pd.merge(patientdata, bloodpressure, how='inner', on =['PatientGuid'])
patientdata = pd.merge(patientdata, bmi, how='inner', on =['PatientGuid'])

In [213]:
new_patientdata = patientdata.copy()

In [214]:
del new_patientdata['PatientGuid']
del new_patientdata['PracticeGuid']

In [215]:
del new_patientdata['Gender']
del new_patientdata['State']

In [217]:
sum(new_patientdata.isnull().sum())

8

In [229]:
new_patientdata.fillna(new_patientdata.mean(), inplace=True)

In [230]:
DMIndicatorForecast = gb_clf.predict(new_patientdata)

In [244]:
DMIndicatorForecast = pd.DataFrame(DMIndicatorForecast)
DMIndicatorForecast.columns =['DMIndicatorForecast'] 
DMIndicatorForecast

Unnamed: 0,DMIndicatorForecast
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
9943,0.0
9944,0.0
9945,0.0
9946,0.0


In [246]:
patientdata

Unnamed: 0,PatientGuid,Gender,State,PracticeGuid,Age,002.0,003.0,008.45,008.69,008.8,...,DiastDiff,isPreHyp,isStage1HBP,isStage2HBP,MeanBMI,MaxBMI,MinBMI,isOverweight,isObese,BMIDiff
0,FB6EFC3D-1A20-4497-9CBD-00027CC5D220,M,SD,7BF4DAD8-5F67-4985-B911-20C9E89A3737,86,0,0,0,0,0,...,26.0,1,0,0,19.047667,19.343,18.900,0,0,0.443
1,C6746626-6783-4650-A58F-00065649139A,F,TX,E7101967-2FF1-4B0F-8129-B0B429D1D15C,30,0,0,0,0,0,...,14.0,0,0,0,26.389000,27.025,25.753,1,0,1.272
2,E05C6E8F-779F-4594-A388-000C635AE4D3,F,NJ,FC01A799-1CAF-464F-A86F-8A666AB86F32,31,0,0,0,0,0,...,18.0,1,0,0,38.041000,48.217,31.923,0,1,16.294
3,EAEBD216-F847-4355-87B2-000D942E08F0,M,OH,EEBC95EF-79BE-4542-892E-98D3166BAB20,56,0,0,0,0,0,...,25.0,1,0,0,32.981333,34.241,32.004,0,1,2.237
4,C7F10A80-4934-42D2-8540-000FBEBA75C8,F,FL,677BA32E-B4C4-48F2-86E4-08C42B135401,25,0,0,0,0,0,...,20.0,0,0,0,32.917813,34.931,30.432,0,1,4.499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9943,96C0A4E6-1E3E-497E-9C4E-FFEC0E25AD3A,F,TX,E7101967-2FF1-4B0F-8129-B0B429D1D15C,37,0,0,0,0,0,...,26.0,1,0,0,25.303000,27.118,23.488,1,0,3.630
9944,5845977A-3014-4301-92B3-FFF0A2EBBAD2,F,WA,EADEC07A-9901-411F-BBE3-04376029E1E8,29,0,0,0,0,0,...,2.0,0,0,0,22.503000,22.503,22.503,0,0,0.000
9945,F948403A-ABE6-496D-B37D-FFF9A9D79767,F,CA,57B6F75F-CF0A-4225-BAD0-8222A7D4B489,60,0,0,0,0,0,...,30.0,1,0,0,30.318667,44.458,12.498,0,1,31.960
9946,F764BC86-0CFA-4661-8D84-FFFA8E2B6080,F,CA,1A69F223-8409-4FDC-A26C-114677D2D4C3,55,0,0,0,0,0,...,16.0,0,0,0,35.724615,36.576,33.922,0,1,2.654


In [247]:
Final_output = pd.concat([patientdata, DMIndicatorForecast], axis=1)

In [251]:
#Final output for the test_SyncPatientForecast.csv
Final_output = Final_output[['PatientGuid','DMIndicatorForecast']]
Final_output

Unnamed: 0,PatientGuid,DMIndicatorForecast
0,FB6EFC3D-1A20-4497-9CBD-00027CC5D220,0.0
1,C6746626-6783-4650-A58F-00065649139A,0.0
2,E05C6E8F-779F-4594-A388-000C635AE4D3,0.0
3,EAEBD216-F847-4355-87B2-000D942E08F0,0.0
4,C7F10A80-4934-42D2-8540-000FBEBA75C8,0.0
...,...,...
9943,96C0A4E6-1E3E-497E-9C4E-FFEC0E25AD3A,0.0
9944,5845977A-3014-4301-92B3-FFF0A2EBBAD2,0.0
9945,F948403A-ABE6-496D-B37D-FFF9A9D79767,0.0
9946,F764BC86-0CFA-4661-8D84-FFFA8E2B6080,0.0


In [252]:
#Converting the final output into csv 
Final_output.to_csv('test_SyncPatientForecast.csv', index= False)