



**Model details**

-  This model aims to enhance Risk Adjustment processes by allowing us to predict a member having a covered DM with complications related ICD code. The model is purely predictive and not rules based.
-  The model is built off a Nueral Network-Deep learning model
-  The model is trained off the 2016 member population
-  Testing accuracy on 2017 data showed an overall accuracy of about 87% in predicting if someone has an HCC 18 related ICD code or not
    -  Positive Predictive Value of about 93%
    -  Likely amount of positive HCC's missed at 93% PPV is about 17%
             

#### James Perry 

In [29]:
#Dependencies for model
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers
from keras.models import load_model
from keras.callbacks import ModelCheckpoint

#Setting up model conditions for learning 
learning_rate = 0.01
epochs_value = 1500
batch_size_value = 2000

#### Data for Model

-  The model data consists of a member ID, Age, Gender, indicators for both Hospice and DSNP as indicators for the covering of each HCC
-  The data ranges from 2014 - 2017
-  No PHI in this data 

In [30]:
# Importing the dataset
datasetInitial = pd.read_csv('C:/Users/jperry/Documents/School/BDT_ProjectData/UPDATED DATA/ProjectData.csv',sep = '|', low_memory='FALSE')

In [31]:
#Chopping up data into a 1 year slice

dataset = datasetInitial.loc[datasetInitial['MMESource'] == 2016.0]
dataset.head(10)

Unnamed: 0,MMESource,BISource,ID,Age,Gender,IS_HOSPICE_FLAG,IS_SNP_FLAG,HCC1,HCC2,HCC6,...,HCC166,HCC167,HCC169,HCC170,HCC173,HCC176,HCC186,HCC188,HCC189,HCC138
290197,2016.0,2016,516823SENH,94.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
290198,2016.0,2016,593464OSUB,85.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290199,2016.0,2016,235815XWOR,93.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290200,2016.0,2016,522003YDHY,93.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290201,2016.0,2016,296957RHRW,77.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290202,2016.0,2016,593124ISQR,78.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290203,2016.0,2016,785748DCOK,78.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290204,2016.0,2016,401581QNEI,80.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
290205,2016.0,2016,157244LTES,96.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290206,2016.0,2016,237072KSTP,94.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
#Final Testing Data

testData = datasetInitial.loc[datasetInitial['MMESource'] == 2017.0]
testData.head(10)

Unnamed: 0,MMESource,BISource,ID,Age,Gender,IS_HOSPICE_FLAG,IS_SNP_FLAG,HCC1,HCC2,HCC6,...,HCC166,HCC167,HCC169,HCC170,HCC173,HCC176,HCC186,HCC188,HCC189,HCC138
538272,2017.0,2017,435015EEDH,86.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538273,2017.0,2017,453806VLNO,94.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538274,2017.0,2017,183497WGTM,94.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538275,2017.0,2017,534656LTTY,78.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538276,2017.0,2017,663519YVAT,79.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538277,2017.0,2017,352323NQWE,79.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538278,2017.0,2017,130073OWVG,81.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
538279,2017.0,2017,99819POIT,97.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
538280,2017.0,2017,260646PFSM,95.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538281,2017.0,2017,370085DPNZ,96.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
#size of data
print('Number of rows: ', dataset.shape[0])

Number of rows:  248072


In [34]:
# Creating matrix of features (X) 
# Creating matrix of target variable (Y)

X = dataset.drop(['MMESource','BISource','ID','HCC18'], 1)
#Converting all data to float
X[['Age',	'Gender',	'IS_HOSPICE_FLAG',	'IS_SNP_FLAG',	'HCC1',	'HCC2',	'HCC6',	'HCC8',	'HCC9',	'HCC10',	'HCC11',	'HCC12',	'HCC17',	
         'HCC19',	'HCC21',	'HCC22',	'HCC23',	'HCC27',	'HCC28',	'HCC29',	'HCC33',	'HCC34',	'HCC35',	'HCC39',	'HCC40',	'HCC46',	
         'HCC47',	'HCC48',	'HCC54',	'HCC55',	'HCC57',	'HCC58',	'HCC70',	'HCC71',	'HCC72',	'HCC73',	'HCC74',	'HCC75',	'HCC76',	
         'HCC77',	'HCC78',	'HCC79',	'HCC80',	'HCC82',	'HCC83',	'HCC84',	'HCC85',	'HCC86',	'HCC87',	'HCC88',	'HCC96',	'HCC99',	
         'HCC100',	'HCC103',	'HCC104',	'HCC106',	'HCC107',	'HCC108',	'HCC110',	'HCC111',	'HCC112',	'HCC114',	'HCC115',	'HCC122',	'HCC124',	
         'HCC134',	'HCC135',	'HCC136',	'HCC137',	'HCC157',	'HCC158',	'HCC161',	'HCC162',	'HCC166',	'HCC167',	'HCC169',	'HCC170',	'HCC173',	
         'HCC176',	'HCC186',	'HCC188',	'HCC189',	'HCC138'
]] = X[['Age',	'Gender',	'IS_HOSPICE_FLAG',	'IS_SNP_FLAG',	'HCC1',	'HCC2',	'HCC6',	'HCC8',	'HCC9',	'HCC10',	'HCC11',	'HCC12',	'HCC17',	
         'HCC19',	'HCC21',	'HCC22',	'HCC23',	'HCC27',	'HCC28',	'HCC29',	'HCC33',	'HCC34',	'HCC35',	'HCC39',	'HCC40',	'HCC46',	
         'HCC47',	'HCC48',	'HCC54',	'HCC55',	'HCC57',	'HCC58',	'HCC70',	'HCC71',	'HCC72',	'HCC73',	'HCC74',	'HCC75',	'HCC76',	
         'HCC77',	'HCC78',	'HCC79',	'HCC80',	'HCC82',	'HCC83',	'HCC84',	'HCC85',	'HCC86',	'HCC87',	'HCC88',	'HCC96',	'HCC99',	
         'HCC100',	'HCC103',	'HCC104',	'HCC106',	'HCC107',	'HCC108',	'HCC110',	'HCC111',	'HCC112',	'HCC114',	'HCC115',	'HCC122',	'HCC124',	
         'HCC134',	'HCC135',	'HCC136',	'HCC137',	'HCC157',	'HCC158',	'HCC161',	'HCC162',	'HCC166',	'HCC167',	'HCC169',	'HCC170',	'HCC173',	
         'HCC176',	'HCC186',	'HCC188',	'HCC189',	'HCC138']].astype("float")
y = dataset.loc[:, ['HCC18']]
y[['HCC18']] = y[['HCC18']].astype("float")

In [35]:
# Splitting the dataset into the Training set and Test set
# Using the train_test_split from sklearn to easily split our file

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [36]:
#Initializing Neural Network
classifier = tf.keras.Sequential()
# Adding layers to the model
classifier.add(tf.keras.layers.Dense(500, input_dim=83, activation='relu'))  
classifier.add(tf.keras.layers.Dense(250, kernel_regularizer=regularizers.l2(learning_rate), activation='relu'))
classifier.add(tf.keras.layers.Dense(125, kernel_regularizer=regularizers.l2(learning_rate), activation='relu'))
classifier.add(tf.keras.layers.Dense(1, kernel_regularizer=regularizers.l2(learning_rate), activation='sigmoid'))
classifier.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 500)               42000     
_________________________________________________________________
dense_10 (Dense)             (None, 250)               125250    
_________________________________________________________________
dense_11 (Dense)             (None, 125)               31375     
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 126       
Total params: 198,751
Trainable params: 198,751
Non-trainable params: 0
_________________________________________________________________


In [37]:
# Compiling Neural Network
classifier.compile(optimizer = 'adam', 
                   loss = 'binary_crossentropy', 
                   metrics = ['accuracy'])

In [38]:
# checkpoint to save absolute best model
filepath="weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [39]:
# Fitting our model 
HCCModel = classifier.fit(X_train, 
                          y_train, 
                          batch_size=batch_size_value, 
                          epochs=epochs_value,
                          validation_data=(X_test, y_test),
                          callbacks=callbacks_list)

Train on 173650 samples, validate on 74422 samples
Epoch 1/1500
Epoch 00001: val_acc improved from -inf to 0.76902, saving model to weights-improvement-01-0.77.hdf5
Epoch 2/1500
Epoch 00002: val_acc improved from 0.76902 to 0.86345, saving model to weights-improvement-02-0.86.hdf5
Epoch 3/1500
Epoch 00003: val_acc improved from 0.86345 to 0.86667, saving model to weights-improvement-03-0.87.hdf5
Epoch 4/1500
Epoch 00004: val_acc did not improve from 0.86667
Epoch 5/1500
Epoch 00005: val_acc did not improve from 0.86667
Epoch 6/1500
Epoch 00006: val_acc improved from 0.86667 to 0.86714, saving model to weights-improvement-06-0.87.hdf5
Epoch 7/1500
Epoch 00007: val_acc improved from 0.86714 to 0.86726, saving model to weights-improvement-07-0.87.hdf5
Epoch 8/1500
Epoch 00008: val_acc improved from 0.86726 to 0.86755, saving model to weights-improvement-08-0.87.hdf5
Epoch 9/1500
Epoch 00009: val_acc did not improve from 0.86755
Epoch 10/1500
Epoch 00010: val_acc did not improve from 0.867

In [None]:
classifier.save('HCC18Prediction.keras', overwrite = True)  

In [40]:
score = classifier.evaluate(X_test, y_test)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.36188525956270534
Test accuracy: 0.8676466636289814


In [None]:
plt.plot(range(1,epochs_value+1), HCCModel.history['loss'], 'r+', label='training loss')
plt.plot(range(1,epochs_value+1), HCCModel.history['val_loss'], 'bo', label='evaluation loss')
plt.xlabel('Epochs\n\n Accuracy: '+str(score[1]))
plt.ylabel('Loss')
plt.legend()
plt.show()

In [13]:
classifier.load_weights('weights-improvement-753-0.87.hdf5')  

In [14]:
# Creating TESTING matrix of features (X) 
# Creating TESTING matrix of target variable (Y)

FTest_X = testData.drop(['MMESource','BISource','ID','HCC18'], 1)
#Converting all data to float
FTest_X[['Age',	'Gender',	'IS_HOSPICE_FLAG',	'IS_SNP_FLAG',	'HCC1',	'HCC2',	'HCC6',	'HCC8',	'HCC9',	'HCC10',	'HCC11',	'HCC12',	'HCC17',	
         'HCC19',	'HCC21',	'HCC22',	'HCC23',	'HCC27',	'HCC28',	'HCC29',	'HCC33',	'HCC34',	'HCC35',	'HCC39',	'HCC40',	'HCC46',	
         'HCC47',	'HCC48',	'HCC54',	'HCC55',	'HCC57',	'HCC58',	'HCC70',	'HCC71',	'HCC72',	'HCC73',	'HCC74',	'HCC75',	'HCC76',	
         'HCC77',	'HCC78',	'HCC79',	'HCC80',	'HCC82',	'HCC83',	'HCC84',	'HCC85',	'HCC86',	'HCC87',	'HCC88',	'HCC96',	'HCC99',	
         'HCC100',	'HCC103',	'HCC104',	'HCC106',	'HCC107',	'HCC108',	'HCC110',	'HCC111',	'HCC112',	'HCC114',	'HCC115',	'HCC122',	'HCC124',	
         'HCC134',	'HCC135',	'HCC136',	'HCC137',	'HCC157',	'HCC158',	'HCC161',	'HCC162',	'HCC166',	'HCC167',	'HCC169',	'HCC170',	'HCC173',	
         'HCC176',	'HCC186',	'HCC188',	'HCC189',	'HCC138'
]] = FTest_X[['Age',	'Gender',	'IS_HOSPICE_FLAG',	'IS_SNP_FLAG',	'HCC1',	'HCC2',	'HCC6',	'HCC8',	'HCC9',	'HCC10',	'HCC11',	'HCC12',	'HCC17',	
         'HCC19',	'HCC21',	'HCC22',	'HCC23',	'HCC27',	'HCC28',	'HCC29',	'HCC33',	'HCC34',	'HCC35',	'HCC39',	'HCC40',	'HCC46',	
         'HCC47',	'HCC48',	'HCC54',	'HCC55',	'HCC57',	'HCC58',	'HCC70',	'HCC71',	'HCC72',	'HCC73',	'HCC74',	'HCC75',	'HCC76',	
         'HCC77',	'HCC78',	'HCC79',	'HCC80',	'HCC82',	'HCC83',	'HCC84',	'HCC85',	'HCC86',	'HCC87',	'HCC88',	'HCC96',	'HCC99',	
         'HCC100',	'HCC103',	'HCC104',	'HCC106',	'HCC107',	'HCC108',	'HCC110',	'HCC111',	'HCC112',	'HCC114',	'HCC115',	'HCC122',	'HCC124',	
         'HCC134',	'HCC135',	'HCC136',	'HCC137',	'HCC157',	'HCC158',	'HCC161',	'HCC162',	'HCC166',	'HCC167',	'HCC169',	'HCC170',	'HCC173',	
         'HCC176',	'HCC186',	'HCC188',	'HCC189',	'HCC138']].astype("float")
FTest_y = testData.loc[:, ['HCC18']]
FTest_y[['HCC18']] = FTest_y[['HCC18']].astype("float")

In [15]:
# Predicting the Test set results
y_pred = classifier.predict(FTest_X)
y_pred = (y_pred > 0.80)

In [16]:
# Creating the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(FTest_y , y_pred)
print(cm)

#Note: You should be able to construct the confusion matrix below with the CSV files that get generated below

[[154856   5158]
 [ 47124  73020]]


In [17]:
#Set up for PPV, not sensitivity
cm[1,1]/(cm[0,1]+cm[1,1])

0.9340223592314973

In [18]:
# Likely Missed
cm[1,0]/(cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1])

0.16820508427387404

In [27]:
# Accuracy
(cm[0,0]+cm[1,1])/(cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1])

0.8133838762412638

In [28]:
(cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1])

280158

In [19]:
#Converting the numpy array into a pandas DF
y_pred = pd.DataFrame({'Predicted Value':y_pred[:,0]})

In [20]:
#Exporting results to CSV Files 
FTest_X.to_csv('Predictors.csv',sep = '|')
FTest_y.to_csv('Labels.csv',sep = '|')
y_pred.to_csv('Predicted Values.csv',sep = '|')