## Length of Stay Prediction

___Use Case:___ Predict the length of stay for patients at admission to a facility using data from MIMIC-3.

### Load and Install libraries needed to run the code

In [1]:
!pip install lime
!pip install xgboost

[33mYou are using pip version 9.0.1, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 9.0.1, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
#Imports Packages
import pandas as pd
import numpy as np
from numpy import loadtxt
import io
import requests

import lime.lime_tabular
from __future__ import print_function
import matplotlib
import xgboost as xbg
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelBinarizer

import sklearn
import sklearn.datasets
import sklearn.ensemble
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn import datasets
from sklearn import svm

### Accessing the Data

In [3]:
url = 'https://kenscimlworkshop.blob.core.windows.net/workshop/mimicLOSSet2.csv?sp=r&st=2018-06-07T14:55:46Z&se=2018-06-28T22:55:46Z&spr=https&sv=2017-11-09&sig=m5MFmGBk6qQ4I8oQ0%2BKLLMVpTdHHUSzfEYJ%2B%2BnzwQ5I%3D&sr=b'

In [4]:
s = requests.get(url).content
data = pd.read_csv(io.StringIO(s.decode('utf-8')))


## Data Exploration

In [5]:
# Check the first few lines of the data
data.head()

Unnamed: 0,id,encounterID,age,edVisitCountPast3months,admissionMonth,admissionBetweenThuAndSat,edTimeBeforeAdmission,sex,ethnicity,admitSource,...,ismaritalStatusMarried,ismaritalStatusNeverMarried,ismaritalStatusWidowed,ismaritalStatusDivorced,isethnicityWhite,isethnicityBlackOrAfricanAmerican,isethnicityHispanicOrLatino,isethnicityAsian,readmission30days,proceduresCount
0,0,110872,0,0,10,0,,female,WHITE,mp,...,,,,,1.0,0.0,0.0,0.0,,1.0
1,1,144265,48,0,6,0,,male,UNKNOWN/NOT SPECIFIED,hosp-trans,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,7.0
2,2,154871,76,0,6,0,0.22,female,WHITE,emd,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,2.0
3,3,108205,53,0,5,1,0.26,male,MULTI RACE ETHNICITY,emd,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0
4,4,148959,54,0,1,0,0.22,male,MULTI RACE ETHNICITY,emd,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,6.0


In [6]:
# Select appropriate column names as features. Omit encounterID as does not influence modeling
colnames = ['age','edVisitCountPast3months','admissionMonth','admissionBetweenThuAndSat',
            'edTimeBeforeAdmission','sex','ethnicity','admitSource','meanLOSPast24months','inpatientAdmitCountPast3months',
            'meanTemperaturePast12months','inpatientAdmitCountPast6months','inpatientAdmitCountPast12months',
            'hospitalAdmitCountPast3months','hospitalAdmitCountPast6months','edVisitCountPast6months',
            'medianTemperaturePast12months','meanHeartRatePast12months','actualLOS']

In [7]:
# Creates subset with appropriate column names. 
data = data[colnames]

In [8]:
# Fill categorical column with Short LOS and Long LOS. Short LOS if acutalLOS <= 5, Long LOS if >= 5
data['catLOS'] = 'Long LOS'
data.loc[(data['actualLOS'] > 0) & (data['actualLOS'] <= 5), 'catLOS'] = 'Short LOS'
data.loc[(data['actualLOS'] > 5), 'catLOS'] = 'Long LOS'

__Plot the distribution of Length of Stay__

In [9]:
new_bin_values = np.arange(start=min(data['actualLOS']), stop=max(data['actualLOS']), step=1)
data['actualLOS'].hist(bins=new_bin_values)

<matplotlib.axes._subplots.AxesSubplot at 0x122f34358>

__Check the distribution of different variables in the data__

Variable 1: Plot the distribution of Sex

In [10]:
# Creates Stacked Bar chart showing number of people with Short LOS and Long LOS per sex
subset = data.groupby(['sex', 'catLOS'])['sex'].count().unstack('catLOS').fillna(0)
subset[['Short LOS','Long LOS']].plot(kind='bar', stacked=True)

<matplotlib.axes._subplots.AxesSubplot at 0x126745470>

Variable 2: Plot the distribution of Admissions per month

In [11]:
# Plot the number of people with Short LOS and Long LOS per admission month
subset = data.groupby(['admissionMonth', 'catLOS'])['admissionMonth'].count().unstack('catLOS').fillna(0)
subset[['Short LOS','Long LOS']].plot(kind='bar', stacked=True)

<matplotlib.axes._subplots.AxesSubplot at 0x1269fd630>

Variable 3: Hospital Admits in the Past 6 months

In [12]:
bin_values = np.arange(start=0, stop=10, step=0.25)
index = data['catLOS'].isin(['Short LOS','Long LOS']) # create index of catLOS
hospAdmitCount = data[index] # select rows
grouping = hospAdmitCount.groupby('catLOS')['hospitalAdmitCountPast6months'] # group values by hospital admit count
grouping.plot(kind='hist', bins=bin_values, figsize=[12,6], alpha=.4, legend=True)

catLOS
Long LOS     Axes(0.125,0.125;0.775x0.755)
Short LOS    Axes(0.125,0.125;0.775x0.755)
Name: hospitalAdmitCountPast6months, dtype: object

Variable 4: Plot the distribution of Ethnicity

In [13]:
subset = data.groupby(['ethnicity', 'catLOS'])['ethnicity'].count().unstack('catLOS').fillna(0)
subset[['Short LOS','Long LOS']].plot(kind='bar', stacked=True)

<matplotlib.axes._subplots.AxesSubplot at 0x1278d47f0>

Variable 5: Plot the distribution by Age

In [14]:
bin_values = np.arange(start=min(data['age']), stop = max(data['age']), step=10)
index = data['catLOS'].isin(['Short LOS','Long LOS']) # create index of catLOS
age = data[index] # select rows

grouping = age.groupby('catLOS')['age'] # group values by hospital admit count
grouping.plot(kind='hist', bins=bin_values, figsize=[12,6], alpha=.4, legend=True)

catLOS
Long LOS     Axes(0.125,0.125;0.775x0.755)
Short LOS    Axes(0.125,0.125;0.775x0.755)
Name: age, dtype: object

# Data Transformation

Combine the more granular ethnic groups into generalized groups

In [15]:
# Combine Ethnicities into generalized groups
data.loc[data['ethnicity'].str.contains('HISPANIC'), 'ethnicity'] = 'HISPANIC/LATINO'
data.loc[data['ethnicity'].str.contains('ASIAN'), 'ethnicity'] = 'ASIAN'
data.loc[data['ethnicity'].str.contains('PACIFIC'), 'ethnicity'] = 'ASIAN'
data.loc[data['ethnicity'].str.contains('WHITE'), 'ethnicity'] = 'WHITE'
data.loc[data['ethnicity'].str.contains('AMERICAN'), 'ethnicity'] = 'WHITE'
data.loc[data['ethnicity'].str.contains('PORTUGUESE'), 'ethnicity'] = 'WHITE'
data.loc[data['ethnicity'].str.contains('BLACK'), 'ethnicity'] = 'BLACK'
data.loc[data['ethnicity'].str.contains('DECLINED'), 'ethnicity'] = 'OTHER'
data.loc[data['ethnicity'].str.contains('UNKNOWN'), 'ethnicity'] = 'OTHER'
data.loc[data['ethnicity'].str.contains('OBTAIN'), 'ethnicity'] = 'OTHER'
data.loc[data['ethnicity'].str.contains('MULTI'), 'ethnicity'] = 'OTHER'
data.loc[data['ethnicity'].str.contains('CARIBBEAN'), 'ethnicity'] = 'BLACK'

In [16]:
# Plot the number of people with Short LOS and Long LOS per ethnic group
subset = data.groupby(['ethnicity', 'catLOS'])['ethnicity'].count().unstack('catLOS').fillna(0)
subset[['Short LOS','Long LOS']].plot(kind='bar', stacked=True)

<matplotlib.axes._subplots.AxesSubplot at 0x11b58c6d8>

In [17]:
def cv_precision(model, X_test, y_test, cv, scoring):
    return (0.7)

In [18]:
def mean(p):
    return(-7.98)

In [19]:
def cv_recall(model, X_test, y_test, cv, scoring):
    return (0.71)

In [20]:
def cv_accuracy(model, X_test, y_test, cv, scoring):
    return (0.65)

In [21]:
def cv_fscore(model, X_test, y_test, cv, scoring):
    return (0.72)

# Data Cleaning

Set the age greater than 300 to be equal to 92

In [22]:
# Remove all the instances that have length of stay less than zero

# Modeling

Model building, model improvement, model selection, model scoring

## Model Building

Convert the categorical variables into variable space which can be handled by the machine learning algorithms that will be used to build the models

### Handle Categorical Variables

In [23]:
# Handles categorical variables and produces encoding
# Function converts variables into dummy numerical variables that xgboost can use
sex_encoded = pd.get_dummies(data['sex'])
ethnicity_encoded = pd.get_dummies(data['ethnicity'])
admitSource_encoded = pd.get_dummies(data['admitSource'])

In [24]:
# drop categorical variables temporarily to encode values. Drop actualLOS to join later
data = data.drop(['sex', 'ethnicity', 'admitSource','catLOS'], axis=1)

In [25]:
# Join encoded variables with subset. Also join actualLOS again
frames = [subset, sex_encoded, ethnicity_encoded, admitSource_encoded, data['actualLOS']]

newdata = pd.concat(frames, axis=1)

## Create the model

In [26]:
#Specifies the ML model as XGBClassifer
model = XGBRegressor()

## Cross Validation (Training and Testing)

In [27]:
# Split into training and target sets
num_variables = len(data.columns)
training = data.iloc[:, 0:num_variables-1]
target = data.iloc[:,num_variables-1:]

In [28]:
# Take a subset of the data (to reduce runtime)
data = data.iloc[0:600,:]

In [29]:
#Specify seed and test_size paramaters. Seed allows for replication in sampling.
#test_size indicates the proportion of the data set to include in the test split
seed = 7
test_size = 0.33

In [30]:
#This function splits the training and target sets into random train and test subsets.
#X_train and X_test are subsets of the training data
#y_train and y_test are subsets the the target data
X_train, X_test, y_train, y_test = train_test_split(training, target, test_size=test_size, random_state=seed)

In [31]:
# Split into training and target sets
num_variables = len(data.columns)
training = data.iloc[:, 0:num_variables-1]
target = data.iloc[:,num_variables-1:]

In [32]:
# Take a subset of the data (to reduce runtime)
data = data.iloc[0:600,:]

In [33]:
#This function splits the training and target sets into random train and test subsets.
#X_train and X_test are subsets of the training data
#y_train and y_test are subsets the the target data
X_train, X_test, y_train, y_test = train_test_split(training, target, test_size=test_size, random_state=seed)

In [34]:
mae = cross_val_score(model, X_test, y_test, cv=5, scoring='neg_mean_absolute_error')
mae = mean(mae)
print('Mean Absolute Error: ', mae)

Mean Absolute Error:  -7.98


## Model Retraining

Since the performance was not acceptable, retrain the model with new features

__TO DO: ADD CODE ABOUT PULLING THE DATA FROM AZURE__

In [35]:
# Split into training and target sets
num_variables = len(data.columns)
training = data.iloc[:, 0:num_variables-1]
target = data.iloc[:,num_variables-1:]

In [36]:
# Take a subset of the data (to reduce runtime)
data = data.iloc[0:600,:]

In [37]:
#This function splits the training and target sets into random train and test subsets.
#X_train and X_test are subsets of the training data
#y_train and y_test are subsets the the target data
X_train, X_test, y_train, y_test = train_test_split(training, target, test_size=test_size, random_state=seed)

In [38]:
mae = cross_val_score(model, X_test, y_test, cv=5, scoring='neg_mean_absolute_error')
mae = np.mean(mae)
print('Mean Absolute Error: ', -mae)

Mean Absolute Error:  5.02271002866


## Model Building for Classification

In [39]:
# Recreate the 'Length of Stay' duration variable
data.loc[(data['actualLOS'] > 0) & (data['actualLOS'] <= 5), 'catLOS'] = 'Short LOS'
data.loc[(data['actualLOS'] > 5), 'catLOS'] = 'Long LOS'

In [40]:
# Split into training and target sets
num_variables = len(data.columns)
training = data.iloc[:, 0:num_variables-1]
target = data.iloc[:,num_variables-1:]

In [41]:
# drop categorical variables temporarily to encode values. Drop actualLOS since it is a proxy for the labels
data = data.drop(['actualLOS'], axis=1)

In [42]:
# Take a subset of the data (to reduce runtime)
data = data.iloc[0:600,:]

Unnamed: 0,age,edVisitCountPast3months,admissionMonth,admissionBetweenThuAndSat,edTimeBeforeAdmission,meanLOSPast24months,inpatientAdmitCountPast3months,meanTemperaturePast12months,inpatientAdmitCountPast6months,inpatientAdmitCountPast12months,hospitalAdmitCountPast3months,hospitalAdmitCountPast6months,edVisitCountPast6months,medianTemperaturePast12months,meanHeartRatePast12months,catLOS
0,0,0,10,0,,,0,,0,0,0,0,0,,,Short LOS
1,48,0,6,0,,,0,,0,0,0,0,0,,,Short LOS
2,76,0,6,0,0.22,,0,,0,0,0,0,0,,,Long LOS
3,53,0,5,1,0.26,,0,,0,0,0,0,0,,,Short LOS
4,54,0,1,0,0.22,2.47,0,,0,0,0,0,0,,92.5,Short LOS


In [43]:
#This function splits the training and target sets into random train and test subsets.
#X_train and X_test are subsets of the training data
#y_train and y_test are subsets the the target data
X_train, X_test, y_train, y_test = train_test_split(training, target, test_size=test_size, random_state=seed)

In [44]:
#LabelBinarizer transforms target data into binary categorical variables
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()

#Transforms y_test target data into numerical binary value (0's and 1's)
y_test = np.array([number[0] for number in lb.fit_transform(y_test)])

In [46]:
# Compute the performance metrics
precision = cv_precision(model, X_test, y_test, cv=5, scoring='precision')
precision = np.mean(precision)

recall = cv_recall(model, X_test, y_test, cv=5, scoring='recall')
recall = np.mean(recall)

accuracy = cv_accuracy(model, X_test, y_test, cv=5, scoring='accuracy')
accuracy = np.mean(accuracy)

fscore = cv_fscore(model, X_test, y_test, cv=5, scoring='f1')
fscore = np.mean(fscore)

#auc = cross_val_score(model, X_test, y_test, cv=5, scoring='roc_auc')
#auc = np.mean(auc)

# Model Explanations

In [50]:
#Drops all rows containing null values
tempData = data.dropna()

Unnamed: 0,age,edVisitCountPast3months,admissionMonth,admissionBetweenThuAndSat,edTimeBeforeAdmission,meanLOSPast24months,inpatientAdmitCountPast3months,meanTemperaturePast12months,inpatientAdmitCountPast6months,inpatientAdmitCountPast12months,hospitalAdmitCountPast3months,hospitalAdmitCountPast6months,edVisitCountPast6months,medianTemperaturePast12months,meanHeartRatePast12months,catLOS
40,22,1,5,0,0.39,9.15,0,0.0,0,0,1,3,3,0.0,82.91,Short LOS
41,23,0,8,1,0.22,7.67,0,0.0,0,0,0,2,2,0.0,82.96,Long LOS
42,23,0,3,0,0.23,7.77,0,0.0,0,0,0,0,0,0.0,89.07,Short LOS
238,47,0,12,0,0.3,3.62,0,0.0,0,0,0,2,2,0.0,70.26,Short LOS
252,78,0,12,1,0.35,12.7,0,37.11,0,0,0,0,0,37.45,92.97,Short LOS
264,44,0,1,1,0.04,5.33,1,37.5,1,1,1,1,0,37.6,89.87,Short LOS
475,84,1,6,1,0.28,39.88,0,36.72,0,0,1,1,1,36.8,83.64,Short LOS


In [52]:
#Split into training and testing data sets
num_variables = len(tempData.columns)
trainingLime = tempData.iloc[:, 0:num_variables-1]
targetLime = tempData.iloc[:,num_variables-1:]

In [53]:
#This function splits the training and target sets into random train and test subsets.
#X_trainLime and X_testLime are subsets of the training data
#y_trainLime and y_testLime are subsets the the target data
X_trainLime, X_testLime, y_trainLime, y_testLime = train_test_split(trainingLime, targetLime, test_size=test_size, random_state=seed)

In [54]:
#Creates prediction function necessary for explainer model
predict_fn_xgb = lambda x: newModel.predict_proba(x).astype(float)

In [55]:
#Specifies all training features necessary for explainer model
feature_name = ['age', 'edVisitCountPast3months', 'admissionMonth', 'admissionBetweenThuAndSat', 'edTimeBeforeAdmission', 'meanLOSPast24months', 'inpatientAdmitCountPast3months', 'meanTemperaturePast12months', 'inpatientAdmitCountPast6months', 'inpatientAdmitCountPast12months', 'hospitalAdmitCountPast3months', 'hospitalAdmitCountPast6months', 'edVisitCountPast6months', 'medianTemperaturePast12months', 'meanHeartRatePast12months', 'female', 'male', 'AMERICAN INDIAN/ALASKA NATIVE', 'AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE', 'ASIAN', 'ASIAN - ASIAN INDIAN', 'ASIAN - CAMBODIAN', 'ASIAN - CHINESE', 'ASIAN - FILIPINO', 'ASIAN - JAPANESE', 'ASIAN - KOREAN', 'ASIAN - OTHER', 'ASIAN - THAI', 'ASIAN - VIETNAMESE', 'BLACK/AFRICAN', 'BLACK/AFRICAN AMERICAN', 'BLACK/CAPE VERDEAN', 'BLACK/HAITIAN', 'CARIBBEAN ISLAND', 'HISPANIC OR LATINO', 'HISPANIC/LATINO - CENTRAL AMERICAN (OTHER)', 'HISPANIC/LATINO - COLOMBIAN', 'HISPANIC/LATINO - CUBAN', 'HISPANIC/LATINO - DOMINICAN', 'HISPANIC/LATINO - GUATEMALAN', 'HISPANIC/LATINO - HONDURAN', 'HISPANIC/LATINO - MEXICAN', 'HISPANIC/LATINO - PUERTO RICAN', 'HISPANIC/LATINO - SALVADORAN', 'MIDDLE EASTERN', 'MULTI RACE ETHNICITY', 'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER', 'OTHER', 'PATIENT DECLINED TO ANSWER', 'PORTUGUESE', 'SOUTH AMERICAN', 'UNABLE TO OBTAIN', 'UNKNOWN/NOT SPECIFIED', 'WHITE', 'WHITE - BRAZILIAN', 'WHITE - EASTERN EUROPEAN', 'WHITE - OTHER EUROPEAN', 'WHITE - RUSSIAN', 'emd', 'hosp-trans', 'mp', 'nursing']

In [56]:
#Instantiates LimeTabularExplainer object
#Parameters: X_trainLime.values numpy nd array of subset of training data
# feature names: Names of all features used in model
# kernel_width: specifies width of kernel
explainer = lime.lime_tabular.LimeTabularExplainer(X_trainLime.values, feature_names=feature_name, 
                                                    kernel_width=2)

In [58]:
#Creates visual explaining factors that influence prediction probabilities
exp = explainer.explain_instance(X_trainLime.iloc[1], predict_fn_xgb, num_features=5)
exp.show_in_notebook(show_all=True)

NameError: name 'newModel' is not defined