In [132]:
#Import packages
import pandas as pd
import numpy as np
import xgboost as xgb
#Functions for xgboost
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn import datasets
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier

from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

In [133]:
#Select appropriate column names as features. Omit encounterID as doesn't influence model
colnames = ['age','edVisitCountPast3months','admissionMonth','admissionBetweenThuAndSat',
            'edTimeBeforeAdmission','sex','ethnicity','admitSource','meanLOSPast24months','inpatientAdmitCountPast3months',
            'meanTemperaturePast12months','inpatientAdmitCountPast6months','inpatientAdmitCountPast12months',
            'hospitalAdmitCountPast3months','hospitalAdmitCountPast6months','edVisitCountPast6months',
            'medianTemperaturePast12months','meanHeartRatePast12months','actualLOS']

In [134]:
#Read CSV
data = pd.read_csv("mimicLOSKenSciFeaturesData.csv")

In [135]:
#Create subset with appropriate columns
subset = data[colnames]

In [136]:
#drop categorical variables temporarily to encode values. Drop actualLOS to join later
subset = subset.drop(['sex', 'ethnicity', 'admitSource', 'actualLOS'], axis=1)

In [137]:
#Print subset head
subset.head()

Unnamed: 0,age,edVisitCountPast3months,admissionMonth,admissionBetweenThuAndSat,edTimeBeforeAdmission,meanLOSPast24months,inpatientAdmitCountPast3months,meanTemperaturePast12months,inpatientAdmitCountPast6months,inpatientAdmitCountPast12months,hospitalAdmitCountPast3months,hospitalAdmitCountPast6months,edVisitCountPast6months,medianTemperaturePast12months,meanHeartRatePast12months
0,0,0,10,0,,,0,,0,0,0,0,0,,
1,48,0,6,0,,,0,,0,0,0,0,0,,
2,76,0,6,0,0.22,,0,,0,0,0,0,0,,
3,53,0,5,1,0.26,,0,,0,0,0,0,0,,
4,54,0,1,0,0.22,2.47,0,,0,0,0,0,0,,92.5


In [138]:
#Handles categorical variables and produces encoding
#Function converts variables into dummy numerical variables that xgboost can use
sex_encoded = pd.get_dummies(data['sex'])
ethnicity_encoded = pd.get_dummies(data['ethnicity'])
admitSource_encoded = pd.get_dummies(data['admitSource'])

In [139]:
#Join encoded variables with subset. Also join actualLOS again
frames = [subset, sex_encoded, ethnicity_encoded, admitSource_encoded, data['actualLOS']]
result = pd.concat(frames, axis=1)
result.head()

Unnamed: 0,age,edVisitCountPast3months,admissionMonth,admissionBetweenThuAndSat,edTimeBeforeAdmission,meanLOSPast24months,inpatientAdmitCountPast3months,meanTemperaturePast12months,inpatientAdmitCountPast6months,inpatientAdmitCountPast12months,...,WHITE - BRAZILIAN,WHITE - EASTERN EUROPEAN,WHITE - OTHER EUROPEAN,WHITE - RUSSIAN,emd,hosp-trans,mp,nursing,other,actualLOS
0,0,0,10,0,,,0,,0,0,...,0,0,0,0,0,0,1,0,0,0.37
1,48,0,6,0,,,0,,0,0,...,0,0,0,0,0,1,0,0,0,3.38
2,76,0,6,0,0.22,,0,,0,0,...,0,0,0,0,1,0,0,0,0,6.16
3,53,0,5,1,0.26,,0,,0,0,...,0,0,0,0,1,0,0,0,0,0.59
4,54,0,1,0,0.22,2.47,0,,0,0,...,0,0,0,0,1,0,0,0,0,1.36


In [140]:
#Drops all rows in which null values in actualLOS column. Prints number of rows after dropping null values
result = result.dropna(subset=['actualLOS'])
len(result)

58868

In [141]:
#Take only top 10% of rows for time sake. CHANGE LATER
result = result.iloc[0:600,:]
len(result)

600

In [142]:
#Split into training and target sets. First 63 columns (features) serve as training data
#Final column (acutalLOS) is target variable

training = result.iloc[:, 0:62]
target = result.iloc[:,63:]
target.head()

Unnamed: 0,actualLOS
0,0.37
1,3.38
2,6.16
3,0.59
4,1.36


In [143]:
#Specify seed and test_size paramaters. Seed allows for replication in sampling.
#test_size indicates the proportion of the data set to include in the test split
seed = 7
test_size = 0.33

In [144]:
#This function splits the training and target sets into random train and test subsets.
#X_train and X_test are subsets of the training data
#y_train and y_test are subsets the the target data
X_train, X_test, y_train, y_test = train_test_split(training, target, test_size=test_size, random_state=seed)

In [145]:
#Specifies the ML model as XGBClassifer
model = XGBClassifier()


In [146]:
#Fits model to subsets of training and target data. Prints model parameters
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [147]:
#Mean Absolute Error (MAE) of the model
#cross_val_score uses cross validation strategy (CV)
#This strategy divides testing and target data into different folds to test for different scoring measures across
#varying subsets of data.  
#Parameters: X_test and y_test are initial random training and target data for CV strategy to be applied to
# cv specifies number of folds or number of folds of data
# scoring parameter specifies score to be tested.
scores = cross_val_score(model, X_test, y_test, cv=5, scoring='mean_absolute_error')
scores

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  sample_weight=sample_weight)
  if diff:
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  sample_weight=sample_weight)
  if diff:
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  sample_weight=sample_weight)
  if diff:
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  sample_weight=sample_weight)
  if diff:
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  sample_weight=sample_weight)
  if diff:


array([-4.5135    , -3.51725   , -4.15875   , -3.10076923, -4.33897436])

In [148]:
#Makes target prediction based on test data
#Rounds predictions to nearest integer
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

  if diff:


In [149]:
#Evaluates predictions using mean_absolute_error function
#Prints average MAE
MAE = mean_absolute_error(y_test, predictions)
print(MAE)

4.172727272727272


In [150]:
#Average MAE
print(sum(scores)/len(scores) *-1)

3.9258487179487176


In [151]:
#Acutal LOS Mean (for top 600 rows)
print(sum(result['actualLOS'])/len(result['actualLOS']))

5.366349999999997


In [152]:
# Experimenting with eli5 explainer model
# from eli5 import show_weights
# vec = DictVectorizer()

# show_weights(model, vec=vec)

In [153]:
# #Eli5 Prediction
# from eli5 import show_prediction
# show_prediction(model, X_test.iloc[1], vec=vec, show_feature_values=True)


In [154]:
#X_test.iloc[1]

In [155]:
#accuracy = accuracy_score(y_test, predictions, normalize=False)