## Predicting Frequent Emergency Department Visits
## CLT Hack-A-Thon 2019
## Team Black Falcons
## Authors: Rishi Hemwani, Vinay Palgiri, Varun Pappuri
## Date: 03/23/2019

## Dataset Used: MIMIC III
## Attributes Used: 
i. Patient Demographics (Gender, Language, Religion, Ethinicity, Marital Status, DOB)


ii. Admissions (Admission Date, Discharge Time, Admission Type, Drug Type) 

In [1]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.metrics import confusion_matrix

## Load the MIMIC data 

In [2]:
mimic_data = pd.read_csv("C:\\Users\\rvh700\\Desktop\\Hack2019\\final_MIMIC.csv", low_memory= False)

## Replace NULL values with 'Others' for categorical variables

In [4]:
mimic_data = mimic_data.fillna('Others')
mimic_data['target'] = 0

## Generate the target variable using Admission Location
i. if Admission Location is either 'EMERGENCY ROOM ADMIT' or 'TRANSFER FROM HOSP/EXTRAM' or 'TRANSFER FROM OTHER HEALT' or 'TRANSFER FROM SKILLED NUR' then target variable = 1


ii. for all other Admission Location, target variable = 0

In [5]:
for i in range(0,len(mimic_data)):
    if((mimic_data['ADMISSION_LOCATION'].iloc[i]) == 'EMERGENCY ROOM ADMIT' or (mimic_data['ADMISSION_LOCATION'].iloc[i]) == 'TRANSFER FROM HOSP/EXTRAM' or (mimic_data['ADMISSION_LOCATION'].iloc[i]) == 'TRANSFER FROM OTHER HEALT' or (mimic_data['ADMISSION_LOCATION'].iloc[i]) == 'TRANSFER FROM SKILLED NUR'):
        mimic_data['target'].iloc[i] = 1
    else:
        mimic_data['target'].iloc[i] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


## Drop all the date columns

In [6]:
mimic_data = mimic_data.drop(columns = ['DISCHTIME','DOB','ADMISSION_TYPE','ADMISSION_LOCATION'])

In [7]:
mimic_data

Unnamed: 0,SUBJECT_ID,GENDER,LANGUAGE,ETHNICITY,INSURANCE,MARITAL_STATUS,RELIGION,DRUG_TYPE,target
0,2,M,Others,ASIAN,Private,Others,NOT SPECIFIED,4,0
1,4,F,Others,WHITE,Private,SINGLE,PROTESTANT QUAKER,59,1
2,6,F,ENGL,WHITE,Medicare,MARRIED,NOT SPECIFIED,148,0
3,8,M,Others,WHITE,Private,Others,CATHOLIC,4,0
4,9,M,Others,UNKNOWN/NOT SPECIFIED,Medicaid,Others,UNOBTAINABLE,68,1
5,10,F,Others,BLACK/AFRICAN AMERICAN,Medicaid,Others,UNOBTAINABLE,10,0
6,11,F,Others,WHITE,Private,MARRIED,OTHER,91,1
7,12,M,Others,WHITE,Medicare,MARRIED,JEWISH,95,0
8,13,F,Others,WHITE,Medicaid,Others,OTHER,84,1
9,17,F,ENGL,WHITE,Private,MARRIED,CATHOLIC,118,0


## Generate dummy variables for Categorical Variables

In [8]:
processed_data = pd.get_dummies(mimic_data)
x = processed_data.drop(columns=['SUBJECT_ID','target'])

In [9]:
x

Unnamed: 0,DRUG_TYPE,GENDER_F,GENDER_M,LANGUAGE_*CHI,LANGUAGE_*DEA,LANGUAGE_*MAN,LANGUAGE_ALBA,LANGUAGE_CAMB,LANGUAGE_CANT,LANGUAGE_CAPE,...,RELIGION_GREEK ORTHODOX,RELIGION_JEHOVAH'S WITNESS,RELIGION_JEWISH,RELIGION_MUSLIM,RELIGION_NOT SPECIFIED,RELIGION_OTHER,RELIGION_Others,RELIGION_PROTESTANT QUAKER,RELIGION_UNITARIAN-UNIVERSALIST,RELIGION_UNOBTAINABLE
0,4,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,59,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,148,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,68,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,10,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,91,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,95,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
8,84,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9,118,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Split the data into Train and Validation data set (70:30)

In [10]:
y  = processed_data['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

In [11]:
x_train

Unnamed: 0,DRUG_TYPE,GENDER_F,GENDER_M,LANGUAGE_*CHI,LANGUAGE_*DEA,LANGUAGE_*MAN,LANGUAGE_ALBA,LANGUAGE_CAMB,LANGUAGE_CANT,LANGUAGE_CAPE,...,RELIGION_GREEK ORTHODOX,RELIGION_JEHOVAH'S WITNESS,RELIGION_JEWISH,RELIGION_MUSLIM,RELIGION_NOT SPECIFIED,RELIGION_OTHER,RELIGION_Others,RELIGION_PROTESTANT QUAKER,RELIGION_UNITARIAN-UNIVERSALIST,RELIGION_UNOBTAINABLE
105,2095,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
68,148,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
479,61,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
399,81,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
434,103,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
258,132,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
827,199,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
884,108,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
304,10,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
811,414,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Logistic Regression Model

In [20]:
thresh = 0.5
log_reg = LogisticRegression()
print(log_reg.fit(x_train,y_train))


y_train_preds = log_reg.predict(x_train)
y_test_preds = log_reg.predict(x_test)

print('Logistic Regression')
print('Training:')
auc = roc_auc_score(y_test, y_test_preds)
accuracy = accuracy_score(y_test, (y_test_preds > thresh))
recall = recall_score(y_test, (y_test_preds > thresh))
precision = precision_score(y_test, (y_test_preds > thresh))
print('AUC: %.3f'%auc)
print('Accuracy: %.3f'%accuracy)
print('Specificity: %.3f'%recall)
print('Precision: %.3f'%precision)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
Logistic Regression
Training:
AUC: 0.652
Accuracy: 0.737
Recall: 0.879
Precision: 0.770




## Decision Tree Classifier

In [19]:
thresh = 0.5
tree = DecisionTreeClassifier(max_depth = 10, random_state = 42)
tree.fit(x_train, y_train)
y_train_preds = tree.predict(x_train)
y_valid_preds = tree.predict(x_test)
print('Decision Tree Classifier')
auc_train = roc_auc_score(y_test, y_valid_preds)
accuracy_train = accuracy_score(y_test, (y_valid_preds > thresh))
recall_train = recall_score(y_test, (y_valid_preds > thresh))
precision_train = precision_score(y_test, (y_valid_preds > thresh))
print("\n")
print('Training:')
print('AUC:%.3f'%auc_train)
print('Accuracy: %.3f'%accuracy_train)
print('Specificity: %.3f'%recall_train)
print('Precision: %.3f'%precision_train)

Decision Tree Classifier


Training:
AUC:0.595
Accuracy: 0.667
Recall: 0.786
Precision: 0.743


## Random Forest Classifier

In [16]:
rf = RandomForestClassifier(max_depth = 6, random_state = 42)
(rf.fit(x_train, y_train))
y_train_preds = rf.predict(x_train)
y_test_preds = rf.predict(x_test)
print('Random Forest Classifier')
auc_train = roc_auc_score(y_test, y_test_preds)
accuracy_train = accuracy_score(y_test, (y_test_preds > thresh))
recall_train = recall_score(y_test, (y_test_preds > thresh))
precision_train = precision_score(y_test, (y_test_preds > thresh))
print("\n")
print('Training:')
print('AUC:%.3f'%auc_train)
print('Accuracy: %.3f'%accuracy_train)
print('Specificity: %.3f'%recall_train)
print('Precision: %.3f'%precision_train)
print('Confusion Matrix:')
print(confusion_matrix(y_test,y_test_preds))

Random Forest Classifier


Training:
AUC:0.630
Accuracy: 0.750
Recall: 0.951
Precision: 0.751
Confusion Matrix:
[[ 29  65]
 [ 10 196]]




## Gradient Boosting Classifier

In [15]:
thresh = 0.5
gbc =GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=3, random_state=42)
(gbc.fit(x_train, y_train))
y_train_preds = gbc.predict(x_train)
y_valid_preds = gbc.predict(x_test)

print('Gradient Boosting Classifier')
auc_train = roc_auc_score(y_test, y_valid_preds)
accuracy_train = accuracy_score(y_test, (y_valid_preds > thresh))
recall_train = recall_score(y_test, (y_valid_preds > thresh))
precision_train = precision_score(y_test, (y_valid_preds > thresh))
print("\n")
print('Training:')
print('AUC:%.3f'%auc_train)
print('Accuracy: %.3f'%accuracy_train)
print('Specificity: %.3f'%recall_train)
print('Precision: %.3f'%precision_train)
print(confusion_matrix(y_test,y_valid_preds))

Gradient Boosting Classifier


Training:
AUC:0.629
Accuracy: 0.677
Recall: 0.757
Precision: 0.768
[[ 47  47]
 [ 50 156]]


## Support Vector Machine Classifier

In [17]:
svm = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
(svm.fit(x_train, y_train))
y_train_preds = svm.predict(x_train)
y_valid_preds = svm.predict(x_test)
print('SVM Classifier')
auc_train = roc_auc_score(y_train, y_train_preds)
accuracy_train = accuracy_score(y_test, (y_valid_preds > thresh))
recall_train = recall_score(y_test, (y_valid_preds > thresh))
precision_train = precision_score(y_test, (y_valid_preds > thresh))
print("\n")
print('Training:')
print('AUC:%.3f'%auc_train)
print('Accuracy: %.3f'%accuracy_train)
print('Specificity: %.3f'%recall_train)
print('Precision: %.3f'%precision_train)
print(confusion_matrix(y_test,y_valid_preds))

SVM Classifier


Training:
AUC:0.614
Accuracy: 0.710
Recall: 0.927
Precision: 0.726
[[ 22  72]
 [ 15 191]]


## Multi Layer Perceptron Classifier

In [18]:
mlp =  MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2))
(mlp.fit(x_train,y_train))
y_train_preds = mlp.predict(x_train)
y_test_preds = mlp.predict(x_test)

print('MLP NN Classifier')
auc_train = roc_auc_score(y_test, y_test_preds)
accuracy_train = accuracy_score(y_test, (y_test_preds > thresh))
recall_train = recall_score(y_test, (y_test_preds > thresh))
precision_train = precision_score(y_test, (y_test_preds > thresh))
print("\n")
print('Training:')
print('AUC:%.3f'%auc_train)
print('Accuracy: %.3f'%accuracy_train)
print('Specificity: %.3f'%recall_train)
print('Precision: %.3f'%precision_train)
print(confusion_matrix(y_test,y_test_preds))

MLP NN Classifier


Training:
AUC:0.500
Accuracy: 0.687
Recall: 1.000
Precision: 0.687
[[  0  94]
 [  0 206]]


## Important Features for Random Forest Classifier

In [146]:
important_features = pd.DataFrame(rf.feature_importances_,index = x_train.columns,columns=['importance']).sort_values('importance',ascending=False)

In [147]:
important_features

Unnamed: 0,importance
DRUG_TYPE,0.237751
MARITAL_STATUS_Others,0.133025
INSURANCE_Private,0.123259
MARITAL_STATUS_SINGLE,0.070608
INSURANCE_Medicare,0.070110
GENDER_F,0.031231
ETHNICITY_BLACK/AFRICAN AMERICAN,0.025981
RELIGION_EPISCOPALIAN,0.025633
INSURANCE_Medicaid,0.025159
RELIGION_PROTESTANT QUAKER,0.023768
