In [17]:
import pandas as pd
from numpy import loadtxt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split

In [18]:
df_clean = pd.read_csv('data/cleaned.csv')
df_clean['label']  = df_clean["LOS"].apply(lambda x : 0 if x > 6 else 1)

# Target Variable (Length-of-Stay)
ctarget = df_clean['label'].values
# Prediction Features
cfeature = df_clean.drop(columns=['LOS', 'label'])

In [19]:
X_train, X_test, y_train, y_test = train_test_split(cfeature, 
                                                    ctarget, 
                                                    test_size = .20, 
                                                    random_state = 0)



## LogisticRegression

In [20]:
# LogisticRegression
clf = LogisticRegression().fit(X_train, y_train)
predictions = clf.predict(X_test)

print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.71      0.73      0.72      5701
           1       0.68      0.66      0.67      4920

    accuracy                           0.70     10621
   macro avg       0.70      0.70      0.70     10621
weighted avg       0.70      0.70      0.70     10621



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc



0.6959625183429522

## KNeighborsClassifier

In [22]:
# KNeighborsClassifier
neigh = KNeighborsClassifier()
neigh.fit(X_train, y_train)
predictions = neigh.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.71      0.65      0.68      5701
           1       0.63      0.69      0.66      4920

    accuracy                           0.67     10621
   macro avg       0.67      0.67      0.67     10621
weighted avg       0.67      0.67      0.67     10621



In [23]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc



0.6683428809380183

## DecisionTreeClassifier


In [24]:


clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.66      0.66      0.66      5701
           1       0.61      0.61      0.61      4920

    accuracy                           0.64     10621
   macro avg       0.64      0.64      0.64     10621
weighted avg       0.64      0.64      0.64     10621



In [25]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc



0.6351347930686816

## RandomForestClassifier


In [26]:

clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.70      0.78      0.74      5701
           1       0.71      0.61      0.66      4920

    accuracy                           0.70     10621
   macro avg       0.70      0.70      0.70     10621
weighted avg       0.70      0.70      0.70     10621



In [27]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc



0.6977470433799234

## GradientBoosting

In [28]:
# GradientBoostingClassifier
clf = GradientBoostingClassifier(random_state=0)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.71      0.76      0.74      5701
           1       0.70      0.65      0.67      4920

    accuracy                           0.71     10621
   macro avg       0.71      0.70      0.70     10621
weighted avg       0.71      0.71      0.71     10621



In [29]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc


0.7036776460555343