## Modeling for predicting hospitalization at ED triage

Work Flow:
1. Task-specific filter
2. Variable selection
3. Modeling script
4. Performance output


### Load train and test

In [1]:
import os
import time
import random
# import tensorflow as tf
import numpy as np
import pandas as pd
# from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
# from tensorflow.keras import optimizers, metrics
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.models import load_model
# from helpers import PlotROCCurve
from dataset_path import output_path

# path = output_path
# output_path = os.path.join(path, "Figure3")
df_train = pd.read_csv((os.path.join(output_path, 'train.csv')))
df_test = pd.read_csv((os.path.join(output_path, 'test.csv')))
confidence_interval = 95
random_seed=0

In [2]:
random.seed(random_seed)
np.random.seed(random_seed)

In [3]:
df_train = df_train[df_train['disposition'].isin(['HOME', 'ADMITTED'])]
df_test = df_test[df_test['disposition'].isin(['HOME', 'ADMITTED'])]

In [None]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
df_train.head()

### 1. Task-specific filter: No task-specific exclusion needed

In [None]:
print('training size =', len(df_train), ', testing size =', len(df_test))

### 2. Variable Selection

In [6]:
variable = ["age", "gender",

            "triage_temperature", "triage_heartrate", "triage_resprate",
            "triage_o2sat", "triage_sbp", "triage_dbp", "triage_pain",

            "chiefcom_chest_pain", "chiefcom_abdominal_pain", "chiefcom_headache",
            "chiefcom_shortness_of_breath", "chiefcom_back_pain", "chiefcom_cough",
            "chiefcom_nausea_vomiting", "chiefcom_fever_chills", "chiefcom_syncope",
            "chiefcom_dizziness"]

outcome = "triage_acuity"

In [7]:
X_train = df_train[variable].copy()
y_train = df_train[outcome].copy()
X_test = df_test[variable].copy()
y_test = df_test[outcome].copy()

In [None]:
X_train.dtypes.to_frame().T

In [9]:
encoder = LabelEncoder()
X_train['gender'] = encoder.fit_transform(X_train['gender'])
X_test['gender'] = encoder.transform(X_test['gender'])

In [10]:
# print('class ratio')
# ratio = y_train.sum()/(~y_train).sum()
# print('positive : negative =', ratio, ': 1')

### 3.  Modeling script

In [11]:
def compare_labels(true_labels, predicted_labels):
    """
    Compares true labels and predicted labels.

    Parameters:
    - true_labels (array-like): Array of true labels.
    - predicted_labels (array-like): Array of predicted labels.

    Returns:
    - result (numpy array): Array containing 0 if labels are the same,
                            1 if the predicted label is greater than the true label,
                            and -1 if the predicted label is less than the true label.
    """
    true_labels = np.array(true_labels)
    predicted_labels = np.array(predicted_labels)

    # Initialize result array with zeros
    result = np.zeros_like(true_labels)

    # Set result values based on comparisons
    result[predicted_labels > true_labels] = 1
    result[predicted_labels < true_labels] = -1

    return result


In [None]:
print("Logistic Regression:")
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import label_binarize

logreg = LogisticRegression(random_state=random_seed)
start = time.time()
logreg.fit(X_train,y_train)
runtime = time.time()-start
print('Training time:', runtime, 'seconds')
probs = logreg.predict_proba(X_test)

In [None]:
pred_label = logreg.predict(X_test)
accuracy = accuracy_score(y_test, pred_label)
print(f"Model Accuracy: {accuracy}")

y_prob = logreg.predict_proba(X_test)
# Binarize the output
y_test_bin = label_binarize(y_test, classes=np.arange(1,6,1))

# Compute the AUROC for each class and the macro-average AUROC
auc_ovr = roc_auc_score(y_test_bin, y_prob, multi_class='ovr')
auc_ovo = roc_auc_score(y_test_bin, y_prob, multi_class='ovo')

print(f"One-vs-Rest AUROC: {auc_ovr:.2f}")
print(f"One-vs-One AUROC: {auc_ovo:.2f}")

In [14]:
label_hospitalization = compare_labels(y_test, pred_label)
df_test['label_triage'] = label_hospitalization

df_test.to_csv(f'{output_path}/results_triage_lr.csv', index=False)

In [None]:
print("RandomForest:")
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=random_seed)
start = time.time()
rf.fit(X_train,y_train)
runtime = time.time()-start
print('Training time:', runtime, 'seconds')
probs = rf.predict_proba(X_test)
importances = rf.feature_importances_
print(importances)

In [None]:
pred_label = rf.predict(X_test)
accuracy = accuracy_score(y_test, pred_label)
print(f"Model Accuracy: {accuracy}")

y_prob = rf.predict_proba(X_test)
# Binarize the output
y_test_bin = label_binarize(y_test, classes=np.arange(1,6,1))

# Compute the AUROC for each class and the macro-average AUROC
auc_ovr = roc_auc_score(y_test_bin, y_prob, multi_class='ovr')
auc_ovo = roc_auc_score(y_test_bin, y_prob, multi_class='ovo')

print(f"One-vs-Rest AUROC: {auc_ovr:.2f}")
print(f"One-vs-One AUROC: {auc_ovo:.2f}")

In [17]:
label_hospitalization = compare_labels(y_test, pred_label)
df_test['label_triage'] = label_hospitalization

df_test.to_csv(f'{output_path}/results_triage_rf.csv', index=False)

In [None]:
print("GradientBoosting:")
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=random_seed)
start = time.time()
gb.fit(X_train, y_train)
runtime = time.time()-start
print('Training time:', runtime, 'seconds')
probs = gb.predict_proba(X_test)

In [None]:
pred_label = gb.predict(X_test)
accuracy = accuracy_score(y_test, pred_label)
print(f"Model Accuracy: {accuracy}")

y_prob = gb.predict_proba(X_test)
# Binarize the output
y_test_bin = label_binarize(y_test, classes=np.arange(1,6,1))

# Compute the AUROC for each class and the macro-average AUROC
auc_ovr = roc_auc_score(y_test_bin, y_prob, multi_class='ovr')
auc_ovo = roc_auc_score(y_test_bin, y_prob, multi_class='ovo')

print(f"One-vs-Rest AUROC: {auc_ovr:.2f}")
print(f"One-vs-One AUROC: {auc_ovo:.2f}")

In [20]:
label_hospitalization = compare_labels(y_test, pred_label)
df_test['label_triage'] = label_hospitalization

df_test.to_csv(f'{output_path}/results_triage_gb.csv', index=False)