## Modeling for predicting 72h ED revisit at ED discharge

Flow:
1. Task-specific filter
2. Variable selection
3. Modeling script
4. Performance output


### Load train and test

In [1]:
import os
import time
import random
# import tensorflow as tf
import numpy as np
import pandas as pd
# from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
# from tensorflow.keras import optimizers, metrics
# from tensorflow.keras.layers import Dense, LSTM
# from tensorflow.keras.layers import concatenate
# from tensorflow.keras.models import load_model
# from helpers import PlotROCCurve, get_lstm_data_gen
from dataset_path import output_path

# path = output_path
# output_path = os.path.join(path, "Figure3")
# if not os.path.exists(output_path):
#     os.makedirs(output_path)
df_train = pd.read_csv((os.path.join(output_path, 'train.csv')))
df_test = pd.read_csv((os.path.join(output_path, 'test.csv')))
confidence_interval = 95
random_seed = 0

In [2]:
random.seed(random_seed)
np.random.seed(random_seed)

In [3]:
df_train = df_train[df_train['disposition'].isin(['HOME', 'ADMITTED'])]
df_test = df_test[df_test['disposition'].isin(['HOME', 'ADMITTED'])]

In [None]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
df_train.head()

### 1. task-specific filter: exclude hospitalized cases for train and test

In [None]:
print('Before filtering: training size =', len(df_train), ', testing size =', len(df_test))
df_train = df_train[(df_train['outcome_hospitalization'] == False)]
df_test = df_test[(df_test['outcome_hospitalization'] == False)]#.reset_index()
print('After filtering: training size =', len(df_train), ', testing size =', len(df_test))

### 2. Variable Selection

In [6]:
variable = ["age", "gender",

            "n_ed_30d", "n_ed_90d", "n_ed_365d", "n_hosp_30d", "n_hosp_90d",
            "n_hosp_365d", "n_icu_30d", "n_icu_90d", "n_icu_365d",

            "triage_pain", "triage_acuity",

            "chiefcom_chest_pain", "chiefcom_abdominal_pain", "chiefcom_headache",
            "chiefcom_shortness_of_breath", "chiefcom_back_pain", "chiefcom_cough",
            "chiefcom_nausea_vomiting", "chiefcom_fever_chills", "chiefcom_syncope",
            "chiefcom_dizziness",

            "cci_MI", "cci_CHF", "cci_PVD", "cci_Stroke", "cci_Dementia", "cci_Pulmonary",
            "cci_Rheumatic", "cci_PUD", "cci_Liver1", "cci_DM1", "cci_DM2",
            "cci_Paralysis", "cci_Renal", "cci_Cancer1", "cci_Liver2", "cci_Cancer2",
            "cci_HIV",

            "eci_Arrhythmia", "eci_Valvular", "eci_PHTN",  "eci_HTN1", "eci_HTN2",
            "eci_NeuroOther", "eci_Hypothyroid", "eci_Lymphoma", "eci_Coagulopathy",
            "eci_Obesity", "eci_WeightLoss", "eci_FluidsLytes", "eci_BloodLoss",
            "eci_Anemia", "eci_Alcohol", "eci_Drugs", "eci_Psychoses", "eci_Depression",

            "ed_temperature_last", "ed_heartrate_last", "ed_resprate_last",
            "ed_o2sat_last", "ed_sbp_last", "ed_dbp_last", "ed_los_hours", "n_med", "n_medrecon"]

outcome = "outcome_ed_revisit_3d"

In [7]:
X_train = df_train[variable].copy()
y_train = df_train[outcome].copy()
X_test = df_test[variable].copy()
y_test = df_test[outcome].copy()

In [None]:
X_train.dtypes.to_frame().T

In [9]:
encoder = LabelEncoder()
X_train['gender'] = encoder.fit_transform(X_train['gender'])
X_test['gender'] = encoder.transform(X_test['gender'])

In [None]:
print('class ratio')
ratio = y_train.sum()/(~y_train).sum()
print('positive : negative =', ratio, ': 1')

### 3.  Modeling script

In [11]:
def compare_labels(true_labels, predicted_labels):
    """
    Compares true labels and predicted labels.

    Parameters:
    - true_labels (array-like): Array of true labels.
    - predicted_labels (array-like): Array of predicted labels.

    Returns:
    - result (numpy array): Array containing 0 if labels are the same,
                            1 if the predicted label is greater than the true label,
                            and -1 if the predicted label is less than the true label.
    """
    results = []
    for true_label, predicted_label in zip(true_labels, predicted_labels):
        if true_label == 0 and predicted_label == 0:
            results.append(1)
        elif true_label == 0 and predicted_label == 1:
            results.append(2)
        elif true_label == 1 and predicted_label == 0:
            results.append(3)
        elif true_label == 1 and predicted_label == 1:
            results.append(4)
        else:
            raise ValueError("Labels must be either 0 or 1.")
    return results


In [None]:
print("Logistic Regression:")
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

logreg = LogisticRegression(random_state=random_seed)
start = time.time()
logreg.fit(X_train,y_train)
runtime = time.time()-start
print('Training time:', runtime, 'seconds')
probs = logreg.predict_proba(X_test)

In [None]:
pred_label = logreg.predict(X_test)
accuracy = accuracy_score(y_test, pred_label)
print(f"Model Accuracy: {accuracy}")

y_prob = logreg.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_prob[:,1])
print(f"AUROC: {roc_auc:.3f}")

In [14]:
label_hospitalization = compare_labels(y_test, pred_label)
df_test['label_revisit'] = label_hospitalization

df_test.to_csv(f'{output_path}/results_revisit_lr.csv', index=False)

In [None]:
print("RandomForest:")
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=random_seed)
start = time.time()
rf.fit(X_train,y_train)
runtime = time.time()-start
print('Training time:', runtime, 'seconds')
probs = rf.predict_proba(X_test)
importances = rf.feature_importances_
print(importances)

In [None]:
pred_label = rf.predict(X_test)
accuracy = accuracy_score(y_test, pred_label)
print(f"Model Accuracy: {accuracy}")

y_prob = rf.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_prob[:,1])
print(f"AUROC: {roc_auc:.3f}")

In [17]:
label_hospitalization = compare_labels(y_test, pred_label)
df_test['label_revisit'] = label_hospitalization

df_test.to_csv(f'{output_path}/results_revisit_rf.csv', index=False)

In [None]:
print("GradientBoosting:")
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=random_seed)
start = time.time()
gb.fit(X_train, y_train)
runtime = time.time()-start
print('Training time:', runtime, 'seconds')
probs = gb.predict_proba(X_test)

In [None]:
pred_label = gb.predict(X_test)
accuracy = accuracy_score(y_test, pred_label)
print(f"Model Accuracy: {accuracy}")

y_prob = gb.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_prob[:,1])
print(f"AUROC: {roc_auc:.3f}")

In [20]:
label_hospitalization = compare_labels(y_test, pred_label)
df_test['label_revisit'] = label_hospitalization

df_test.to_csv(f'{output_path}/results_revisit_gb.csv', index=False)