## Mortality rate prediction

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import roc_auc_score, roc_curve
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score, StratifiedKFold

# 1) Load data
# Note:
# - Both mimicdeathtrain and mimicdeathtest are assumed to be standardized and one-hot encoded already.
# - For other prediction tasks, switch files to:
#   mimicmodstrain.csv (MODS), mimichxtrain.csv (hypoxemia), mimichstrain.csv (hemorrhagic shock).
df_train = pd.read_csv('mimicmodstrain.csv').drop(['stay_id', 'hr'], axis=1)
X_train = df_train.iloc[:, :-1].values
y_train = df_train.iloc[:, -1].values

df_test = pd.read_csv('mimicmodstest.csv')
X_test = df_test.iloc[:, :-1].values
y_test = df_test.iloc[:, -1].values

# 2) Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# --- LightGBM (Bayesian Optimization) ---
def lgb_cv(n_estimators, max_depth, learning_rate, subsample, colsample_bytree):
    params = {
        'n_estimators': int(n_estimators),
        'max_depth': int(max_depth),
        'learning_rate': learning_rate,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'objective': 'binary',
        'random_state': 42
    }
    model = LGBMClassifier(**params)
    return cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv).mean()

lgb_bo = BayesianOptimization(
    f=lgb_cv,
    pbounds={
        'n_estimators': (100, 500),
        'max_depth': (3, 10),
        'learning_rate': (0.01, 0.3),
        'subsample': (0.5, 1.0),
        'colsample_bytree': (0.5, 1.0)
    },
    random_state=42,
    verbose=0
)
lgb_bo.maximize(init_points=5, n_iter=15)

lgb_best_params = lgb_bo.max['params']
lgb_best_params['n_estimators'] = int(lgb_best_params['n_estimators'])
lgb_best_params['max_depth'] = int(lgb_best_params['max_depth'])
lgbm_best = LGBMClassifier(objective='binary', random_state=42, **lgb_best_params)

# --- Logistic Regression (Bayesian Optimization) ---
def lr_cv(C, solver_idx):
    # Map solver index to solver name
    solver_list = ['liblinear', 'lbfgs']
    solver = solver_list[int(round(solver_idx))]
    model = LogisticRegression(C=C, solver=solver, max_iter=1000, random_state=42)
    return cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv).mean()

lr_bo = BayesianOptimization(
    f=lr_cv,
    pbounds={
        'C': (0.001, 10.0),
        'solver_idx': (0, 1)
    },
    random_state=42,
    verbose=0
)
lr_bo.maximize(init_points=3, n_iter=7)

lr_best_params = lr_bo.max['params']
lr_best_solver = ['liblinear', 'lbfgs'][int(round(lr_best_params['solver_idx']))]
lr_best_C = lr_best_params['C']
lr_best = LogisticRegression(C=lr_best_C, solver=lr_best_solver, max_iter=1000, random_state=42)

# --- Random Forest (Bayesian Optimization) ---
def rf_cv(n_estimators, max_depth, max_features, min_samples_split):
    params = {
        'n_estimators': int(n_estimators),
        'max_depth': int(max_depth),
        'max_features': max_features,
        'min_samples_split': int(min_samples_split),
        'random_state': 42,
        'n_jobs': -1
    }
    model = RandomForestClassifier(**params)
    return cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv).mean()

rf_bo = BayesianOptimization(
    f=rf_cv,
    pbounds={
        'n_estimators': (100, 500),
        'max_depth': (3, 20),
        'max_features': (0.1, 1.0),
        'min_samples_split': (2, 20)
    },
    random_state=42,
    verbose=0
)
rf_bo.maximize(init_points=5, n_iter=15)

rf_best_params = rf_bo.max['params']
rf_best_params['n_estimators'] = int(rf_best_params['n_estimators'])
rf_best_params['max_depth'] = int(rf_best_params['max_depth'])
rf_best_params['min_samples_split'] = int(rf_best_params['min_samples_split'])
rf_best = RandomForestClassifier(random_state=42, n_jobs=-1, **rf_best_params)

# --- XGBoost (Bayesian Optimization) ---
def xgb_cv(n_estimators, max_depth, learning_rate, subsample, colsample_bytree, gamma):
    params = {
        'n_estimators': int(n_estimators),
        'max_depth': int(max_depth),
        'learning_rate': learning_rate,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'gamma': gamma,
        'use_label_encoder': False,
        'eval_metric': 'logloss',
        'random_state': 42
    }
    model = XGBClassifier(**params)
    return cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv).mean()

xgb_bo = BayesianOptimization(
    f=xgb_cv,
    pbounds={
        'n_estimators': (100, 500),
        'max_depth': (3, 10),
        'learning_rate': (0.01, 0.3),
        'subsample': (0.5, 1.0),
        'colsample_bytree': (0.5, 1.0),
        'gamma': (0.0, 5.0)
    },
    random_state=42,
    verbose=0
)
xgb_bo.maximize(init_points=5, n_iter=15)

xgb_best_params = xgb_bo.max['params']
xgb_best_params['n_estimators'] = int(xgb_best_params['n_estimators'])
xgb_best_params['max_depth'] = int(xgb_best_params['max_depth'])
xgb_best = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, **xgb_best_params)

# 3) Model dictionary (only LR, RF, XGBoost, LightGBM)
models = {
    'Logistic Regression (BayesOpt)': lr_best,
    'Random Forest (BayesOpt)': rf_best,
    'XGBoost (BayesOpt)': xgb_best,
    'LightGBM (BayesOpt)': lgbm_best
}

# 4) Train and plot ROC curves
plt.figure(figsize=(10, 8))
for name, model in models.items():
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc_score = roc_auc_score(y_test, y_proba)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc_score:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve Comparison (Bayesian-Optimized Models)')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()
