In [1]:
import sys
import numpy as np
import pandas as pd
from pathlib import Path

# Go from .../cml_final/notebooks/HEF -> .../cml_final
BASE_DIR = Path.cwd().parents[1]
SRC_DIR = BASE_DIR / "src"

# Make sure Python can see the src folder
sys.path.append(str(SRC_DIR))

from hef_prep import prepare_data
from hef_prep import add_engineered_features  # and any other functions you want


# Classification task, with feature engineering
X, y, X_test = prepare_data(
    task="class",
    leak_cols=["ADMITTIME", "ICD9_diagnosis", "DIAGNOSIS", "DOB", "DEATHTIME", "DISCHTIME", "DOD", "LOS", "HOSPITAL_EXPIRE_FLAG"], 
    apply_fe=False,
)

print(X.shape, y.shape)
print("Positive rate:", y.mean().round(3))


Base dir: c:\Users\sffra\Downloads\BSE 2025-2026\cml_final
Train path: c:\Users\sffra\Downloads\BSE 2025-2026\cml_final\data\raw\MIMIC III dataset HEF\mimic_train_HEF.csv
Test path: c:\Users\sffra\Downloads\BSE 2025-2026\cml_final\data\raw\MIMIC III dataset HEF\mimic_test_HEF.csv
Train shape: (20885, 44)
Test shape: (5221, 39)
Task: class (target = HOSPITAL_EXPIRE_FLAG)
X_train_raw shape: (20885, 35)
X_test_raw shape: (5221, 35)
y_train shape: (20885,)
Positive rate (death): 0.112
[clean_min_bp_outliers] SysBP_Min: setting 99 values (0.530% of valid) below 40.0 mmHg to NaN
[clean_min_bp_outliers] DiasBP_Min: setting 6 values (0.032% of valid) below 10.0 mmHg to NaN
[clean_min_bp_outliers] MeanBP_Min: setting 797 values (4.262% of valid) below 30.0 mmHg to NaN
[clean_min_bp_outliers] SysBP_Min: setting 31 values (0.664% of valid) below 40.0 mmHg to NaN
[clean_min_bp_outliers] DiasBP_Min: setting 1 values (0.021% of valid) below 10.0 mmHg to NaN
[clean_min_bp_outliers] MeanBP_Min: settin

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Train/valid split (stratified)
X_train, X_valid, y_train_split, y_valid = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42,
)

print("Train size:", X_train.shape[0], "Valid size:", X_valid.shape[0])
print("Train positive rate:", y_train_split.mean().round(3))
print("Valid positive rate:", y_valid.mean().round(3))

# Feature type lists
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)


Train size: 16708 Valid size: 4177
Train positive rate: 0.112
Valid positive rate: 0.112


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from scipy.stats import randint

# Base RF (we'll tune around this)
rf = RandomForestClassifier(
    n_estimators=300,
    class_weight="balanced",
    n_jobs=-1,
    random_state=42,
)

rf_pipe = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", rf),
    ]
)

from scipy.stats import randint

# Hyperparameter search space (tweaked)
param_distributions = {
    # Fewer trees → big runtime win, tiny performance loss (if any)
    "clf__n_estimators": randint(150, 301),  # 150–300

    # Try a tiny bit more variety in depth
    "clf__max_depth": [10, 14, 18, None],    # add moderate & full depth

    # Slightly wider and more standard ranges
    "clf__min_samples_split": randint(2, 21),  # 2–20
    "clf__min_samples_leaf":  randint(1, 11),  # 1–10

    # Still just sqrt is fine; it's the usual RF default for classification
    "clf__max_features": ["sqrt"],

    # Worth considering class imbalance
    "clf__class_weight": [None, "balanced"],
}

rf_search = RandomizedSearchCV(
    estimator=rf_pipe,
    param_distributions=param_distributions,
    n_iter=30,       # can keep 20 if time is tight
    scoring="roc_auc",
    cv=3,            # 3-fold: ~40% faster than 5-fold
    n_jobs=-1,
    random_state=42,
    verbose=1,
)

rf_search.fit(X_train, y_train_split)


rf_search.fit(X_train, y_train_split)

print("Best CV AUC:", rf_search.best_score_)
print("Best params:")
for k, v in rf_search.best_params_.items():
    print(f"  {k}: {v}")

# Evaluate on hold-out validation set
best_rf_pipe = rf_search.best_estimator_
y_valid_proba_rf = best_rf_pipe.predict_proba(X_valid)[:, 1]
rf_auc = roc_auc_score(y_valid, y_valid_proba_rf)
print(f"Random Forest (tuned) ROC-AUC on validation: {rf_auc:.4f}")


Fitting 3 folds for each of 30 candidates, totalling 90 fits
Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best CV AUC: 0.8096721827600112
Best params:
  clf__class_weight: balanced
  clf__max_depth: None
  clf__max_features: sqrt
  clf__min_samples_leaf: 2
  clf__min_samples_split: 7
  clf__n_estimators: 203
Random Forest (tuned) ROC-AUC on validation: 0.8181


In [4]:
from sklearn.dummy import DummyClassifier

# Dummy baseline: predicts classes according to training distribution
dummy = DummyClassifier(strategy="stratified", random_state=42)

dummy_pipe = Pipeline(
    steps=[
        ("preprocess", preprocessor),  # still impute/encode, even if dummy ignores X
        ("clf", dummy),
    ]
)

dummy_pipe.fit(X_train, y_train_split)
y_valid_proba_dummy = dummy_pipe.predict_proba(X_valid)[:, 1]
dummy_auc = roc_auc_score(y_valid, y_valid_proba_dummy)

print(f"Dummy (stratified) ROC-AUC on validation: {dummy_auc:.4f}")
print(f"Random Forest (tuned) ROC-AUC on validation: {rf_auc:.4f}")
print(f"AUC improvement over dummy: {rf_auc - dummy_auc:.4f}")


Dummy (stratified) ROC-AUC on validation: 0.4849
Random Forest (tuned) ROC-AUC on validation: 0.8181
AUC improvement over dummy: 0.3331


In [5]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

def evaluate_probs_and_labels(name, y_true, y_proba, y_pred):
    auc = roc_auc_score(y_true, y_proba)
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    print(f"\n{name}")
    print("-" * len(name))
    print(f"ROC-AUC:  {auc:.4f}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision:{prec:.4f}")
    print(f"Recall:   {rec:.4f}")
    print(f"F1:       {f1:.4f}")
    print("Confusion matrix [ [TN FP], [FN TP] ]:")
    print(confusion_matrix(y_true, y_pred))


In [6]:
dummy_neg = DummyClassifier(strategy="constant", constant=0)

dummy_neg_pipe = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", dummy_neg),
    ]
)

dummy_neg_pipe.fit(X_train, y_train_split)

# Probabilities: model puts prob=0 for class 1 everywhere
y_valid_proba_neg = dummy_neg_pipe.predict_proba(X_valid)[:, 1]
y_valid_pred_neg  = dummy_neg_pipe.predict(X_valid)

evaluate_probs_and_labels("Always NEGATIVE (0) baseline", y_valid, y_valid_proba_neg, y_valid_pred_neg)



Always NEGATIVE (0) baseline
----------------------------
ROC-AUC:  0.5000
Accuracy: 0.8877
Precision:0.0000
Recall:   0.0000
F1:       0.0000
Confusion matrix [ [TN FP], [FN TP] ]:
[[3708    0]
 [ 469    0]]


In [7]:
dummy_pos = DummyClassifier(strategy="constant", constant=1)

dummy_pos_pipe = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", dummy_pos),
    ]
)

dummy_pos_pipe.fit(X_train, y_train_split)

y_valid_proba_pos = dummy_pos_pipe.predict_proba(X_valid)[:, 1]
y_valid_pred_pos  = dummy_pos_pipe.predict(X_valid)

evaluate_probs_and_labels("Always POSITIVE (1) baseline", y_valid, y_valid_proba_pos, y_valid_pred_pos)



Always POSITIVE (1) baseline
----------------------------
ROC-AUC:  0.5000
Accuracy: 0.1123
Precision:0.1123
Recall:   1.0000
F1:       0.2019
Confusion matrix [ [TN FP], [FN TP] ]:
[[   0 3708]
 [   0  469]]


In [None]:
y_valid_proba_rf = best_rf_pipe.predict_proba(X_valid)[:, 1]
y_valid_pred_rf  = best_rf_pipe.predict(X_valid)

evaluate_probs_and_labels("Random Forest (not tuned)", y_valid, y_valid_proba_rf, y_valid_pred_rf)



Random Forest (tuned)
---------------------
ROC-AUC:  0.8181
Accuracy: 0.9002
Precision:0.7203
Recall:   0.1812
F1:       0.2896
Confusion matrix [ [TN FP], [FN TP] ]:
[[3675   33]
 [ 384   85]]
