In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    accuracy_score, f1_score,
    precision_score, recall_score,
    mean_squared_error, mean_absolute_error
)

import lightgbm as lgb

# ================================
# 0. 설정
# ================================
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

DATA_PATH = "./cohort_ver151_reorder_col.csv"   # 필요시 수정
OUT_PATH  = "./ppm_pred_results_test.csv"       # Test 예측 결과 CSV 경로

# ================================
# 1. 데이터 로딩
# ================================
df = pd.read_csv(DATA_PATH, low_memory=False)
print("[LOAD] 데이터 로딩 완료")
print(" - shape :", df.shape)

# ================================
# 2. 전처리
#    - race 인코딩
#    - time_to_next clip / log1p
#    - delay_label 생성
# ================================

# (1) race 라벨 인코딩
if "race" in df.columns:
    le_race = LabelEncoder()
    df["race_enc"] = le_race.fit_transform(df["race"].astype(str))
    print("[PREP] race -> race_enc 인코딩 완료")
else:
    print("[WARN] race 컬럼이 없어 race_enc를 만들지 않았습니다.")

# (2) time_to_next clip + log1p
if "target_time_to_next" not in df.columns:
    raise ValueError("target_time_to_next 컬럼이 없습니다. cohort를 확인해 주세요.")

clip_value = df["target_time_to_next"].quantile(0.995)
df["time_to_next_clip"] = df["target_time_to_next"].clip(upper=clip_value)
df["time_to_next_log1p"] = np.log1p(df["time_to_next_clip"])

# (3) delay_label 생성 (상위 25% 이상을 지연으로 정의)
delay_threshold = df["time_to_next_clip"].quantile(0.75)
df["delay_label"] = (df["time_to_next_clip"] > delay_threshold).astype(int)

print(f"[PREP] time_to_next_clip 상한(99.5%): {clip_value:.2f}")
print(f"[PREP] delay_threshold(75%): {delay_threshold:.2f}")
print("[PREP] delay_label 분포:")
print(df["delay_label"].value_counts(normalize=True).rename("ratio"))

# ================================
# 3. Feature / Target 정의
# ================================

# ID 컬럼
id_cols = [c for c in ["subject_id", "hadm_id", "timestamp"] if c in df.columns]

# Target 컬럼 존재 여부 확인
required_targets = [
    "target_mortality",
    "target_next_evt",
    "target_time_to_next",
    "target_remain_los",
]
for t in required_targets:
    if t not in df.columns:
        raise ValueError(f"{t} 컬럼이 없습니다. cohort를 확인해 주세요.")

target_cols = required_targets + ["delay_label"]

# feature에서 제외할 컬럼
exclude_cols = set(id_cols + target_cols + [
    "race",                 # 문자열 원본
    "time_to_next_clip",
    "time_to_next_log1p",
])

# 기본 feature: object 타입이 아닌 컬럼 중 exclude_cols에 없는 것
feature_cols = [
    c for c in df.columns
    if c not in exclude_cols and df[c].dtype != "object"
]

# race_enc가 있다면 feature에 포함
if "race_enc" in df.columns and "race_enc" not in feature_cols:
    feature_cols.append("race_enc")

print("[INFO] 사용 feature 수:", len(feature_cols))
print("[INFO] Feature columns 예시:", feature_cols[:20])

# ================================
# 4. hadm_id 기준 Train / Val / Test 분할
# ================================
def split_by_hadm(df, random_state=42, train_ratio=0.7, val_ratio=0.15):
    if "hadm_id" not in df.columns:
        raise ValueError("hadm_id 컬럼이 없습니다. split이 불가능합니다.")

    hadm_ids = df["hadm_id"].unique()
    rng = np.random.RandomState(random_state)
    rng.shuffle(hadm_ids)

    n = len(hadm_ids)
    n_train = int(n * train_ratio)
    n_val = int(n * val_ratio)

    hadm_train = hadm_ids[:n_train]
    hadm_val   = hadm_ids[n_train:n_train + n_val]
    hadm_test  = hadm_ids[n_train + n_val:]

    df_train = df[df["hadm_id"].isin(hadm_train)].copy()
    df_val   = df[df["hadm_id"].isin(hadm_val)].copy()
    df_test  = df[df["hadm_id"].isin(hadm_test)].copy()

    print("[SPLIT] hadm_id 기준 분할 완료")
    print(" - 전체 hadm_id:", n)
    print(" - train hadm_id:", len(hadm_train), ", rows:", len(df_train))
    print(" - val   hadm_id:", len(hadm_val),   ", rows:", len(df_val))
    print(" - test  hadm_id:", len(hadm_test),  ", rows:", len(df_test))

    return df_train, df_val, df_test

df_train, df_val, df_test = split_by_hadm(df, random_state=RANDOM_STATE)

X_train = df_train[feature_cols]
X_val   = df_val[feature_cols]
X_test  = df_test[feature_cols]

# ================================
# 5. LightGBM 학습 함수 (early stopping 없음)
# ================================
def train_lgbm_binary(X_tr, y_tr, X_va, y_va, model_name, num_boost_round=800):
    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data   = lgb.Dataset(X_va, label=y_va)

    params = {
        "objective": "binary",
        "metric": ["auc", "binary_logloss"],
        "boosting_type": "gbdt",
        "learning_rate": 0.03,
        "num_leaves": 63,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "max_depth": -1,
        "min_data_in_leaf": 50,
        "lambda_l2": 1.0,
        "verbose": -1,
        "seed": RANDOM_STATE,
    }

    print(f"\n[TRAIN] Binary 모델 학습 시작: {model_name}")
    model = lgb.train(
        params,
        train_data,
        num_boost_round=num_boost_round,
        valid_sets=[val_data],
        valid_names=["valid"]
    )

    y_pred_proba = model.predict(X_va)
    y_pred_label = (y_pred_proba >= 0.5).astype(int)

    auc  = roc_auc_score(y_va, y_pred_proba)
    ap   = average_precision_score(y_va, y_pred_proba)
    acc  = accuracy_score(y_va, y_pred_label)
    f1   = f1_score(y_va, y_pred_label)
    prec = precision_score(y_va, y_pred_label)
    rec  = recall_score(y_va, y_pred_label)

    print(f"[EVAL-{model_name}]")
    print(f"  AUC       : {auc:.4f}")
    print(f"  AP        : {ap:.4f}")
    print(f"  Accuracy  : {acc:.4f}")
    print(f"  Precision : {prec:.4f}")
    print(f"  Recall    : {rec:.4f}")
    print(f"  F1-score  : {f1:.4f}")

    return model


def train_lgbm_multiclass(X_tr, y_tr, X_va, y_va, model_name, num_boost_round=800):
    num_class = len(np.unique(y_tr))
    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data   = lgb.Dataset(X_va, label=y_va)

    params = {
        "objective": "multiclass",
        "num_class": num_class,
        "metric": ["multi_logloss"],
        "boosting_type": "gbdt",
        "learning_rate": 0.05,
        "num_leaves": 63,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "max_depth": -1,
        "min_data_in_leaf": 50,
        "lambda_l2": 1.0,
        "verbose": -1,
        "seed": RANDOM_STATE,
    }

    print(f"\n[TRAIN] Multiclass 모델 학습 시작: {model_name} (num_class={num_class})")
    model = lgb.train(
        params,
        train_data,
        num_boost_round=num_boost_round,
        valid_sets=[val_data],
        valid_names=["valid"]
    )

    y_pred = np.argmax(model.predict(X_va), axis=1)
    acc = accuracy_score(y_va, y_pred)
    macro_f1   = f1_score(y_va, y_pred, average="macro")
    macro_prec = precision_score(y_va, y_pred, average="macro")
    macro_rec  = recall_score(y_va, y_pred, average="macro")

    print(f"[EVAL-{model_name}]")
    print(f"  Accuracy        : {acc:.4f}")
    print(f"  Macro Precision : {macro_prec:.4f}")
    print(f"  Macro Recall    : {macro_rec:.4f}")
    print(f"  Macro F1-score  : {macro_f1:.4f}")

    return model


def train_lgbm_regression(X_tr, y_tr, X_va, y_va, model_name, num_boost_round=800):
    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data   = lgb.Dataset(X_va, label=y_va)

    params = {
        "objective": "regression",
        "metric": ["rmse", "l1"],
        "boosting_type": "gbdt",
        "learning_rate": 0.05,
        "num_leaves": 63,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "max_depth": -1,
        "min_data_in_leaf": 50,
        "lambda_l2": 1.0,
        "verbose": -1,
        "seed": RANDOM_STATE,
    }

    print(f"\n[TRAIN] Regression 모델 학습 시작: {model_name}")
    model = lgb.train(
        params,
        train_data,
        num_boost_round=num_boost_round,
        valid_sets=[val_data],
        valid_names=["valid"]
    )

    y_pred = model.predict(X_va)

    mse  = mean_squared_error(y_va, y_pred)
    rmse = np.sqrt(mse)
    mae  = mean_absolute_error(y_va, y_pred)

    print(f"[EVAL-{model_name}]")
    print(f"  RMSE : {rmse:.4f}")
    print(f"  MAE  : {mae:.4f}")

    return model

# ================================
# 6. 태스크별 모델 학습
# ================================

# (1) 사망 확률 (binary)
y_train_mort = df_train["target_mortality"].values
y_val_mort   = df_val["target_mortality"].values
y_test_mort  = df_test["target_mortality"].values

model_mort = train_lgbm_binary(
    X_train, y_train_mort,
    X_val, y_val_mort,
    model_name="Mortality"
)

# (2) 다음 event (multiclass, 1~K -> 0~K-1로 shift)
y_train_next = df_train["target_next_evt"].values - 1
y_val_next   = df_val["target_next_evt"].values - 1
y_test_next  = df_test["target_next_evt"].values - 1

model_next = train_lgbm_multiclass(
    X_train, y_train_next,
    X_val, y_val_next,
    model_name="NextEvent"
)

# (3) 다음 event까지 걸리는 시간 (log1p(time_to_next_clip), regression)
y_train_ttn = df_train["time_to_next_log1p"].values
y_val_ttn   = df_val["time_to_next_log1p"].values
y_test_ttn  = df_test["time_to_next_log1p"].values

model_ttn = train_lgbm_regression(
    X_train, y_train_ttn,
    X_val, y_val_ttn,
    model_name="TimeToNext_log1p"
)

# (4) 남은 입원 기간 (remain LOS, regression)
y_train_los = df_train["target_remain_los"].values
y_val_los   = df_val["target_remain_los"].values
y_test_los  = df_test["target_remain_los"].values

model_los = train_lgbm_regression(
    X_train, y_train_los,
    X_val, y_val_los,
    model_name="RemainLOS"
)

# (5) 지연 여부 (delay_label, binary)
y_train_delay = df_train["delay_label"].values
y_val_delay   = df_val["delay_label"].values
y_test_delay  = df_test["delay_label"].values

model_delay = train_lgbm_binary(
    X_train, y_train_delay,
    X_val, y_val_delay,
    model_name="DelayRisk"
)

print("\n[INFO] 모든 모델 학습 완료")

# ================================
# 7. 공통 Test 평가 함수
# ================================
def evaluate_binary(model, X, y, name="Binary-Test"):
    pred_proba = model.predict(X)
    pred = (pred_proba >= 0.5).astype(int)

    auc  = roc_auc_score(y, pred_proba)
    ap   = average_precision_score(y, pred_proba)
    acc  = accuracy_score(y, pred)
    f1   = f1_score(y, pred)
    prec = precision_score(y, pred)
    rec  = recall_score(y, pred)

    print(f"\n=== {name} (Binary) ===")
    print(f"AUC        : {auc:.4f}")
    print(f"AP         : {ap:.4f}")
    print(f"Accuracy   : {acc:.4f}")
    print(f"Precision  : {prec:.4f}")
    print(f"Recall     : {rec:.4f}")
    print(f"F1-score   : {f1:.4f}")

    return {
        "AUC": auc,
        "AP": ap,
        "ACC": acc,
        "PREC": prec,
        "REC": rec,
        "F1": f1
    }


def evaluate_multiclass(model, X, y, name="Multiclass-Test"):
    proba = model.predict(X)
    pred  = np.argmax(proba, axis=1)

    acc = accuracy_score(y, pred)
    macro_f1   = f1_score(y, pred, average="macro")
    macro_prec = precision_score(y, pred, average="macro")
    macro_rec  = recall_score(y, pred, average="macro")

    print(f"\n=== {name} (Multiclass) ===")
    print(f"Accuracy         : {acc:.4f}")
    print(f"Macro Precision  : {macro_prec:.4f}")
    print(f"Macro Recall     : {macro_rec:.4f}")
    print(f"Macro F1-score   : {macro_f1:.4f}")

    return {
        "ACC": acc,
        "PREC": macro_prec,
        "REC": macro_rec,
        "F1": macro_f1
    }


def evaluate_regression(model, X, y, name="Regression-Test"):
    pred = model.predict(X)

    mse  = mean_squared_error(y, pred)
    rmse = np.sqrt(mse)
    mae  = mean_absolute_error(y, pred)

    print(f"\n=== {name} (Regression) ===")
    print(f"RMSE : {rmse:.4f}")
    print(f"MAE  : {mae:.4f}")

    return {
        "RMSE": rmse,
        "MAE": mae
    }

# ================================
# 8. Test 평가
# ================================
print("\n[INFO] Test 셋 평가 시작")

# Binary
evaluate_binary(model_mort,  X_test, y_test_mort,  name="Mortality-Test")
evaluate_binary(model_delay, X_test, y_test_delay, name="DelayRisk-Test")

# Multiclass
evaluate_multiclass(model_next, X_test, y_test_next, name="NextEvent-Test")

# Regression
evaluate_regression(model_ttn, X_test, y_test_ttn, name="TimeToNext_log1p-Test")
evaluate_regression(model_los, X_test, y_test_los, name="RemainLOS-Test")

# ================================
# 9. Test 예측 및 CSV 저장
# ================================
print("\n[INFO] Test 예측 및 CSV 저장 시작")

# Mortality
test_pred_mort_proba = model_mort.predict(X_test)

# Next event
test_pred_next_proba = model_next.predict(X_test)
test_pred_next_label = np.argmax(test_pred_next_proba, axis=1) + 1  # 다시 1~K로 복원

# Time to next
test_pred_ttn_log1p = model_ttn.predict(X_test)
test_pred_ttn_real  = np.expm1(test_pred_ttn_log1p)

# Remain LOS
test_pred_los = model_los.predict(X_test)

# Delay
test_pred_delay_proba = model_delay.predict(X_test)
test_pred_delay_label = (test_pred_delay_proba >= 0.5).astype(int)

# 결과 DataFrame 구성
result_dict = {
    "subject_id": df_test["subject_id"].values,
    "hadm_id": df_test["hadm_id"].values,
}
if "timestamp" in df_test.columns:
    result_dict["timestamp"] = df_test["timestamp"].values

result_dict.update({
    "mortality_true": df_test["target_mortality"].values,
    "mortality_pred_proba": test_pred_mort_proba,

    "next_evt_true": df_test["target_next_evt"].values,
    "next_evt_pred": test_pred_next_label,

    "ttn_log1p_true": df_test["time_to_next_log1p"].values,
    "ttn_log1p_pred": test_pred_ttn_log1p,
    "ttn_minutes_pred": test_pred_ttn_real,

    "remain_los_true": df_test["target_remain_los"].values,
    "remain_los_pred": test_pred_los,

    "delay_true": df_test["delay_label"].values,
    "delay_pred_label": test_pred_delay_label,
    "delay_pred_proba": test_pred_delay_proba,
})

df_result = pd.DataFrame(result_dict)
df_result.to_csv(OUT_PATH, index=False)

print(f"[SAVE] Test 예측 결과 CSV 저장 완료: {OUT_PATH}")
print(df_result.head())


[LOAD] 데이터 로딩 완료
 - shape : (40817, 27)
[PREP] race -> race_enc 인코딩 완료
[PREP] time_to_next_clip 상한(99.5%): 9920.20
[PREP] delay_threshold(75%): 168.10
[PREP] delay_label 분포:
delay_label
0    0.750006
1    0.249994
Name: ratio, dtype: float64
[INFO] 사용 feature 수: 20
[INFO] Feature columns 예시: ['age', 'gender', 'arrival_transport', 'prefix_len', 'current_event_id', 'time_since_start_min', 'time_since_ed', 'time_since_last', 'is_night', 'cum_ecg_cnt', 'cum_trop_cnt', 'stemi_flag', 'trop_pos_flag', 'last_trop', 'run_max_trop', 'trop_trend', 'pci_status', 'current_heart_rate', 'current_mean_bp', 'race_enc']
[SPLIT] hadm_id 기준 분할 완료
 - 전체 hadm_id: 1929
 - train hadm_id: 1350 , rows: 28412
 - val   hadm_id: 289 , rows: 5723
 - test  hadm_id: 290 , rows: 6682

[TRAIN] Binary 모델 학습 시작: Mortality
[EVAL-Mortality] AUC=0.5381, AP=0.1291, ACC=0.8835, F1=0.0800

[TRAIN] Multiclass 모델 학습 시작: NextEvent (num_class=14)
[EVAL-NextEvent] ACC=0.5843, Macro-F1=0.3849

[TRAIN] Regression 모델 학습 시작: TimeToNext