In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score,
    mean_squared_error, mean_absolute_error
)


RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

DATA_PATH = "./../cohort/cohort_ver151_reorder_col.csv"


# ======================================================
# 1. 데이터 로딩
# ======================================================
df = pd.read_csv(DATA_PATH, low_memory=False)
print("[LOAD] 데이터 로딩 완료")
print(" - shape :", df.shape)

required_cols = [
    "hadm_id",
    "target_mortality",
    "target_next_evt",
    "target_time_to_next",
    "target_remain_los",
]

for c in required_cols:
    if c not in df.columns:
        raise ValueError(f"필수 컬럼 {c} 이(가) 없습니다.")


# ======================================================
# 2. 기본 전처리: race 인코딩, delay_label, time_to_next_clip/log1p
# ======================================================
if "race" in df.columns:
    le_race = LabelEncoder()
    df["race_enc"] = le_race.fit_transform(df["race"].astype(str))
    print("[PREP] race -> race_enc 인코딩 완료")
else:
    print("[WARN] race 컬럼 없음, race_enc 사용 불가. 대신 0으로 채웁니다.")
    df["race_enc"] = 0

# time_to_next 클리핑 + log1p (LGBM/Transformer와 비슷한 방식)
clip_value = df["target_time_to_next"].quantile(0.995)
df["time_to_next_clip"] = df["target_time_to_next"].clip(upper=clip_value)
df["time_to_next_log1p"] = np.log1p(df["time_to_next_clip"])

# delay_label (75% 기준)
if "delay_label" not in df.columns:
    delay_thr = df["time_to_next_clip"].quantile(0.75)
    df["delay_label"] = (df["time_to_next_clip"] > delay_thr).astype(int)
    print(f"[PREP] delay_label 생성 완료 (75% 기준: {delay_thr:.2f})")
else:
    print("[PREP] 기존 delay_label 사용")


# ======================================================
# 3. Feature Engineering (Transformer에서 쓰던 것 재사용)
# ======================================================
def add_features(df_in: pd.DataFrame) -> pd.DataFrame:
    df_fe = df_in.copy()

    # HR/BP 비율
    if "current_heart_rate" in df_fe.columns and "current_mean_bp" in df_fe.columns:
        df_fe["hr_bp_ratio"] = df_fe["current_heart_rate"] / (df_fe["current_mean_bp"].abs() + 1.0)
    else:
        df_fe["hr_bp_ratio"] = 0.0

    # 마지막 이벤트 시간 / 전체 시간 비율
    if "time_since_last" in df_fe.columns and "time_since_start_min" in df_fe.columns:
        df_fe["time_last_ratio"] = df_fe["time_since_last"] / (df_fe["time_since_start_min"].abs() + 1.0)
    else:
        df_fe["time_last_ratio"] = 0.0

    # pathway 진행도
    if "pathway_stage" in df_fe.columns and "prefix_len" in df_fe.columns:
        df_fe["event_progress"] = df_fe["pathway_stage"] / (df_fe["prefix_len"] + 1.0)
    else:
        df_fe["event_progress"] = 0.0

    # 글로벌 median 기준 delay 위험
    if "time_since_last" in df.columns:
        global_median = df["time_since_last"].median()
        df_fe["risk_delay"] = (df_fe["time_since_last"] > global_median).astype(int)
    else:
        df_fe["risk_delay"] = 0

    # STEMI 누적 위험도
    has_stemi = "stemi_flag" in df_fe.columns
    has_cum_stemi = "cum_stemi_cnt" in df_fe.columns
    if has_stemi and has_cum_stemi:
        df_fe["risk_stemi"] = df_fe["stemi_flag"] * df_fe["cum_stemi_cnt"]
    elif has_stemi:
        df_fe["risk_stemi"] = df_fe["stemi_flag"]
    else:
        df_fe["risk_stemi"] = 0

    # Troponin 이상 여부
    if "last_trop" in df_fe.columns:
        df_fe["trop_abnormal"] = (df_fe["last_trop"] > 0.04).astype(int)
    else:
        df_fe["trop_abnormal"] = 0

    return df_fe


df = add_features(df)


# ======================================================
# 4. 사용 Feature 정의
# ======================================================
candidate_cols = [
    "age", "gender", "race_enc",
    "arrival_transport",
    "prefix_len", "current_event_id",
    "time_since_start_min", "time_since_ed", "time_since_last",
    "is_night",
    "cum_ecg_cnt", "cum_trop_cnt",
    "stemi_flag", "trop_pos_flag",
    "last_trop", "run_max_trop", "trop_trend",
    "pci_status",
    "current_heart_rate", "current_mean_bp",
    "hr_bp_ratio", "time_last_ratio", "event_progress",
    "risk_delay", "risk_stemi", "trop_abnormal",
]

feature_cols = [c for c in candidate_cols if c in df.columns]
print("[INFO] 사용 feature 수:", len(feature_cols))
print("[INFO] Feature 예시:", feature_cols[:10])

target_cols = {
    "mortality": "target_mortality",
    "next_event": "target_next_evt",
    "time_to_next": "time_to_next_log1p",   # 회귀는 log1p된 값
    "delay": "delay_label",
    "remain_los": "target_remain_los",
}


# ======================================================
# 5. 결측행 제거
# ======================================================
df = df.dropna(subset=feature_cols + list(target_cols.values()))
print("[PREP] 결측 제거 후:", df.shape)


# ======================================================
# 6. hadm_id 기준 Train / Val / Test 분할
# ======================================================
def split_by_hadm(df_in, random_state=42, train_ratio=0.7, val_ratio=0.15):
    if "hadm_id" not in df_in.columns:
        raise ValueError("hadm_id 컬럼이 없습니다.")

    hadm_ids = df_in["hadm_id"].unique()
    rng = np.random.RandomState(random_state)
    rng.shuffle(hadm_ids)

    n = len(hadm_ids)
    n_train = int(n * train_ratio)
    n_val = int(n * val_ratio)

    hadm_train = hadm_ids[:n_train]
    hadm_val   = hadm_ids[n_train:n_train + n_val]
    hadm_test  = hadm_ids[n_train + n_val:]

    df_train = df_in[df_in["hadm_id"].isin(hadm_train)].copy()
    df_val   = df_in[df_in["hadm_id"].isin(hadm_val)].copy()
    df_test  = df_in[df_in["hadm_id"].isin(hadm_test)].copy()

    print("[SPLIT] hadm_id 기준 분할 완료")
    print(" - 전체 hadm_id:", n)
    print(" - train hadm_id:", len(hadm_train), ", rows:", len(df_train))
    print(" - val   hadm_id:", len(hadm_val),   ", rows:", len(df_val))
    print(" - test  hadm_id:", len(hadm_test),  ", rows:", len(df_test))

    return df_train, df_val, df_test


df_train, df_val, df_test = split_by_hadm(df, random_state=RANDOM_STATE)


# ======================================================
# 7. 스케일링 (StandardScaler)
# ======================================================
scaler = StandardScaler()
scaler.fit(df_train[feature_cols])

X_train = scaler.transform(df_train[feature_cols])
X_val   = scaler.transform(df_val[feature_cols])
X_test  = scaler.transform(df_test[feature_cols])

y_train = {name: df_train[col].values for name, col in target_cols.items()}
y_val   = {name: df_val[col].values   for name, col in target_cols.items()}
y_test  = {name: df_test[col].values  for name, col in target_cols.items()}

print("[PREP] StandardScaler 적용 완료")


# ======================================================
# 8. 모델 정의 (LR / LinearRegression)
# ======================================================
mort_model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    solver="lbfgs",
    n_jobs=-1
)

delay_model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    solver="lbfgs",
    n_jobs=-1
)

# next_event: 멀티클래스 (target_next_evt는 1~K라고 가정)
next_event_model = LogisticRegression(
    max_iter=1000,
    multi_class="multinomial",
    solver="lbfgs",
    n_jobs=-1
)

time_to_next_model = LinearRegression()
remain_los_model   = LinearRegression()


# ======================================================
# 9. 학습
# ======================================================
print("[TRAIN] Mortality LR 학습")
mort_model.fit(X_train, y_train["mortality"])

print("[TRAIN] Delay LR 학습")
delay_model.fit(X_train, y_train["delay"])

print("[TRAIN] Next-event LR(multinomial) 학습")
next_event_model.fit(X_train, y_train["next_event"])

print("[TRAIN] Time-to-next LinearRegression 학습 (log1p scale)")
time_to_next_model.fit(X_train, y_train["time_to_next"])

print("[TRAIN] Remain LOS LinearRegression 학습 (days)")
remain_los_model.fit(X_train, y_train["remain_los"])


# ======================================================
# 10. 평가 함수
# ======================================================
def eval_binary(model, X, y_true, name=""):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1]

    # AUC는 y에 0/1 두 클래스가 모두 있을 때만 계산
    if len(np.unique(y_true)) > 1:
        auc = roc_auc_score(y_true, y_prob)
    else:
        auc = np.nan

    ap   = average_precision_score(y_true, y_prob)
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec  = recall_score(y_true, y_pred, zero_division=0)
    f1   = f1_score(y_true, y_pred, zero_division=0)

    return dict(AUC=auc, AP=ap, ACC=acc, PREC=prec, REC=rec, F1=f1)


def eval_multiclass(model, X, y_true, name=""):
    y_pred = model.predict(X)
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average="macro", zero_division=0)
    rec  = recall_score(y_true, y_pred, average="macro", zero_division=0)
    f1   = f1_score(y_true, y_pred, average="macro", zero_division=0)
    return dict(ACC=acc, PREC=prec, REC=rec, F1=f1)


def eval_reg_time_to_next(model, X, y_log1p_true, name=""):
    # y_log1p_true: log1p(time_to_next_clip)
    y_log1p_pred = model.predict(X)
    # 분 단위로 복원
    y_true = np.expm1(y_log1p_true)
    y_pred = np.expm1(y_log1p_pred)

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae  = mean_absolute_error(y_true, y_pred)
    return dict(RMSE=rmse, MAE=mae)


def eval_reg_simple(model, X, y_true, name=""):
    y_pred = model.predict(X)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae  = mean_absolute_error(y_true, y_pred)
    return dict(RMSE=rmse, MAE=mae)


# ======================================================
# 11. Train / Val / Test 평가
# ======================================================
results = {
    "Train": {},
    "Val": {},
    "Test": {},
}

for split_name, X, y in [
    ("Train", X_train, y_train),
    ("Val",   X_val,   y_val),
    ("Test",  X_test,  y_test),
]:
    res_split = {}
    print(f"\n[EVAL] {split_name}")

    # (1) 사망
    res_split["mortality"] = eval_binary(mort_model, X, y["mortality"], name="mortality")
    print("  mortality:", res_split["mortality"])

    # (2) 지연
    res_split["delay"] = eval_binary(delay_model, X, y["delay"], name="delay")
    print("  delay    :", res_split["delay"])

    # (3) 다음 이벤트 종류
    res_split["next_event"] = eval_multiclass(next_event_model, X, y["next_event"], name="next_event")
    print("  next_evt :", res_split["next_event"])

    # (4) 다음 이벤트까지 시간 (time_to_next_log1p → 분 단위 복원 후 RMSE/MAE)
    res_split["time_to_next"] = eval_reg_time_to_next(
        time_to_next_model, X, y["time_to_next"], name="time_to_next"
    )
    print("  time_to_next:", res_split["time_to_next"])

    # (5) 남은 입원 기간 (remain_los, days)
    res_split["remain_los"] = eval_reg_simple(
        remain_los_model, X, y["remain_los"], name="remain_los"
    )
    print("  remain_los  :", res_split["remain_los"])

    results[split_name] = res_split

print("\n[FINAL RESULT]")
print(results)


[LOAD] 데이터 로딩 완료
 - shape : (40817, 27)
[PREP] race -> race_enc 인코딩 완료
[PREP] delay_label 생성 완료 (75% 기준: 168.10)
[INFO] 사용 feature 수: 26
[INFO] Feature 예시: ['age', 'gender', 'race_enc', 'arrival_transport', 'prefix_len', 'current_event_id', 'time_since_start_min', 'time_since_ed', 'time_since_last', 'is_night']
[PREP] 결측 제거 후: (40817, 37)
[SPLIT] hadm_id 기준 분할 완료
 - 전체 hadm_id: 1929
 - train hadm_id: 1350 , rows: 28412
 - val   hadm_id: 289 , rows: 5723
 - test  hadm_id: 290 , rows: 6682
[PREP] StandardScaler 적용 완료
[TRAIN] Mortality LR 학습
[TRAIN] Delay LR 학습
[TRAIN] Next-event LR(multinomial) 학습




[TRAIN] Time-to-next LinearRegression 학습 (log1p scale)
[TRAIN] Remain LOS LinearRegression 학습 (days)

[EVAL] Train
  mortality: {'AUC': np.float64(0.7018918255141731), 'AP': np.float64(0.1815052336599004), 'ACC': 0.6424398141630298, 'PREC': 0.13388614553246994, 'REC': 0.6247826086956522, 'F1': 0.22051714877618353}
  delay    : {'AUC': np.float64(0.7867874900077182), 'AP': np.float64(0.49911552915024454), 'ACC': 0.702132901590877, 'PREC': 0.444867221561633, 'REC': 0.793552036199095, 'F1': 0.5701224158073855}
  next_evt : {'ACC': 0.5457553146557792, 'PREC': 0.26699927086561914, 'REC': 0.23411485462769788, 'F1': 0.20646943772105933}
  time_to_next: {'RMSE': np.float64(1196.5561539806702), 'MAE': 354.6302704665939}
  remain_los  : {'RMSE': np.float64(1.7727524384644449), 'MAE': 1.2323418173469307}

[EVAL] Val
  mortality: {'AUC': np.float64(0.5945130709566216), 'AP': np.float64(0.17221669423168534), 'ACC': 0.6192556351563865, 'PREC': 0.12112541726275632, 'REC': 0.43050847457627117, 'F1': 0