In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb

# ==============================
# 0. 경로 및 기본 설정
# ==============================

PPM_DATA_PATH = "./cohort/cohort_ver142_ppm_prefix_next_event.csv"
OUTPUT_DIR = "./cohort"
os.makedirs(OUTPUT_DIR, exist_ok=True)

MODEL_NEXT_EVENT_PATH = os.path.join(OUTPUT_DIR, "lgbm_ver142_next_event.txt")
MODEL_TIME_TO_NEXT_PATH = os.path.join(OUTPUT_DIR, "lgbm_ver142_time_to_next.txt")

RANDOM_STATE = 42
TEST_SIZE = 0.15   # 전체 hadm_id 중 15% test
VAL_SIZE = 0.15    # 전체 hadm_id 중 15% validation (나머지 70% train)


# ==============================
# 1. 데이터 로딩 & 기본 전처리
# ==============================

def load_ppm_dataset(path: str) -> pd.DataFrame:
    """
    ver142 PPM prefix–next_event CSV 로딩.
    필수 컬럼:
      - subject_id
      - hadm_id
      - case_id
      - prefix_len
      - prefix_events_str
      - current_event
      - current_event_id
      - next_event
      - next_event_id
      - time_since_start_min
      - time_to_next_min
      - full_trace_len
    """
    df = pd.read_csv(path)

    required_cols = [
        "subject_id",
        "hadm_id",
        "case_id",
        "prefix_len",
        "prefix_events_str",
        "current_event",
        "current_event_id",
        "next_event",
        "next_event_id",
        "time_since_start_min",
        "time_to_next_min",
        "full_trace_len",
    ]
    for c in required_cols:
        if c not in df.columns:
            raise ValueError(f"PPM 데이터셋에 '{c}' 컬럼이 없습니다. 현재 컬럼: {list(df.columns)}")

    # 결측/이상치 제거 (기본적인 sanity check)
    df = df.dropna(subset=["next_event_id", "time_since_start_min", "time_to_next_min"])
    # 시간 관련 값이 음수인 경우 제거 (이상치)
    df = df[df["time_since_start_min"] >= 0]
    df = df[df["time_to_next_min"] >= 0]

    print(f"[LOAD] PPM 데이터 로딩 완료: {len(df)} rows, {df['hadm_id'].nunique()} hadm_id")
    print(f"[LOAD] next_event_id 고유 개수: {df['next_event_id'].nunique()}")
    return df


# ==============================
# 2. hadm_id 기준 Train / Val / Test Split
# ==============================

def split_by_hadm(df: pd.DataFrame,
                  test_size: float = TEST_SIZE,
                  val_size: float = VAL_SIZE,
                  random_state: int = RANDOM_STATE):
    """
    환자 입원 단위(hadm_id) 기준으로 train/val/test를 나눈다.
    같은 hadm_id가 서로 다른 split에 섞이지 않도록 방지.
    """
    unique_hadm = df["hadm_id"].drop_duplicates().values
    # 먼저 test 분리
    hadm_train_val, hadm_test = train_test_split(
        unique_hadm, test_size=test_size, random_state=random_state, shuffle=True
    )
    # train_val에서 다시 val 분리
    val_ratio = val_size / (1.0 - test_size)
    hadm_train, hadm_val = train_test_split(
        hadm_train_val, test_size=val_ratio, random_state=random_state, shuffle=True
    )

    def _subset(hadm_ids):
        return df[df["hadm_id"].isin(hadm_ids)].copy()

    df_train = _subset(hadm_train)
    df_val = _subset(hadm_val)
    df_test = _subset(hadm_test)

    print(f"[SPLIT] train hadm_id: {len(hadm_train)}, rows: {len(df_train)}")
    print(f"[SPLIT] val   hadm_id: {len(hadm_val)}, rows: {len(df_val)}")
    print(f"[SPLIT] test  hadm_id: {len(hadm_test)}, rows: {len(df_test)}")

    return df_train, df_val, df_test


# ==============================
# 3. Feature 구성
# ==============================

def build_feature_matrices(df_train: pd.DataFrame,
                           df_val: pd.DataFrame,
                           df_test: pd.DataFrame):
    """
    LightGBM에 넣을 feature matrix와 label 벡터 생성.
    - 분류: next_event_id (0-index로 변환)
    - 회귀: time_to_next_min
    Feature는 기본적으로:
      - prefix_len (연속형)
      - time_since_start_min (연속형)
      - full_trace_len (연속형)
      - current_event_id (범주형)
      - prefix_events_str (범주형)  ← sequence 자체를 하나의 category로 처리 (baseline)
    """
    # 사용할 feature 컬럼
    num_cols = ["prefix_len", "time_since_start_min", "full_trace_len"]
    cat_cols = ["current_event_id", "prefix_events_str"]

    # 범주형은 pandas category로 캐스팅
    for c in cat_cols:
        for df in [df_train, df_val, df_test]:
            df[c] = df[c].astype("category")

    feature_cols = num_cols + cat_cols

    X_train = df_train[feature_cols]
    X_val = df_val[feature_cols]
    X_test = df_test[feature_cols]

    # LightGBM의 multiclass objective는 0 ~ num_class-1 범위의 label을 기대하므로 변환
    y_train_cls = df_train["next_event_id"].values - 1
    y_val_cls = df_val["next_event_id"].values - 1
    y_test_cls = df_test["next_event_id"].values - 1

    # 회귀는 그대로 사용
    y_train_reg = df_train["time_to_next_min"].values
    y_val_reg = df_val["time_to_next_min"].values
    y_test_reg = df_test["time_to_next_min"].values

    print(f"[FEATURE] 사용 feature 컬럼: {feature_cols}")
    print(f"[FEATURE] 분류 target 클래스 수: {len(np.unique(y_train_cls))}")

    return (
        X_train, X_val, X_test,
        y_train_cls, y_val_cls, y_test_cls,
        y_train_reg, y_val_reg, y_test_reg,
        num_cols, cat_cols
    )


# ==============================
# 4. LightGBM 학습 함수들
# ==============================

def train_lgbm_classifier(
    X_train, y_train,
    X_val, y_val,
    num_cols, cat_cols,
    model_path: str
):
    """
    next_event_id 분류용 LightGBM 학습.
    """
    num_class = len(np.unique(y_train))

    # LightGBM Dataset 생성
    train_data = lgb.Dataset(
        X_train,
        label=y_train,
        categorical_feature=cat_cols,
        free_raw_data=False
    )
    val_data = lgb.Dataset(
        X_val,
        label=y_val,
        categorical_feature=cat_cols,
        free_raw_data=False
    )

    params = {
        "objective": "multiclass",
        "num_class": num_class,
        "metric": ["multi_logloss", "multi_error"],
        "learning_rate": 0.05,
        "num_leaves": 31,
        "max_depth": -1,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "min_data_in_leaf": 30,
        "lambda_l2": 1.0,
        "verbosity": -1,
        "force_col_wise": True,
        "seed": RANDOM_STATE,
    }

    print("\n[TRAIN] LightGBM 분류 모델 학습 시작 (next_event_id)...")
    model = lgb.train(
        params,
        train_data,
        valid_sets=[train_data, val_data],
        valid_names=["train", "val"],
        num_boost_round=500,
        early_stopping_rounds=50,
        verbose_eval=50,
    )

    print(f"[TRAIN] 최적 반복 수: {model.best_iteration}")
    model.save_model(model_path)
    print(f"[SAVE] 분류 모델 저장: {model_path}")
    return model


def train_lgbm_regressor(
    X_train, y_train,
    X_val, y_val,
    num_cols, cat_cols,
    model_path: str
):
    """
    time_to_next_min 회귀용 LightGBM 학습.
    """
    train_data = lgb.Dataset(
        X_train,
        label=y_train,
        categorical_feature=cat_cols,
        free_raw_data=False
    )
    val_data = lgb.Dataset(
        X_val,
        label=y_val,
        categorical_feature=cat_cols,
        free_raw_data=False
    )

    params = {
        "objective": "regression",
        "metric": ["l2", "l1"],
        "learning_rate": 0.05,
        "num_leaves": 31,
        "max_depth": -1,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "min_data_in_leaf": 30,
        "lambda_l2": 1.0,
        "verbosity": -1,
        "force_col_wise": True,
        "seed": RANDOM_STATE,
    }

    print("\n[TRAIN] LightGBM 회귀 모델 학습 시작 (time_to_next_min)...")
    model = lgb.train(
        params,
        train_data,
        valid_sets=[train_data, val_data],
        valid_names=["train", "val"],
        num_boost_round=500,
        early_stopping_rounds=50,
        verbose_eval=50,
    )

    print(f"[TRAIN] 최적 반복 수: {model.best_iteration}")
    model.save_model(model_path)
    print(f"[SAVE] 회귀 모델 저장: {model_path}")
    return model


# ==============================
# 5. 평가 함수
# ==============================

def eval_classifier(model, X_test, y_test):
    """
    분류 모델 평가:
      - Top-1 accuracy
      - Top-3 accuracy
      - 간단한 classification_report
    """
    # 예측 확률
    prob = model.predict(X_test, num_iteration=model.best_iteration)
    # Top-1
    y_pred = np.argmax(prob, axis=1)
    acc_top1 = accuracy_score(y_test, y_pred)

    # Top-3 accuracy
    top3 = np.argsort(prob, axis=1)[:, -3:]
    correct_top3 = np.any(top3 == y_test.reshape(-1, 1), axis=1)
    acc_top3 = correct_top3.mean()

    print("\n[EVAL][CLASS] Test Top-1 Accuracy :", acc_top1)
    print("[EVAL][CLASS] Test Top-3 Accuracy :", acc_top3)
    print("\n[EVAL][CLASS] Classification Report (Top-1 기준):")
    print(classification_report(y_test, y_pred, digits=4))


def eval_regressor(model, X_test, y_test):
    """
    회귀 모델 평가:
      - MAE
      - RMSE
    """
    pred = model.predict(X_test, num_iteration=model.best_iteration)
    mae = np.mean(np.abs(pred - y_test))
    rmse = np.sqrt(np.mean((pred - y_test) ** 2))

    print("\n[EVAL][REG] Test MAE (min) :", mae)
    print("[EVAL][REG] Test RMSE (min):", rmse)


# ==============================
# 6. MAIN
# ==============================

def main():
    # 1) 데이터 로딩
    df = load_ppm_dataset(PPM_DATA_PATH)

    # 2) hadm_id 기준 Train/Val/Test split
    df_train, df_val, df_test = split_by_hadm(df)

    # 3) Feature / Label 구성
    (
        X_train, X_val, X_test,
        y_train_cls, y_val_cls, y_test_cls,
        y_train_reg, y_val_reg, y_test_reg,
        num_cols, cat_cols
    ) = build_feature_matrices(df_train, df_val, df_test)

    # 4) 분류 모델 학습 (next_event_id)
    cls_model = train_lgbm_classifier(
        X_train, y_train_cls,
        X_val, y_val_cls,
        num_cols, cat_cols,
        MODEL_NEXT_EVENT_PATH
    )

    # 5) 회귀 모델 학습 (time_to_next_min)
    reg_model = train_lgbm_regressor(
        X_train, y_train_reg,
        X_val, y_val_reg,
        num_cols, cat_cols,
        MODEL_TIME_TO_NEXT_PATH
    )

    # 6) Test 평가
    eval_classifier(cls_model, X_test, y_test_cls)
    eval_regressor(reg_model, X_test, y_test_reg)

    print("\n[INFO] LightGBM 기반 PPM 베이스라인 학습 및 평가 완료.")


if __name__ == "__main__":
    main()


EmptyDataError: No columns to parse from file