In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb

# ==============================
# 0. 경로 및 기본 설정
# ==============================

# 140 이벤트 로그 (새로운 full event log)
INPUT_EVENT_LOG_PATH = "./cohort/cohort_ver140_event_log.csv"

# 142 결과물 저장
OUTPUT_DIR = "./cohort"
os.makedirs(OUTPUT_DIR, exist_ok=True)

CLEAN_EVENT_LOG_PATH = os.path.join(OUTPUT_DIR, "cohort_ver142_event_log_clean.csv")
EVENT_ID_MAP_PATH = os.path.join(OUTPUT_DIR, "cohort_ver142_event_id_map.csv")
PPM_DATA_PATH = os.path.join(OUTPUT_DIR, "cohort_ver142_ppm_prefix_next_event.csv")

# LightGBM 모델 파일
MODEL_NEXT_EVENT_PATH = os.path.join(OUTPUT_DIR, "lgbm_ver142_next_event.txt")
MODEL_TIME_TO_NEXT_PATH = os.path.join(OUTPUT_DIR, "lgbm_ver142_time_to_next.txt")

# 공통 설정
RANDOM_STATE = 42
TEST_SIZE = 0.15   # 전체 hadm_id 중 15% test
VAL_SIZE = 0.15    # 전체 hadm_id 중 15% validation (나머지 70% train)

# trace 필터링 기준
MIN_EVENTS_PER_CASE = 2
MAX_EPISODE_DAYS = 365          # 한 입원(hadm) 동안 365일 이상이면 이상치로 제거

# 회귀 타깃에서 허용할 최대 gap (분)
# ex) 30일 = 30 * 24 * 60
MAX_TIME_TO_NEXT_MIN = 30 * 24 * 60

# === (추가) 분석/학습용 극단값 제거 기준 ===
# time_to_next_min, time_since_start_min 모두 30일 이내만 사용
MAX_TIME_TO_NEXT_TRIM_MIN = 30 * 24 * 60       # 30일
MAX_TIME_SINCE_START_TRIM_MIN = 30 * 24 * 60   # 30일


# ==============================
# 1. 공통 유틸
# ==============================

def _to_datetime(df: pd.DataFrame, col: str) -> pd.DataFrame:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")
    return df


# ==============================
# 2. 이벤트 로그 로딩
# ==============================

def load_event_log(path: str) -> pd.DataFrame:
    """
    Event Log CSV 로딩.
    필수 컬럼:
      - case_id
      - subject_id
      - hadm_id
      - event_name
      - timestamp
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"입력 이벤트 로그 파일을 찾을 수 없습니다: {path}")

    df = pd.read_csv(path)

    required_cols = ["case_id", "subject_id", "hadm_id", "event_name", "timestamp"]
    for c in required_cols:
        if c not in df.columns:
            raise ValueError(f"입력 이벤트 로그에 '{c}' 컬럼이 없습니다. 현재 컬럼: {list(df.columns)}")

    df = _to_datetime(df, "timestamp")
    before = len(df)
    df = df.dropna(subset=["timestamp"])
    print(f"[LOAD] timestamp NaT 제거: {before} -> {len(df)} rows")

    # 전역 정렬
    df = df.sort_values(by=["hadm_id", "timestamp", "event_name"]).reset_index(drop=True)

    print(f"[LOAD] Event Log 로딩 완료: {len(df)} rows, {df['hadm_id'].nunique()} hadm_id")
    print(f"[LOAD] event_name 분포:\n{df['event_name'].value_counts()}")
    return df


# ==============================
# 3. 1차 시간 sanity 체크
#    - episode 길이만 필터 (연도 이상치는 허용)
# ==============================

def sanity_filter_time(raw_events: pd.DataFrame,
                       max_episode_days: int = MAX_EPISODE_DAYS) -> pd.DataFrame:
    """
    1차 시간 sanity 체크:
      - hadm_id별 timestamp span이 max_episode_days를 초과하면 제거
      - 연도 이상치는 허용 (연도 범위 필터는 적용하지 않음)
    """
    df = raw_events.copy()

    # hadm_id별 episode 길이 계산
    span = df.groupby("hadm_id")["timestamp"].agg(["min", "max"])
    span["duration_days"] = (span["max"] - span["min"]).dt.total_seconds() / 86400.0

    long_span = span[span["duration_days"] > max_episode_days]
    good_hadm = span[span["duration_days"] <= max_episode_days].index

    print(f"[TIME] episode 길이 통계 (일 단위):")
    print(span["duration_days"].describe())

    if len(long_span) > 0:
        print(f"[TIME] episode 길이>{max_episode_days}일 hadm_id 수: {len(long_span)}")
    else:
        print(f"[TIME] episode 길이>{max_episode_days}일 hadm_id 없음")

    df = df[df["hadm_id"].isin(good_hadm)].copy()
    df = df.sort_values(by=["hadm_id", "timestamp", "event_name"]).reset_index(drop=True)

    print(f"[TIME] 시간 sanity 필터 후 rows: {len(df)}, hadm_id: {df['hadm_id'].nunique()}")

    return df


# ==============================
# 4. Event Log 클린업
#    - 시작 기준: ED_ARRIVAL > ED_ARRIVAL_SURR > 첫 이벤트
#    - DISCHARGE/DEATH 이후 제거
#    - 너무 짧은 trace 제거
# ==============================

def clean_event_log(raw_events: pd.DataFrame,
                    min_events_per_case: int = MIN_EVENTS_PER_CASE) -> pd.DataFrame:
    """
    hadm_id 단위로 다음 규칙 적용:

      1) 시작 기준 이벤트:
         - 우선순위 1: ED_ARRIVAL
         - 우선순위 2: ED_ARRIVAL_SURR
         - 둘 다 없으면: 해당 hadm의 첫 timestamp

      2) DISCHARGE/DEATH 이후 이벤트 제거
         - 둘 다 있으면 더 이른 시점을 기준으로 자름

      3) 남은 이벤트 수가 min_events_per_case 미만이면 제거
    """
    keep_groups = []
    dropped_too_short = 0

    # 통계용 카운트
    cnt_start_ed       = 0  # ED_ARRIVAL 기준 시작
    cnt_start_ed_surr  = 0  # ED_ARRIVAL_SURR 기준 시작
    cnt_start_first    = 0  # 첫 이벤트 기준 시작

    for hadm_id, g in raw_events.groupby("hadm_id"):
        g = g.sort_values(["timestamp", "event_name"]).copy()
        subject_id = g["subject_id"].iloc[0]
        case_id = g["case_id"].iloc[0]

        # 1) 시작 시각 결정
        is_ed      = (g["event_name"] == "ED_ARRIVAL")
        is_ed_surr = (g["event_name"] == "ED_ARRIVAL_SURR")

        if is_ed.any():
            start_time = g.loc[is_ed, "timestamp"].min()
            cnt_start_ed += 1
        elif is_ed_surr.any():
            start_time = g.loc[is_ed_surr, "timestamp"].min()
            cnt_start_ed_surr += 1
        else:
            start_time = g["timestamp"].min()
            cnt_start_first += 1

        g = g[g["timestamp"] >= start_time].copy()

        # 2) DISCHARGE/DEATH 이후 제거
        is_end = g["event_name"].isin(["DISCHARGE", "DEATH"])
        if is_end.any():
            end_time = g.loc[is_end, "timestamp"].min()
            g = g[g["timestamp"] <= end_time].copy()

        # 3) 최소 이벤트 개수 체크
        if len(g) < min_events_per_case:
            dropped_too_short += 1
            continue

        g["subject_id"] = subject_id
        g["case_id"] = case_id
        keep_groups.append(g)

    if not keep_groups:
        print("[CLEAN] 남아 있는 trace가 없습니다.")
        print(f"[CLEAN] 원본 hadm_id 수: {raw_events['hadm_id'].nunique()}")
        print(f"[CLEAN] 이벤트 수<{min_events_per_case}로 제거된 hadm_id 수: {dropped_too_short}")
        return pd.DataFrame(columns=raw_events.columns)

    clean_df = pd.concat(keep_groups, ignore_index=True)
    clean_df = clean_df.sort_values(
        by=["hadm_id", "timestamp", "event_name"]
    ).reset_index(drop=True)

    print("\n[CLEAN] === 요약 ===")
    print(f"원본 hadm_id 수: {raw_events['hadm_id'].nunique()}")
    print(f"최종 남은 hadm_id 수: {clean_df['hadm_id'].nunique()}")
    print(f"최종 이벤트 row 수: {len(clean_df)}")
    print(f"이벤트 수<{min_events_per_case}로 제거된 hadm_id 수: {dropped_too_short}")
    print("\n[CLEAN] 시작 기준 통계 (hadm 단위):")
    print(f"  ED_ARRIVAL 기준 시작 hadm 수       : {cnt_start_ed}")
    print(f"  ED_ARRIVAL_SURR 기준 시작 hadm 수  : {cnt_start_ed_surr}")
    print(f"  첫 이벤트 기준 시작 hadm 수        : {cnt_start_first}")

    return clean_df


# ==============================
# 5. event_name ↔ event_id 매핑
# ==============================

def build_event_id_map(events: pd.DataFrame) -> pd.DataFrame:
    unique_events = sorted(events["event_name"].unique())
    event_id_map = pd.DataFrame({
        "event_name": unique_events,
        "event_id": range(1, len(unique_events) + 1)
    })
    print(f"[MAP] 이벤트 종류 개수: {len(unique_events)}")
    return event_id_map


# ==============================
# 6. PPM prefix–next_event 데이터셋 생성
# ==============================

def build_ppm_prefix_dataset(clean_events: pd.DataFrame,
                             event_id_map: pd.DataFrame) -> pd.DataFrame:
    """
    PPM용 prefix–next_event 데이터셋 생성.

    출력 컬럼:
      - subject_id
      - hadm_id
      - case_id
      - prefix_len
      - prefix_events_str
      - current_event
      - current_event_id
      - next_event
      - next_event_id
      - time_since_start_min
      - time_to_next_min
      - full_trace_len
    """
    name_to_id = dict(zip(event_id_map["event_name"], event_id_map["event_id"]))
    records = []

    for hadm_id, g in clean_events.groupby("hadm_id"):
        g = g.sort_values(["timestamp", "event_name"]).copy()
        subject_id = g["subject_id"].iloc[0]
        case_id = g["case_id"].iloc[0] if "case_id" in g.columns else hadm_id

        events = list(g["event_name"])
        times = list(g["timestamp"])
        full_trace_len = len(events)

        if full_trace_len < 2:
            continue

        first_time = times[0]

        for i in range(full_trace_len - 1):
            prefix_seq = events[: i + 1]
            prefix_len = len(prefix_seq)
            current_event = events[i]
            next_event = events[i + 1]

            prefix_end_time = times[i]
            next_time = times[i + 1]

            time_since_start_min = (prefix_end_time - first_time).total_seconds() / 60.0
            time_to_next_min = (next_time - prefix_end_time).total_seconds() / 60.0

            # 음수/NaN 방어
            if time_since_start_min < 0 or time_to_next_min < 0:
                continue

            prefix_str = ">".join(prefix_seq)

            rec = {
                "subject_id": subject_id,
                "hadm_id": hadm_id,
                "case_id": case_id,
                "prefix_len": prefix_len,
                "prefix_events_str": prefix_str,
                "current_event": current_event,
                "current_event_id": name_to_id.get(current_event, -1),
                "next_event": next_event,
                "next_event_id": name_to_id.get(next_event, -1),
                "time_since_start_min": time_since_start_min,
                "time_to_next_min": time_to_next_min,
                "full_trace_len": full_trace_len,
            }
            records.append(rec)

    ppm_df = pd.DataFrame(records)
    print(f"[PPM] prefix–next_event row 수: {len(ppm_df)}")
    print(f"[PPM] hadm_id 수: {ppm_df['hadm_id'].nunique() if not ppm_df.empty else 0}")

    if len(ppm_df) > 0:
        print("\n[PPM] time_to_next_min 분포 (raw):")
        print(ppm_df["time_to_next_min"].describe(percentiles=[0.5, 0.9, 0.99, 0.999]))
        print("\n[PPM] time_since_start_min 분포 (raw):")
        print(ppm_df["time_since_start_min"].describe(percentiles=[0.5, 0.9, 0.99, 0.999]))

    return ppm_df


# ==============================
# 7. ver142 cohort 구축 main
# ==============================

def build_ver142_cohort():
    # 1) 140 event log 로딩
    raw_events = load_event_log(INPUT_EVENT_LOG_PATH)

    # 2) 1차 시간 sanity 필터 (episode 길이 기준)
    time_filtered = sanity_filter_time(raw_events)

    # 3) Clean: ED/ED_SURR 기준 시작, DISCHARGE/DEATH까지, 너무 짧은 trace 제거
    clean_events = clean_event_log(time_filtered, min_events_per_case=MIN_EVENTS_PER_CASE)

    if clean_events.empty:
        print("[MAIN] clean_events가 비어 있습니다. 이전 단계 이벤트 생성 로직을 확인하세요.")
        return

    # 4) 클린 이벤트 로그 저장 (ver142)
    clean_events.to_csv(CLEAN_EVENT_LOG_PATH, index=False)
    print(f"[SAVE] 클린 이벤트 로그 저장: {CLEAN_EVENT_LOG_PATH}")

    # 5) event_name ↔ event_id 매핑 생성 및 저장
    event_id_map = build_event_id_map(clean_events)
    event_id_map.to_csv(EVENT_ID_MAP_PATH, index=False)
    print(f"[SAVE] 이벤트 ID 매핑 저장: {EVENT_ID_MAP_PATH}")

    # 6) PPM prefix–next_event 데이터셋 생성 및 저장
    ppm_df = build_ppm_prefix_dataset(clean_events, event_id_map)
    ppm_df.to_csv(PPM_DATA_PATH, index=False)
    print(f"[SAVE] PPM prefix–next_event 데이터셋 저장: {PPM_DATA_PATH}")

    print("\n[INFO] ver142 cohort 구축 완료.")
    print(f"  - clean event log : {CLEAN_EVENT_LOG_PATH}")
    print(f"  - event_id map    : {EVENT_ID_MAP_PATH}")
    print(f"  - PPM dataset     : {PPM_DATA_PATH}")


# ==============================
# 8. PPM 데이터 로딩 (모델용)
# ==============================

def load_ppm_dataset(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)

    required_cols = [
        "subject_id",
        "hadm_id",
        "case_id",
        "prefix_len",
        "prefix_events_str",
        "current_event",
        "current_event_id",
        "next_event",
        "next_event_id",
        "time_since_start_min",
        "time_to_next_min",
        "full_trace_len",
    ]
    for c in required_cols:
        if c not in df.columns:
            raise ValueError(f"PPM 데이터셋에 '{c}' 컬럼이 없습니다. 현재 컬럼: {list(df.columns)}")

    # 1) NaN / 음수 제거
    df = df.dropna(subset=["next_event_id", "time_since_start_min", "time_to_next_min"])

    before = len(df)
    df = df[(df["time_since_start_min"] >= 0) & (df["time_to_next_min"] >= 0)]
    print(f"[LOAD] 음수 time 제거: {before} -> {len(df)} rows")

    print(f"[LOAD] PPM 데이터 로딩 완료: {len(df)} rows, {df['hadm_id'].nunique()} hadm_id")
    print(f"[LOAD] next_event_id 고유 개수: {df['next_event_id'].nunique()}")

    # 2) 극단값 제거 전 분포
    print("\n=== time_to_next_min 분포 (raw) ===")
    print(df["time_to_next_min"].describe(percentiles=[0.5, 0.9, 0.99, 0.999]))
    print("\n=== time_since_start_min 분포 (raw) ===")
    print(df["time_since_start_min"].describe(percentiles=[0.5, 0.9, 0.99, 0.999]))

    # 3) 극단값 제거 (30일 초과 row 제거)
    before_trim = len(df)
    df = df[
        (df["time_to_next_min"] <= MAX_TIME_TO_NEXT_TRIM_MIN) &
        (df["time_since_start_min"] <= MAX_TIME_SINCE_START_TRIM_MIN)
    ].copy()
    after_trim = len(df)

    print(f"\n[TRIM] 극단값 제거 기준:")
    print(f"  - time_to_next_min <= {MAX_TIME_TO_NEXT_TRIM_MIN} 분 (~30일)")
    print(f"  - time_since_start_min <= {MAX_TIME_SINCE_START_TRIM_MIN} 분 (~30일)")
    print(f"[TRIM] 극단값 제거: {before_trim} -> {after_trim} rows (제거: {before_trim - after_trim})")

    # 4) 극단값 제거 후 분포
    if len(df) > 0:
        print("\n=== time_to_next_min 분포 (trimmed) ===")
        print(df["time_to_next_min"].describe(percentiles=[0.5, 0.9, 0.99, 0.999]))
        print("\n=== time_since_start_min 분포 (trimmed) ===")
        print(df["time_since_start_min"].describe(percentiles=[0.5, 0.9, 0.99, 0.999]))
    else:
        print("[TRIM] 모든 row가 제거되었습니다. 극단값 기준을 다시 조정해야 합니다.")

    return df


# ==============================
# 9. hadm_id 기준 Train / Val / Test Split
# ==============================

def split_by_hadm(df: pd.DataFrame,
                  test_size: float = TEST_SIZE,
                  val_size: float = VAL_SIZE,
                  random_state: int = RANDOM_STATE):
    unique_hadm = df["hadm_id"].drop_duplicates().values

    hadm_train_val, hadm_test = train_test_split(
        unique_hadm, test_size=test_size, random_state=random_state, shuffle=True
    )
    val_ratio = val_size / (1.0 - test_size)
    hadm_train, hadm_val = train_test_split(
        hadm_train_val, test_size=val_ratio, random_state=random_state, shuffle=True
    )

    def _subset(hadm_ids):
        return df[df["hadm_id"].isin(hadm_ids)].copy()

    df_train = _subset(hadm_train)
    df_val = _subset(hadm_val)
    df_test = _subset(hadm_test)

    print(f"[SPLIT] train hadm_id: {len(hadm_train)}, rows: {len(df_train)}")
    print(f"[SPLIT] val   hadm_id: {len(hadm_val)}, rows: {len(df_val)}")
    print(f"[SPLIT] test  hadm_id: {len(hadm_test)}, rows: {len(df_test)}")

    return df_train, df_val, df_test


# ==============================
# 10. Feature 구성
# ==============================

def build_feature_matrices(df_train: pd.DataFrame,
                           df_val: pd.DataFrame,
                           df_test: pd.DataFrame):
    """
    LightGBM에 넣을 feature matrix와 label 벡터 생성.
    - 분류: next_event_id (0-index로 변환)
    - 회귀: time_to_next_min (상한 클리핑 후 사용)
    """
    num_cols = ["prefix_len", "time_since_start_min", "full_trace_len"]
    cat_cols = ["current_event_id", "prefix_events_str"]

    for c in cat_cols:
        for df in [df_train, df_val, df_test]:
            df[c] = df[c].astype("category")

    feature_cols = num_cols + cat_cols

    X_train = df_train[feature_cols]
    X_val = df_val[feature_cols]
    X_test = df_test[feature_cols]

    # LightGBM multiclass: 0 ~ num_class-1
    y_train_cls = df_train["next_event_id"].values - 1
    y_val_cls = df_val["next_event_id"].values - 1
    y_test_cls = df_test["next_event_id"].values - 1

    # 회귀 타깃: 지나치게 큰 값은 클리핑
    def _clip_time_to_next(arr):
        return np.minimum(arr, MAX_TIME_TO_NEXT_MIN)

    y_train_reg = _clip_time_to_next(df_train["time_to_next_min"].values)
    y_val_reg = _clip_time_to_next(df_val["time_to_next_min"].values)
    y_test_reg = _clip_time_to_next(df_test["time_to_next_min"].values)

    print(f"[FEATURE] 사용 feature 컬럼: {feature_cols}")
    print(f"[FEATURE] 분류 target 클래스 수: {len(np.unique(y_train_cls))}")
    print(f"[FEATURE] 회귀 target 클리핑 상한 (min): {MAX_TIME_TO_NEXT_MIN}")

    return (
        X_train, X_val, X_test,
        y_train_cls, y_val_cls, y_test_cls,
        y_train_reg, y_val_reg, y_test_reg,
        num_cols, cat_cols
    )


# ==============================
# 11. LightGBM 학습 함수들
# ==============================

def train_lgbm_classifier(
    X_train, y_train,
    X_val, y_val,
    num_cols, cat_cols,
    model_path: str
):
    num_class = len(np.unique(y_train))

    train_data = lgb.Dataset(
        X_train,
        label=y_train,
        categorical_feature=cat_cols,
        free_raw_data=False
    )
    val_data = lgb.Dataset(
        X_val,
        label=y_val,
        categorical_feature=cat_cols,
        free_raw_data=False
    )

    params = {
        "objective": "multiclass",
        "num_class": num_class,
        "metric": ["multi_logloss", "multi_error"],
        "learning_rate": 0.05,
        "num_leaves": 31,
        "max_depth": -1,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "min_data_in_leaf": 30,
        "lambda_l2": 1.0,
        "verbosity": -1,
        "force_col_wise": True,
        "seed": RANDOM_STATE,
    }

    print("\n[TRAIN] LightGBM 분류 모델 학습 시작 (next_event_id)...")
    model = lgb.train(
        params,
        train_data,
        valid_sets=[train_data, val_data],
        valid_names=["train", "val"],
        num_boost_round=500,
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(period=50),
        ],
    )

    best_iter = model.best_iteration
    if best_iter is None or best_iter == 0:
        if hasattr(model, "current_iteration") and model.current_iteration() is not None:
            best_iter = model.current_iteration()
        else:
            best_iter = model.num_trees()

    print(f"[TRAIN] 최적 반복 수: {best_iter}")
    model.save_model(model_path)
    print(f"[SAVE] 분류 모델 저장: {model_path}")
    return model


def train_lgbm_regressor(
    X_train, y_train,
    X_val, y_val,
    num_cols, cat_cols,
    model_path: str
):
    train_data = lgb.Dataset(
        X_train,
        label=y_train,
        categorical_feature=cat_cols,
        free_raw_data=False
    )
    val_data = lgb.Dataset(
        X_val,
        label=y_val,
        categorical_feature=cat_cols,
        free_raw_data=False
    )

    params = {
        "objective": "regression",
        "metric": ["l2", "l1"],
        "learning_rate": 0.05,
        "num_leaves": 31,
        "max_depth": -1,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "min_data_in_leaf": 30,
        "lambda_l2": 1.0,
        "verbosity": -1,
        "force_col_wise": True,
        "seed": RANDOM_STATE,
    }

    print("\n[TRAIN] LightGBM 회귀 모델 학습 시작 (time_to_next_min)...")
    model = lgb.train(
        params,
        train_data,
        valid_sets=[train_data, val_data],
        valid_names=["train", "val"],
        num_boost_round=500,
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(period=50),
        ],
    )

    best_iter = model.best_iteration
    if best_iter is None or best_iter == 0:
        if hasattr(model, "current_iteration") and model.current_iteration() is not None:
            best_iter = model.current_iteration()
        else:
            best_iter = model.num_trees()

    print(f"[TRAIN] 최적 반복 수: {best_iter}")
    model.save_model(model_path)
    print(f"[SAVE] 회귀 모델 저장: {model_path}")
    return model


# ==============================
# 12. 평가 함수
# ==============================

def _get_best_iter(model: lgb.Booster) -> int:
    best_iter = getattr(model, "best_iteration", None)
    if best_iter is None or best_iter == 0:
        if hasattr(model, "current_iteration") and model.current_iteration() is not None:
            best_iter = model.current_iteration()
        else:
            best_iter = model.num_trees()
    return best_iter


def eval_classifier(model, X_test, y_test):
    num_iter = _get_best_iter(model)
    prob = model.predict(X_test, num_iteration=num_iter)

    y_pred = np.argmax(prob, axis=1)
    acc_top1 = accuracy_score(y_test, y_pred)

    top3 = np.argsort(prob, axis=1)[:, -3:]
    correct_top3 = np.any(top3 == y_test.reshape(-1, 1), axis=1)
    acc_top3 = correct_top3.mean()

    print("\n[EVAL][CLASS] Test Top-1 Accuracy :", acc_top1)
    print("[EVAL][CLASS] Test Top-3 Accuracy :", acc_top3)
    print("\n[EVAL][CLASS] Classification Report (Top-1 기준):")
    print(classification_report(y_test, y_pred, digits=4, zero_division=0))


def eval_regressor(model, X_test, y_test):
    num_iter = _get_best_iter(model)
    pred = model.predict(X_test, num_iteration=num_iter)
    mae = np.mean(np.abs(pred - y_test))
    rmse = np.sqrt(np.mean((pred - y_test) ** 2))

    print("\n[EVAL][REG] Test MAE (min) :", mae)
    print("[EVAL][REG] Test RMSE (min):", rmse)


# ==============================
# 13. 전체 MAIN
# ==============================

def main():
    # 1) ver142 cohort 구축 (필요 시 한 번만 실행)
    if not os.path.exists(PPM_DATA_PATH):
        print("[MAIN] ver142 PPM 데이터가 없어 cohort를 먼저 구축합니다.")
        build_ver142_cohort()
    else:
        print(f"[MAIN] PPM 데이터가 이미 존재합니다: {PPM_DATA_PATH}")

    # 2) PPM 데이터 로딩 (극단값 제거 포함)
    df = load_ppm_dataset(PPM_DATA_PATH)

    # 3) hadm_id 기준 Train/Val/Test split
    df_train, df_val, df_test = split_by_hadm(df)

    # 4) Feature / Label 구성
    (
        X_train, X_val, X_test,
        y_train_cls, y_val_cls, y_test_cls,
        y_train_reg, y_val_reg, y_test_reg,
        num_cols, cat_cols
    ) = build_feature_matrices(df_train, df_val, df_test)

    # 5) 분류 모델 학습 (next_event_id)
    cls_model = train_lgbm_classifier(
        X_train, y_train_cls,
        X_val, y_val_cls,
        num_cols, cat_cols,
        MODEL_NEXT_EVENT_PATH
    )

    # 6) 회귀 모델 학습 (time_to_next_min)
    reg_model = train_lgbm_regressor(
        X_train, y_train_reg,
        X_val, y_val_reg,
        num_cols, cat_cols,
        MODEL_TIME_TO_NEXT_PATH
    )

    # 7) Test 평가
    eval_classifier(cls_model, X_test, y_test_cls)
    eval_regressor(reg_model, X_test, y_test_reg)

    print("\n[INFO] LightGBM 기반 PPM 베이스라인 학습 및 평가 완료.")


if __name__ == "__main__":
    main()


[MAIN] PPM 데이터가 이미 존재합니다: ./cohort\cohort_ver142_ppm_prefix_next_event.csv
[LOAD] 음수 time 제거: 24991 -> 24991 rows
[LOAD] PPM 데이터 로딩 완료: 24991 rows, 1869 hadm_id
[LOAD] next_event_id 고유 개수: 13

=== time_to_next_min 분포 (raw) ===
count    2.499100e+04
mean     2.014538e+03
std      1.161987e+05
min      0.000000e+00
50%      8.100000e+01
90%      1.440000e+03
99%      8.736903e+03
99.9%    4.778485e+04
max      1.448079e+07
Name: time_to_next_min, dtype: float64

=== time_since_start_min 분포 (raw) ===
count    2.499100e+04
mean     2.040683e+04
std      5.081532e+05
min      0.000000e+00
50%      6.110000e+02
90%      5.558000e+03
99%      2.929445e+04
99.9%    1.086022e+07
max      1.633377e+07
Name: time_since_start_min, dtype: float64

[TRIM] 극단값 제거 기준:
  - time_to_next_min <= 43200 분 (~30일)
  - time_since_start_min <= 43200 분 (~30일)
[TRIM] 극단값 제거: 24991 -> 24832 rows (제거: 159)

=== time_to_next_min 분포 (trimmed) ===
count    24832.000000
mean       573.512138
std       1676.927713
min  

In [4]:
df = load_ppm_dataset(PPM_DATA_PATH)

print("=== time_to_next_min (trimmed) ===")
print(df["time_to_next_min"].describe(percentiles=[0.5, 0.9, 0.99, 0.999]))


[LOAD] 음수 time 제거: 24991 -> 24991 rows
[LOAD] PPM 데이터 로딩 완료: 24991 rows, 1869 hadm_id
[LOAD] next_event_id 고유 개수: 13

=== time_to_next_min 분포 (raw) ===
count    2.499100e+04
mean     2.014538e+03
std      1.161987e+05
min      0.000000e+00
50%      8.100000e+01
90%      1.440000e+03
99%      8.736903e+03
99.9%    4.778485e+04
max      1.448079e+07
Name: time_to_next_min, dtype: float64

=== time_since_start_min 분포 (raw) ===
count    2.499100e+04
mean     2.040683e+04
std      5.081532e+05
min      0.000000e+00
50%      6.110000e+02
90%      5.558000e+03
99%      2.929445e+04
99.9%    1.086022e+07
max      1.633377e+07
Name: time_since_start_min, dtype: float64

[TRIM] 극단값 제거 기준:
  - time_to_next_min <= 43200 분 (~30일)
  - time_since_start_min <= 43200 분 (~30일)
[TRIM] 극단값 제거: 24991 -> 24832 rows (제거: 159)

=== time_to_next_min 분포 (trimmed) ===
count    24832.000000
mean       573.512138
std       1676.927713
min          0.000000
50%         80.000000
90%       1418.000000
99%       8051

In [5]:

print("\n=== time_since_start_min (trimmed) ===")
print(df["time_since_start_min"].describe(percentiles=[0.5, 0.9, 0.99, 0.999]))



=== time_since_start_min (trimmed) ===
count    24832.000000
mean      2020.633524
std       4164.794255
min          0.000000
50%        604.000000
90%       5309.000000
99%      21981.657167
99.9%    38034.704000
max      42786.000000
Name: time_since_start_min, dtype: float64
