In [4]:
import os
import pandas as pd
import numpy as np

# ============================================
# 0. 경로 설정
# ============================================
INPUT_PATH = "./../cohort/cohort_102_recalc_cum.csv"
EVENT_ID_MAP_PATH = "./../cohort/cohort_ver142_event_id_map.csv"
OUTPUT_PATH = "./../cohort/cohort_ver147_ppm_event_only_with_targets.csv"


# ============================================
# 1. Event ID Map 로딩
# ============================================
def load_event_map(path):
    m = pd.read_csv(path)
    m["event_id"] = m["event_id"].astype(int)
    return dict(zip(m["event_name"], m["event_id"]))


# ============================================
# 2. Event-only PPM 생성
# ============================================
def build_event_only_ppm(df, event_map):

    # ---- 기존 prefix 관련 컬럼 전체 삭제 ----
    drop_cols = [
        "prefix_len", "prefix_events_str",
        "current_event_id", "current_event",
        "next_event_id", "next_event",
        "time_to_next_min", "full_trace_len",
        "target_time_to_next", "target_next_evt", "target_remain_los",
    ]
    df = df[[c for c in df.columns if c not in drop_cols]]

    event_records = []

    for (sid, hid), g in df.groupby(["subject_id", "hadm_id"]):

        g = g.sort_values("time_since_start_min").reset_index(drop=True)
        n = len(g)

        # delta 계산용 prev 값
        g["prev_cum_ecg"] = g["cum_ecg_cnt"].shift(1).fillna(0)
        g["prev_cum_stemi"] = g["cum_stemi_cnt"].shift(1).fillna(0)
        g["prev_cum_trop"] = g["cum_trop_cnt"].shift(1).fillna(0)
        g["prev_trop_flag"] = g["trop_pos_flag"].shift(1).fillna(0)
        g["prev_pci_status"] = g["pci_status"].shift(1).fillna(0)

        # --------------------------------------------------------
        # ① 첫 row에서 ED_ARRIVAL 1회
        # --------------------------------------------------------
        row0 = g.iloc[0]
        base0 = row0.to_dict()
        event_records.append({
            **base0,
            "current_event": "ED_ARRIVAL",
            "current_event_id": event_map["ED_ARRIVAL"],
        })

        # --------------------------------------------------------
        # ② 중간 이벤트들 (ECG, TROP, PCI)
        # --------------------------------------------------------
        for i in range(1, n):
            row = g.iloc[i]
            base = row.to_dict()
            events = []

            # ECG 이벤트
            if row["cum_ecg_cnt"] > row["prev_cum_ecg"]:
                if row["cum_stemi_cnt"] > row["prev_cum_stemi"]:
                    events.append("ECG_STEMI_FLAG")
                else:
                    events.append("ECG_TAKEN")

            # TROP 이벤트
            if row["cum_trop_cnt"] > row["prev_cum_trop"]:
                if (row["prev_trop_flag"] == 0) and (row["trop_pos_flag"] == 1):
                    events.append("TROP_POSITIVE")
                else:
                    events.append("TROP_TAKEN")

            # PCI 시작
            if (row["prev_pci_status"] == 0) and (row["pci_status"] != 0):
                events.append("PCI_START")

            # 기록
            for ev in events:
                event_records.append({
                    **base,
                    "current_event": ev,
                    "current_event_id": event_map[ev],
                })

        # --------------------------------------------------------
        # ③ 종료 이벤트 DISCHARGE / DEATH
        # --------------------------------------------------------
        last_row = g.iloc[-1]
        base_last = last_row.to_dict()

        if int(last_row["target_mortality"]) == 1:
            ev = "DEATH"
        else:
            ev = "DISCHARGE"

        event_records.append({
            **base_last,
            "current_event": ev,
            "current_event_id": event_map[ev],
        })

        # --------------------------------------------------------
        # ④ EOS 1회
        # --------------------------------------------------------
        event_records.append({
            **base_last,
            "current_event": "EOS",
            "current_event_id": event_map["EOS"],
        })

    # 전체 event df 생성
    evt = pd.DataFrame(event_records)

    # 정렬
    evt = evt.sort_values(
        ["subject_id", "hadm_id", "time_since_start_min", "current_event_id"]
    ).reset_index(drop=True)

    # --------------------------------------------------------
    # prefix_len / next_event_id / time_to_next 계산
    # --------------------------------------------------------
    def _add_prefix_meta(g):
        g = g.sort_values(["time_since_start_min", "current_event_id"]).reset_index(drop=True)
        n = len(g)
        g["prefix_len"] = np.arange(1, n + 1)
        g["full_trace_len"] = n
        g["next_event_id"] = g["current_event_id"].shift(-1)
        g["next_event"] = g["current_event"].shift(-1)
        g["time_to_next_min"] = g["time_since_start_min"].shift(-1) - g["time_since_start_min"]
        return g

    evt = evt.groupby(["subject_id", "hadm_id"], group_keys=False).apply(_add_prefix_meta)

    # --------------------------------------------------------
    # prefix_events_str – 안전한 방식 (index mismatch 없음)
    # --------------------------------------------------------
    def _build_prefix_events(g):
        g = g.sort_values("prefix_len").copy()

        events = g["current_event"].tolist()
        prefixes = []
        acc = []
        for ev in events:
            acc.append(ev)
            prefixes.append(">".join(acc))

        g["prefix_events_str"] = prefixes
        return g

    evt = evt.groupby(["subject_id", "hadm_id"], group_keys=False).apply(_build_prefix_events)

    # --------------------------------------------------------
    # target_next_evt / target_time_to_next / target_remain_los 재계산
    # --------------------------------------------------------
    evt["target_next_evt"] = evt["next_event_id"]
    evt["target_time_to_next"] = evt["time_to_next_min"]

    def _calc_remaining(g):
        eos_time = g[g["current_event"] == "EOS"]["time_since_start_min"].iloc[0]
        g["target_remain_los"] = eos_time - g["time_since_start_min"]
        return g

    evt = evt.groupby(["subject_id", "hadm_id"], group_keys=False).apply(_calc_remaining)

    return evt


# ============================================
# 3. MAIN
# ============================================
def main():
    df = pd.read_csv(INPUT_PATH)
    event_map = load_event_map(EVENT_ID_MAP_PATH)

    evt = build_event_only_ppm(df, event_map)

    evt.to_csv(OUTPUT_PATH, index=False)
    print(f"[SAVE] ver147 저장 완료: {OUTPUT_PATH}")


if __name__ == "__main__":
    main()

  evt = evt.groupby(["subject_id", "hadm_id"], group_keys=False).apply(_add_prefix_meta)
  evt = evt.groupby(["subject_id", "hadm_id"], group_keys=False).apply(_build_prefix_events)
  evt = evt.groupby(["subject_id", "hadm_id"], group_keys=False).apply(_calc_remaining)


[SAVE] ver147 저장 완료: ./../cohort/cohort_ver147_ppm_event_only_with_targets.csv
