In [9]:
import os
import pandas as pd
import numpy as np

INPUT_PATH = "./../cohort/cohort_102_recalc_cum.csv"
EVENT_ID_MAP_PATH = "./../cohort/cohort_ver142_event_id_map.csv"
OUTPUT_PATH = "./../cohort/cohort_ver146_ppm_event_only.csv"


def load_event_map(path):
    m = pd.read_csv(path)
    m["event_id"] = m["event_id"].astype(int)
    return dict(zip(m["event_name"], m["event_id"]))


def build_events(df, event_map):

    # 기존 prefix/event 컬럼 전부 폐기
    drop_cols = [
        "prefix_len","prefix_events_str","current_event_id","current_event",
        "next_event_id","next_event","time_to_next_min","full_trace_len",
        "target_time_to_next"
    ]
    df = df[[c for c in df.columns if c not in drop_cols]]

    records = []

    for (sid, hid), g in df.groupby(["subject_id", "hadm_id"]):

        g = g.sort_values("time_since_start_min").reset_index(drop=True)
        n = len(g)

        # delta 계산용 이전 값
        g["prev_cum_ecg"] = g["cum_ecg_cnt"].shift(1).fillna(0)
        g["prev_cum_stemi"] = g["cum_stemi_cnt"].shift(1).fillna(0)
        g["prev_cum_trop"] = g["cum_trop_cnt"].shift(1).fillna(0)
        g["prev_trop_flag"] = g["trop_pos_flag"].shift(1).fillna(0)
        g["prev_pci_status"] = g["pci_status"].shift(1).fillna(0)

        # ------------------------
        # ① 첫 row에서 ED_ARRIVAL 1개만 생성
        # ------------------------
        row = g.iloc[0]
        base = row.to_dict()
        records.append({
            **base,
            "current_event": "ED_ARRIVAL",
            "current_event_id": event_map["ED_ARRIVAL"]
        })

        # ------------------------
        # ② 나머지 이벤트 생성
        # ------------------------
        for i in range(1, n):
            row = g.iloc[i]
            base = row.to_dict()
            events = []

            # ECG 증가
            if row["cum_ecg_cnt"] > row["prev_cum_ecg"]:
                if row["cum_stemi_cnt"] > row["prev_cum_stemi"]:
                    events.append("ECG_STEMI_FLAG")
                else:
                    events.append("ECG_TAKEN")

            # Troponin 증가
            if row["cum_trop_cnt"] > row["prev_cum_trop"]:
                if (row["prev_trop_flag"] == 0) and (row["trop_pos_flag"] == 1):
                    events.append("TROP_POSITIVE")
                else:
                    events.append("TROP_TAKEN")

            # PCI 시작
            if (row["prev_pci_status"] == 0) and (row["pci_status"] != 0):
                events.append("PCI_START")

            for ev in events:
                records.append({
                    **base,
                    "current_event": ev,
                    "current_event_id": event_map[ev]
                })

        # ------------------------
        # ③ 종료 이벤트 (DISCHARGE or DEATH)
        # ------------------------
        last = g.iloc[-1]
        base_last = last.to_dict()

        if last["target_mortality"] == 1:
            ev = "DEATH"
        else:
            ev = "DISCHARGE"

        records.append({
            **base_last,
            "current_event": ev,
            "current_event_id": event_map[ev]
        })

        # ------------------------
        # ④ EOS 1개 추가
        # ------------------------
        records.append({
            **base_last,
            "current_event": "EOS",
            "current_event_id": event_map["EOS"]
        })

    evt = pd.DataFrame(records)

    # prefix_len, next_event 재계산
    evt = evt.sort_values(["subject_id","hadm_id","time_since_start_min",
                           "current_event_id"]).reset_index(drop=True)

    def add_prefix(g):
        g = g.sort_values(["time_since_start_min","current_event_id"])
        n = len(g)
        g["prefix_len"] = np.arange(1, n+1)
        g["full_trace_len"] = n
        g["next_event_id"] = g["current_event_id"].shift(-1)
        g["next_event"] = g["current_event"].shift(-1)
        g["time_to_next_min"] = (
            g["time_since_start_min"].shift(-1) - g["time_since_start_min"]
        )
        return g

    evt = evt.groupby(["subject_id","hadm_id"], group_keys=False).apply(add_prefix)

    return evt


def main():
    df = pd.read_csv(INPUT_PATH)
    event_map = load_event_map(EVENT_ID_MAP_PATH)
    evt = build_events(df, event_map)

    evt.to_csv(OUTPUT_PATH, index=False)
    print(f"[SAVE] {OUTPUT_PATH} 저장 완료")

    # sanity check
    ed_cnt = (
        evt[evt["current_event"] == "ED_ARRIVAL"]
        .groupby(["subject_id","hadm_id"])["current_event"]
        .count()
    )
    print("\n[CHECK] ED_ARRIVAL 개수 (환자당):")
    print(ed_cnt.describe())


if __name__ == "__main__":
    main()


  evt = evt.groupby(["subject_id","hadm_id"], group_keys=False).apply(add_prefix)


[SAVE] ./../cohort/cohort_ver146_ppm_event_only.csv 저장 완료

[CHECK] ED_ARRIVAL 개수 (환자당):
count    1929.0
mean        1.0
std         0.0
min         1.0
25%         1.0
50%         1.0
75%         1.0
max         1.0
Name: current_event, dtype: float64


In [14]:
df = pd.read_csv("./../cohort/cohort_ver146_ppm_event_only.csv")
print(
    df[df["current_event"] == "EOS"]
      .groupby(["subject_id", "hadm_id"])["current_event"]
      .count()
      .describe()
)


count    1929.0
mean        1.0
std         0.0
min         1.0
25%         1.0
50%         1.0
75%         1.0
max         1.0
Name: current_event, dtype: float64


In [15]:
df = pd.read_csv("./../cohort/cohort_ver146_ppm_event_only.csv")
print(
    df[df["current_event"] == "ED_ARRIVAL"]
      .groupby(["subject_id", "hadm_id"])["current_event"]
      .count()
      .describe()
)


count    1929.0
mean        1.0
std         0.0
min         1.0
25%         1.0
50%         1.0
75%         1.0
max         1.0
Name: current_event, dtype: float64
