In [None]:
import os
import json
import numpy as np
import pandas as pd

# ----------------------------------------
# 0. 경로 및 상수 설정
# ----------------------------------------
BASE_DIR = "./../cohort"

COHORT_PATH      = os.path.join(BASE_DIR, "cohort_ver183_add_delay_missing_flag.csv")
EVENT_LOG_PATH   = os.path.join(BASE_DIR, "event_log_stemi_all.csv")
CHARTEVENTS_PATH = os.path.join(BASE_DIR, "chartevents_ver50.csv")

# EOS 토큰 ID (다음 이벤트가 없는 마지막 이벤트의 target_next_evt)
EOS_ID = 0

# event_name → current_event_id 매핑 (cohort_ver127와 동일 매핑)
EVENT_ID_MAP = {
    "ED_ARRIVAL":      1,
    "ECG_STEMI_FLAG":  2,
    "ECG_TAKEN":       3,
    "TROP_POSITIVE":   4,
    "TROP_TAKEN":      5,
    "ANTI_PLT_ADMIN":  6,
    "ANTI_PLT_ORDER":  7,
    "PCI_START":       8,
    "ED_DEPARTURE":    9,
    "DEATH":          10,
    "DISCHARGE":      11,
    "ICU_INTIME":     12,
    "ICU_OUTTIME":    13,
}

# ----------------------------------------
# 1. 데이터 로딩
# ----------------------------------------
print("[LOAD] cohort_ver183_add_delay_missing_flag.csv 로딩 중...")
cohort = pd.read_csv(COHORT_PATH)

print("[LOAD] event_log_stemi_all.csv 로딩 중...")
event_log = pd.read_csv(EVENT_LOG_PATH, parse_dates=["timestamp"])

print("[LOAD] chartevents_ver50.csv 로딩 중 (HR/BP 전용)...")
chartevents = pd.read_csv(CHARTEVENTS_PATH, parse_dates=["charttime"])

# 타입 정리
cohort["hadm_id"]   = cohort["hadm_id"].astype("int64")
cohort["subject_id"] = cohort["subject_id"].astype("int64")

event_log["hadm_id"]   = event_log["hadm_id"].astype("int64")
event_log["subject_id"] = event_log["subject_id"].astype("int64")

chartevents["hadm_id"] = chartevents["hadm_id"].astype("int64")

# cohort 안에 있는 hadm_id만 사용
event_log = event_log[event_log["hadm_id"].isin(cohort["hadm_id"])].copy()

# ----------------------------------------
# 2. 이벤트 필터링 및 current_event_id 매핑
# ----------------------------------------
event_log = event_log[event_log["event_name"].isin(EVENT_ID_MAP.keys())].copy()
event_log["current_event_id"] = event_log["event_name"].map(EVENT_ID_MAP)

# 정렬 (hadm_id, timestamp, event_id 기준)
event_log = event_log.sort_values(["hadm_id", "timestamp", "current_event_id"])

print(f"[INFO] 이벤트 수: {len(event_log)} (hadm_id {event_log['hadm_id'].nunique()}개)")

# ----------------------------------------
# 3. hadm_id별 이벤트 시퀀스 기반 feature 계산
#    (17~31, 35~38, 34 포함)
# ----------------------------------------
def build_features_per_admission(g: pd.DataFrame) -> pd.DataFrame:
    """
    한 hadm_id 내에서:
    - prefix_len / time_since_ed / time_since_last / is_night
    - cum_ecg_cnt / cum_stemi_cnt / cum_trop_cnt
    - stemi_flag / trop_pos_flag
    - last_trop / run_max_trop / trop_trend
    - pci_status (0/1)
    - time_since_start_min
    - target_next_evt / target_time_to_next
    - target_mortality / target_remain_los
    를 계산한다.
    """
    g = g.sort_values("timestamp").copy()

    # 17. prefix_len
    g["prefix_len"] = np.arange(1, len(g) + 1)

    # ED_ARRIVAL 기준 시간
    ed_mask = g["event_name"] == "ED_ARRIVAL"
    if ed_mask.any():
        ed_time = g.loc[ed_mask, "timestamp"].iloc[0]
    else:
        ed_time = g["timestamp"].min()

    # 18. time_since_ed (분 단위, 0 이하일 경우 0으로 클램핑)
    g["time_since_ed"] = (g["timestamp"] - ed_time).dt.total_seconds() / 60.0
    g.loc[g["time_since_ed"] < 0, "time_since_ed"] = 0.0

    # 19. time_since_last (분 단위, 첫 이벤트는 0)
    g["time_since_last"] = g["timestamp"].diff().dt.total_seconds() / 60.0
    g["time_since_last"] = g["time_since_last"].fillna(0.0)

    # 20. is_night (22~07시)
    hours = g["timestamp"].dt.hour
    g["is_night"] = ((hours >= 22) | (hours <= 7)).astype(int)

    # 21~23. 누적 카운트
    g["cum_ecg_cnt"]   = (g["event_name"] == "ECG_TAKEN").cumsum()
    g["cum_stemi_cnt"] = (g["event_name"] == "ECG_STEMI_FLAG").cumsum()
    g["cum_trop_cnt"]  = (g["event_name"] == "TROP_TAKEN").cumsum()

    # 24. stemi_flag (첫 ECG_STEMI_FLAG 이후 1)
    if (g["event_name"] == "ECG_STEMI_FLAG").any():
        first_stemi_time = g.loc[g["event_name"] == "ECG_STEMI_FLAG", "timestamp"].iloc[0]
        g["stemi_flag"] = (g["timestamp"] >= first_stemi_time).astype(int)
    else:
        g["stemi_flag"] = 0

    # 25. trop_pos_flag (첫 TROP_POSITIVE 이후 1)
    if (g["event_name"] == "TROP_POSITIVE").any():
        first_pos_trop_time = g.loc[g["event_name"] == "TROP_POSITIVE", "timestamp"].iloc[0]
        g["trop_pos_flag"] = (g["timestamp"] >= first_pos_trop_time).astype(int)
    else:
        g["trop_pos_flag"] = 0

    # 26~28. Troponin 관련 값 (attributes JSON에서 valuenum 추출)
    trop_mask = g["event_name"].isin(["TROP_TAKEN", "TROP_POSITIVE"])
    trop_vals = pd.Series(index=g.index, dtype="float64")

    if trop_mask.any():
        def extract_valuenum(attr_str):
            if not isinstance(attr_str, str) or not attr_str.strip():
                return np.nan
            try:
                d = json.loads(attr_str)
            except Exception:
                return np.nan
            v = d.get("valuenum")
            try:
                return float(v) if v is not None else np.nan
            except Exception:
                return np.nan

        trop_vals.loc[trop_mask] = g.loc[trop_mask, "attributes"].map(extract_valuenum)

    g["trop_value"] = trop_vals

    # 26. last_trop: 직전 Troponin 수치 (없으면 0)
    g["last_trop"] = g["trop_value"].ffill().fillna(0.0)

    # 27. run_max_trop: 현재까지 최대 Troponin 수치
    g["run_max_trop"] = g["last_trop"].cummax()

    # 28. trop_trend: current_trop - prev_trop (첫 검사는 0)
    g["trop_trend"] = g["last_trop"] - g["last_trop"].shift(1).fillna(0.0)

    # 29. pci_status: 0/1 (PCI_START 발생 전 0, 이후 1)
    if (g["event_name"] == "PCI_START").any():
        first_pci_time = g.loc[g["event_name"] == "PCI_START", "timestamp"].iloc[0]
        g["pci_status"] = (g["timestamp"] >= first_pci_time).astype(int)
    else:
        g["pci_status"] = 0

    # 30. time_since_start_min: 첫 이벤트 이후 경과 시간(분)
    first_time = g["timestamp"].min()
    g["time_since_start_min"] = (g["timestamp"] - first_time).dt.total_seconds() / 60.0

    # 36. target_next_evt: 다음 이벤트의 current_event_id, 없으면 EOS_ID
    g["target_next_evt"] = g["current_event_id"].shift(-1)
    g["target_next_evt"] = g["target_next_evt"].fillna(EOS_ID).astype(int)

    # 37. target_time_to_next: 다음 이벤트까지 시간(분), 마지막은 0
    diff_next = g["timestamp"].shift(-1) - g["timestamp"]
    g["target_time_to_next"] = diff_next.dt.total_seconds() / 60.0
    g["target_time_to_next"] = g["target_time_to_next"].fillna(0.0)

    # 35. target_mortality: DEATH 이벤트 존재 여부
    death_flag = int((g["event_name"] == "DEATH").any())
    g["target_mortality"] = death_flag

    # 38. target_remain_los: dischtime - current_time (일 단위)
    if (g["event_name"] == "DISCHARGE").any():
        dischtime = g.loc[g["event_name"] == "DISCHARGE", "timestamp"].iloc[0]
    elif (g["event_name"] == "DEATH").any():
        dischtime = g.loc[g["event_name"] == "DEATH", "timestamp"].iloc[0]
    else:
        dischtime = g["timestamp"].max()

    g["target_remain_los"] = (dischtime - g["timestamp"]).dt.total_seconds() / 60.0 / 24.0

    return g

print("[FEAT] hadm_id별 이벤트 기반 feature 계산 중...")
ev_feat = event_log.groupby("hadm_id", group_keys=False).apply(build_features_per_admission)

# ----------------------------------------
# 4. HR / BP 붙이기 (32, 33)
#    - hadm_id별로 나눠서 merge_asof(prev/next) 후 평균
# ----------------------------------------
hr = chartevents[chartevents["variable_name"] == "heart_rate"][
    ["hadm_id", "charttime", "valuenum"]
].copy()
bp = chartevents[chartevents["variable_name"] == "mean_bp"][
    ["hadm_id", "charttime", "valuenum"]
].copy()

def attach_signal_mean_prev_next(df_events: pd.DataFrame,
                                 df_signal: pd.DataFrame,
                                 out_col: str,
                                 time_col_evt: str = "timestamp",
                                 time_col_sig: str = "charttime") -> pd.Series:
    """
    각 hadm_id에 대해:
    - 이벤트 시각 기준 직전 값(prev)과 직후 값(next)을 merge_asof로 가져온 뒤
    - 둘의 평균을 out_col로 계산 (한쪽만 있으면 그 값 사용)
    """
    pieces = []

    for hadm_id, sub_evt in df_events.groupby("hadm_id"):
        sub_evt = sub_evt.sort_values(time_col_evt).copy()
        sig = df_signal[df_signal["hadm_id"] == hadm_id].sort_values(time_col_sig)

        if sig.empty:
            sub_evt[out_col] = np.nan
        else:
            left = sub_evt[[time_col_evt]].rename(columns={time_col_evt: "t"})
            right = sig[[time_col_sig, "valuenum"]].rename(columns={time_col_sig: "t"})

            prev = pd.merge_asof(
                left,
                right,
                on="t",
                direction="backward"
            )["valuenum"]

            nxt = pd.merge_asof(
                left,
                right,
                on="t",
                direction="forward"
            )["valuenum"]

            sub_evt[out_col] = pd.concat([prev, nxt], axis=1).mean(axis=1, skipna=True)

        pieces.append(sub_evt[[out_col]])

    out = pd.concat(pieces)
    return out[out_col]

print("[FEAT] current_heart_rate 계산 중...")
ev_feat = ev_feat.sort_values(["hadm_id", "timestamp"])
ev_feat["current_heart_rate"] = attach_signal_mean_prev_next(ev_feat, hr, "current_heart_rate")

print("[FEAT] current_mean_bp 계산 중...")
ev_feat["current_mean_bp"] = attach_signal_mean_prev_next(ev_feat, bp, "current_mean_bp")

# ----------------------------------------
# 5. static cohort(1~16)와 이벤트 feature 병합
# ----------------------------------------
# event 쪽 subject_id는 버리고 cohort의 subject_id 사용
ev_feat_for_merge = ev_feat.drop(columns=["subject_id"], errors="ignore").copy()
ev_feat_for_merge["hadm_id"] = ev_feat_for_merge["hadm_id"].astype("int64")

final = ev_feat_for_merge.merge(cohort, on="hadm_id", how="left")

# missing_flag 컬럼 이름 정리
final = final.rename(columns={
    "door_to_ecg_missing":  "door_to_ecg_missing_flag",
    "door_to_trop_missing": "door_to_trop_missing_flag",
    "door_to_anti_missing": "door_to_anti_missing_flag",
    "door_to_pci_missing":  "door_to_pci_missing_flag",
})

# target_mortality 형 변환
final["target_mortality"] = final["target_mortality"].astype(int)

# ----------------------------------------
# 6. 최종 컬럼 순서 정리 (1~38)
# ----------------------------------------
cols_order = [
    "subject_id",                 # 1
    "hadm_id",                    # 2
    "age",                        # 3
    "gender",                     # 4
    "race",                       # 5
    "arrival_transport",          # 6
    "cci_score",                  # 7
    "hfrs_score",                 # 8
    "door_to_ecg",                # 9
    "door_to_trop",               # 10
    "door_to_anti",               # 11
    "door_to_pci",                # 12
    "door_to_ecg_missing_flag",   # 13
    "door_to_trop_missing_flag",  # 14
    "door_to_anti_missing_flag",  # 15
    "door_to_pci_missing_flag",   # 16
    "prefix_len",                 # 17
    "time_since_ed",              # 18
    "time_since_last",            # 19
    "is_night",                   # 20
    "cum_ecg_cnt",                # 21
    "cum_stemi_cnt",              # 22
    "cum_trop_cnt",               # 23
    "stemi_flag",                 # 24
    "trop_pos_flag",              # 25
    "last_trop",                  # 26
    "run_max_trop",               # 27
    "trop_trend",                 # 28
    "pci_status",                 # 29 (0/1)
    "time_since_start_min",       # 30
    "current_event_id",           # 31
    "current_heart_rate",         # 32
    "current_mean_bp",            # 33
    "timestamp",                  # 34
    "target_mortality",           # 35
    "target_next_evt",            # 36
    "target_time_to_next",        # 37
    "target_remain_los",          # 38
]

final_out = final[cols_order].copy()

print(f"[DONE] 최종 event-level 테이블: {final_out.shape[0]} rows, {final_out.shape[1]} columns")

# ----------------------------------------
# 7. 저장 (원하는 이름으로 수정해서 사용)
# ----------------------------------------
OUTPUT_PATH = os.path.join(BASE_DIR, "cohort_ver150_event_level_full.csv")
final_out.to_csv(OUTPUT_PATH, index=False)
print(f"[SAVE] {OUTPUT_PATH} 에 저장 완료")


[LOAD] cohort_ver183_add_delay_missing_flag.csv 로딩 중...
[LOAD] event_log_stemi_all.csv 로딩 중...
[LOAD] chartevents_ver50.csv 로딩 중 (HR/BP 전용)...
[INFO] 이벤트 수: 40817 (hadm_id 1929개)
[FEAT] hadm_id별 이벤트 기반 feature 계산 중...


  ev_feat = event_log.groupby("hadm_id", group_keys=False).apply(build_features_per_admission)


[FEAT] current_heart_rate 계산 중...
[FEAT] current_mean_bp 계산 중...
[DONE] 최종 event-level 테이블: 40817 rows, 38 columns
[SAVE] ./../cohort\cohort_verXXX_event_level_full.csv 에 저장 완료
