In [6]:
import pandas as pd
import json
import csv

# -----------------------------------------
# 0. 파일 경로 설정
# -----------------------------------------
cohort_path    = "./../cohort/cohort_ver183_add_delay_missing_flag.csv"
ecg_path       = "./../cohort/ecg_ver50_stemi_flag.csv"
chart_path     = "./../cohort/chartevents_ver50.csv"

out_cohort_flat_csv   = "./../cohort/cohort_ver148_flat.csv"                 # 배열 없는 cohort
out_arrays_json       = "./../cohort/cohort_ver148_arrays.jsonl"             # 배열 포함 JSONL
out_ecg_series_csv    = "./../cohort/ecg_series_ver148.csv"                  # ECG long table
out_hr_series_csv     = "./../cohort/hr_series_ver148.csv"                   # HR long table
out_bp_series_csv     = "./../cohort/bp_series_ver148.csv"                   # BP long table

# -----------------------------------------
# 1. 데이터 로딩
# -----------------------------------------
cohort = pd.read_csv(cohort_path, low_memory=False)
ecg    = pd.read_csv(ecg_path, parse_dates=["ecg_time"], low_memory=False)
chart  = pd.read_csv(chart_path, parse_dates=["charttime"], low_memory=False)

# -----------------------------------------
# 2. ECG 준비 (subject_id 단위)
#    ['subject_id', 'study_id', 'cart_id', 'ecg_time', 'stemi_flag']
# -----------------------------------------
ecg = ecg.sort_values(["subject_id", "ecg_time"])
ecg_group = ecg.groupby("subject_id")

# -----------------------------------------
# 3. HR / BP 준비 (hadm_id 단위)
#    ['subject_id', 'hadm_id', 'stay_id', 'charttime', 'itemid',
#     'valuenum', 'variable_name']
# -----------------------------------------
hr = chart[chart["variable_name"] == "heart_rate"].copy()
bp = chart[chart["variable_name"] == "mean_bp"].copy()

hr = hr.sort_values(["hadm_id", "charttime"])
bp = bp.sort_values(["hadm_id", "charttime"])

hr_group = hr.groupby("hadm_id")
bp_group = bp.groupby("hadm_id")

# -----------------------------------------
# 4. cohort에 배열 컬럼 준비 (파이썬 객체)
# -----------------------------------------
cohort["door_to_ecg_series"] = [[] for _ in range(len(cohort))]
cohort["cum_ecg_cnt"]        = [[] for _ in range(len(cohort))]
cohort["cum_stemi_cnt"]      = [[] for _ in range(len(cohort))]
cohort["current_heart_rate"] = [[] for _ in range(len(cohort))]
cohort["current_mean_bp"]    = [[] for _ in range(len(cohort))]

# long-format 저장용 리스트
ecg_rows = []
hr_rows  = []
bp_rows  = []

# -----------------------------------------
# 5. hadm_id / subject_id별로 배열 생성 + long table도 같이 생성
# -----------------------------------------
for idx, row in cohort.iterrows():
    subj = row["subject_id"]
    hadm = row["hadm_id"]
    door_scalar = row["door_to_ecg"]  # 기존 숫자형 door_to_ecg (분 단위)

    # =========================
    # ECG (subject 기준)
    # =========================
    door_series = []
    cum_ecg     = []
    cum_stemi   = []

    if subj in ecg_group.groups:
        g = ecg_group.get_group(subj).sort_values("ecg_time")

        # 5-1) door_to_ecg_series: ecg_time - ed_intime
        # ed_intime = 첫 ECG 시간 - 기존 door_to_ecg(분)
        if pd.notna(door_scalar) and door_scalar >= 0:
            first_time = g["ecg_time"].iloc[0]
            ed_intime = first_time - pd.to_timedelta(door_scalar, unit="m")
            door_series = [
                {
                    "time": t,
                    "value": (t - ed_intime).total_seconds() / 60.0  # 분
                }
                for t in g["ecg_time"]
            ]
        else:
            door_series = []

        # 5-2) 누적 ECG / 누적 STEMI
        stemi_cnt = 0
        for i, ecg_row in enumerate(g.itertuples(index=False)):
            t = ecg_row.ecg_time
            flag = ecg_row.stemi_flag

            cum_ecg.append({"time": t, "value": i + 1})
            if flag == 1:
                stemi_cnt += 1
            cum_stemi.append({"time": t, "value": stemi_cnt})

            # long-format ECG row 추가
            ecg_rows.append({
                "subject_id": subj,
                "hadm_id": hadm,
                "ecg_time": t,
                "door_to_ecg_min": door_series[i]["value"] if i < len(door_series) else None,
                "cum_ecg_cnt": i + 1,
                "cum_stemi_cnt": stemi_cnt,
                "stemi_flag": flag,
            })

    # cohort에 배열 세팅
    cohort.at[idx, "door_to_ecg_series"] = [
        {"time": s["time"].isoformat(), "value": s["value"]} for s in door_series
    ]
    cohort.at[idx, "cum_ecg_cnt"] = [
        {"time": s["time"].isoformat(), "value": s["value"]} for s in cum_ecg
    ]
    cohort.at[idx, "cum_stemi_cnt"] = [
        {"time": s["time"].isoformat(), "value": s["value"]} for s in cum_stemi
    ]

    # =========================
    # HR (hadm 기준)
    # =========================
    hr_series = []
    if hadm in hr_group.groups:
        g_hr = hr_group.get_group(hadm)
        for hr_row in g_hr.itertuples(index=False):
            t = hr_row.charttime
            v = hr_row.valuenum
            hr_series.append({"time": t, "value": v})
            hr_rows.append({
                "subject_id": hr_row.subject_id,
                "hadm_id": hadm,
                "charttime": t,
                "heart_rate": v,
            })
    cohort.at[idx, "current_heart_rate"] = [
        {"time": s["time"].isoformat(), "value": s["value"]} for s in hr_series
    ]

    # =========================
    # BP (hadm 기준)
    # =========================
    bp_series = []
    if hadm in bp_group.groups:
        g_bp = bp_group.get_group(hadm)
        for bp_row in g_bp.itertuples(index=False):
            t = bp_row.charttime
            v = bp_row.valuenum
            bp_series.append({"time": t, "value": v})
            bp_rows.append({
                "subject_id": bp_row.subject_id,
                "hadm_id": hadm,
                "charttime": t,
                "mean_bp": v,
            })
    cohort.at[idx, "current_mean_bp"] = [
        {"time": s["time"].isoformat(), "value": s["value"]} for s in bp_series
    ]

# -----------------------------------------
# 6. 저장
# -----------------------------------------

# 6-1) flat cohort CSV (배열 컬럼 제거 → 엑셀에서 깨지지 않음)
flat_cols = [c for c in cohort.columns
             if c not in ["door_to_ecg_series", "cum_ecg_cnt",
                          "cum_stemi_cnt", "current_heart_rate",
                          "current_mean_bp"]]
cohort[flat_cols].to_csv(out_cohort_flat_csv, index=False)
print(f"[SAVE] flat cohort CSV: {out_cohort_flat_csv}")

# 6-2) 배열 포함 JSONL (파이썬/분석용, 중첩 구조 그대로 유지)
cohort.to_json(out_arrays_json, orient="records", lines=True, force_ascii=False)
print(f"[SAVE] arrays JSONL: {out_arrays_json}")

# 6-3) ECG / HR / BP long-format CSV (엑셀에서 보기 편하게)
ecg_df = pd.DataFrame(ecg_rows)
hr_df  = pd.DataFrame(hr_rows)
bp_df  = pd.DataFrame(bp_rows)

ecg_df.to_csv(out_ecg_series_csv, index=False)
hr_df.to_csv(out_hr_series_csv, index=False)
bp_df.to_csv(out_bp_series_csv, index=False)

print(f"[SAVE] ECG series CSV: {out_ecg_series_csv}")
print(f"[SAVE] HR series CSV:  {out_hr_series_csv}")
print(f"[SAVE] BP series CSV:  {out_bp_series_csv}")


[SAVE] flat cohort CSV: ./../cohort/cohort_ver148_flat.csv
[SAVE] arrays JSONL: ./../cohort/cohort_ver148_arrays.jsonl
[SAVE] ECG series CSV: ./../cohort/ecg_series_ver148.csv
[SAVE] HR series CSV:  ./../cohort/hr_series_ver148.csv
[SAVE] BP series CSV:  ./../cohort/bp_series_ver148.csv
