In [6]:
import pandas as pd
import json

# -----------------------------------------
# 0. 파일 경로
# -----------------------------------------
base = "./../cohort/"
ts_path    = base + "timestamp_array.csv"
ecg_path   = base + "ecg_ver50_stemi_flag.csv"
chart_path = base + "chartevents_ver50.csv"
out_path   = base + "cohort_ver148_with_ecg_hr_bp_arrays.csv"

# -----------------------------------------
# 1. Load
# -----------------------------------------
# timestamp_array: hadm_id, event_sequence_array (JSON string)
ts    = pd.read_csv(ts_path, low_memory=False)
ecg   = pd.read_csv(ecg_path, parse_dates=["ecg_time"], low_memory=False)
chart = pd.read_csv(chart_path, parse_dates=["charttime"], low_memory=False)

# -----------------------------------------
# 2. timestamp_array에 subject_id 붙이기
#    - chartevents에서 hadm_id→subject_id 매핑
# -----------------------------------------
hadm_subject = (
    chart[["hadm_id", "subject_id"]]
    .dropna(subset=["hadm_id", "subject_id"])
    .drop_duplicates()
)

ts = ts.merge(hadm_subject, on="hadm_id", how="left")
# 이제 ts: hadm_id, event_sequence_array, subject_id

# -----------------------------------------
# 3. HR / BP 시계열 배열 생성 (key: subject_id + hadm_id)
# -----------------------------------------
hr = chart[chart["variable_name"] == "heart_rate"][
    ["subject_id", "hadm_id", "charttime", "valuenum"]
]
bp = chart[chart["variable_name"] == "mean_bp"][
    ["subject_id", "hadm_id", "charttime", "valuenum"]
]

def make_array(df: pd.DataFrame) -> str:
    """charttime/valuenum을 JSON 배열로 직렬화"""
    if df.empty:
        return "[]"
    df = df.sort_values("charttime")
    arr = [
        {
            "time": t.strftime("%Y-%m-%d %H:%M:%S"),
            "value": float(v),
        }
        for t, v in zip(df["charttime"], df["valuenum"])
    ]
    return json.dumps(arr)

# subject_id + hadm_id 기준으로 HR/BP 배열 생성
hr_array = (
    hr.groupby(["subject_id", "hadm_id"])
      .apply(make_array)
      .rename("hr_array")
      .reset_index()
)

bp_array = (
    bp.groupby(["subject_id", "hadm_id"])
      .apply(make_array)
      .rename("bp_array")
      .reset_index()
)

# -----------------------------------------
# 4. ts에 HR/BP 병합 (key: subject_id + hadm_id)
# -----------------------------------------
merged = ts.merge(hr_array, on=["subject_id", "hadm_id"], how="left")
merged = merged.merge(bp_array, on=["subject_id", "hadm_id"], how="left")

# -----------------------------------------
# 5. ECG 배열 생성
#    - ECG에는 hadm_id가 없으므로:
#      (subject_id 일치) AND (해당 hadm의 event_sequence_array 시간 범위 ±1일)
#      를 기준으로 붙임
# -----------------------------------------
def build_ecg_array(row) -> str:
    sid = row["subject_id"]

    # event_sequence_array 파싱
    try:
        events = json.loads(row["event_sequence_array"])
    except Exception:
        return "[]"

    times = []
    for e in events:
        t = e.get("time")
        if t:
            try:
                times.append(pd.to_datetime(t))
            except Exception:
                pass

    if not times:
        return "[]"

    start = min(times)
    end   = max(times)
    buffer = pd.Timedelta("1D")

    # subject_id만으로 ECG 매칭 + 해당 hadm 입원 구간 주변만 사용
    sub_ecg = ecg[
        (ecg["subject_id"] == sid) &
        (ecg["ecg_time"] >= start - buffer) &
        (ecg["ecg_time"] <= end + buffer)
    ].sort_values("ecg_time")

    if sub_ecg.empty:
        return "[]"

    arr = [
        {
            "time": t.strftime("%Y-%m-%d %H:%M:%S"),
            "stemi_flag": int(sf),
        }
        for t, sf in zip(sub_ecg["ecg_time"], sub_ecg["stemi_flag"])
    ]
    return json.dumps(arr)

merged["ecg_array"] = merged.apply(build_ecg_array, axis=1)

# -----------------------------------------
# 6. 중복 컬럼 제거 (예방용)
# -----------------------------------------
merged = merged.loc[:, ~merged.columns.duplicated()]

# -----------------------------------------
# 7. 저장
# -----------------------------------------
merged.to_csv(out_path, index=False)
print("[DONE] saved:", out_path)
print("shape:", merged.shape)
print("columns:", merged.columns.tolist())


  hr.groupby(["subject_id", "hadm_id"])
  bp.groupby(["subject_id", "hadm_id"])


[DONE] saved: ./../cohort/cohort_ver148_with_ecg_hr_bp_arrays.csv
shape: (1929, 6)
columns: ['hadm_id', 'event_sequence_array', 'subject_id', 'hr_array', 'bp_array', 'ecg_array']
