In [None]:
import pandas as pd

# ----------------------------------
# 0. 파일 경로 설정
# ----------------------------------
cohort_path = "./cohort/cohort_ver41_remove_old_col.csv"
d_labitems_path = "../../data/MIMIC4-hosp-icu/d_labitems.csv"
labevents_path = "../../data/MIMIC4-hosp-icu/labevents.csv"
output_path = "./cohort/cohort_ver42_with_troponin.csv"

# ----------------------------------
# 1. Cohort 불러오기
# ----------------------------------
cohort = pd.read_csv(cohort_path)

if "hadm_id" not in cohort.columns:
    raise ValueError("cohort에 'hadm_id' 컬럼이 없습니다. join key를 확인해주세요.")

cohort["hadm_id"] = cohort["hadm_id"].astype("Int64")

# ----------------------------------
# 2. d_labitems에서 Troponin I / Troponin T itemid 추출
# ----------------------------------
d_lab = pd.read_csv(d_labitems_path)

troponin_labels = ["Troponin I", "Troponin T"]

troponin_items = (
    d_lab[d_lab["label"].isin(troponin_labels)]
    [["itemid", "label"]]
    .drop_duplicates()
    .sort_values("itemid")
)

if troponin_items.empty:
    raise ValueError("d_labitems에서 'Troponin I' / 'Troponin T' label을 찾지 못했습니다.")

troponin_itemids = troponin_items["itemid"].unique().tolist()

print("[INFO] Troponin 관련 label 및 itemid")
print(troponin_items)
print(f"[INFO] Troponin itemid 리스트: {troponin_itemids}")

# ----------------------------------
# 3. labevents를 chunk로 읽으면서 Troponin만 수집
# ----------------------------------
usecols = [
    "labevent_id",
    "subject_id",
    "hadm_id",
    "itemid",
    "charttime",
    "storetime",
    "value",
    "valuenum",
    "flag",
]

troponin_rows = []
chunksize = 1_000_000  # 필요시 조정

for chunk in pd.read_csv(
    labevents_path,
    usecols=usecols,
    chunksize=chunksize,
    parse_dates=["charttime", "storetime"],
):
    # hadm_id 없는 행 제거
    chunk = chunk[chunk["hadm_id"].notna()]
    chunk["hadm_id"] = chunk["hadm_id"].astype("Int64")

    # Troponin I / T itemid만 필터링
    chunk_tp = chunk[chunk["itemid"].isin(troponin_itemids)]

    if len(chunk_tp) > 0:
        troponin_rows.append(chunk_tp)

if len(troponin_rows) == 0:
    raise ValueError("labevents에서 Troponin(I/T) 행을 찾지 못했습니다. itemid 및 경로를 확인해주세요.")

troponin = pd.concat(troponin_rows, ignore_index=True)
print(f"[INFO] Troponin(I/T) 행 개수: {len(troponin)}")

# ----------------------------------
# 4. Troponin 파생 변수 계산
#    - first_troponin_charttime          : hadm_id 별 첫 charttime
#    - troponin_end_charttime            : hadm_id 별 마지막 storetime
#    - first_troponin_positive_charttime : flag == 'abnormal' 인 첫 charttime
#    - troponin_count                    : Troponin(I/T) 검사 횟수
# ----------------------------------

# 4-1. 첫 Troponin 검사 시간
first_trop = (
    troponin.groupby("hadm_id")["charttime"]
    .min()
    .reset_index()
    .rename(columns={"charttime": "first_troponin_charttime"})
)

# 4-2. Troponin 검사 종료시간 (storetime 중 최댓값)
trop_end = (
    troponin.groupby("hadm_id")["storetime"]
    .max()
    .reset_index()
    .rename(columns={"storetime": "troponin_end_charttime"})
)

# 4-3. 첫 양성 Troponin 검사 시간
#     기준: flag == 'abnormal'
tp_pos = troponin[
    troponin["flag"].str.lower().eq("abnormal").fillna(False)
].copy()

first_pos = (
    tp_pos.groupby("hadm_id")["charttime"]
    .min()
    .reset_index()
    .rename(columns={"charttime": "first_troponin_positive_charttime"})
)

# 4-4. Troponin 검사 횟수
trop_count = (
    troponin.groupby("hadm_id")["labevent_id"]
    .count()
    .reset_index()
    .rename(columns={"labevent_id": "troponin_count"})
)

# ----------------------------------
# 5. Cohort에 merge
# ----------------------------------
cohort = cohort.merge(first_trop, on="hadm_id", how="left")
cohort = cohort.merge(trop_end, on="hadm_id", how="left")
cohort = cohort.merge(first_pos, on="hadm_id", how="left")
cohort = cohort.merge(trop_count, on="hadm_id", how="left")

# ----------------------------------
# 5-1. troponin_count default 0 설정
# ----------------------------------
cohort["troponin_count"] = cohort["troponin_count"].fillna(0).astype(int)

# ----------------------------------
# 6. 저장
# ----------------------------------
cohort.to_csv(output_path, index=False)
print(f"[DONE] Troponin 변수 4개를 추가한 cohort 저장 완료: {output_path}")


[INFO] Troponin 관련 label 및 itemid
      itemid       label
198    51002  Troponin I
199    51003  Troponin T
1528   52642  Troponin I
[INFO] Troponin itemid 리스트: [51002, 51003, 52642]
[INFO] Troponin(I/T) 행 개수: 243985
[DONE] Troponin 변수 4개를 추가한 cohort 저장 완료: ./cohort/cohort_ver41_with_troponin.csv
