In [1]:
import pandas as pd
import os

LABEVENTS_PATH = "../../data/MIMIC4-hosp-icu/labevents.csv"
OUTPUT_PATH = "../../data/MIMIC4-hosp-icu/labevents_troponin.csv"

TROPONIN_ITEMIDS = [52642, 51002, 51003]
CHUNKSIZE = 1_000_000  # 상황에 맞게 조정

# 기존 출력 파일이 있다면 삭제
if os.path.exists(OUTPUT_PATH):
    os.remove(OUTPUT_PATH)
    print(f"[INFO] 기존 파일 삭제: {OUTPUT_PATH}")

header_written = False
total_rows = 0

for i, chunk in enumerate(pd.read_csv(LABEVENTS_PATH, chunksize=CHUNKSIZE)):
    print(f"[INFO] chunk {i} 로딩, rows={len(chunk)}")

    # Troponin 필터링
    sub = chunk[chunk["itemid"].isin(TROPONIN_ITEMIDS)].copy()
    print(f"[INFO]   → chunk {i} Troponin rows={len(sub)}")

    if len(sub) == 0:
        continue

    mode = "a"
    header = not header_written
    sub.to_csv(OUTPUT_PATH, mode=mode, header=header, index=False)
    header_written = True

    total_rows += len(sub)

print("[INFO] 전체 Troponin row 수:", total_rows)
print("[INFO] Saved:", os.path.abspath(OUTPUT_PATH))


[INFO] chunk 0 로딩, rows=1000000
[INFO]   → chunk 0 Troponin rows=2901
[INFO] chunk 1 로딩, rows=1000000
[INFO]   → chunk 1 Troponin rows=2949
[INFO] chunk 2 로딩, rows=1000000
[INFO]   → chunk 2 Troponin rows=2698
[INFO] chunk 3 로딩, rows=1000000
[INFO]   → chunk 3 Troponin rows=2956
[INFO] chunk 4 로딩, rows=1000000
[INFO]   → chunk 4 Troponin rows=2893
[INFO] chunk 5 로딩, rows=1000000
[INFO]   → chunk 5 Troponin rows=2884
[INFO] chunk 6 로딩, rows=1000000
[INFO]   → chunk 6 Troponin rows=3118
[INFO] chunk 7 로딩, rows=1000000
[INFO]   → chunk 7 Troponin rows=2736
[INFO] chunk 8 로딩, rows=1000000
[INFO]   → chunk 8 Troponin rows=2998
[INFO] chunk 9 로딩, rows=1000000
[INFO]   → chunk 9 Troponin rows=2848
[INFO] chunk 10 로딩, rows=1000000
[INFO]   → chunk 10 Troponin rows=3185
[INFO] chunk 11 로딩, rows=1000000
[INFO]   → chunk 11 Troponin rows=2823
[INFO] chunk 12 로딩, rows=1000000
[INFO]   → chunk 12 Troponin rows=2877
[INFO] chunk 13 로딩, rows=1000000
[INFO]   → chunk 13 Troponin rows=2886
[INFO] chunk