In [12]:
# === 경로 설정 (네 경로로 바꿔주세요) ===
SQLITE_DB_PATH = "/content/drive/MyDrive/Colab Notebooks/MIMIC4-hosp-icu.db"   # hosp-icu SQLite 파일
ED_CSV_DIR     = "/content/drive/MyDrive/Colab Notebooks/ed_csvs"              # diagnosis.csv, edstays.csv, medrecon.csv 등 위치
OUT_DIR        = "/content/drive/MyDrive/Colab Notebooks/stemi_out"            # 결과 저장 폴더

import os, re, sqlite3
import pandas as pd
import numpy as np

os.makedirs(OUT_DIR, exist_ok=True)

# 공용 함수
def to_dt(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], errors="coerce")
    return df

def minutes_between(t_end, t_start):
    if pd.isna(t_end) or pd.isna(t_start):
        return np.nan
    return (t_end - t_start).total_seconds() / 60.0


In [13]:
con = sqlite3.connect(SQLITE_DB_PATH)

# ICD-10: I21.0~I21.3 → STEMI, ICD-9: 410.[0-6], 410.8 → STEMI
stemi = pd.read_sql("""
WITH dx AS (
  SELECT subject_id, hadm_id, icd_version, REPLACE(UPPER(icd_code), '.', '') AS code
  FROM diagnoses_icd
)
SELECT DISTINCT subject_id, hadm_id
FROM dx
WHERE (icd_version=10 AND (code LIKE 'I210%' OR code LIKE 'I211%' OR code LIKE 'I212%' OR code LIKE 'I213%'))
   OR (icd_version=9  AND (code GLOB '410[0-6]*' OR code LIKE '4108%'));
""", con)

admissions = pd.read_sql("""
  SELECT subject_id, hadm_id, admittime, dischtime
  FROM admissions
""", con)

con.close()

to_dt(admissions, ["admittime", "dischtime"])

print("STEMI hadm 개수:", stemi["hadm_id"].nunique())
stemi.head()


STEMI hadm 개수: 2685


Unnamed: 0,subject_id,hadm_id
0,10002155,23822395
1,10009686,29681222
2,10010058,26359957
3,10012438,22764825
4,10013310,21243435


In [14]:
# 필수 ED 파일: edstays.csv
ed = pd.read_csv(f"{ED_CSV_DIR}/edstays.csv")
to_dt(ed, ["intime", "outtime"])

# hadm_id가 ED에 있으면 바로 조인, 없으면 시간-근접 매칭으로 hadm_id 부여
if "hadm_id" in ed.columns:
    ed_link = ed.merge(admissions[["subject_id","hadm_id","admittime","dischtime"]],
                       on=["subject_id","hadm_id"], how="left")
else:
    tmp = ed.merge(admissions, on="subject_id", how="inner")
    tmp["dist_min"] = (tmp["admittime"] - tmp["intime"]).abs().dt.total_seconds()/60.0
    ed_link = (tmp.sort_values(["subject_id","intime","dist_min"])
                 .groupby(["subject_id","intime","outtime"], as_index=False)
                 .first()[["subject_id","hadm_id","intime","outtime","admittime","dischtime"]])

# STEMI + ED 매칭
stemi_ed = (stemi
            .merge(ed_link, on=["subject_id","hadm_id"], how="inner")
            .dropna(subset=["intime","admittime"]))

print("STEMI+ED hadm 개수:", stemi_ed["hadm_id"].nunique())
stemi_ed.head()


STEMI+ED hadm 개수: 889


Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,gender,race,arrival_transport,disposition,admittime,dischtime
0,10010058,26359957,39919549,2147-11-18 00:50:00,2147-11-18 03:19:00,M,ASIAN - SOUTH EAST ASIAN,AMBULANCE,ADMITTED,2147-11-18 02:34:00,2147-11-19 04:00:00
1,10012438,22764825,35750515,2178-06-07 19:33:00,2178-06-07 21:57:00,M,WHITE,AMBULANCE,ADMITTED,2178-06-07 20:02:00,2178-06-10 15:30:00
2,10013310,21243435,31736720,2153-05-26 08:56:00,2153-05-26 14:18:39,F,BLACK/AFRICAN,UNKNOWN,ADMITTED,2153-05-26 14:18:00,2153-06-05 19:30:00
3,10013310,27682188,35160955,2153-05-06 10:21:00,2153-05-06 18:28:00,F,BLACK/AFRICAN,AMBULANCE,ADMITTED,2153-05-06 18:03:00,2153-05-13 13:45:00
4,10033552,26487381,32542054,2136-02-29 22:09:00,2136-03-01 02:26:37,F,OTHER,AMBULANCE,ADMITTED,2136-03-01 02:25:00,2136-03-03 20:30:00
