In [2]:
import os
import pandas as pd

# ---------------------------------------------------
# 0. 파일 경로 설정
# ---------------------------------------------------
cohort_path          = "./cohort/cohort_ver44_with_ed.csv"          # 입력: ED까지 붙은 cohort
procedures_icd_path  = "../../data/MIMIC4-hosp-icu/procedures_icd.csv"
procedureevents_path = "../../data/MIMIC4-hosp-icu/procedureevents.csv"
d_items_path         = "../../data/MIMIC4-hosp-icu/d_items.csv"      # itemid → label

# d_icd_procedures는 core 또는 hosp-icu 아래 있을 수 있음 (존재하는 쪽 사용)
d_icd_candidates = [
    "../../data/MIMIC4-core/d_icd_procedures.csv",
    "../../data/MIMIC4-hosp-icu/d_icd_procedures.csv",
]

prescriptions_path   = "../../data/MIMIC4-hosp-icu/prescriptions.csv"
inputevents_path     = "../../data/MIMIC4-hosp-icu/inputevents.csv"   # 있으면 사용
emar_path            = "../../data/MIMIC4-hosp-icu/emar.csv"          # 있으면 사용
admissions_path      = "../../data/MIMIC4-hosp-icu/admissions.csv"
patients_path        = "../../data/MIMIC4-hosp-icu/patients.csv"

output_path          = "./cohort/cohort_ver45_with_pci_rx_admin.csv"  # 출력


# ---------------------------------------------------
# 1. Cohort 로드 및 기본 전처리
# ---------------------------------------------------
print(f"Loading cohort: {cohort_path}")
cohort = pd.read_csv(cohort_path)

required_cols = ["subject_id", "hadm_id"]
missing = [c for c in required_cols if c not in cohort.columns]
if missing:
    raise ValueError(f"Cohort에 필요한 키 컬럼이 없습니다: {missing}")

print("Cohort rows:", len(cohort))

# ed_intime datetime 변환
if "ed_intime" in cohort.columns and not pd.api.types.is_datetime64_any_dtype(cohort["ed_intime"]):
    cohort["ed_intime"] = pd.to_datetime(cohort["ed_intime"], errors="coerce")


# ---------------------------------------------------
# 2. PCI 관련 정보 (ICD + d_icd_procedures + procedureevents + d_items)
#    - pci_code
#    - pci_starttime
#    - pci_endtime
# ---------------------------------------------------
print(f"Loading procedures_icd: {procedures_icd_path}")
procedures_icd = pd.read_csv(procedures_icd_path)

for c in ["subject_id", "hadm_id", "icd_code"]:
    if c not in procedures_icd.columns:
        raise ValueError(f"procedures_icd에 {c} 컬럼이 없습니다.")

procedures_icd["icd_code"] = procedures_icd["icd_code"].astype(str)

# (2-1) ICD prefix 기반 PCI 후보
PCI_ICD_PREFIXES = [
    "00.66",   # PTCA
    "36.0",    # coronary 계열
    "36.06",
    "36.07",
    "36.09",
    "929",     # CPT 계열이 섞여 있을 가능성 예시
    "0270",    # ICD-10-PCS dilation of coronary
    "0271",
    "0272",
]

mask_icd_prefix = procedures_icd["icd_code"].str.startswith(tuple(PCI_ICD_PREFIXES))

# (2-2) d_icd_procedures.long_title 텍스트 기반 PCI 후보 (pci/stent/angio/cath/coronary)
d_icd = None
for path in d_icd_candidates:
    if os.path.exists(path):
        print(f"Loading d_icd_procedures: {path}")
        d_icd = pd.read_csv(path)
        break

mask_icd_text = pd.Series(False, index=procedures_icd.index)

if d_icd is not None and "icd_code" in d_icd.columns and "long_title" in d_icd.columns:
    d_icd["icd_code"] = d_icd["icd_code"].astype(str)
    d_icd["long_title_lower"] = d_icd["long_title"].astype(str).str.lower()

    pci_text_keywords = ["pci", "percutaneous coronary", "stent",
                         "angioplasty", "coronary", "cath"]

    d_icd["is_pci_longtitle"] = d_icd["long_title_lower"].apply(
        lambda x: any(k in x for k in pci_text_keywords)
    )
    pci_icd_codes_from_text = d_icd.loc[d_icd["is_pci_longtitle"], "icd_code"].unique()
    mask_icd_text = procedures_icd["icd_code"].isin(pci_icd_codes_from_text)

    print("d_icd_procedures 기반 PCI 추정 코드 수:",
          len(pci_icd_codes_from_text))
else:
    print("d_icd_procedures를 사용한 텍스트 매칭은 생략되었습니다.")

# 최종 ICD 기반 PCI 여부
procedures_icd["is_pci_icd"] = mask_icd_prefix | mask_icd_text
pci_icd_rows = procedures_icd[procedures_icd["is_pci_icd"]].copy()
print("ICD 기준 PCI 후보 row 수:", len(pci_icd_rows))

# hadm별 대표 icd_code 선택 (seq_num 있으면 가장 작은 것)
if "seq_num" in pci_icd_rows.columns:
    pci_icd_rows = pci_icd_rows.sort_values(["hadm_id", "seq_num"])
else:
    pci_icd_rows = pci_icd_rows.sort_values(["hadm_id"])

pci_icd_unique = pci_icd_rows.drop_duplicates(subset=["hadm_id"], keep="first")[
    ["hadm_id", "icd_code"]
].copy()
pci_icd_unique.rename(columns={"icd_code": "pci_code_icd"}, inplace=True)

print("ICD 기준 PCI hadm 수:", len(pci_icd_unique))


# (2-3) procedureevents + d_items에서 PCI item 추출 (label에 cath/pci/stent/angio/coronary)
print(f"Loading procedureevents: {procedureevents_path}")
procedureevents = pd.read_csv(procedureevents_path)

for c in ["subject_id", "hadm_id", "itemid", "starttime", "endtime"]:
    if c not in procedureevents.columns:
        raise ValueError(f"procedureevents에 {c} 컬럼이 없습니다.")

procedureevents["starttime"] = pd.to_datetime(procedureevents["starttime"], errors="coerce")
procedureevents["endtime"]   = pd.to_datetime(procedureevents["endtime"], errors="coerce")

if not os.path.exists(d_items_path):
    raise ValueError(f"d_items 파일을 찾을 수 없습니다: {d_items_path}")

print(f"Loading d_items: {d_items_path}")
d_items = pd.read_csv(d_items_path)

if "itemid" not in d_items.columns:
    raise ValueError("d_items에 itemid 컬럼이 없습니다.")
if "label" in d_items.columns:
    label_col = "label"
elif "item_label" in d_items.columns:
    label_col = "item_label"
else:
    raise ValueError("d_items에 label / item_label 컬럼이 없습니다.")

d_items["label_lower"] = d_items[label_col].astype(str).str.lower()

pci_item_keywords = ["pci", "angioplasty", "stent", "coronary", "cath"]

d_items["is_pci_item"] = d_items["label_lower"].apply(
    lambda x: any(k in x for k in pci_item_keywords)
)

pci_itemids = d_items.loc[d_items["is_pci_item"], "itemid"].unique()
print("d_items 기준 PCI 추정 itemid 수:", len(pci_itemids))

proc_pci_rows = procedureevents[procedureevents["itemid"].isin(pci_itemids)].copy()
print("procedureevents에서 PCI itemid를 가진 row 수:", len(proc_pci_rows))

if len(proc_pci_rows) > 0:
    proc_pci_rows = proc_pci_rows.sort_values(["hadm_id", "starttime"])
    pci_time_by_item = proc_pci_rows.groupby("hadm_id").agg(
        pci_starttime_item=("starttime", "min"),
        pci_endtime_item=("endtime", "max"),
        pci_itemid=("itemid", "first"),
    ).reset_index()
else:
    pci_time_by_item = pd.DataFrame(columns=["hadm_id", "pci_starttime_item",
                                             "pci_endtime_item", "pci_itemid"])

# fallback: 모든 procedureevents에 대해 hadm별 earliest / latest
proc_all_times = procedureevents.groupby("hadm_id").agg(
    proc_any_start=("starttime", "min"),
    proc_any_end=("endtime", "max"),
).reset_index()


# (2-4) cohort hadm 기준으로 PCI 정보 통합
cohort_hadm = cohort[["hadm_id"]].drop_duplicates()

pci_merged = cohort_hadm.merge(
    pci_icd_unique,
    on="hadm_id",
    how="left",
)

pci_merged = pci_merged.merge(
    pci_time_by_item,
    on="hadm_id",
    how="left",
)

pci_merged = pci_merged.merge(
    proc_all_times,
    on="hadm_id",
    how="left",
)

def decide_pci_code(row):
    if pd.notna(row.get("pci_code_icd")):
        return str(row["pci_code_icd"])
    if pd.notna(row.get("pci_itemid")):
        return f"ITEM_{int(row['pci_itemid'])}"
    return pd.NA

def decide_pci_start(row):
    if pd.notna(row.get("pci_starttime_item")):
        return row["pci_starttime_item"]
    if pd.notna(row.get("pci_code_icd")) and pd.notna(row.get("proc_any_start")):
        return row["proc_any_start"]
    return pd.NaT

def decide_pci_end(row):
    if pd.notna(row.get("pci_endtime_item")):
        return row["pci_endtime_item"]
    if pd.notna(row.get("pci_code_icd")) and pd.notna(row.get("proc_any_end")):
        return row["proc_any_end"]
    return pd.NaT

pci_merged["pci_code_final"] = pci_merged.apply(decide_pci_code, axis=1)
pci_merged["pci_starttime_final"] = pci_merged.apply(decide_pci_start, axis=1)
pci_merged["pci_endtime_final"] = pci_merged.apply(decide_pci_end, axis=1)

# 기존 pci_* 삭제 후 새로 붙이기
cohort = cohort.drop(columns=["pci_code", "pci_starttime", "pci_endtime"], errors="ignore")

cohort = cohort.merge(
    pci_merged[["hadm_id", "pci_code_final", "pci_starttime_final", "pci_endtime_final"]],
    on="hadm_id",
    how="left",
    validate="m:1"
)

cohort.rename(columns={
    "pci_code_final": "pci_code",
    "pci_starttime_final": "pci_starttime",
    "pci_endtime_final": "pci_endtime",
}, inplace=True)

print("PCI 정보 업데이트 완료.")


# ---------------------------------------------------
# 3. 항혈전제 처방/투약 + door_to_antithrombotic
#    - first_antithrombotic_ordertime
#    - first_antithrombotic_admintime (admin 없으면 order fallback)
#    - door_to_antithrombotic_order
#    - door_to_antithrombotic_admin
#    - door_to_antithrombotic (admin 기준)
# ---------------------------------------------------
print(f"Loading prescriptions: {prescriptions_path}")
prescriptions = pd.read_csv(prescriptions_path)

for col in ["subject_id", "hadm_id", "drug", "starttime"]:
    if col not in prescriptions.columns:
        raise ValueError(f"prescriptions에 {col} 컬럼이 없습니다.")

prescriptions["starttime"] = pd.to_datetime(prescriptions["starttime"], errors="coerce")
if "orderid" in prescriptions.columns:
    prescriptions["orderid"] = prescriptions["orderid"].astype("Int64")
if "linkorderid" in prescriptions.columns:
    prescriptions["linkorderid"] = prescriptions["linkorderid"].astype("Int64")

ANTITHROMBOTIC_KEYWORDS = [
    "aspirin",
    "clopidogrel",
    "ticagrelor",
    "prasugrel",
    "heparin",
    "enoxaparin",
    "warfarin",
    "apixaban",
    "rivaroxaban",
    "dabigatran",
]

prescriptions["drug_lower"] = prescriptions["drug"].astype(str).str.lower()
mask_anti = prescriptions["drug_lower"].apply(
    lambda x: any(k in x for k in ANTITHROMBOTIC_KEYWORDS)
)
presc_anti = prescriptions[mask_anti].copy()
print("항혈전제 처방 row 수:", len(presc_anti))

# 3-1) 처방 기준 시간
first_order = (
    presc_anti.sort_values(["hadm_id", "starttime"])
    .drop_duplicates(subset=["hadm_id"], keep="first")
    [["hadm_id", "starttime"]]
    .rename(columns={"starttime": "first_antithrombotic_ordertime"})
)

cohort = cohort.merge(
    first_order,
    on="hadm_id",
    how="left",
    validate="m:1"
)

# 3-2) 투약 기준: inputevents + emar
admin_times_input = None
admin_times_emar  = None

# inputevents
if os.path.exists(inputevents_path):
    print(f"Loading inputevents: {inputevents_path}")
    inputevents = pd.read_csv(inputevents_path)

    needed_ie_cols = ["subject_id", "hadm_id", "starttime"]
    for c in needed_ie_cols:
        if c not in inputevents.columns:
            raise ValueError(f"inputevents에 {c} 컬럼이 없습니다.")

    inputevents["starttime"] = pd.to_datetime(inputevents["starttime"], errors="coerce")
    if "orderid" in inputevents.columns:
        inputevents["orderid"] = inputevents["orderid"].astype("Int64")
    if "linkorderid" in inputevents.columns:
        inputevents["linkorderid"] = inputevents["linkorderid"].astype("Int64")

    order_ids = set(presc_anti["orderid"].dropna().tolist()) if "orderid" in presc_anti.columns else set()
    link_ids  = set(presc_anti["linkorderid"].dropna().tolist()) if "linkorderid" in presc_anti.columns else set()

    mask_ie = pd.Series(False, index=inputevents.index)
    if order_ids and "orderid" in inputevents.columns:
        mask_ie |= inputevents["orderid"].isin(order_ids)
    if link_ids and "linkorderid" in inputevents.columns:
        mask_ie |= inputevents["linkorderid"].isin(link_ids)

    input_anti = inputevents[mask_ie].copy()
    print("inputevents 기반 항혈전제 투약 row 수:", len(input_anti))

    if len(input_anti) > 0:
        admin_times_input = (
            input_anti.sort_values(["hadm_id", "starttime"])
            .drop_duplicates(subset=["hadm_id"], keep="first")
            [["hadm_id", "starttime"]]
            .rename(columns={"starttime": "first_antithrombotic_admin_input"})
        )

# emar
if os.path.exists(emar_path):
    print(f"Loading emar: {emar_path}")
    emar = pd.read_csv(emar_path)

    if "medication" in emar.columns:
        med_col = "medication"
    elif "medication_name" in emar.columns:
        med_col = "medication_name"
    else:
        raise ValueError("emar에서 medication 관련 컬럼을 찾지 못했습니다.")

    needed_emar_cols = ["subject_id", "hadm_id", med_col, "charttime"]
    for c in needed_emar_cols:
        if c not in emar.columns:
            raise ValueError(f"emar에 {c} 컬럼이 없습니다.")

    emar["charttime"] = pd.to_datetime(emar["charttime"], errors="coerce")
    emar["med_lower"] = emar[med_col].astype(str).str.lower()

    mask_emar = emar["med_lower"].apply(
        lambda x: any(k in x for k in ANTITHROMBOTIC_KEYWORDS)
    )
    emar_anti = emar[mask_emar].copy()
    print("emar 기반 항혈전제 투약 row 수:", len(emar_anti))

    if len(emar_anti) > 0:
        admin_times_emar = (
            emar_anti.sort_values(["hadm_id", "charttime"])
            .drop_duplicates(subset=["hadm_id"], keep="first")
            [["hadm_id", "charttime"]]
            .rename(columns={"charttime": "first_antithrombotic_admin_emar"})
        )

# inputevents / emar 중 더 이른 시간 선택
admin_all = None
if admin_times_input is not None and admin_times_emar is not None:
    admin_all = pd.merge(
        admin_times_input,
        admin_times_emar,
        on="hadm_id",
        how="outer"
    )
    admin_all["first_antithrombotic_admintime"] = admin_all[
        ["first_antithrombotic_admin_input", "first_antithrombotic_admin_emar"]
    ].min(axis=1)
elif admin_times_input is not None:
    admin_all = admin_times_input.copy()
    admin_all.rename(columns={"first_antithrombotic_admin_input": "first_antithrombotic_admintime"}, inplace=True)
elif admin_times_emar is not None:
    admin_all = admin_times_emar.copy()
    admin_all.rename(columns={"first_antithrombotic_admin_emar": "first_antithrombotic_admintime"}, inplace=True)

if admin_all is not None:
    cohort = cohort.merge(
        admin_all[["hadm_id", "first_antithrombotic_admintime"]],
        on="hadm_id",
        how="left",
        validate="m:1"
    )
else:
    cohort["first_antithrombotic_admintime"] = pd.NaT

# admin이 NaT이고 ordertime이 있으면 fallback으로 ordertime 사용
if "first_antithrombotic_admintime" in cohort.columns and "first_antithrombotic_ordertime" in cohort.columns:
    mask_fallback = cohort["first_antithrombotic_admintime"].isna() & cohort["first_antithrombotic_ordertime"].notna()
    cohort.loc[mask_fallback, "first_antithrombotic_admintime"] = cohort.loc[
        mask_fallback, "first_antithrombotic_ordertime"
    ]


# door_to_antithrombotic_* 계산
def compute_door_minutes(start_series, ed_series):
    diff = (start_series - ed_series).dt.total_seconds() / 60.0
    # 음수는 NaN 처리
    diff = diff.where(diff >= 0)
    return diff

if "ed_intime" in cohort.columns:
    if "first_antithrombotic_ordertime" in cohort.columns:
        cohort["door_to_antithrombotic_order"] = compute_door_minutes(
            cohort["first_antithrombotic_ordertime"], cohort["ed_intime"]
        )
    else:
        cohort["door_to_antithrombotic_order"] = pd.NA

    if "first_antithrombotic_admintime" in cohort.columns:
        cohort["door_to_antithrombotic_admin"] = compute_door_minutes(
            cohort["first_antithrombotic_admintime"], cohort["ed_intime"]
        )
    else:
        cohort["door_to_antithrombotic_admin"] = pd.NA

    # 최종 지표는 admin 기준 (이미 order fallback 반영됨)
    cohort["door_to_antithrombotic"] = cohort["door_to_antithrombotic_admin"]
else:
    print("경고: ed_intime이 없어 door_to_antithrombotic을 계산할 수 없습니다.")
    cohort["door_to_antithrombotic_order"] = pd.NA
    cohort["door_to_antithrombotic_admin"] = pd.NA
    cohort["door_to_antithrombotic"] = pd.NA


# ---------------------------------------------------
# 4. Discharge time (dischtime), age
# ---------------------------------------------------
print(f"Loading admissions: {admissions_path}")
admissions = pd.read_csv(admissions_path)

for col in ["subject_id", "hadm_id", "dischtime"]:
    if col not in admissions.columns:
        raise ValueError(f"admissions에 {col} 컬럼이 없습니다.")

admissions["dischtime"] = pd.to_datetime(admissions["dischtime"], errors="coerce")
admissions_info = admissions[["subject_id", "hadm_id", "dischtime"]].copy()

cohort = cohort.merge(
    admissions_info,
    on=["subject_id", "hadm_id"],
    how="left",
    validate="m:1"
)

print(f"Loading patients: {patients_path}")
patients = pd.read_csv(patients_path)

if "subject_id" not in patients.columns:
    raise ValueError("patients 테이블에 subject_id 컬럼이 없습니다.")
if "anchor_age" not in patients.columns:
    raise ValueError("patients 테이블에 anchor_age 컬럼이 없습니다.")

patients_info = patients[["subject_id", "anchor_age"]].copy()
patients_info.rename(columns={"anchor_age": "age"}, inplace=True)

cohort = cohort.merge(
    patients_info,
    on="subject_id",
    how="left",
    validate="m:1"
)


# ---------------------------------------------------
# 5. 저장 및 Summary
# ---------------------------------------------------
os.makedirs(os.path.dirname(output_path), exist_ok=True)
cohort.to_csv(output_path, index=False)
print(f"\nSaved cohort to: {output_path}")

print("\n[Summary]")
print("최종 cohort row 수:", len(cohort))
print("pci_code 존재 row 수:", cohort["pci_code"].notna().sum())
print("pci_starttime 존재 row 수:", cohort["pci_starttime"].notna().sum())
print("first_antithrombotic_ordertime 존재 row 수:", cohort["first_antithrombotic_ordertime"].notna().sum())
print("first_antithrombotic_admintime 존재 row 수:", cohort["first_antithrombotic_admintime"].notna().sum())
print("door_to_antithrombotic_order 값 존재 row 수:", cohort["door_to_antithrombotic_order"].notna().sum())
print("door_to_antithrombotic_admin 값 존재 row 수:", cohort["door_to_antithrombotic_admin"].notna().sum())
print("door_to_antithrombotic (최종) 값 존재 row 수:", cohort["door_to_antithrombotic"].notna().sum())
print("dischtime 값 존재 row 수:", cohort["dischtime"].notna().sum())
print("age 값 존재 row 수:", cohort["age"].notna().sum())


Loading cohort: ./cohort/cohort_ver44_with_ed.csv
Cohort rows: 1930
Loading procedures_icd: ../../data/MIMIC4-hosp-icu/procedures_icd.csv
Loading d_icd_procedures: ../../data/MIMIC4-hosp-icu/d_icd_procedures.csv
d_icd_procedures 기반 PCI 추정 코드 수: 897
ICD 기준 PCI 후보 row 수: 113216
ICD 기준 PCI hadm 수: 58795
Loading procedureevents: ../../data/MIMIC4-hosp-icu/procedureevents.csv
Loading d_items: ../../data/MIMIC4-hosp-icu/d_items.csv
d_items 기준 PCI 추정 itemid 수: 130
procedureevents에서 PCI itemid를 가진 row 수: 38032
PCI 정보 업데이트 완료.
Loading prescriptions: ../../data/MIMIC4-hosp-icu/prescriptions.csv


  prescriptions = pd.read_csv(prescriptions_path)


항혈전제 처방 row 수: 1280093
Loading admissions: ../../data/MIMIC4-hosp-icu/admissions.csv
Loading patients: ../../data/MIMIC4-hosp-icu/patients.csv

Saved cohort to: ./cohort/cohort_ver45_with_pci_rx_admin.csv

[Summary]
최종 cohort row 수: 1930
pci_code 존재 row 수: 996
pci_starttime 존재 row 수: 690
first_antithrombotic_ordertime 존재 row 수: 1784
first_antithrombotic_admintime 존재 row 수: 1784
door_to_antithrombotic_order 값 존재 row 수: 1627
door_to_antithrombotic_admin 값 존재 row 수: 1627
door_to_antithrombotic (최종) 값 존재 row 수: 1627
dischtime 값 존재 row 수: 1930
age 값 존재 row 수: 1930
