In [3]:
import pandas as pd

ver8_path = "/content/drive/MyDrive/cohort_ver8_fill_admissions.csv"
icustays_path = "/content/drive/MyDrive/icustays.csv"
out_path = "/content/drive/MyDrive/cohort_ver9_fill_icustays.csv"

ver8 = pd.read_csv(ver8_path)
icu = pd.read_csv(icustays_path)

ver8 = ver8.loc[:, ~ver8.columns.duplicated()]
icu = icu.loc[:, ~icu.columns.duplicated()]

if "subject_id" not in ver8.columns:
    raise ValueError("ver8에 subject_id 컬럼이 없습니다.")
if "subject_id" not in icu.columns:
    raise ValueError("icustays에 subject_id 컬럼이 없습니다.")

ver8["subject_id"] = ver8["subject_id"].astype(str)
icu["subject_id"] = icu["subject_id"].astype(str)

icu_cols = [
    "subject_id", "hadm_id", "stay_id", "first_careunit",
    "last_careunit", "intime", "outtime", "los"
]

# los와 los_days를 동일 컬럼으로 취급
# 1) ver8에 los_days만 있고 icu에는 los만 있으면 icu.los -> icu.los_days로 일시 매핑
if ("los_days" in ver8.columns) and ("los" in icu.columns) and ("los_days" not in icu.columns):
    icu = icu.copy()
    icu["los_days"] = icu["los"]
    icu_cols = [c if c != "los" else "los_days" for c in icu_cols]

# 2) ver8에 los만 있고 icu에는 los_days만 있을 가능성은 낮지만 양방향 예외 처리
if ("los" in ver8.columns) and ("los_days" in icu.columns) and ("los" not in icu.columns):
    icu = icu.copy()
    icu["los"] = icu["los_days"]
    icu_cols = [c if c != "los_days" else "los" for c in icu_cols]

# ver8에도 존재하는 icu 컬럼만 선택(subject_id 제외)
fillable_cols = sorted(set(ver8.columns).intersection(icu_cols) - {"subject_id"})

icu_tmp = icu.copy()
if "intime" in icu_tmp.columns:
    icu_tmp["_intime_parsed"] = pd.to_datetime(icu_tmp["intime"], errors="coerce")
    icu_tmp = icu_tmp.sort_values(["subject_id", "_intime_parsed"])
    icu_tmp = icu_tmp.drop_duplicates(subset=["subject_id"], keep="last")
    icu_tmp = icu_tmp.drop(columns=["_intime_parsed"])
else:
    icu_tmp = icu_tmp.drop_duplicates(subset=["subject_id"], keep="last")

icu_tmp = icu_tmp[["subject_id"] + fillable_cols]

merged = ver8.merge(icu_tmp, on="subject_id", how="left", suffixes=("", "_icu"))

def is_blank(s):
    if s.dtype == object:
        return s.isna() | (s.astype(str).str.len() == 0)
    return s.isna()

filled_counts = {}
for col in fillable_cols:
    icu_col = f"{col}_icu"
    if icu_col in merged.columns:
        mask_to_fill = is_blank(merged[col]) & (~merged[icu_col].isna())
        filled_counts[col] = int(mask_to_fill.sum())
        merged.loc[mask_to_fill, col] = merged.loc[mask_to_fill, icu_col]
        merged.drop(columns=[icu_col], inplace=True, errors="ignore")

merged.to_csv(out_path, index=False)

print("Saved:", out_path)
print("ver8 rows:", len(ver8))
print("icustays rows:", len(icu))
print("merged rows:", len(merged))
print("fillable columns:", fillable_cols)
print("filled cell counts per column:", filled_counts)


Saved: /content/drive/MyDrive/cohort_ver9_fill_icustays.csv
ver8 rows: 836
icustays rows: 94458
merged rows: 836
fillable columns: ['hadm_id', 'intime', 'los_days', 'outtime', 'stay_id']
filled cell counts per column: {'hadm_id': 0, 'intime': 0, 'los_days': 297, 'outtime': 0, 'stay_id': 0}
