In [2]:
import pandas as pd

# 경로
ver10_path = "/content/drive/MyDrive/cohort_ver10_fill_patients.csv"
transfers_path = "/content/drive/MyDrive/transfers.csv"
out_path = "/content/drive/MyDrive/cohort_ver11_fill_transfers.csv"

# 로드
ver10 = pd.read_csv(ver10_path)
trans = pd.read_csv(transfers_path)

# 중복 컬럼 제거
ver10 = ver10.loc[:, ~ver10.columns.duplicated()]
trans = trans.loc[:, ~trans.columns.duplicated()]

# 키 확인 및 타입 정규화
if "subject_id" not in ver10.columns:
    raise ValueError("ver10에 subject_id 컬럼이 없습니다.")
if "subject_id" not in trans.columns:
    raise ValueError("transfers에 subject_id 컬럼이 없습니다.")
ver10["subject_id"] = ver10["subject_id"].astype(str)
trans["subject_id"] = trans["subject_id"].astype(str)

# transfers 기본 컬럼
trans_cols = ["subject_id", "hadm_id", "transfer_id", "eventtype", "careunit", "intime", "outtime"]

# ver10에 존재하는 transfers 컬럼만 채움 대상
fillable_cols = sorted(set(ver10.columns).intersection(trans_cols) - {"subject_id"})

# transfer_type <-> eventtype 동의어 매핑
# ver10에 transfer_type이 있고 transfers에 eventtype이 있으면 transfer_type을 채움 대상에 포함
alias_pairs = []
if ("transfer_type" in ver10.columns) and ("eventtype" in trans.columns):
    alias_pairs.append(("transfer_type", "eventtype"))

# 동일 subject_id가 여러 건인 경우 최신 intime 기준으로 1건 선택
trans_tmp = trans.copy()
if "intime" in trans_tmp.columns:
    trans_tmp["_intime_parsed"] = pd.to_datetime(trans_tmp["intime"], errors="coerce")
    trans_tmp = trans_tmp.sort_values(["subject_id", "_intime_parsed"])
    trans_tmp = trans_tmp.drop_duplicates(subset=["subject_id"], keep="last")
    trans_tmp = trans_tmp.drop(columns=["_intime_parsed"])
else:
    trans_tmp = trans_tmp.drop_duplicates(subset=["subject_id"], keep="last")

# 병합에 사용할 컬럼 구성
keep_cols = ["subject_id"] + fillable_cols + [src for _, src in alias_pairs if src not in fillable_cols]
keep_cols = [c for c in keep_cols if c in trans_tmp.columns]
trans_tmp = trans_tmp[keep_cols]

# 병합
merged = ver10.merge(trans_tmp, on="subject_id", how="left", suffixes=("", "_tr"))

# 결측 또는 빈칸 판정
def is_blank(s):
    if s.dtype == object:
        return s.isna() | (s.astype(str).str.strip() == "")
    return s.isna()

filled_counts = {}

# 동명이인 컬럼 채우기
for col in fillable_cols:
    tr_col = f"{col}_tr"
    if tr_col in merged.columns:
        mask_to_fill = is_blank(merged[col]) & (~merged[tr_col].isna())
        filled_counts[col] = int(mask_to_fill.sum())
        merged.loc[mask_to_fill, col] = merged.loc[mask_to_fill, tr_col]
        merged.drop(columns=[tr_col], inplace=True, errors="ignore")

# 동의어 매핑 채우기: transfer_type <- eventtype
for tgt, src in alias_pairs:
    src_col = f"{src}_tr"
    if src_col in merged.columns:
        mask_to_fill = is_blank(merged[tgt]) & (~merged[src_col].isna())
        filled_counts[tgt] = filled_counts.get(tgt, 0) + int(mask_to_fill.sum())
        merged.loc[mask_to_fill, tgt] = merged.loc[mask_to_fill, src_col]
        merged.drop(columns=[src_col], inplace=True, errors="ignore")

# 저장
merged.to_csv(out_path, index=False)

print("Saved:", out_path)
print("ver10 rows:", len(ver10))
print("transfers rows:", len(trans))
print("merged rows:", len(merged))
print("fillable columns:", fillable_cols, "aliases:", alias_pairs)
print("filled cell counts per column:", filled_counts)


Saved: /content/drive/MyDrive/cohort_ver11_fill_transfers.csv
ver10 rows: 836
transfers rows: 2413581
merged rows: 836
fillable columns: ['hadm_id', 'intime', 'outtime'] aliases: [('transfer_type', 'eventtype')]
filled cell counts per column: {'hadm_id': 0, 'intime': 0, 'outtime': 0}
