In [None]:
import pandas as pd
from pathlib import Path

# ----------------------------------------
# 0. 경로 설정
# ----------------------------------------
BASE_DIR = Path(".")
COHORT_PATH = BASE_DIR / "cohort" / "cohort_ver45_with_pci_rx_admin.csv"
DIAGNOSES_PATH = BASE_DIR / ".." / ".." / "data" / "MIMIC4-hosp-icu" / "diagnoses_icd.csv"
CHARLSON_MAP_PATH = BASE_DIR / "ref" / "charlson_icd_map.txt"

OUTPUT_PATH = BASE_DIR / "cohort" / "cohort_ver46_with_cci.csv"


# ----------------------------------------
# 1. 데이터 로드
# ----------------------------------------
print("Loading cohort...")
cohort = pd.read_csv(COHORT_PATH)
print("Cohort rows:", len(cohort))

print("Loading diagnoses_icd...")
dx = pd.read_csv(DIAGNOSES_PATH)
print("Diagnoses rows:", len(dx))

print("Loading Charlson ICD map...")
charlson_map = pd.read_csv(CHARLSON_MAP_PATH)
print("Charlson map rows:", len(charlson_map))


# ----------------------------------------
# 2. 코호트 hadm_id에 해당하는 진단만 필터링
# ----------------------------------------
hadm_ids = cohort["hadm_id"].unique()
dx_sub = dx[dx["hadm_id"].isin(hadm_ids)].copy()
print("Diagnoses rows (in cohort hadm):", len(dx_sub))

# icd_version은 MIMIC의 diagnoses_icd에 따라 9/10 등으로 들어있음
# (현재 매핑이 ICD-10만 있다면, 아래에서 version == 10만 활용될 것임)


# ----------------------------------------
# 3. ICD 코드 정규화 (점 제거 + 대문자)
# ----------------------------------------
def normalize_icd(code: str) -> str:
    """
    ICD 코드에서 공백 제거, 대문자 변환, '.' 제거 (접두 일치 매칭용).
    """
    if pd.isna(code):
        return ""
    s = str(code).strip().upper().replace(".", "")
    return s

dx_sub["icd_code_norm"] = dx_sub["icd_code"].apply(normalize_icd)

charlson_map["icd_code_prefix_norm"] = (
    charlson_map["icd_code_prefix"]
    .astype(str)
    .str.strip()
    .str.upper()
    .str.replace(".", "", regex=False)
)


# ----------------------------------------
# 4. ICD → Charlson comorbidity 매핑
#    - prefix 매칭 (예: I21, I22, C77 등)
#    - icd_version도 함께 사용
# ----------------------------------------

def map_icd_to_charlson(dx_df: pd.DataFrame, map_df: pd.DataFrame) -> pd.DataFrame:
    """
    diagnoses_icd 서브셋(dx_df)에 대해 Charlson 매핑(map_df)을 적용해
    hadm_id - comorbidity - weight 테이블을 생성.
    
    dx_df: columns - hadm_id, icd_version, icd_code_norm
    map_df: columns - icd_version, icd_code_prefix_norm, comorbidity, weight
    """
    dx_work = dx_df[["hadm_id", "icd_version", "icd_code_norm"]].copy()
    results = []

    # prefix 길이 (상의해서 결정) – 일반적으로 3~5자리 정도
    # 길이가 긴 prefix를 먼저 매칭해서 좀 더 구체적인 매핑을 우선
    for plen in [5, 4, 3]:
        tmp = dx_work.copy()
        tmp["prefix"] = tmp["icd_code_norm"].str[:plen]

        map_sub = map_df.copy()
        map_sub["prefix"] = map_sub["icd_code_prefix_norm"].str[:plen]

        merged = tmp.merge(
            map_sub[["icd_version", "prefix", "comorbidity", "weight"]],
            on=["icd_version", "prefix"],
            how="inner",
        )

        # 중복 제거
        merged = merged[["hadm_id", "comorbidity", "weight"]].drop_duplicates()
        results.append(merged)

    if not results:
        return pd.DataFrame(columns=["hadm_id", "comorbidity", "weight"])

    mapped = pd.concat(results, ignore_index=True).drop_duplicates()
    return mapped


dx_charlson = map_icd_to_charlson(dx_sub, charlson_map)
print("Mapped Charlson rows:", len(dx_charlson))


# ----------------------------------------
# 5. hadm_id 별 comorbidity flag 및 CCI score 계산
# ----------------------------------------

# 5-1. hadm_id x comorbidity flag (0/1)
cci_flags = (
    dx_charlson
    .assign(flag=1)
    .pivot_table(
        index="hadm_id",
        columns="comorbidity",
        values="flag",
        aggfunc="max",
        fill_value=0,
    )
    .reset_index()
)

print("CCI flag table shape:", cci_flags.shape)

# 5-2. comorbidity별 weight 테이블 (중복 제거)
weight_table = (
    charlson_map[["comorbidity", "weight"]]
    .drop_duplicates()
    .set_index("comorbidity")["weight"]
)


def compute_cci_score(row: pd.Series) -> int:
    """
    한 hadm(row)에 대해 comorbidity flag * weight를 합산한 CCI 점수 계산.
    """
    score = 0
    for comorbidity, w in weight_table.items():
        if comorbidity in row and row[comorbidity] == 1:
            score += int(w)
    return score


cci_flags["cci_score"] = cci_flags.apply(compute_cci_score, axis=1)


# 5-3. CCI 범주화 (0 / 1–2 / ≥3)
def categorize_cci(score: int) -> str:
    """
    CCI 점수를 3단계 범주로 분류:
      - 0점
      - 1~2점
      - 3점 이상
    """
    if pd.isna(score):
        return "0"
    score = int(score)
    if score == 0:
        return "0"
    elif 1 <= score <= 2:
        return "1-2"
    else:
        return "3+"

cci_flags["cci_category"] = cci_flags["cci_score"].apply(categorize_cci)


print(cci_flags[["hadm_id", "cci_score", "cci_category"]].head())


# ----------------------------------------
# 6. 코호트에 merge
# ----------------------------------------
cohort_cci = cohort.merge(cci_flags[["hadm_id", "cci_score", "cci_category"]],
                          on="hadm_id", how="left")

# 진단이 하나도 매핑 안 된 경우 NaN → 0점 / "0"으로 채우기
cohort_cci["cci_score"] = cohort_cci["cci_score"].fillna(0).astype(int)
cohort_cci["cci_category"] = cohort_cci["cci_category"].fillna("0")

print("Final cohort with CCI:", cohort_cci.shape)
print(cohort_cci[["hadm_id", "cci_score", "cci_category"]].head())


# ----------------------------------------
# 7. 저장
# ----------------------------------------
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
cohort_cci.to_csv(OUTPUT_PATH, index=False, encoding="utf-8")
print("Saved:", OUTPUT_PATH)


Loading cohort...
Cohort rows: 1930
Loading diagnoses_icd...
Diagnoses rows: 6364488
Loading Charlson ICD map...


FileNotFoundError: [Errno 2] No such file or directory: 'ref\\charlson_icd_map.csv'