# Missing-as-Signal EDA for STEMI Cohort

본 노트북은 `cohort_ver11_fill_transfers.csv`를 기반으로 다음을 수행합니다.
1) 스키마(컬럼/타입) 요약
2) ECG/Lab/Cath 결측 여부 인디케이터 생성 (missing-as-signal)
3) 기본 아웃컴(`death_flag`, `los_days`) 파생 및 결측-아웃컴 연관 간단 점검
4) 결과 저장: `cohort_v11_with_missing_flags.csv`

> ⚠️ 주의: ECG 관련 컬럼이 코호트에 없을 수 있습니다. 이 경우 `ecg_missing_flag`는 전부 1로 생성되며, 실제 ECG 시행 여부를 구분하려면 MIMIC-IV-ECG 연계 또는 ED/chartevents 교차확인이 필요합니다.

In [3]:
# === Cell 0: Paths & Imports ===
import pandas as pd
import numpy as np
import re, datetime as dt

CSV_PATH = 'C:\25_DA\cohort\cohort_ver11_fill_transfers.csv'  # 필요 시 경로 수정
SAVE_CSV = 'C:\25_DA\cohort\cohort_ver80_missing_flags.csv'

pd.options.display.max_rows = 20
pd.options.display.max_columns = 200
print('Paths set.')

  CSV_PATH = 'C:\25_DA\cohort\cohort_ver11_fill_transfers.csv'  # 필요 시 경로 수정
  SAVE_CSV = 'C:\25_DA\cohort\cohort_ver80_missing_flags.csv'


ModuleNotFoundError: No module named 'pandas'

In [None]:
# === Cell 1: Load CSV & Schema peek ===
df = pd.read_csv(CSV_PATH)
print('Loaded:', df.shape)
display(df.head())

# 컬럼/타입 요약
schema = pd.DataFrame({'column': df.columns, 'dtype': df.dtypes.astype(str)})
display(schema)

In [None]:
# === Cell 2: Datetime normalization (best-effort) ===
time_cols_guess = [c for c in df.columns if re.search(r'(time|date)$', c, flags=re.I)]
for c in time_cols_guess:
    try:
        df[c] = pd.to_datetime(df[c], errors='coerce')
    except Exception:
        pass
print('Datetime normalized for:', len(time_cols_guess), 'columns')

In [None]:
# === Cell 3: Outcome derivation ===
# death_flag: deathtime 존재 여부로 1/0, 해당 컬럼 없으면 0
if 'deathtime' in df.columns:
    if not np.issubdtype(df['deathtime'].dtype, np.datetime64):
        df['deathtime'] = pd.to_datetime(df['deathtime'], errors='coerce')
    df['death_flag'] = df['deathtime'].notna().astype(int)
else:
    df['death_flag'] = 0

# los_days: 있으면 사용, 없으면 admittime/dischtime로 계산 시도
if 'los_days' in df.columns:
    # 음수/이상치 정리
    df['los_days_clean'] = pd.to_numeric(df['los_days'], errors='coerce')
else:
    if {'admittime','dischtime'}.issubset(df.columns):
        if not np.issubdtype(df['admittime'].dtype, np.datetime64):
            df['admittime'] = pd.to_datetime(df['admittime'], errors='coerce')
        if not np.issubdtype(df['dischtime'].dtype, np.datetime64):
            df['dischtime'] = pd.to_datetime(df['dischtime'], errors='coerce')
        delta = (df['dischtime'] - df['admittime']).dt.total_seconds()/86400.0
        df['los_days_clean'] = delta
    else:
        df['los_days_clean'] = np.nan

display(df[['death_flag','los_days_clean']].head())

In [None]:
# === Cell 4: Missing-as-signal flags ===
# ECG
ecg_cols = [c for c in df.columns if re.search(r'ecg', c, flags=re.I)]
if len(ecg_cols)==0:
    # ECG 관련 컬럼이 전혀 없다면 모두 missing으로 태깅 (추후 MIMIC-IV-ECG 연계 권장)
    df['ecg_missing_flag'] = 1
    ecg_note = 'No ECG-like columns found; set all ecg_missing_flag=1 (placeholder).'
else:
    # 어떤 ECG 타임/지표라도 관측되면 not-missing으로 간주
    ecg_obs = df[ecg_cols].apply(lambda s: s.notna() & (s.astype(str).str.len()>0))
    if isinstance(ecg_obs, pd.DataFrame):
        ecg_any = ecg_obs.any(axis=1)
    else:
        ecg_any = ecg_obs
    df['ecg_missing_flag'] = (~ecg_any).astype(int)
    ecg_note = f'ECG columns detected: {ecg_cols[:6]}...' if len(ecg_cols)>0 else 'None'

# Lab (troponin/CK-MB)
lab_cols_hint = [
    'lab_tat_min','first_troponin_time','troponin_value','ckmb_value','lab_time','troponin_time'
]
lab_cols = [c for c in df.columns if c in lab_cols_hint or re.search(r'(troponin|ck\s?-?mb|lab)', c, flags=re.I)]
lab_obs = df[lab_cols].apply(lambda s: s.notna() & (s.astype(str).str.len()>0)) if lab_cols else pd.Series([False]*len(df))
lab_any = lab_obs.any(axis=1) if isinstance(lab_obs, pd.DataFrame) else lab_obs
df['lab_missing_flag'] = (~lab_any).astype(int)

# Cath(PCI/angiography) proxy
cath_cols_hint = ['door_to_cath_min','door_to_cath_min_A','door_to_cath_min_B','cath_time','pci_time']
cath_cols = [c for c in df.columns if c in cath_cols_hint or re.search(r'(cath|pci|angiograph)', c, flags=re.I)]
cath_obs = df[cath_cols].apply(lambda s: s.notna() & (s.astype(str).str.len()>0)) if cath_cols else pd.Series([False]*len(df))
cath_any = cath_obs.any(axis=1) if isinstance(cath_obs, pd.DataFrame) else cath_obs
df['cath_missing_flag'] = (~cath_any).astype(int)

print('ECG note:', ecg_note)
print('ECG cols:', ecg_cols)
print('Lab cols (detected):', lab_cols[:12])
print('Cath cols (detected):', cath_cols[:12])
display(df[['ecg_missing_flag','lab_missing_flag','cath_missing_flag']].head())

In [None]:
# === Cell 5: Quick sanity checks ===
def rate(x):
    return f"{x.mean()*100:.1f}%" if len(x)>0 else 'n/a'

summary = {
    'n_rows': len(df),
    'ecg_missing_rate': rate(df['ecg_missing_flag']),
    'lab_missing_rate': rate(df['lab_missing_flag']),
    'cath_missing_rate': rate(df['cath_missing_flag']),
    'death_rate': rate(df['death_flag']),
    'los_days_median': float(np.nanmedian(df['los_days_clean'])) if 'los_days_clean' in df.columns else np.nan
}
summary

In [None]:
# === Cell 6: Missing vs Outcome (coarse) ===
def ctab(col):
    if col not in df.columns:
        return pd.DataFrame()
    return pd.crosstab(df[col], df['death_flag'], normalize='index')

ct_ecg = ctab('ecg_missing_flag'); print('ECG-missing vs death_flag'); display(ct_ecg)
ct_lab = ctab('lab_missing_flag'); print('Lab-missing vs death_flag'); display(ct_lab)
ct_cat = ctab('cath_missing_flag'); print('Cath-missing vs death_flag'); display(ct_cat)

In [None]:
# === Cell 7: Save enriched CSV ===
cols_keep = list(df.columns)
df.to_csv(SAVE_CSV, index=False)
print('Saved ->', SAVE_CSV)
df.shape