# Process Variation Analysis (Missing-as-Signal)
이 노트북은 `ecg_missing_flag`, `lab_missing_flag`, `cath_missing_flag` 조합으로 프로세스 변이를 분류하고, LOS(입원기간)과의 관계를 요약/시각화합니다.

**사용 전:** 아래 `CSV_PATH`만 자신의 경로로 수정하세요.

In [None]:
# === Cell 1: Paths & Imports ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# ▼▼▼ 수정: CSV 경로 ▼▼▼
CSV_PATH = 'cohort_ver80_missing_flags.csv'  # 예시: '/content/drive/MyDrive/Colab Notebooks/cohort_ver80_missing_flags.csv'
# ▲▲▲ 수정: CSV 경로 ▲▲▲

# 표시 옵션
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 200)

df = pd.read_csv(CSV_PATH)

print('Loaded:', CSV_PATH)
print('Shape:', df.shape)
display(df.head(3))

In [None]:
# === Cell 2: Basic Checks (columns & NA) ===
expected_cols = ['los_days_clean','ecg_missing_flag','lab_missing_flag','cath_missing_flag']
missing_cols = [c for c in expected_cols if c not in df.columns]
print('Missing expected cols:', missing_cols)

# 결측률 요약
na_summary = df[expected_cols].isna().mean().to_frame('na_ratio')
display(na_summary)

# flag를 정수(0/1)로 강제 변환 (True/False, float → int)
for c in ['ecg_missing_flag','lab_missing_flag','cath_missing_flag']:
    if c in df.columns:
        df[c] = (df[c].astype(float).round().astype('Int64')).astype('float').fillna(0).astype(int)

# LOS는 숫자 변환
if 'los_days_clean' in df.columns:
    df['los_days_clean'] = pd.to_numeric(df['los_days_clean'], errors='coerce')

print(df[['los_days_clean','ecg_missing_flag','lab_missing_flag','cath_missing_flag']].head())

In [None]:
# === Cell 3: Individual Flag Distribution & LOS Summary ===
def summarize_flag(flag_col, los_col='los_days_clean'):
    print(f'\n=== {flag_col} (0 vs 1) ===')
    # 분포
    vc = df[flag_col].value_counts(dropna=False).sort_index()
    print('Counts:'); print(vc)
    # LOS 요약
    out = []
    for val in [0,1]:
        sub = df[df[flag_col]==val][los_col].dropna()
        if len(sub)==0:
            out.append((val, 0, np.nan, np.nan, np.nan, np.nan))
        else:
            out.append((val, len(sub), float(np.nanmean(sub)),
                        float(np.nanmedian(sub)),
                        float(np.nanpercentile(sub,75)),
                        float(np.nanpercentile(sub,90))))
    res = pd.DataFrame(out, columns=[flag_col, 'n', 'mean_LOS', 'median_LOS', 'p75_LOS', 'p90_LOS'])
    display(res)

for col in ['ecg_missing_flag','lab_missing_flag','cath_missing_flag']:
    if col in df.columns:
        summarize_flag(col)

In [None]:
# === Cell 4: Flag Correlation (Pearson) + Heatmap (matplotlib only) ===
flags = ['ecg_missing_flag','lab_missing_flag','cath_missing_flag']
flags = [c for c in flags if c in df.columns]
corr = df[flags].corr(method='pearson') if len(flags)>=2 else pd.DataFrame()

print('Flag correlation matrix:')
display(corr)

if not corr.empty:
    fig, ax = plt.subplots(figsize=(4,3))
    cax = ax.imshow(corr.values, vmin=-1, vmax=1)
    ax.set_xticks(range(len(flags))); ax.set_yticks(range(len(flags)))
    ax.set_xticklabels(flags, rotation=45, ha='right')
    ax.set_yticklabels(flags)
    fig.colorbar(cax, ax=ax)
    ax.set_title('Flag Correlation (Pearson)')
    plt.tight_layout()
    plt.show()

In [None]:
# === Cell 5: Build Pattern (ecg/lab/cath) ===
def safe_int(x):
    try:
        return int(x)
    except:
        return 0

for c in ['ecg_missing_flag','lab_missing_flag','cath_missing_flag']:
    if c not in df.columns:
        df[c] = 0

df['pattern'] = (
    df['ecg_missing_flag'].apply(safe_int).astype(str) +
    df['lab_missing_flag'].apply(safe_int).astype(str) +
    df['cath_missing_flag'].apply(safe_int).astype(str)
)

print('Pattern unique values:', sorted(df['pattern'].unique()))
display(df[['ecg_missing_flag','lab_missing_flag','cath_missing_flag','pattern']].head(5))

In [None]:
# === Cell 6: Pattern Summary (count, ratio, LOS stats) ===
total = len(df)
grp = df.groupby('pattern', as_index=False).agg(
    count=('pattern','size'),
    mean_LOS=('los_days_clean','mean'),
    median_LOS=('los_days_clean','median')
).sort_values('count', ascending=False)

grp['ratio_%'] = grp['count'] / total * 100.0
# p75, p90
p75_list = []
p90_list = []
for pat in grp['pattern']:
    s = df.loc[df['pattern']==pat, 'los_days_clean'].dropna()
    if len(s)==0:
        p75_list.append(np.nan); p90_list.append(np.nan)
    else:
        p75_list.append(float(np.nanpercentile(s,75)))
        p90_list.append(float(np.nanpercentile(s,90)))

grp['p75_LOS'] = p75_list
grp['p90_LOS'] = p90_list

print('\n=== Pattern Summary ===')
display(grp)

In [None]:
# === Cell 7: Plot – Flag Count Bars (matplotlib) ===
for col in ['ecg_missing_flag','lab_missing_flag','cath_missing_flag']:
    if col in df.columns:
        counts = df[col].value_counts().sort_index()
        fig, ax = plt.subplots(figsize=(4,3))
        ax.bar(counts.index.astype(str), counts.values)
        ax.set_title(f'{col} counts')
        ax.set_xlabel(col)
        ax.set_ylabel('count')
        plt.tight_layout()
        plt.show()

In [None]:
# === Cell 8: Plot – Flag vs LOS Boxplot (matplotlib) ===
for col in ['ecg_missing_flag','lab_missing_flag','cath_missing_flag']:
    if col in df.columns:
        data0 = df.loc[df[col]==0, 'los_days_clean'].dropna().values
        data1 = df.loc[df[col]==1, 'los_days_clean'].dropna().values
        if len(data0)>0 or len(data1)>0:
            fig, ax = plt.subplots(figsize=(5,3))
            ax.boxplot([data0, data1], labels=['0','1'], showfliers=True)
            ax.set_title(f'LOS by {col}')
            ax.set_xlabel(col)
            ax.set_ylabel('LOS (days)')
            plt.tight_layout()
            plt.show()

In [None]:
# === Cell 9: Plot – Pattern Counts (matplotlib) ===
pat_counts = df['pattern'].value_counts().sort_index()
fig, ax = plt.subplots(figsize=(6,3))
ax.bar(pat_counts.index.astype(str), pat_counts.values)
ax.set_title('Pattern counts (ecg/lab/cath)')
ax.set_xlabel('pattern')
ax.set_ylabel('count')
plt.tight_layout()
plt.show()

In [None]:
# === Cell 10: Plot – Pattern vs LOS Boxplot (matplotlib) ===
patterns = sorted(df['pattern'].unique())
data = [df.loc[df['pattern']==p, 'los_days_clean'].dropna().values for p in patterns]

if any(len(arr)>0 for arr in data):
    fig, ax = plt.subplots(figsize=(8,3))
    ax.boxplot(data, labels=patterns, showfliers=True)
    ax.set_title('LOS by pattern (ecg/lab/cath)')
    ax.set_xlabel('pattern')
    ax.set_ylabel('LOS (days)')
    plt.tight_layout()
    plt.show()
else:
    print('No LOS values available for boxplot.')

In [None]:
# === Cell 11: Classify Process Type (standard / variant / missing-heavy) ===
def classify_pattern(p):
    ones = p.count('1')
    if p == '000':
        return 'standard'
    elif ones >= 2:
        return 'missing-heavy'
    else:
        return 'variant'

df['process_class'] = df['pattern'].astype(str).apply(classify_pattern)

summary = df.groupby('process_class', as_index=False).agg(
    count=('process_class','size'),
    mean_LOS=('los_days_clean','mean'),
    median_LOS=('los_days_clean','median')
).sort_values('count', ascending=False)

summary['ratio_%'] = summary['count'] / len(df) * 100.0

print('=== Process Class Summary ===')
display(summary)

In [None]:
# === Cell 12: Save Summaries ===
out_dir = Path('.')
sum_path = out_dir / 'process_pattern_summary.csv'
df[['ecg_missing_flag','lab_missing_flag','cath_missing_flag','pattern','process_class','los_days_clean']].to_csv(sum_path, index=False)
print('Saved:', sum_path.resolve())