In [8]:
# Cell 1: 라이브러리 및 전역 설정
import warnings, time
from pathlib import Path
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, brier_score_loss
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

warnings.filterwarnings("ignore")

try:
    from lightgbm import LGBMClassifier
    _HAS_LGBM = True
except Exception:
    _HAS_LGBM = False

try:
    from xgboost import XGBClassifier
    _HAS_XGB = True
except Exception:
    _HAS_XGB = False

# NOTEARS
_HAS_NOTEARS = True
try:
    from notears.linear import notears_linear
except Exception:
    _HAS_NOTEARS = False

DATA_PATH = "./training_data.csv"
LABEL_COL = "label"
ID_PREFIX = "Unnamed"
N_SPLITS = 10
RANDOM_STATE = 42
DAG_NAME = "NOTEARS"
CLASS_WEIGHT_BALANCED = True
VERBOSE_PRINT = True


In [9]:
# Cell 2: 하이퍼파라미터와 유틸 함수
LGBM_PARAMS = dict(
    n_estimators=300, learning_rate=0.05, num_leaves=63, max_depth=-1,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0, reg_alpha=0.0,
    objective="binary", class_weight="balanced", random_state=RANDOM_STATE, n_jobs=-1
)

XGB_PARAMS = dict(
    n_estimators=300, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8,
    reg_lambda=1.0, reg_alpha=0.0, objective="binary:logistic", eval_metric="auc",
    random_state=RANDOM_STATE, n_jobs=-1
)

NOTEARS_L1 = 0.10    # sparsity 제어(높을수록 간선 수 감소)
NOTEARS_TH = 1e-3    # B 임계치
W_TH = 1e-6          # OLS 재적합 후 W 소거 임계치

def ensure_dirs():
    Path("./logs").mkdir(parents=True, exist_ok=True)
    Path("./reports").mkdir(parents=True, exist_ok=True)
    Path("./reports/edges").mkdir(parents=True, exist_ok=True)
    Path("./reports/edges/by_model").mkdir(parents=True, exist_ok=True)

def expected_calibration_error(y_true, y_prob, n_bins=10):
    y_true = np.asarray(y_true); y_prob = np.asarray(y_prob)
    bins = np.linspace(0, 1, n_bins + 1)
    ece = 0.0; N = len(y_true)
    for b in range(n_bins):
        L, R = bins[b], bins[b+1]
        m = (y_prob >= L) & (y_prob < (R if b < n_bins - 1 else R + 1e-12))
        if not np.any(m):
            continue
        ece += (m.sum() / N) * abs(y_true[m].mean() - y_prob[m].mean())
    return float(ece)


In [10]:
# Cell 3: 모델 팩토리, 로깅, 교차검증
def make_models():
    ms = [
        ("Logit", LogisticRegression(
            solver="lbfgs", max_iter=200,
            class_weight=("balanced" if CLASS_WEIGHT_BALANCED else None))),
        ("RandomForest", RandomForestClassifier(
            n_estimators=400, n_jobs=-1, random_state=RANDOM_STATE,
            class_weight=("balanced" if CLASS_WEIGHT_BALANCED else None)))
    ]
    if _HAS_XGB:
        ms.append(("XGBoost", XGBClassifier(**XGB_PARAMS, verbosity=0)))
    if _HAS_LGBM:
        ms.append(("LightGBM", LGBMClassifier(**LGBM_PARAMS)))
    return ms

def append_log_csv(row: Dict):
    ensure_dirs()
    p = Path("./logs/run_log.csv")
    pd.DataFrame([row]).to_csv(
        p, mode="a", index=False, encoding="utf-8-sig", header=not p.exists())

def one_line(dag, k, model, set_name, met, secs):
    return (f"DAG={dag} | k={k} | model={model} | set={set_name} | "
            f"f1={met['f1']:.4f}, AUPRC={met['AUPRC']:.4f}, AUROC={met['AUROC']:.4f}, "
            f"Brier={met['Brier']:.4f}, ECE={met['ECE']:.4f} | time={secs:.2f}s")

def run_cv(model, X, y, n_splits=10, seed=42):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    f1s, auprcs, aurocs, briers, eces = [], [], [], [], []
    Xv = X.values.astype(np.float32); yv = y.values.astype(int)
    for tr, te in skf.split(Xv, yv):
        X_tr, X_te, y_tr, y_te = Xv[tr], Xv[te], yv[tr], yv[te]
        pipe = make_pipeline(StandardScaler(with_mean=True, with_std=True), model)
        is_lgbm = (model.__class__.__name__ == "LGBMClassifier")
        if is_lgbm:
            X_tr2, X_val, y_tr2, y_val = train_test_split(
                X_tr, y_tr, test_size=0.2, random_state=seed, stratify=y_tr)
            pipe.fit(X_tr2, y_tr2)
        else:
            pipe.fit(X_tr, y_tr)
        proba = pipe.predict_proba(X_te)[:, 1] if hasattr(pipe, "predict_proba") \
                else pipe.predict(X_te).astype(float)
        pred = (proba >= 0.5).astype(int)
        f1s.append(f1_score(y_te, pred, zero_division=0))
        auprcs.append(average_precision_score(y_te, proba) if len(np.unique(y_te)) > 1 else np.nan)
        aurocs.append(roc_auc_score(y_te, proba) if len(np.unique(y_te)) > 1 else np.nan)
        briers.append(brier_score_loss(y_te, proba))
        eces.append(expected_calibration_error(y_te, proba, 10))
    return dict(
        f1=float(np.nanmean(f1s)),
        AUPRC=float(np.nanmean(auprcs)),
        AUROC=float(np.nanmean(aurocs)),
        Brier=float(np.nanmean(briers)),
        ECE=float(np.nanmean(eces)))


In [11]:
# Cell 4: 엣지 처리와 저장 함수
def pick_all_edges(W: np.ndarray):
    n = W.shape[0]; out = []
    for i in range(n):
        for j in range(n):
            if i == j:
                continue
            w = W[i, j]
            if w != 0.0:
                out.append((i, j, w))
    out.sort(key=lambda t: (abs(t[2]), t[2]), reverse=True)
    return out

def pick_top_k_edges(edge_list, k):
    return edge_list[:k]

def build_feature_df_from_edges(
    X_base: pd.DataFrame, cols: List[str],
    edges_ijw: List[Tuple[int,int,float]], feature_type: str):
    feats = {}
    used_meta = []
    for rank, (i, j, w) in enumerate(edges_ijw, start=1):
        A, B = cols[j], cols[i]
        if feature_type == "m":
            fname = f"{A}_mul_{B}"
            feats[fname] = (X_base[A] * X_base[B]).values
        elif feature_type == "mw":
            fname = f"{A}_mulw_{B}"
            feats[fname] = (w * (X_base[A] * X_base[B])).values
        else:
            raise ValueError("feature_type must be 'm' or 'mw'")
        used_meta.append(dict(rank=rank, src=A, dst=B, weight=float(w), feature=fname))
    return pd.DataFrame(feats, index=X_base.index), used_meta

def save_edges_csv(used_meta: List[Dict], feature_type: str, k: int):
    ensure_dirs()
    path = Path(f"./reports/edges/{feature_type}_k{k}.csv")
    pd.DataFrame(used_meta).to_csv(path, index=False, encoding="utf-8-sig")

def save_master_edges(all_edges_meta: List[Dict]):
    ensure_dirs()
    path = Path("./reports/edges/master_edges.csv")
    pd.DataFrame(all_edges_meta).to_csv(path, index=False, encoding="utf-8-sig")

def save_edges_by_model(dag: str, k: int, model: str,
                        feature: str, set_name: str, used_meta: List[Dict]):
    ensure_dirs()
    df = pd.DataFrame(used_meta)
    if df.empty:
        df = pd.DataFrame(columns=["rank", "src", "dst", "weight", "feature"])
    df.insert(0, "Set", set_name)
    df.insert(0, "Feature", feature)
    df.insert(0, "Model", model)
    df.insert(0, "k", k)
    df.insert(0, "DAG", dag)
    per_run = Path(f"./reports/edges/by_model/{model}_{feature}_{set_name}_k{k}.csv")
    df.to_csv(per_run, index=False, encoding="utf-8-sig")

    summary_row = dict(
        DAG=dag, k=k, Model=model, Feature=feature, Set=set_name,
        edge_count=len(used_meta),
        features=";".join([m["feature"] for m in used_meta]) if used_meta else ""
    )
    summary_p = Path("./reports/edges/usage_summary.csv")
    pd.DataFrame([summary_row]).to_csv(
        summary_p, mode="a", index=False, encoding="utf-8-sig", header=not summary_p.exists())


In [12]:
# Cell 5: 데이터 로드 → NOTEARS 구조학습 → A → OLS로 W → 엣지 후보 생성
ensure_dirs()
path = Path(DATA_PATH); assert path.exists(), f"not found: {path}"
df = pd.read_csv(path); assert LABEL_COL in df.columns, f"'{LABEL_COL}' not found"

id_cols = [c for c in df.columns if c.startswith(ID_PREFIX)]
feature_cols = [c for c in df.columns if c not in id_cols + [LABEL_COL]]
X_base = df[feature_cols].copy()
y = df[LABEL_COL].astype(int).copy()

var0 = [c for c in feature_cols if np.isclose(X_base[c].var(ddof=0), 0.0)]
cols_used = [c for c in feature_cols if c not in var0]
X_base_used = X_base[cols_used].copy()

if not _HAS_NOTEARS:
    raise ImportError("notears 설치 필요: pip install notears")

Z = StandardScaler().fit_transform(X_base_used.values.astype(float))
p = len(cols_used)

# NOTEARS (linear)
B = notears_linear(Z, lambda1=NOTEARS_L1, loss_type='l2')  # B[i,j] : j -> i
B = np.asarray(B, dtype=float)

A = (np.abs(B) > NOTEARS_TH).astype(float)

# OLS 재적합으로 W 안정화
W = np.zeros_like(B)
for i in range(p):
    parents = np.where(A[i, :] == 1.0)[0]
    if parents.size == 0:
        continue
    yi = Z[:, i]
    Xi = Z[:, parents]
    coef, *_ = np.linalg.lstsq(Xi, yi, rcond=None)
    W[i, parents] = coef

W[np.abs(W) < W_TH] = 0.0

edge_candidates = pick_all_edges(W)
E = len(edge_candidates)
P = len(cols_used)
print(f"edges={E}, nodes={P}")

master_meta = []
for rank, (i, j, w) in enumerate(edge_candidates, start=1):
    master_meta.append(dict(rank=rank, src=cols_used[j], dst=cols_used[i], weight=float(w)))
save_master_edges(master_meta)


edges=9, nodes=13


In [6]:
# Cell 6: 학습 루프 (k는 '엣지 개수'로 통일)
models = make_models()
long_rows, wide_rows = [], []

# O–original baseline: k=0에서 모델별 1회
for mname, model in models:
    t0 = time.time()
    clf = (LGBMClassifier(**LGBM_PARAMS) if (_HAS_LGBM and mname == "LightGBM") else model)
    metrics = run_cv(clf, X_base_used, y, n_splits=N_SPLITS, seed=RANDOM_STATE)
    secs = time.time() - t0
    if VERBOSE_PRINT:
        print(one_line(DAG_NAME, 0, mname, "O", metrics, secs))
    append_log_csv(dict(dag=DAG_NAME, k=0, model=mname, set="O", feature="original",
                        **metrics, seconds=round(secs, 3)))
    long_rows.append(dict(DAG=DAG_NAME, k=0, Model=mname, Feature="original", Set="O", **metrics))
    wide_rows.append(dict(dag=DAG_NAME, k=0, model=mname, feature="original", set="O", **metrics))
    save_edges_by_model(DAG_NAME, 0, mname, "original", "O", [])

# k=1..E 각 단계마다 m, mw의 F/OF 모두 수행
def run_k_loop_for_feature(feature_type: str):
    if E == 0:
        return
    for k in range(1, E + 1):
        edges_k = pick_top_k_edges(edge_candidates, k)
        X_edges, used_meta = build_feature_df_from_edges(
            X_base_used, cols_used, edges_k, feature_type
        )
        save_edges_csv(used_meta, feature_type, k)

        for mname, model in models:
            clf = (LGBMClassifier(**LGBM_PARAMS) if (_HAS_LGBM and mname == "LightGBM") else model)

            # F: 엣지 파생만
            save_edges_by_model(DAG_NAME, k, mname, feature_type, "F", used_meta)
            t0 = time.time()
            metrics = run_cv(clf, X_edges, y, n_splits=N_SPLITS, seed=RANDOM_STATE)
            secs = time.time() - t0
            if VERBOSE_PRINT:
                print(one_line(DAG_NAME, k, mname, "F", metrics, secs))
            append_log_csv(dict(dag=DAG_NAME, k=k, model=mname, set="F", feature=feature_type,
                                **metrics, seconds=round(secs, 3)))
            long_rows.append(dict(DAG=DAG_NAME, k=k, Model=mname, Feature=feature_type, Set="F", **metrics))
            wide_rows.append(dict(dag=DAG_NAME, k=k, model=mname, feature=feature_type, set="F", **metrics))

            # OF: 원본 노드 전체 + 엣지 파생
            X_OF = pd.concat([X_base_used, X_edges], axis=1)
            save_edges_by_model(DAG_NAME, k, mname, feature_type, "OF", used_meta)
            t0 = time.time()
            metrics = run_cv(clf, X_OF, y, n_splits=N_SPLITS, seed=RANDOM_STATE)
            secs = time.time() - t0
            if VERBOSE_PRINT:
                print(one_line(DAG_NAME, k, mname, "OF", metrics, secs))
            append_log_csv(dict(dag=DAG_NAME, k=k, model=mname, set="OF", feature=feature_type,
                                **metrics, seconds=round(secs, 3)))
            long_rows.append(dict(DAG=DAG_NAME, k=k, Model=mname, Feature=feature_type, Set="OF", **metrics))
            wide_rows.append(dict(dag=DAG_NAME, k=k, model=mname, feature=feature_type, set="OF", **metrics))

run_k_loop_for_feature("m")
run_k_loop_for_feature("mw")


DAG=NOTEARS | k=0 | model=Logit | set=O | f1=0.1406, AUPRC=0.1913, AUROC=0.8816, Brier=0.1477, ECE=0.2809 | time=0.67s
DAG=NOTEARS | k=0 | model=RandomForest | set=O | f1=0.3142, AUPRC=0.4535, AUROC=0.9114, Brier=0.0159, ECE=0.0056 | time=17.92s
DAG=NOTEARS | k=0 | model=XGBoost | set=O | f1=0.4448, AUPRC=0.4926, AUROC=0.9227, Brier=0.0150, ECE=0.0087 | time=14.10s
[LightGBM] [Info] Number of positive: 285, number of negative: 12588
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000616 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3042
[LightGBM] [Info] Number of data points in the train set: 12873, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 286, number of negative: 12588
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of

In [7]:
# Cell 7: 리포트 저장
df_long = pd.DataFrame(long_rows)
df_long.to_csv("./reports/report_long.csv", index=False, encoding="utf-8-sig")
print("saved report_long.csv")

required = ["DAG","k","Model","Feature","Set","f1","AUPRC","AUROC","Brier","ECE"]
if not df_long.empty and all(col in df_long.columns for col in required):
    pv = df_long.pivot_table(
        index=["DAG","k","Model","Feature"],
        columns="Set",
        values=["f1","AUPRC","AUROC","Brier","ECE"],
        aggfunc="mean")
    pv.to_csv("./reports/report_pivot.csv", encoding="utf-8-sig")

    def save_user_style_wide_csv(all_rows: List[Dict], out_path: Path):
        groups = ["O","F","OF","F-O","OF-O"]
        mets = ["f1","AUPRC","AUROC","Brier","ECE"]
        key_map = {}
        for r in all_rows:
            if r["set"] not in {"O","F","OF"}:
                continue
            key = (r["dag"], r["model"], r["feature"], r["k"])
            if key not in key_map:
                base = dict(DAG=r["dag"], Model=r["model"], Feature=r["feature"], k=r["k"])
                for g in groups:
                    for m in mets:
                        base[f"{g}:{m}"] = np.nan
                key_map[key] = base
            for m in mets:
                key_map[key][f"{r['set']}:{m}"] = r[m]
        for rec in key_map.values():
            for m in mets:
                if pd.notna(rec[f"O:{m}"]) and pd.notna(rec[f"F:{m}"]):
                    rec[f"F-O:{m}"] = rec[f"F:{m}"] - rec[f"O:{m}"]
                if pd.notna(rec[f"O:{m}"]) and pd.notna(rec[f"OF:{m}"]):
                    rec[f"OF-O:{m}"] = rec[f"OF:{m}"] - rec[f"O:{m}"]
        wide = pd.DataFrame(list(key_map.values()))
        order = ["DAG","Model","Feature"] + [f"{g}:{m}" for g in groups for m in mets] + ["k"]
        wide = wide[order]
        out_path.parent.mkdir(parents=True, exist_ok=True)
        wide.to_csv(out_path, index=False, encoding="utf-8-sig")

    save_user_style_wide_csv(wide_rows, Path("./reports/report_all_like_user.csv"))
    print("saved report_pivot.csv and report_all_like_user.csv")

    order_cols = ["AUROC","AUPRC","f1","Brier","ECE"]
    ascending  = [False,   False,   False, True,   True]
    best_df = (df_long.sort_values(order_cols, ascending=ascending)
                        .groupby("Model", as_index=False).head(1).reset_index(drop=True))
    best_df.to_csv("./reports/best_per_model.csv", index=False, encoding="utf-8-sig")
    best_df
else:
    print("warning: df_long is empty or missing metric columns. Skipping pivot and best-per-model reports.")


saved report_long.csv
saved report_pivot.csv and report_all_like_user.csv
