In [None]:
# NOTEARS → Edge Sweep(k=1..Kmax) → F(m/mw) → O/F/OF × 10-fold CV
# Models: Logit, RF, XGBoost(있으면), LightGBM(있으면)
# Outputs:
#   - logs/run_log.csv
#   - reports/report_long.csv
#   - reports/report_pivot.csv
#   - reports/report_all_like_user.csv
#   - reports/best_per_model.csv
#   - reports/edges/master_edges.csv
#   - reports/edges/m_k{K}.csv, reports/edges/mw_k{K}.csv  (각 k마다 사용 엣지 기록)

import warnings, time
from pathlib import Path
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, brier_score_loss
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings("ignore")

# optional
try:
    from lightgbm import LGBMClassifier
    _HAS_LGBM = True
except Exception:
    _HAS_LGBM = False

try:
    from xgboost import XGBClassifier
    _HAS_XGB = True
except Exception:
    _HAS_XGB = False

# gCastle NOTEARS (미설치여도 notears_linear로 진행)
try:
    from castle.algorithms import Notears  # noqa: F401
    _HAS_NOTEARS = True
except Exception:
    _HAS_NOTEARS = False
    warnings.warn("gCastle 미설치: notears_linear만 사용합니다.")

# 기본 설정
DATA_PATH = "./training_data.csv"
LABEL_COL = "label"
ID_PREFIX = "Unnamed"
N_SPLITS = 10
RANDOM_STATE = 42
DAG_NAME = "NOTEARS"
CLASS_WEIGHT_BALANCED = True
VERBOSE_PRINT = True  # 학습 1회당 요약 1줄

# 모델 파라미터
LGBM_PARAMS = dict(
    n_estimators=300, learning_rate=0.05, num_leaves=63, max_depth=-1,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0, reg_alpha=0.0,
    objective="binary", class_weight="balanced", random_state=RANDOM_STATE, n_jobs=-1
)

XGB_PARAMS = dict(
    n_estimators=300, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8,
    reg_lambda=1.0, reg_alpha=0.0, objective="binary:logistic", eval_metric="auc",
    random_state=RANDOM_STATE, n_jobs=-1
)

# ---------------------------
# 유틸/평가/모델팩토리/저장
# ---------------------------

def ensure_dirs():
    Path("./logs").mkdir(parents=True, exist_ok=True)
    Path("./reports").mkdir(parents=True, exist_ok=True)
    Path("./reports/edges").mkdir(parents=True, exist_ok=True)

def expected_calibration_error(y_true, y_prob, n_bins=10):
    y_true = np.asarray(y_true); y_prob = np.asarray(y_prob)
    bins = np.linspace(0,1,n_bins+1); ece=0.0; N=len(y_true)
    for b in range(n_bins):
        L,R = bins[b], bins[b+1]
        m = (y_prob>=L)&(y_prob<(R if b<n_bins-1 else R+1e-12))
        if not np.any(m): continue
        ece += (m.sum()/N)*abs(y_true[m].mean()-y_prob[m].mean())
    return float(ece)

def run_cv(model, X, y, n_splits=10, seed=42):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    f1s, auprcs, aurocs, briers, eces = [], [], [], [], []
    Xv = X.values.astype(np.float32); yv = y.values.astype(int)
    for tr, te in skf.split(Xv, yv):
        X_tr, X_te, y_tr, y_te = Xv[tr], Xv[te], yv[tr], yv[te]
        is_lgbm = (model.__class__.__name__ == "LGBMClassifier")
        if is_lgbm:
            X_tr2, X_val, y_tr2, y_val = train_test_split(
                X_tr, y_tr, test_size=0.2, random_state=seed, stratify=y_tr
            )
            model.fit(
                X_tr2, y_tr2,
                eval_set=[(X_val, y_val)],
                eval_metric="auc"
                # 필요시: early_stopping_rounds=50
            )
        else:
            model.fit(X_tr, y_tr)
        proba = model.predict_proba(X_te)[:,1] if hasattr(model,"predict_proba") else model.predict(X_te).astype(float)
        pred = (proba>=0.5).astype(int)
        f1s.append(f1_score(y_te, pred, zero_division=0))
        auprcs.append(average_precision_score(y_te, proba) if len(np.unique(y_te))>1 else np.nan)
        aurocs.append(roc_auc_score(y_te, proba) if len(np.unique(y_te))>1 else np.nan)
        briers.append(brier_score_loss(y_te, proba))
        eces.append(expected_calibration_error(y_te, proba, 10))
    return dict(f1=float(np.nanmean(f1s)), AUPRC=float(np.nanmean(auprcs)), AUROC=float(np.nanmean(aurocs)),
                Brier=float(np.nanmean(briers)), ECE=float(np.nanmean(eces)))

def make_models():
    ms=[("Logit", LogisticRegression(solver="lbfgs", max_iter=200,
                                     class_weight=("balanced" if CLASS_WEIGHT_BALANCED else None))),
        ("RandomForest", RandomForestClassifier(n_estimators=400,
                                               class_weight=("balanced" if CLASS_WEIGHT_BALANCED else None),
                                               n_jobs=-1, random_state=RANDOM_STATE))]
    if _HAS_XGB:
        ms.append(("XGBoost", XGBClassifier(**XGB_PARAMS, verbosity=0)))
    if _HAS_LGBM:
        ms.append(("LightGBM", LGBMClassifier(**LGBM_PARAMS)))
    return ms

def append_log_csv(row: Dict):
    ensure_dirs()
    p=Path("./logs/run_log.csv")
    pd.DataFrame([row]).to_csv(p, mode="a", index=False, encoding="utf-8-sig", header=not p.exists())

def one_line(dag,k,model,set_name,met,secs):
    return (f"DAG={dag} | k={k} | model={model} | set={set_name} | "
            f"f1={met['f1']:.4f}, AUPRC={met['AUPRC']:.4f}, AUROC={met['AUROC']:.4f}, "
            f"Brier={met['Brier']:.4f}, ECE={met['ECE']:.4f} | time={secs:.2f}s")

# ---------------------------
# 엣지 관련
# ---------------------------

def pick_all_edges(W: np.ndarray) -> List[Tuple[int,int,float]]:
    """모든 비대각 & 비0 엣지를 (i<-j, weight) 리스트로 반환"""
    n=W.shape[0]; out=[]
    for i in range(n):
        for j in range(n):
            if i==j: continue
            w=W[i,j]
            if w!=0.0:
                out.append((i,j,w))  # i<-j
    # 정렬 기준: |w| 내림차순, 그 다음 w 내림차순
    out.sort(key=lambda t:(abs(t[2]), t[2]), reverse=True)
    return out

def pick_top_k_edges(edge_list: List[Tuple[int,int,float]], k: int):
    return edge_list[:k]

def build_feature_df_from_edges(X_base: pd.DataFrame, cols: List[str],
                                edges_ijw: List[Tuple[int,int,float]],
                                feature_type: str) -> Tuple[pd.DataFrame, List[Dict]]:
    """
    주어진 edges_ijw(i,j,w)로부터 파생특징 DataFrame 생성.
    반환: (X_feat, used_edges_meta)
      - used_edges_meta: [{rank, src, dst, w, feature_name}]
    """
    feats = {}
    used_meta = []
    for rank, (i, j, w) in enumerate(edges_ijw, start=1):
        A, B = cols[j], cols[i]  # j -> i (A=src, B=dst)
        if feature_type == "m":
            fname = f"{A}_mul_{B}"
            feats[fname] = (X_base[A] * X_base[B]).values
        elif feature_type == "mw":
            fname = f"{A}_mulw_{B}"
            feats[fname] = (w * (X_base[A] * X_base[B])).values
        else:
            raise ValueError("feature_type must be 'm' or 'mw'")
        used_meta.append(dict(rank=rank, src=A, dst=B, weight=float(w), feature=fname))
    return pd.DataFrame(feats, index=X_base.index), used_meta

def save_edges_csv(used_meta: List[Dict], feature_type: str, k: int):
    ensure_dirs()
    path = Path(f"./reports/edges/{feature_type}_k{k}.csv")
    pd.DataFrame(used_meta).to_csv(path, index=False, encoding="utf-8-sig")

def save_master_edges(all_edges_meta: List[Dict]):
    ensure_dirs()
    path = Path("./reports/edges/master_edges.csv")
    pd.DataFrame(all_edges_meta).to_csv(path, index=False, encoding="utf-8-sig")




In [None]:
# ---------------------------
# 사용자 스타일 Wide CSV
# ---------------------------

def save_user_style_wide_csv(all_rows: List[Dict], out_path: Path):
    groups=["O","F","OF","F-O","OF-O"]; mets=["f1","AUPRC","AUROC","Brier","ECE"]
    key_map={}
    for r in all_rows:
        if r["set"] not in {"O","F","OF"}: continue
        key=(r["dag"], r["model"], r["feature"], r["k"])
        if key not in key_map:
            base=dict(DAG=r["dag"],Model=r["model"],Feature=r["feature"],k=r["k"])
            for g in groups:
                for m in mets: base[f"{g}:{m}"]=np.nan
            key_map[key]=base
        for m in mets: key_map[key][f"{r['set']}:{m}"]=r[m]
    for rec in key_map.values():
        for m in mets:
            if pd.notna(rec[f"F:{m}"]) and pd.notna(rec[f"O:{m}"]):
                rec[f"F-O:{m}"]=rec[f"F:{m}"]-rec[f"O:{m}"]
            if pd.notna(rec[f"OF:{m}"]) and pd.notna(rec[f"O:{m}"]):
                rec[f"OF-O:{m}"]=rec[f"OF:{m}"]-rec[f"O:{m}"]
    wide=pd.DataFrame(list(key_map.values()))
    order=["DAG","Model","Feature"]+[f"{g}:{m}" for g in groups for m in mets]+["k"]
    wide=wide[order]
    out_path.parent.mkdir(parents=True, exist_ok=True)
    wide.to_csv(out_path, index=False, encoding="utf-8-sig")

In [None]:
# ---------------------------
# 데이터 로드 → NOTEARS → OLS W
# ---------------------------

ensure_dirs()
path = Path(DATA_PATH); assert path.exists(), f"not found: {path}"
df = pd.read_csv(path); assert LABEL_COL in df.columns, f"'{LABEL_COL}' not found"
id_cols = [c for c in df.columns if c.startswith(ID_PREFIX)]
feature_cols = [c for c in df.columns if c not in id_cols + [LABEL_COL]]
X_base = df[feature_cols].copy()
y = df[LABEL_COL].astype(int).copy()

# 분산0 제거
var0 = [c for c in feature_cols if np.isclose(X_base[c].var(ddof=0), 0.0)]
cols_used = [c for c in feature_cols if c not in var0]

# 표준화
Z = StandardScaler().fit_transform(X_base[cols_used].values.astype(float))

# NOTEARS(linear)
from notears.linear import notears_linear
NOTEARS_L1 = 0.10     # sparsity 제어(높을수록 간선↓)
NOTEARS_TH = 1e-3     # 간선 임계치

B = notears_linear(Z, lambda1=NOTEARS_L1, loss_type='l2')  # B[i,j] : j->i
B = np.asarray(B, dtype=float)

# 구조 A 결정(임계치 적용)
A = (np.abs(B) > NOTEARS_TH).astype(float)

# OLS 재적합으로 W 안정화
p = B.shape[0]
W = np.zeros_like(B)
for i in range(p):
    parents = np.where(A[i, :] == 1.0)[0]
    if parents.size == 0:
        continue
    yi = Z[:, i]
    Xi = Z[:, parents]
    coef, *_ = np.linalg.lstsq(Xi, yi, rcond=None)
    W[i, parents] = coef

# 후보 엣지 전부 추출 (|w| 내림차순)
edge_candidates = pick_all_edges(W)  # [(i,j,w), ...]  i<-j
Kmax = len(edge_candidates)          # 파생 엣지(파생 컬럼) 기준의 최대 개수
print("candidate edges (derived features count) =", Kmax)

# 마스터 엣지 목록 저장 (분석용)
master_meta = []
for rank,(i,j,w) in enumerate(edge_candidates, start=1):
    master_meta.append(dict(rank=rank, src=cols_used[j], dst=cols_used[i], weight=float(w)))
save_master_edges(master_meta)

# ---------------------------
# k=1..Kmax 스윕, 모델 학습/저장
# ---------------------------

models = make_models()
long_rows=[]; wide_rows=[]

for k in range(1, Kmax+1):
    # 상위 k개 엣지 선택
    edges_k = pick_top_k_edges(edge_candidates, k)

    # 파생세트: m, mw (m+mw 제거)
    Xm, edges_meta_m   = build_feature_df_from_edges(X_base[cols_used], cols_used, edges_k, feature_type="m")
    Xmw, edges_meta_mw = build_feature_df_from_edges(X_base[cols_used], cols_used, edges_k, feature_type="mw")

    # 각 k마다 어떤 엣지를 썼는지 CSV로 저장
    save_edges_csv(edges_meta_m,   "m",  k)
    save_edges_csv(edges_meta_mw,  "mw", k)

    feature_sets=[("original", X_base), ("m", Xm), ("mw", Xmw)]
    for mname, model in models:
        for fname, Xf in feature_sets:
            # original은 O만, 파생은 F/OF 평가
            sets = [("O", X_base)] if fname=="original" else [("F", Xf), ("OF", pd.concat([X_base, Xf], axis=1))]
            for sname, Xset in sets:
                t0=time.time()
                clf = (LGBMClassifier(**LGBM_PARAMS) if (_HAS_LGBM and mname=="LightGBM") else model)
                metrics=run_cv(clf, Xset, y, n_splits=N_SPLITS, seed=RANDOM_STATE)
                secs=time.time()-t0
                if VERBOSE_PRINT: print(one_line(DAG_NAME,k,mname,sname,metrics,secs))
                append_log_csv(dict(dag=DAG_NAME,k=k,model=mname,set=sname,feature=fname,**metrics,seconds=round(secs,3)))
                long_rows.append(dict(DAG=DAG_NAME,k=k,Model=mname,Feature=fname,Set=sname,**metrics))
                wide_rows.append(dict(dag=DAG_NAME,k=k,model=mname,feature=fname,set=sname,**metrics))

# 리포트 저장
df_long=pd.DataFrame(long_rows)
df_long.to_csv("./reports/report_long.csv", index=False, encoding="utf-8-sig")
pv=df_long.pivot_table(index=["DAG","k","Model","Feature"], columns="Set",
                       values=["f1","AUPRC","AUROC","Brier","ECE"], aggfunc="mean")
pv.to_csv("./reports/report_pivot.csv", encoding="utf-8-sig")
save_user_style_wide_csv(wide_rows, Path("./reports/report_all_like_user.csv"))
print("saved reports")

# 모델별 최적(우선순위: AUROC desc, AUPRC desc, f1 desc, Brier asc, ECE asc)
order_cols=["AUROC","AUPRC","f1","Brier","ECE"]
ascending =[False,   False,   False, True,   True]
best_df=(df_long.sort_values(order_cols, ascending=ascending)
               .groupby("Model", as_index=False).head(1).reset_index(drop=True))
best_df.to_csv("./reports/best_per_model.csv", index=False, encoding="utf-8-sig")
best_df