In [1]:
# Cell 1: 라이브러리 및 설정  (변경: GES 사용, DAG_NAME='GES')
import warnings, time
from pathlib import Path
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, brier_score_loss
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings("ignore")

try:
    from lightgbm import LGBMClassifier
    _HAS_LGBM = True
except Exception:
    _HAS_LGBM = False

try:
    from xgboost import XGBClassifier
    _HAS_XGB = True
except Exception:
    _HAS_XGB = False

# GES 우선, 미설치 시 점수기반 대체(HillClimbSearch)
_HAS_GES = True
try:
    from causallearn.search.ScoreBased.GES import ges
except Exception:
    _HAS_GES = False

_HAS_HC = True
try:
    from pgmpy.estimators import HillClimbSearch, BicScore
except Exception:
    _HAS_HC = False

DATA_PATH = "./training_data.csv"
LABEL_COL = "label"
ID_PREFIX = "Unnamed"
N_SPLITS = 10
RANDOM_STATE = 42
DAG_NAME = "GES"
CLASS_WEIGHT_BALANCED = True
VERBOSE_PRINT = True


In [2]:
# Cell 2: 파라미터와 유틸
LGBM_PARAMS = dict(
    n_estimators=300, learning_rate=0.05, num_leaves=63, max_depth=-1,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0, reg_alpha=0.0,
    objective="binary", class_weight="balanced", random_state=RANDOM_STATE, n_jobs=-1
)

XGB_PARAMS = dict(
    n_estimators=300, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8,
    reg_lambda=1.0, reg_alpha=0.0, objective="binary:logistic", eval_metric="auc",
    random_state=RANDOM_STATE, n_jobs=-1
)

def ensure_dirs():
    Path("./logs").mkdir(parents=True, exist_ok=True)
    Path("./reports").mkdir(parents=True, exist_ok=True)
    Path("./reports/edges").mkdir(parents=True, exist_ok=True)
    Path("./reports/edges/by_model").mkdir(parents=True, exist_ok=True)

def expected_calibration_error(y_true, y_prob, n_bins=10):
    y_true = np.asarray(y_true); y_prob = np.asarray(y_prob)
    bins = np.linspace(0,1,n_bins+1)
    ece=0.0; N=len(y_true)
    for b in range(n_bins):
        L,R = bins[b], bins[b+1]
        m = (y_prob>=L)&(y_prob<(R if b<n_bins-1 else R+1e-12))
        if not np.any(m): continue
        ece += (m.sum()/N)*abs(y_true[m].mean()-y_prob[m].mean())
    return float(ece)


In [3]:
# Cell 3: 모델, 로깅, 평가
def make_models():
    ms=[("Logit", LogisticRegression(solver="lbfgs", max_iter=200,
                                     class_weight=("balanced" if CLASS_WEIGHT_BALANCED else None))),
        ("RandomForest", RandomForestClassifier(n_estimators=400,
                                               class_weight=("balanced" if CLASS_WEIGHT_BALANCED else None),
                                               n_jobs=-1, random_state=RANDOM_STATE))]
    if _HAS_XGB:
        ms.append(("XGBoost", XGBClassifier(**XGB_PARAMS, verbosity=0)))
    if _HAS_LGBM:
        ms.append(("LightGBM", LGBMClassifier(**LGBM_PARAMS)))
    return ms

def append_log_csv(row: Dict):
    ensure_dirs()
    p=Path("./logs/run_log.csv")
    pd.DataFrame([row]).to_csv(p, mode="a", index=False, encoding="utf-8-sig", header=not p.exists())

def one_line(dag,k,model,set_name,met,secs):
    return (f"DAG={dag} | k={k} | model={model} | set={set_name} | "
            f"f1={met['f1']:.4f}, AUPRC={met['AUPRC']:.4f}, AUROC={met['AUROC']:.4f}, "
            f"Brier={met['Brier']:.4f}, ECE={met['ECE']:.4f} | time={secs:.2f}s")

def run_cv(model, X, y, n_splits=10, seed=42):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    f1s, auprcs, aurocs, briers, eces = [], [], [], [], []
    Xv = X.values.astype(np.float32); yv = y.values.astype(int)
    for tr, te in skf.split(Xv, yv):
        X_tr, X_te, y_tr, y_te = Xv[tr], Xv[te], yv[tr], yv[te]
        is_lgbm = (model.__class__.__name__ == "LGBMClassifier")
        if is_lgbm:
            X_tr2, X_val, y_tr2, y_val = train_test_split(X_tr, y_tr, test_size=0.2, random_state=seed, stratify=y_tr)
            model.fit(X_tr2, y_tr2, eval_set=[(X_val, y_val)], eval_metric="auc")
        else:
            model.fit(X_tr, y_tr)
        proba = model.predict_proba(X_te)[:,1] if hasattr(model,"predict_proba") else model.predict(X_te).astype(float)
        pred = (proba>=0.5).astype(int)
        f1s.append(f1_score(y_te, pred, zero_division=0))
        auprcs.append(average_precision_score(y_te, proba) if len(np.unique(y_te))>1 else np.nan)
        aurocs.append(roc_auc_score(y_te, proba) if len(np.unique(y_te))>1 else np.nan)
        briers.append(brier_score_loss(y_te, proba))
        eces.append(expected_calibration_error(y_te, proba, 10))
    return dict(f1=float(np.nanmean(f1s)), AUPRC=float(np.nanmean(auprcs)), AUROC=float(np.nanmean(aurocs)),
                Brier=float(np.nanmean(briers)), ECE=float(np.nanmean(eces)))


In [4]:
# Cell 4: 엣지 관련 함수와 저장 함수
def pick_all_edges(W: np.ndarray):
    n=W.shape[0]; out=[]
    for i in range(n):
        for j in range(n):
            if i==j: continue
            w=W[i,j]
            if w!=0.0:
                out.append((i,j,w))
    out.sort(key=lambda t:(abs(t[2]), t[2]), reverse=True)
    return out

def pick_top_k_edges(edge_list, k):
    return edge_list[:k]

def build_feature_df_from_edges(X_base, cols, edges_ijw, feature_type):
    feats = {}
    used_meta = []
    for rank, (i, j, w) in enumerate(edges_ijw, start=1):
        A, B = cols[j], cols[i]
        if feature_type == "m":
            fname = f"{A}_mul_{B}"
            feats[fname] = (X_base[A] * X_base[B]).values
        elif feature_type == "mw":
            fname = f"{A}_mulw_{B}"
            feats[fname] = (w * (X_base[A] * X_base[B])).values
        used_meta.append(dict(rank=rank, src=A, dst=B, weight=float(w), feature=fname))
    return pd.DataFrame(feats, index=X_base.index), used_meta

def save_edges_csv(used_meta, feature_type, k):
    ensure_dirs()
    path = Path(f"./reports/edges/{feature_type}_k{k}.csv")
    pd.DataFrame(used_meta).to_csv(path, index=False, encoding="utf-8-sig")

def save_master_edges(all_edges_meta: List[Dict]):
    ensure_dirs()
    path = Path("./reports/edges/master_edges.csv")
    pd.DataFrame(all_edges_meta).to_csv(path, index=False, encoding="utf-8-sig")

def save_edges_by_model(dag: str, k: int, model: str, feature: str, set_name: str, used_meta: List[Dict]):
    ensure_dirs()
    df = pd.DataFrame(used_meta)
    if df.empty:
        df = pd.DataFrame(columns=["rank","src","dst","weight","feature"])
    df.insert(0, "Set", set_name)
    df.insert(0, "Feature", feature)
    df.insert(0, "Model", model)
    df.insert(0, "k", k)
    df.insert(0, "DAG", dag)
    per_run = Path(f"./reports/edges/by_model/{model}_{feature}_{set_name}_k{k}.csv")
    df.to_csv(per_run, index=False, encoding="utf-8-sig")

    summary_row = dict(
        DAG=dag, k=k, Model=model, Feature=feature, Set=set_name,
        edge_count=len(used_meta),
        features=";".join([m["feature"] for m in used_meta]) if used_meta else ""
    )
    summary_p = Path("./reports/edges/usage_summary.csv")
    pd.DataFrame([summary_row]).to_csv(summary_p, mode="a", index=False, encoding="utf-8-sig", header=not summary_p.exists())


In [5]:
# Cell 5: 구조학습을 GES로 수행 (미설치 시 HillClimbSearch로 대체), 이후 OLS로 W 추정
ensure_dirs()
path = Path(DATA_PATH); assert path.exists(), f"not found: {path}"
df = pd.read_csv(path); assert LABEL_COL in df.columns, f"'{LABEL_COL}' not found"
id_cols = [c for c in df.columns if c.startswith(ID_PREFIX)]
feature_cols = [c for c in df.columns if c not in id_cols + [LABEL_COL]]
X_base = df[feature_cols].copy()
y = df[LABEL_COL].astype(int).copy()

var0 = [c for c in feature_cols if np.isclose(X_base[c].var(ddof=0), 0.0)]
cols_used = [c for c in feature_cols if c not in var0]
Z = StandardScaler().fit_transform(X_base[cols_used].values.astype(float))
p = len(cols_used)

A = np.zeros((p, p), dtype=float)

if _HAS_GES:
    ges_res = ges(Z, score_func='local_score_BIC')
    G = getattr(ges_res, "G", ges_res)
    try:
        A_tmp = np.zeros((p, p), dtype=float)
        for i in range(p):
            for j in range(p):
                if i == j:
                    continue
                if G.is_directed_from_to(j, i):
                    A_tmp[i, j] = 1.0
        A = A_tmp
    except Exception:
        edges = getattr(G, "get_graph_edges", lambda: [])()
        for e in edges:
            try:
                u = int(str(e.get_node1()))
                v = int(str(e.get_node2()))
                ep1 = str(e.get_endpoint1())
                ep2 = str(e.get_endpoint2())
                if ep1 == 'TAIL' and ep2 == 'ARROW':
                    A[v, u] = 1.0
                elif ep1 == 'ARROW' and ep2 == 'TAIL':
                    A[u, v] = 1.0
            except Exception:
                continue
elif _HAS_HC:
    df_std = pd.DataFrame(Z, columns=cols_used)
    est = HillClimbSearch(df_std, scoring_method=BicScore(df_std))
    model = est.estimate(scoring_method=BicScore(df_std))
    for u, v in model.edges():
        iu = cols_used.index(u)
        iv = cols_used.index(v)
        A[iv, iu] = 1.0
else:
    raise ImportError("GES 실행에 필요한 라이브러리를 찾지 못했습니다. causal-learn 또는 pgmpy 설치 필요")

W = np.zeros((p, p), dtype=float)
for i in range(p):
    parents = np.where(A[i, :] == 1.0)[0]
    if parents.size == 0:
        continue
    yi = Z[:, i]
    Xi = Z[:, parents]
    coef, *_ = np.linalg.lstsq(Xi, yi, rcond=None)
    W[i, parents] = coef

edge_candidates = []
for i in range(p):
    for j in range(p):
        if i == j:
            continue
        w = W[i, j]
        if w != 0.0:
            edge_candidates.append((i, j, w))
edge_candidates.sort(key=lambda t: (abs(t[2]), t[2]), reverse=True)

Kmax = len(edge_candidates)
print("edge (derived feature) count =", Kmax)

master_meta = []
for rank, (i, j, w) in enumerate(edge_candidates, start=1):
    master_meta.append(dict(rank=rank, src=cols_used[j], dst=cols_used[i], weight=float(w)))
save_master_edges(master_meta)


edge (derived feature) count = 0


In [None]:
# Cell 6: 학습 루프와 리포트 (K=0 안전 처리 포함)
models = make_models()
long_rows=[]; wide_rows=[]

# K 리스트: 엣지가 없으면 [0], 있으면 0..Kmax
k_list = [0] if Kmax == 0 else list(range(0, Kmax+1))

for k in k_list:
    if k == 0:
        # 엣지 없음: baseline(O)만 수행
        edges_k = []
        edges_meta_m, edges_meta_mw = [], []
        feature_sets = [("original", X_base)]
    else:
        # 상위 k개 엣지 기반 파생특징 생성
        edges_k = pick_top_k_edges(edge_candidates, k)
        Xm,  edges_meta_m  = build_feature_df_from_edges(X_base[cols_used], cols_used, edges_k, "m")
        Xmw, edges_meta_mw = build_feature_df_from_edges(X_base[cols_used], cols_used, edges_k, "mw")
        save_edges_csv(edges_meta_m,  "m",  k)
        save_edges_csv(edges_meta_mw, "mw", k)
        feature_sets = [("original", X_base), ("m", Xm), ("mw", Xmw)]

    for mname, model in models:
        for fname, Xf in feature_sets:
            # original은 O만, 파생은 F/OF
            sets = [("O", X_base)] if fname=="original" else [("F", Xf), ("OF", pd.concat([X_base, Xf], axis=1))]

            # 이번 조합에서 사용된 엣지 메타(파일 기록용)
            if fname == "original":
                used_meta_for_this_run = []
            elif fname == "m":
                used_meta_for_this_run = edges_meta_m
            else:  # "mw"
                used_meta_for_this_run = edges_meta_mw

            for sname, Xset in sets:
                # 모델별·세트별·k별 엣지 사용 내역 저장
                save_edges_by_model(DAG_NAME, k, mname, fname, sname, used_meta_for_this_run)

                # 학습
                t0=time.time()
                clf = (LGBMClassifier(**LGBM_PARAMS) if (_HAS_LGBM and mname=="LightGBM") else model)
                metrics=run_cv(clf, Xset, y, n_splits=N_SPLITS, seed=RANDOM_STATE)
                secs=time.time()-t0

                if VERBOSE_PRINT:
                    print(one_line(DAG_NAME,k,mname,sname,metrics,secs))

                # 로그/롱 포맷 축적
                append_log_csv(dict(dag=DAG_NAME,k=k,model=mname,set=sname,feature=fname,**metrics,seconds=round(secs,3)))
                long_rows.append(dict(DAG=DAG_NAME,k=k,Model=mname,Feature=fname,Set=sname,**metrics))
                wide_rows.append(dict(dag=DAG_NAME,k=k,model=mname,feature=fname,set=sname,**metrics))

# report_long 저장
df_long=pd.DataFrame(long_rows)
df_long.to_csv("./reports/report_long.csv", index=False, encoding="utf-8-sig")
print("saved report_long.csv")

# df_long이 비어있지 않을 때만 피벗/리포트 생성
if not df_long.empty and all(col in df_long.columns for col in ["DAG","k","Model","Feature","Set","f1","AUPRC","AUROC","Brier","ECE"]):
    pv=df_long.pivot_table(index=["DAG","k","Model","Feature"], columns="Set",
                           values=["f1","AUPRC","AUROC","Brier","ECE"], aggfunc="mean")
    pv.to_csv("./reports/report_pivot.csv", encoding="utf-8-sig")

    def save_user_style_wide_csv(all_rows: List[Dict], out_path: Path):
        groups=["O","F","OF","F-O","OF-O"]; mets=["f1","AUPRC","AUROC","Brier","ECE"]
        key_map={}
        for r in all_rows:
            if r["set"] not in {"O","F","OF"}: continue
            key=(r["dag"], r["model"], r["feature"], r["k"])
            if key not in key_map:
                base=dict(DAG=r["dag"],Model=r["model"],Feature=r["feature"],k=r["k"])
                for g in groups:
                    for m in mets: base[f"{g}:{m}"]=np.nan
                key_map[key]=base
            for m in mets: key_map[key][f"{r['set']}:{m}"]=r[m]
        for rec in key_map.values():
            for m in mets:
                if pd.notna(rec[f"F:{m}"]) and pd.notna(rec[f"O:{m}"]):
                    rec[f"F-O:{m}"]=rec[f"F:{m}"]-rec[f"O:{m}"]
                if pd.notna(rec[f"OF:{m}"]) and pd.notna(rec[f"O:{m}"]):
                    rec[f"OF-O:{m}"]=rec[f"OF:{m}"]-rec[f"O:{m}"]
        wide=pd.DataFrame(list(key_map.values()))
        order=["DAG","Model","Feature"]+[f"{g}:{m}" for g in groups for m in mets]+["k"]
        wide=wide[order]
        out_path.parent.mkdir(parents=True, exist_ok=True)
        wide.to_csv(out_path, index=False, encoding="utf-8-sig")

    save_user_style_wide_csv(wide_rows, Path("./reports/report_all_like_user.csv"))
    print("saved report_pivot.csv and report_all_like_user.csv")

    order_cols=["AUROC","AUPRC","f1","Brier","ECE"]
    ascending =[False,   False,   False, True,   True]
    best_df=(df_long.sort_values(order_cols, ascending=ascending)
                    .groupby("Model", as_index=False).head(1).reset_index(drop=True))
    best_df.to_csv("./reports/best_per_model.csv", index=False, encoding="utf-8-sig")
    best_df
else:
    print("warning: df_long is empty or missing metric columns. Skipping pivot and best-per-model reports.")


saved report_long.csv


KeyError: 'f1'