In [None]:
# ================================= EXPERIMENT: 10-out-of-20 (ARM, macro-MLP) ================================
import os, sys, re, json, time, warnings
from typing import Dict, Any, List
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

# --- framework entrypoint ---
from rolling_framework import Machine   # 프로젝트의 핵심 API

# ============================ Configs =======================================================================
DATA_DIR      = "data/"
Y_FILE        = os.path.join(DATA_DIR, "exrets.csv")
SLOPE_FILE    = os.path.join(DATA_DIR, "slope.csv")
YL_FILE       = os.path.join(DATA_DIR, "yl_all.csv")
MACRO_FILE    = os.path.join(DATA_DIR, "MacroFactors.csv")

OUT_DIR       = "./output";  os.makedirs(OUT_DIR, exist_ok=True)

# 샘플/예측 구간
BURN_START, BURN_END   = "197108", "199001"
PERIOD_START, PERIOD_END = "197108", "202312"
HORIZON = 12                           # months ahead

MATURITIES = ["xr_2","xr_3","xr_5","xr_7","xr_10"]

# ============================ Robust scalar helper ==========================================================
def _to_scalar(x, agg: str = "mean") -> float:
    """
    Robustly convert x (scalar / Series / DataFrame / array / list) to float.
    - If vector-like: aggregate with 'mean' (default) or 'sum'.
    - If single element: return that element as float.
    - On failure: return NaN.
    """
    if x is None:
        return float("nan")

    if np.isscalar(x):
        try:
            return float(x)
        except Exception:
            return float("nan")

    if isinstance(x, pd.Series):
        if x.size == 1:
            return float(x.iloc[0])
        vals = x.to_numpy(dtype=float, copy=False)
        return float(np.nanmean(vals) if agg == "mean" else np.nansum(vals))

    if isinstance(x, pd.DataFrame):
        vals = x.to_numpy(dtype=float, copy=False)
        return float(np.nanmean(vals) if agg == "mean" else np.nansum(vals))

    try:
        arr = np.asarray(x, dtype=float)
        if arr.size == 1:
            return float(arr.item())
        return float(np.nanmean(arr) if agg == "mean" else np.nansum(arr))
    except Exception:
        pass

    try:
        return float(x)
    except Exception:
        return float("nan")

# ============================ IO helpers ===================================================================
def _load_csv(path, name):
    try:
        return pd.read_csv(path, index_col="Time")
    except FileNotFoundError as e:
        sys.exit(f"[ERROR] missing {name} → {e.filename}")

def _align_time(*dfs):
    idx=None
    for d in dfs: idx = d.index if idx is None else idx.intersection(d.index)
    return [d.loc[idx].sort_index() for d in dfs]

def _direct_pairs(slope_cols, y_cols):
    mk = lambda s: re.search(r"(\d+)", s).group(1) if re.search(r"(\d+)", s) else None
    y_map = {mk(c): c for c in y_cols}
    return [(sc, y_map[mk(sc)]) for sc in slope_cols if mk(sc) in y_map]

# ============================ Data load =====================================================================
y     = _load_csv(Y_FILE,   "exrets")
slope = _load_csv(SLOPE_FILE, "slope")
yl    = _load_csv(YL_FILE,   "yl_all")
macro = _load_csv(MACRO_FILE,"MacroFactors")

# 타깃 열 필터
y_cols = [c for c in MATURITIES if c in y.columns]
if not y_cols:
    sys.exit("[ERROR] MATURITIES not in exrets")
y = y[y_cols]

# 시간축 맞추기
y, slope, yl, macro = _align_time(y, slope, yl, macro)

# slope->y 자동 매핑  ex) slope_2 -> xr_2
DIRECT_PAIRS = _direct_pairs(slope.columns, y_cols)

print("✓ Loaded data shapes:",
      {k:v.shape for k,v in [("y",y),("slope",slope),("yl",yl),("macro",macro)]})
print("✓ direct map pairs :", DIRECT_PAIRS)

# ============================ Experiment setup ==============================================================
# CASE: Base = slope(OLS), Residual = macro(MLP)
# X 구성: slope + macro 모두 포함 (베이스/잔차가 같은 X에서 서로 다른 열을 사용)
X_macro = pd.concat([slope, macro], axis=1)

# 기본 옵션(고정) — 필요 시 여기서만 수정
BASE_OPT = {
    "base_on": True,                          # 베이스 켜기 → CS-Resi 형태
    "base_cols":   list(slope.columns),       # 베이스는 slope 만 사용
    "target_cols": list(y.columns),           # ['xr_2','xr_3','xr_5','xr_7','xr_10']
    "residual_kind": "mlp",                   # 잔차학습은 단일 MLP
    "feature_cols": list(macro.columns),      # 잔차 입력은 macro 만 사용
    "standardize_res": True,                  # 잔차 입력만 표준화
    "mlp_hidden": (64, 32),
    "mlp_dropout": 0.1,
    "mlp_lr": 1e-3,
    "mlp_wd": 1e-4,                           # ← L2 (weight decay)
    "mlp_epochs": 200,
    "mlp_patience": 20,
    "seed": 0,                                # seed는 run마다 바꿔줌
}

# Grid (경량) — 내부 전략이 단일 Grid로 읽어 기본값을 세팅
BASE_GRID = {
    "arm__residual_model__module__hidden": [(64, 32), (128, 64)],
    "arm__residual_model__module__dropout": [0.0, 0.2],
    "arm__residual_model__optimizer__lr": [1e-3, 5e-4],
    "arm__residual_model__optimizer__weight_decay": [0.0, 1e-4],  # ← 꼭 포함(L2)
}

# ============================ Core run function =============================================================
def run_once_and_save(
    run_id: str,
    X: pd.DataFrame,
    y_: pd.DataFrame,
    option: Dict[str, Any],
    params_grid: Dict[str, List],
    burn_in=(BURN_START, BURN_END),
    period=(PERIOD_START, PERIOD_END),
    horizon=HORIZON,
    out_base=OUT_DIR,
) -> Dict[str, Any]:
    """
    한 번의 rolling 학습을 수행하고, 요약을 저장해 반환.
    저장: {out_base}/runs/{run_id}/summary.json
    """
    run_dir = os.path.join(out_base, "runs", run_id)
    os.makedirs(run_dir, exist_ok=True)

    # 옵션 복사 및 seed 설정
    opt = dict(option)
    seed = int(opt.get("seed", 0))
    np.random.seed(seed)
    try:
        import torch
        torch.manual_seed(seed)
    except Exception:
        pass

    # Machine 생성 및 학습
    m = Machine(
        X, y_, "ARM",
        option=opt, params_grid=params_grid,
        burn_in_start=burn_in[0], burn_in_end=burn_in[1],
        period=[period[0], period[1]], forecast_horizon=horizon
    )
    print(f"  - start training (run_id={run_id})")
    t0 = time.time()
    m.training()
    elapsed = time.time() - t0

    # 성능 지표 안전 변환(스칼라)
    r2oos_raw = m.R2OOS()
    mseoos_raw = m.MSEOOS()
    r2oos = _to_scalar(r2oos_raw, agg="mean")
    mseoos = _to_scalar(mseoos_raw, agg="mean")

    # 그리드 최적 파라미터 접근 (있을 경우)
    try:
        best_params = getattr(m.strategy, "best_params_", None)
        if best_params is None and hasattr(m.strategy, "last_best_params_"):
            best_params = m.strategy.last_best_params_
    except Exception:
        best_params = None

    # 요약 저장 (JSON 직렬화 안전)
    def _jsonable(v):
        if isinstance(v, (np.floating,)):
            return float(v)
        if isinstance(v, (np.integer,)):
            return int(v)
        if isinstance(v, (np.ndarray,)):
            return v.tolist()
        return v

    summary = {
        "run_id": run_id,
        "elapsed_sec": round(float(elapsed), 2),
        "R2OOS": float(r2oos),
        "MSEOOS": float(mseoos),
        "best_params": {k: _jsonable(v) for k, v in (best_params or {}).items()},
    }
    with open(os.path.join(run_dir, "summary.json"), "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

    return summary

# ============================ 20 runs + pick top-10 =========================================================
def run_20_and_ensemble_top10():
    print("\n▶ 20 runs start (ARM: base=slope OLS, residual=macro-MLP)")
    summaries = []

    # 20회 반복 (seed 1..20)
    for k, seed in enumerate(range(1, 21), start=1):
        opt = dict(BASE_OPT)
        opt["seed"] = seed
        run_id = f"ARM_macroMLP_seed{seed}"

        print(f"  - run {k:02d}/20 (seed={seed})")
        summ = run_once_and_save(
            run_id=run_id,
            X=X_macro,
            y_=y,
            option=opt,
            params_grid=BASE_GRID,
            burn_in=(BURN_START, BURN_END),
            period=(PERIOD_START, PERIOD_END),
            horizon=HORIZON,
            out_base=OUT_DIR,
        )
        print(f"    R2OOS={summ['R2OOS']:.6f}  MSEOOS={summ['MSEOOS']:.6f}")
        summaries.append(summ)

    # 결과 집계 / 저장
    df = pd.DataFrame(summaries)
    for col in ["R2OOS", "MSEOOS", "elapsed_sec"]:
        df[col] = pd.to_numeric(df[col], errors="coerce")
    df_sorted = df.sort_values("R2OOS", ascending=False).reset_index(drop=True)

    csv_path = os.path.join(OUT_DIR, "runs_summary.csv")
    df_sorted.to_csv(csv_path, index=False)
    print(f"\n✓ Saved summary table → {csv_path}")

    # 상위 10개 출력
    top10 = df_sorted.head(10)
    print("\nTop-10 by R2OOS:")
    print(top10[["run_id", "R2OOS", "MSEOOS", "elapsed_sec"]])

    # (선택) 상위 10개 run_id 리스트 반환
    return top10["run_id"].tolist()

if __name__ == "__main__":
    run_20_and_ensemble_top10()

DNN_DUAL rolling:   4%|▍         | 20/520 [03:47<1:34:56, 11.39s/it]


