In [None]:
# ---------- QUIET MODE (put these lines at very top) ----------
import os, sys, types, warnings
os.environ["TQDM_NOTEBOOK"] = "0"  # tqdm.auto가 노트북 모드로 안 가게
# tqdm IProgress 경고 억제용: auto를 표준 tqdm으로 바인딩
try:
    import tqdm as _tqdm
    _auto = types.ModuleType("tqdm.auto"); _auto.tqdm = _tqdm.tqdm
    sys.modules["tqdm.auto"] = _auto
except Exception:
    pass
from sklearn.exceptions import DataConversionWarning, ConvergenceWarning
warnings.filterwarnings("ignore", category=DataConversionWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", message=".*IProgress not found.*", category=UserWarning)
warnings.filterwarnings("ignore", message=".*X does not have valid feature names.*", category=UserWarning)
# -------------------------------------------------------------

import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

from rolling_framework import ExpandingRunner, make_strategy

# ---------------- CONFIG ----------------
DATA_FILE     = os.path.join("", "dataset.csv")     # index='Time'
Y_COLS        = ["xr_2", "xr_3", "xr_5", "xr_7", "xr_10"]
Y_10_COL      = "xr_10"

SLOPE_PREFIX  = "s_"
FWD_PREFIX    = "fwd_"
MACRO_PREFIX  = "F"

PERIOD        = ["197108", "202312"]
BURN_IN_END   = "200609"
HORIZON       = 12
SHOW_PROGRESS = True

# 반복 횟수 및 seed
N_RUNS        = 20
BASE_SEED     = 42

# 강한 L2
RIDGE_ALPHA   = 10.0
MLP_WDECAY    = 1e-2

# Ridge base params
RIDGE_PARAMS_BASE = dict(alpha=RIDGE_ALPHA, random_state=0)

# Plain MLP base params (입력 스케일링은 외부에서 처리)
MLP_PARAMS_BASE = dict(
    hidden_sizes=[64, 64],
    lr=1e-3,
    weight_decay=MLP_WDECAY,
    max_epochs=200,
    batch_size=64,
    dropout=0.0,
    device="auto",
    # seed는 run마다 설정
)

# SDMLP base params (slope + residual MLP)
SDMLP_PARAMS_BASE = dict(
    hidden_sizes=[64, 64],
    lr=1e-3,          # residual MLP lr
    slope_lr=1e-4,    # a,b lr (CS 구조를 조금만 움직이게)
    weight_decay=MLP_WDECAY,
    max_epochs=200,
    batch_size=64,
    dropout=0.0,
    slope_scale=1.0,
    device="auto",
    # seed는 run마다 설정
)

# ---------------- mini utils ----------------
def read_df(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, index_col="Time")
    df.index = df.index.astype(str)
    return df

def features(df: pd.DataFrame, *, use_fwd: bool, use_macro: bool) -> pd.DataFrame:
    parts = []
    if use_fwd:
        parts.append(df.loc[:, df.columns.str.startswith(FWD_PREFIX)])
    if use_macro:
        parts.append(df.loc[:, df.columns.str.startswith(MACRO_PREFIX)])
    if not parts:
        raise ValueError("No features selected (fwd/macro).")
    return pd.concat(parts, axis=1)

def cs_baseline(runner: ExpandingRunner, df: pd.DataFrame, y: pd.DataFrame) -> pd.DataFrame:
    """
    Per-maturity Campbell–Shiller OLS baseline:
        xr_j ~ s_j

    Returns:
        DataFrame indexed by runner.test_times, columns=Y_COLS
    """
    rows = []
    for t in runner.test_times:
        tr = [s for s in runner.times if s < t]
        if not tr:
            continue
        row = {}
        for ycol in Y_COLS:
            mat  = ycol.split("_", 1)[1]
            scol = f"{SLOPE_PREFIX}{mat}"
            X_tr = df.loc[tr, [scol]].astype(float)
            y_tr = y.loc[tr, ycol].astype(float).values
            reg  = LinearRegression().fit(X_tr, y_tr)
            x_te = df.loc[[t], [scol]].astype(float)
            row[ycol] = float(reg.predict(x_te)[0])
        rows.append(pd.Series(row, name=t))
    return pd.DataFrame(rows).reindex(index=runner.test_times, columns=Y_COLS)

def mse10_from_runner(runner: ExpandingRunner, y_col: str = Y_10_COL) -> pd.Series:
    """
    runner에서 xr_10에 대한 시점별 SE(t) = (y_true - y_pred)^2 반환.
    """
    Y_true, Y_pred = runner.collect_frames()
    se_10 = (Y_pred[y_col] - Y_true[y_col]) ** 2
    se_10.name = "MSE_10"
    return se_10

def save_mse10_mean(se_list, csv_path: str) -> None:
    """
    여러 run의 SE(t) 시리즈 리스트를 받아 run 방향으로 평균낸 MSE_10(t)를 CSV로 저장.
    """
    df = pd.concat(se_list, axis=1)
    mse_mean = df.mean(axis=1)
    mse_mean.name = "MSE_10"
    mse_mean.to_csv(csv_path, header=True)

# ---------------- RUN EXPERIMENTS -------------------
if __name__ == "__main__":
    df = read_df(DATA_FILE)
    y  = df[Y_COLS].copy()

    # slope 컬럼 (CS / SDMLP용)
    slope_cols = [f"{SLOPE_PREFIX}{col.split('_', 1)[1]}" for col in Y_COLS]
    X_slope = df[slope_cols].copy()
    slope_map = {ycol: f"{SLOPE_PREFIX}{ycol.split('_', 1)[1]}" for ycol in Y_COLS}

    # feature set 정의
    feature_sets = {
        "fwd":       dict(use_fwd=True,  use_macro=False),
        "fwd_macro": dict(use_fwd=True,  use_macro=True),
    }

    for tag, fspec in feature_sets.items():
        print(f"\n================ Feature set: {tag} ================")
        X_feat = features(df, use_fwd=fspec["use_fwd"], use_macro=fspec["use_macro"])
        feat_cols = list(X_feat.columns)

        # SDMLP용 X (slope + features)
        X_cs = pd.concat([X_slope, X_feat], axis=1)

        # 모델별 SE(t) accumulator
        mse_runs_ridge      = []
        mse_runs_csarm      = []
        mse_runs_mlp        = []
        mse_runs_sdmlp      = []

        for run in range(N_RUNS):
            seed = BASE_SEED + run
            print(f"\n--- {tag} | run {run+1}/{N_RUNS} | seed={seed} ---")

            # ---------- 1) Ridge ----------
            ridge_params = dict(RIDGE_PARAMS_BASE)
            ridge_params["random_state"] = seed   # 혹시 모를 난수 차이 대비

            ridge_cfg = {
                "feature_cols": feat_cols,
                "params": ridge_params,
            }
            ridge = make_strategy("RIDGE", ridge_cfg, target_cols=Y_COLS)

            runner_ridge = ExpandingRunner(
                X=X_feat,
                y=y,
                strategy=ridge,
                period=PERIOD,
                burn_in_end=BURN_IN_END,
                horizon=HORIZON,
            )
            runner_ridge.fit_walk(progress=SHOW_PROGRESS, desc=f"Ridge [{tag}] (run={run+1})")

            se_ridge = mse10_from_runner(runner_ridge)
            mse_runs_ridge.append(se_ridge)

            # 첫 run에서만 CS baseline 및 R^2 출력, mat 저장
            if run == 0:
                y_cs_hat = cs_baseline(runner_ridge, df, y)

                print(f"\n=== Ridge [{tag}] (run=1) ===")
                print("R2OOS vs naive:\n",    runner_ridge.R2OOS(baseline="naive").round(4))
                print("R2OOS vs condmean:\n", runner_ridge.R2OOS(baseline="condmean").round(4))
                print("R2OOS vs CS OLS:\n",   runner_ridge.R2OOS(baseline="custom", benchmark=y_cs_hat).round(4))

                runner_ridge.to_mat(f"ridge_{tag}.mat", baseline="custom", benchmark=y_cs_hat)
            else:
                # baseline은 첫 run에서 이미 계산
                pass

            # ---------- 2) CSARM (residual = ridge) ----------
            csarm_cfg = {
                "slope_map": slope_map,
                "feature_cols": feat_cols,         # residual ridge가 보는 feature
                "residual_kind": "ridge",
                "residual_params": {
                    "alpha": RIDGE_ALPHA,
                    "random_state": seed,
                },
            }
            csarm = make_strategy("CSARM", csarm_cfg, target_cols=Y_COLS)

            runner_csarm = ExpandingRunner(
                X=X_cs,     # slope + features
                y=y,
                strategy=csarm,
                period=PERIOD,
                burn_in_end=BURN_IN_END,
                horizon=HORIZON,
            )
            runner_csarm.fit_walk(progress=SHOW_PROGRESS, desc=f"CSARM(ridge) [{tag}] (run={run+1})")

            se_csarm = mse10_from_runner(runner_csarm)
            mse_runs_csarm.append(se_csarm)

            if run == 0:
                print(f"\n=== CSARM(ridge) [{tag}] (run=1) ===")
                print("R2OOS vs naive:\n",    runner_csarm.R2OOS(baseline="naive").round(4))
                print("R2OOS vs condmean:\n", runner_csarm.R2OOS(baseline="condmean").round(4))
                print("R2OOS vs CS OLS:\n",   runner_csarm.R2OOS(baseline="custom", benchmark=y_cs_hat).round(4))

                runner_csarm.to_mat(f"csarm_ridge_{tag}.mat", baseline="custom", benchmark=y_cs_hat)

            # ---------- 3) Plain MLP ----------
            mlp_params = dict(MLP_PARAMS_BASE)
            mlp_params["seed"] = seed     # Torch MLP 내부에서 사용할 seed

            mlp_cfg = {
                "feature_cols": feat_cols,
                "params": mlp_params,
            }
            mlp = make_strategy("MLP", mlp_cfg, target_cols=Y_COLS)

            runner_mlp = ExpandingRunner(
                X=X_feat,
                y=y,
                strategy=mlp,
                period=PERIOD,
                burn_in_end=BURN_IN_END,
                horizon=HORIZON,
            )
            runner_mlp.fit_walk(progress=SHOW_PROGRESS, desc=f"Plain MLP [{tag}] (run={run+1})")

            se_mlp = mse10_from_runner(runner_mlp)
            mse_runs_mlp.append(se_mlp)

            if run == 0:
                print(f"\n=== Plain MLP [{tag}] (run=1) ===")
                print("R2OOS vs naive:\n",    runner_mlp.R2OOS(baseline="naive").round(4))
                print("R2OOS vs condmean:\n", runner_mlp.R2OOS(baseline="condmean").round(4))
                print("R2OOS vs CS OLS:\n",   runner_mlp.R2OOS(baseline="custom", benchmark=y_cs_hat).round(4))

                runner_mlp.to_mat(f"mlp_{tag}.mat", baseline="custom", benchmark=y_cs_hat)

            # ---------- 4) SDMLP ----------
            sdmlp_params = dict(SDMLP_PARAMS_BASE)
            sdmlp_params["seed"] = seed   # Torch SDMLP 내부에서 사용할 seed

            sdmlp_cfg = {
                "slope_map": slope_map,
                "feature_cols": feat_cols,   # residual MLP는 fwd/macro만 사용
                "params": sdmlp_params,
            }
            sdmlp = make_strategy("SDMLP", sdmlp_cfg, target_cols=Y_COLS)

            runner_sdmlp = ExpandingRunner(
                X=X_cs,   # slope + features
                y=y,
                strategy=sdmlp,
                period=PERIOD,
                burn_in_end=BURN_IN_END,
                horizon=HORIZON,
            )
            runner_sdmlp.fit_walk(progress=SHOW_PROGRESS, desc=f"SDMLP [{tag}] (run={run+1})")

            se_sdmlp = mse10_from_runner(runner_sdmlp)
            mse_runs_sdmlp.append(se_sdmlp)

            if run == 0:
                print(f"\n=== SDMLP [{tag}] (run=1) ===")
                print("R2OOS vs naive:\n",    runner_sdmlp.R2OOS(baseline="naive").round(4))
                print("R2OOS vs condmean:\n", runner_sdmlp.R2OOS(baseline="condmean").round(4))
                print("R2OOS vs CS OLS:\n",   runner_sdmlp.R2OOS(baseline="custom", benchmark=y_cs_hat).round(4))

                runner_sdmlp.to_mat(f"sdmlp_{tag}.mat", baseline="custom", benchmark=y_cs_hat)

        # ------ run 방향 평균 MSE_10(t) 저장 ------
        save_mse10_mean(mse_runs_ridge, f"ridge_{tag}_mse10_avg.csv")
        save_mse10_mean(mse_runs_csarm, f"csarm_ridge_{tag}_mse10_avg.csv")
        save_mse10_mean(mse_runs_mlp,   f"mlp_{tag}_mse10_avg.csv")
        save_mse10_mean(mse_runs_sdmlp, f"sdmlp_{tag}_mse10_avg.csv")

        print(f"\n>>> [{tag}] 평균 MSE_10(t) CSV 저장 완료.")