In [None]:
# ============================================================
# Ridge / Lasso / ElasticNet / RF / ExtraTrees / XGB
# X = fwds
# Baselines: naive / condmean / cs_yhat
# Console output only
# ============================================================

import os, sys, warnings
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

from rolling_framework import Machine

# ---------------- Paths ----------------
DATA_DIR = "data"
Y_FILE = os.path.join(DATA_DIR, "exrets.csv")
FWD_FILE = os.path.join(DATA_DIR, "fwds.csv")
CS_YHAT_FILE = os.path.join(DATA_DIR, "cs_yhat.csv")

BURN_START, BURN_END = "197108", "199001"
PERIOD_START, PERIOD_END = "197108", "202312"
HORIZON = 12
MATURITIES = ["xr_2", "xr_3", "xr_5", "xr_7", "xr_10"]

# ---------------- Helpers ----------------
def _load_csv(path):
    try:
        return pd.read_csv(path, index_col="Time")
    except FileNotFoundError:
        sys.exit(f"Missing file: {path}")

def _align_time(*dfs):
    idx = None
    for d in dfs:
        idx = d.index if idx is None else idx.intersection(d.index)
    return [d.loc[idx].sort_index() for d in dfs]

def _to_series(x, index):
    if isinstance(x, pd.Series):
        return x.reindex(index)
    if isinstance(x, pd.DataFrame):
        return x.squeeze().reindex(index)
    try:
        return pd.Series(float(x), index=index)
    except Exception:
        return pd.Series(np.nan, index=index)

def _print_r2_block(title, m, cs_path=None):
    print(f"\n==== {title} ====")
    r2_naive = m.R2OOS(baseline="naive")
    r2_cond  = m.R2OOS(baseline="condmean")
    r2_cs    = m.R2OOS(baseline="cs_yhat", cs_path=cs_path) if cs_path else np.nan

    cols = getattr(m, "targets", getattr(m, "y", pd.DataFrame()).columns)
    s_naive, s_cond, s_cs = map(lambda x: _to_series(x, cols), [r2_naive, r2_cond, r2_cs])

    tbl = pd.DataFrame({
        "R2OOS_naive": s_naive,
        "R2OOS_condmean": s_cond,
        "R2OOS_cs_yhat": s_cs
    })
    print(tbl.round(4))
    print("Mean:", tbl.mean(axis=0, skipna=True).round(4).to_dict())

def run_model(model_type, option, grid, X, y, cs_path, title):
    m = Machine(
        X, y,
        model_type=model_type, option=option, params_grid=grid,
        burn_in_start=BURN_START, burn_in_end=BURN_END,
        period=[PERIOD_START, PERIOD_END],
        forecast_horizon=HORIZON,
    )
    print(f"\n▶ {title}")
    m.training()
    _print_r2_block(title, m, cs_path=cs_path)

# ---------------- Data ----------------
y = _load_csv(Y_FILE)
fwd = _load_csv(FWD_FILE)

y_cols = [c for c in MATURITIES if c in y.columns]
if not y_cols:
    sys.exit("MATURITIES not found in exrets.csv")
y = y[y_cols]

y, fwd = _align_time(y, fwd)
X = fwd.copy()

# Clean cs_yhat
cs = _load_csv(CS_YHAT_FILE)
cs = cs[~cs.index.duplicated(keep="last")]
cs = cs.reindex(columns=y.columns)
CS_CLEAN_FILE = os.path.join(DATA_DIR, "_cs_yhat_clean_tmp.csv")
cs.to_csv(CS_CLEAN_FILE)

print("✓ data:", {"X": X.shape, "y": y.shape, "cs_yhat": cs.shape})

# ---------------- Penalized Models ----------------
grid_lasso = {"reg__alpha": [0.001, 1.0]}
grid_ridge = {"reg__alpha": [0.001, 10]}
grid_enet  = {"reg__alpha": [0.01, 0.1, 1, 10], "reg__l1_ratio": [0.1, 0.3, 0.5]}

run_model("Penalized", "lasso",      grid_lasso, X, y, CS_CLEAN_FILE, "Lasso")
run_model("Penalized", "ridge",      grid_ridge, X, y, CS_CLEAN_FILE, "Ridge")
run_model("Penalized", "elasticnet", grid_enet,  X, y, CS_CLEAN_FILE, "ElasticNet")

# ---------------- Tree Models ----------------
grid_rf = {
    "model__estimator__n_estimators": [300],
    "model__estimator__max_depth": [2, 8],
    "model__estimator__min_samples_split": [2, 4],
    "model__estimator__min_samples_leaf": [1, 2, 4],
    "model__estimator__max_features": [0.25, 0.5, 1],
}
grid_et = {
    "model__estimator__n_estimators": [300],
    "model__estimator__max_depth": [2, 8],
    "model__estimator__min_samples_split": [2, 4],
    "model__estimator__min_samples_leaf": [1, 2, 4],
    "model__estimator__max_features": [0.25, 0.5, 1],
}
grid_xgb = {
    "model__estimator__n_estimators": [300],
    "model__estimator__max_depth": [2, 4],
    "model__estimator__learning_rate": [0.01],
    "model__estimator__subsample": [0.7, 0.5],
    "model__estimator__reg_lambda": [0.1, 1.0],
}

run_model("Tree", "rf",  grid_rf,  X, y, CS_CLEAN_FILE, "RandomForest")
run_model("Tree", "et",  grid_et,  X, y, CS_CLEAN_FILE, "ExtraTrees")
run_model("Tree", "xgb", grid_xgb, X, y, CS_CLEAN_FILE, "XGBoost")

print("\n✓ done")

▶ OLS-SL_nonDNN


OLS rolling:  93%|█████████▎| 485/520 [00:08<00:00, 59.08it/s]

In [None]:
# ============================================================
# Ridge / Lasso / Two Trees with R2OOS baselines
# baselines: naive / condmean(historical mean) / cs_yhat
# - 결과 저장 없음, 콘솔 출력만
# - Machine.R2OOS는 간단 시그니처( baseline, cs_path )만 사용
# ============================================================
import os, sys, warnings
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

from rolling_framework import Machine  # <- your framework

# --------------------- Paths & windows ---------------------
DATA_DIR      = "data"
Y_FILE        = os.path.join(DATA_DIR, "exrets.csv")
SLOPE_FILE    = os.path.join(DATA_DIR, "slope.csv")
MACRO_FILE    = os.path.join(DATA_DIR, "MacroFactors.csv")
FWD_FILE     = os.path.join(DATA_DIR, "fwds.csv")
CS_YHAT_FILE  = os.path.join(DATA_DIR, "cs_yhat.csv")  # precomputed CS baseline

BURN_START, BURN_END     = "197108", "199001"
PERIOD_START, PERIOD_END = "197108", "202312"
HORIZON = 12
MATURITIES = ["xr_2", "xr_3", "xr_5", "xr_7", "xr_10"]

# --------------------- Helpers ---------------------
def _load_csv(path, name):
    try:
        return pd.read_csv(path, index_col="Time")
    except FileNotFoundError as e:
        sys.exit(f"[ERROR] missing {name} → {e.filename}")

def _align_time(*dfs):
    idx = None
    for d in dfs:
        idx = d.index if idx is None else idx.intersection(d.index)
    return [d.loc[idx].sort_index() for d in dfs]

def _to_series(x, index):
    """R2OOS 반환값이 scalar/Series/DataFrame 어느 것이든 Series로 변환."""
    if isinstance(x, pd.Series):
        return x.reindex(index)
    if isinstance(x, pd.DataFrame):
        return x.squeeze().reindex(index)
    try:
        v = float(x)
        return pd.Series(v, index=index)
    except Exception:
        return pd.Series(np.nan, index=index)

def _print_r2_block(title, m, cs_clean_path=None):
    print(f"\n==== {title} ====")
    r2_naive = m.R2OOS(baseline="naive")                     # ||y||^2
    r2_cond  = m.R2OOS(baseline="condmean")                  # ||y - (hist mean up to t-1)||^2
    r2_cs    = m.R2OOS(baseline="cs_yhat", cs_path=cs_clean_path) if cs_clean_path else np.nan

    targets = getattr(m, "targets", getattr(m, "y", pd.DataFrame()).columns)
    s_naive = _to_series(r2_naive, targets)
    s_cond  = _to_series(r2_cond,  targets)
    s_cs    = _to_series(r2_cs,    targets)

    tbl = pd.DataFrame({
        "R2OOS_naive":    s_naive,
        "R2OOS_condmean": s_cond,
        "R2OOS_cs_yhat":  s_cs,
    })
    print("\nR2OOS (by maturity)")
    print(tbl.round(4))

    print("\nAverages:")
    print(tbl.mean(axis=0, skipna=True).round(4))

# --------------------- Load data ---------------------
y     = _load_csv(Y_FILE, "exrets")
slope = _load_csv(SLOPE_FILE, "slope")
macro = _load_csv(MACRO_FILE, "macro")
fwd = _load_csv(FWD_FILE, "fwds")  # for features

# target subset & alignment
y_cols = [c for c in MATURITIES if c in y.columns]
if not y_cols:
    sys.exit("[ERROR] MATURITIES not found in exrets.csv")
y = y[y_cols]

y, slope, macro = _align_time(y, slope, macro)
X = pd.concat([fwd], axis=1)

# ---- cs_yhat: 중복 타임스탬프 제거 후 임시 경로로 저장 (Machine은 cs_path만 받음) ----
cs = _load_csv(CS_YHAT_FILE, "cs_yhat")
if cs.index.duplicated().any():
    cs = cs[~cs.index.duplicated(keep="last")]
# 열도 y와 맞춰두면 더 안전
cs = cs.reindex(columns=y.columns)
CS_CLEAN_FILE = os.path.join(DATA_DIR, "_cs_yhat_clean___tmp.csv")
cs.to_csv(CS_CLEAN_FILE)

print("✓ data shapes:", {"X": X.shape, "y": y.shape, "cs_yhat(clean)": cs.shape})

# --------------------- Common kwargs for Machine ---------------------
COMMON_KW = dict(
    burn_in_start=BURN_START, burn_in_end=BURN_END,
    period=[PERIOD_START, PERIOD_END],
    forecast_horizon=HORIZON,
)

# ======================================================================
# 1) Ridge  (STRATEGY: 'Penalized' + option="ridge")
# ======================================================================
opt_ridge  = "ridge"                        # ← 문자열 키만!
grid_ridge = {"reg__alpha": [10, 100]}

m_ridge = Machine(X, y, model_type="Penalized", option=opt_ridge, params_grid=grid_ridge, **COMMON_KW)
print("\n▶ Training: Ridge")
m_ridge.training()
_print_r2_block("Ridge (alpha grid)", m_ridge, cs_clean_path=CS_CLEAN_FILE)

# ======================================================================
# 2) Lasso  (STRATEGY: 'Penalized' + option="lasso")
# ======================================================================
opt_lasso  = "lasso"
grid_lasso = {"reg__alpha": [10, 100]}

m_lasso = Machine(X, y, model_type="Penalized", option=opt_lasso, params_grid=grid_lasso, **COMMON_KW)
print("\n▶ Training: Lasso")
m_lasso.training()
_print_r2_block("Lasso (alpha grid)", m_lasso, cs_clean_path=CS_CLEAN_FILE)

# ======================================================================
# 3) Two Tree models  (STRATEGY: 'Tree' + option 키 확인 필요: 예 'rf' / 'gbr')
# ======================================================================
# 3-1) Random Forest
opt_rf  = "rf"
grid_rf = {
    "model__estimator__n_estimators":    [200, 400],
    "model__estimator__max_depth":       [2,6],
    "model__estimator__min_samples_leaf":[2, 4],
    "model__estimator__n_jobs":          [-1],   # 있으면 고정
    "model__estimator__random_state":    [42],   # 재현원하면 고정
}
m_rf = Machine(X, y, model_type="Tree", option=opt_rf, params_grid=grid_rf, **COMMON_KW)
m_rf.training()
_print_r2_block("RandomForest (small grid)", m_rf, cs_clean_path=CS_CLEAN_FILE)

# 3-2) Gradient Boosting
opt_gbr  = "xgb"
grid_gbr = {
    "model__estimator__n_estimators": [300, 500],
    "model__estimator__learning_rate":[0.05, 0.1],
    "model__estimator__max_depth":    [2, 4],   # 개별 트리 깊이
}
m_gbr = Machine(X, y, model_type="Tree", option=opt_gbr, params_grid=grid_gbr, **COMMON_KW)
m_gbr.training()
_print_r2_block("GradientBoosting (small grid)", m_gbr, cs_clean_path=CS_CLEAN_FILE)

print("\n✓ done")

✓ data shapes: {'X': (629, 10), 'y': (629, 5), 'cs_yhat(clean)': (612, 5)}

▶ Training: Ridge


Penalized rolling: 100%|██████████| 408/408 [00:10<00:00, 39.06it/s]



==== Ridge (alpha grid) ====

R2OOS (by maturity)
       R2OOS_naive  R2OOS_condmean  R2OOS_cs_yhat
xr_2       -0.1047         -0.2547        -0.1376
xr_3       -0.0758         -0.2432        -0.1902
xr_5       -0.0342         -0.2154        -0.2061
xr_7       -0.0275         -0.2047        -0.2253
xr_10      -0.0115         -0.1743        -0.2496

Averages:
R2OOS_naive      -0.0507
R2OOS_condmean   -0.2185
R2OOS_cs_yhat    -0.2018
dtype: float64

▶ Training: Lasso


Penalized rolling: 100%|██████████| 408/408 [00:07<00:00, 54.24it/s]



==== Lasso (alpha grid) ====

R2OOS (by maturity)
       R2OOS_naive  R2OOS_condmean  R2OOS_cs_yhat
xr_2        0.0809         -0.0448         0.0535
xr_3        0.0805         -0.0634        -0.0172
xr_5        0.0776         -0.0848        -0.0757
xr_7        0.0669         -0.0947        -0.1127
xr_10       0.0358         -0.1199        -0.1912

Averages:
R2OOS_naive       0.0684
R2OOS_condmean   -0.0815
R2OOS_cs_yhat    -0.0687
dtype: float64


Tree rolling:  27%|██▋       | 109/408 [05:32<17:10,  3.45s/it]