# 05 â€” Tree-based Direction Model (USD/CAD)

Goal: Evaluate a simple, controlled tree-based classifier for 7-business-day direction
using the same expanding-window rolling backtest as Notebook 04.

Input:
- outputs/usdcad_features_h7.parquet (from Notebook 03)

Outputs (written to outputs/, ignored by git):
- tree_backtest_rows.csv
- tree_metrics_overall.csv
- tree_metrics_by_confidence.csv
- tree_feature_importance.csv


In [25]:
from __future__ import annotations

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, log_loss, brier_score_loss
from sklearn.inspection import permutation_importance

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

RANDOM_SEED = 7
np.random.seed(RANDOM_SEED)

def find_repo_root(start: Path | None = None) -> Path:
    start = start or Path.cwd()
    for p in [start, *start.parents]:
        if (p / "data").is_dir() and (p / "src").is_dir():
            return p
    raise RuntimeError(f"Repo root not found from: {start}. Run notebook from inside repo.")

REPO_ROOT = find_repo_root()
OUT_DIR = REPO_ROOT / "outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

H = 7
FEATURE_PATH = OUT_DIR / f"usdcad_features_h{H}.parquet"
if not FEATURE_PATH.exists():
    raise FileNotFoundError(f"Feature parquet not found: {FEATURE_PATH}. Run Notebook 03 first.")

BACKTEST_ROWS_CSV = OUT_DIR / "tree_backtest_rows.csv"
METRICS_OVERALL_CSV = OUT_DIR / "tree_metrics_overall.csv"
METRICS_BUCKETS_CSV = OUT_DIR / "tree_metrics_by_confidence.csv"
IMPORTANCE_CSV = OUT_DIR / "tree_feature_importance.csv"


In [26]:
df_feat = pd.read_parquet(FEATURE_PATH).sort_index()

target_col = f"direction_{H}d"
excluded = {target_col, f"fwd_return_{H}d"}
feature_cols = [c for c in df_feat.columns if c not in excluded]

X_all = df_feat[feature_cols].copy()
y_all = df_feat[target_col].astype(int).copy()

df_feat.shape, len(feature_cols), feature_cols[:10]


((1960, 23),
 21,
 ['value',
  'ret_1d',
  'ret_3d',
  'ret_5d',
  'ret_10d',
  'ret_21d',
  'vol_5d',
  'vol_10d',
  'vol_21d',
  'vol_63d'])

In [27]:
def assign_confidence_bucket(p: pd.Series, edges=(0.0, 0.4, 0.45, 0.55, 0.6, 1.0)) -> pd.Series:
    labels = []
    for i in range(len(edges) - 1):
        labels.append(f"[{edges[i]:.2f},{edges[i+1]:.2f})" if i < len(edges) - 2 else f"[{edges[i]:.2f},{edges[i+1]:.2f}]")
    return pd.cut(p, bins=list(edges), labels=labels, include_lowest=True, right=False)


## Model choice

XGBoost is optional. If it is not installed, this notebook falls back to
`sklearn.ensemble.HistGradientBoostingClassifier` to keep the pipeline runnable.

To use XGBoost:
`pip install xgboost`


In [28]:
USE_XGBOOST = True

try:
    import xgboost as xgb
    xgb_available = True
except Exception as e:
    xgb_available = False
    xgb_err = str(e)

if USE_XGBOOST and xgb_available:
    MODEL_NAME = "xgboost"
    # Controlled, conservative defaults:
    # - shallow trees
    # - modest learning rate
    # - no hyperparameter search
    # - early stopping is allowed but kept simple
    base_model = xgb.XGBClassifier(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=3,
        min_child_weight=5,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        reg_alpha=0.0,
        objective="binary:logistic",
        eval_metric="logloss",
        random_state=RANDOM_SEED,
        n_jobs=-1,
    )
else:
    MODEL_NAME = "sklearn_histgb"
    from sklearn.ensemble import HistGradientBoostingClassifier
    # Controlled tree boosting built into sklearn:
    base_model = HistGradientBoostingClassifier(
        max_depth=3,
        learning_rate=0.05,
        max_iter=300,
        min_samples_leaf=20,
        random_state=RANDOM_SEED
    )

print("MODEL_NAME:", MODEL_NAME)
if USE_XGBOOST and not xgb_available:
    print("XGBoost not available, fallback to sklearn. Import error:", xgb_err[:200])


MODEL_NAME: xgboost


In [29]:
def rolling_monthly_backtest_tree(
    X: pd.DataFrame,
    y: pd.Series,
    model,
    min_train_size: int = 252 * 2,
    use_early_stopping: bool = False,
    eval_slice: int = 252,
    early_stopping_rounds: int = 25,
):
    """
    Expanding-window monthly refit:
    - Fit at the start of each month using all prior data (strictly before the month)
    - Predict probabilities for all dates in that month

    Notes:
    - For XGBoost >= 3.x, early stopping via `early_stopping_rounds=` is not supported
      in the sklearn wrapper. This function defaults to NO early stopping to keep the
      behavior stable across versions.
    """
    idx = X.index
    months = pd.PeriodIndex(idx, freq="M")
    unique_months = months.unique().sort_values()

    rows = []
    last_fitted_model = None
    last_train_mask = None

    for m in unique_months:
        in_month = (months == m)
        month_idx = idx[in_month]
        if len(month_idx) == 0:
            continue

        # Train strictly before this month
        train_mask = idx < month_idx.min()
        if train_mask.sum() < min_train_size:
            continue

        X_train, y_train = X.loc[train_mask], y.loc[train_mask]
        X_test, y_test = X.loc[in_month], y.loc[in_month]

        # Fit model (no early stopping by default for version robustness)
        if MODEL_NAME == "xgboost" and use_early_stopping:
            # Optional: callback-based early stopping (works with newer XGBoost versions)
            # Keeps deterministic split: last eval_slice points of training used for eval
            try:
                from xgboost.callback import EarlyStopping

                if len(X_train) >= (eval_slice * 3):
                    X_tr, y_tr = X_train.iloc[:-eval_slice], y_train.iloc[:-eval_slice]
                    X_ev, y_ev = X_train.iloc[-eval_slice:], y_train.iloc[-eval_slice:]
                    model.fit(
                        X_tr,
                        y_tr,
                        eval_set=[(X_ev, y_ev)],
                        verbose=False,
                        callbacks=[EarlyStopping(rounds=early_stopping_rounds, save_best=True)],
                    )
                else:
                    model.fit(X_train, y_train)
            except Exception:
                # If callbacks are unavailable for any reason, fall back to plain fit
                model.fit(X_train, y_train)
        else:
            model.fit(X_train, y_train)

        last_fitted_model = model
        last_train_mask = train_mask

        # Predict probabilities
        if hasattr(model, "predict_proba"):
            p_up = model.predict_proba(X_test)[:, 1]
        else:
            # Safety fallback (should not happen for classifiers here)
            p_up = model.predict(X_test)

        y_pred = (p_up >= 0.5).astype(int)

        out = pd.DataFrame(
            {
                "date": X_test.index,
                "y_true": y_test.values,
                "p_up": p_up,
                "y_pred": y_pred,
                "month": str(m),
                "train_end": month_idx.min() - pd.Timedelta(days=1),
                "n_train": len(X_train),
                "n_test": len(X_test),
            }
        ).set_index("date")

        rows.append(out)

    bt = pd.concat(rows).sort_index() if rows else pd.DataFrame()
    return bt, last_fitted_model, last_train_mask


# Run backtest
bt_rows, last_model, last_train_mask = rolling_monthly_backtest_tree(
    X_all, y_all, model=base_model, use_early_stopping=False
)

bt_rows.head(), bt_rows.tail(), bt_rows.shape


(            y_true      p_up  y_pred    month  train_end  n_train  n_test
 date                                                                     
 2020-03-02       1  0.222219       0  2020-03 2020-03-01      518      22
 2020-03-03       1  0.270458       0  2020-03 2020-03-01      518      22
 2020-03-04       1  0.267606       0  2020-03 2020-03-01      518      22
 2020-03-05       1  0.301312       0  2020-03 2020-03-01      518      22
 2020-03-06       1  0.157834       0  2020-03 2020-03-01      518      22,
             y_true      p_up  y_pred    month  train_end  n_train  n_test
 date                                                                     
 2025-12-05       0  0.651157       1  2025-12 2025-11-30     1951       9
 2025-12-08       0  0.866706       1  2025-12 2025-11-30     1951       9
 2025-12-09       0  0.811794       1  2025-12 2025-11-30     1951       9
 2025-12-10       0  0.799210       1  2025-12 2025-11-30     1951       9
 2025-12-11       0  0.6

In [30]:
y_true = bt_rows["y_true"].astype(int).values
p_up = bt_rows["p_up"].astype(float).values
y_pred = bt_rows["y_pred"].astype(int).values

overall = {
    "model": MODEL_NAME,
    "n": len(bt_rows),
    "accuracy": float(accuracy_score(y_true, y_pred)),
    "log_loss": float(log_loss(y_true, p_up, labels=[0, 1])),
    "brier": float(brier_score_loss(y_true, p_up)),
    "mean_p_up": float(np.mean(p_up)),
    "pos_rate": float(np.mean(y_true)),
}

metrics_overall = pd.DataFrame([overall])
metrics_overall


Unnamed: 0,model,n,accuracy,log_loss,brier,mean_p_up,pos_rate
0,xgboost,1442,0.486824,0.928234,0.323778,0.469575,0.496533


In [31]:
bt = bt_rows.copy()
bt["bucket"] = assign_confidence_bucket(bt["p_up"])

bucket_metrics = (
    bt.groupby("bucket", observed=True)
      .apply(lambda g: pd.Series({
          "n": len(g),
          "coverage": len(g) / len(bt),
          "accuracy": (g["y_pred"] == g["y_true"]).mean(),
          "avg_p_up": g["p_up"].mean(),
          "pos_rate": g["y_true"].mean(),
          "avg_confidence": np.maximum(g["p_up"], 1 - g["p_up"]).mean(),
      }))
      .reset_index()
)

bucket_metrics


  .apply(lambda g: pd.Series({


Unnamed: 0,bucket,n,coverage,accuracy,avg_p_up,pos_rate,avg_confidence
0,"[0.00,0.40)",637.0,0.441748,0.516484,0.212887,0.483516,0.787113
1,"[0.40,0.45)",89.0,0.06172,0.404494,0.424674,0.595506,0.575326
2,"[0.45,0.55)",140.0,0.097087,0.421429,0.50314,0.55,0.524958
3,"[0.55,0.60)",76.0,0.052705,0.434211,0.574249,0.434211,0.574249
4,"[0.60,1.00]",500.0,0.346741,0.49,0.77928,0.49,0.77928


In [32]:
importance = None

if MODEL_NAME == "xgboost":
    # Use built-in importance (gain) from the last fitted model
    booster = last_model.get_booster()
    score = booster.get_score(importance_type="gain")
    imp = pd.Series(score, name="gain").sort_values(ascending=False)

    # XGBoost feature names sometimes come as "f0, f1..." if not passed
    # Ensure mapping to actual names if needed
    if imp.index.str.match(r"f\d+").all():
        mapping = {f"f{i}": col for i, col in enumerate(feature_cols)}
        imp.index = imp.index.map(mapping)

    importance = imp.reset_index().rename(columns={"index": "feature"})
else:
    # Controlled permutation importance on a small recent subset of training data (deterministic slice)
    # to avoid heavy runtime.
    train_X = X_all.loc[last_train_mask]
    train_y = y_all.loc[last_train_mask]

    # Use the last 252 observations for importance (recent regime)
    if len(train_X) >= 252:
        X_imp = train_X.iloc[-252:]
        y_imp = train_y.iloc[-252:]
    else:
        X_imp, y_imp = train_X, train_y

    r = permutation_importance(
        last_model, X_imp, y_imp,
        n_repeats=5,
        random_state=RANDOM_SEED,
        scoring="neg_log_loss"
    )
    importance = (
        pd.DataFrame({"feature": X_imp.columns, "importance": r.importances_mean})
          .sort_values("importance", ascending=False)
          .reset_index(drop=True)
    )

importance.head(20)


Unnamed: 0,feature,gain
0,value,6.758736
1,is_high_vol,6.672
2,mom_21d,6.606195
3,vol_21d,6.278761
4,vol_63d,6.090878
5,vol_21_med_252,5.958591
6,ret_21d,5.791118
7,month,5.724078
8,vol_10d,5.240837
9,vol_ratio_21_63,5.106203


In [33]:
bt_rows.to_csv(BACKTEST_ROWS_CSV, index=True)
metrics_overall.to_csv(METRICS_OVERALL_CSV, index=False)
bucket_metrics.to_csv(METRICS_BUCKETS_CSV, index=False)

if importance is not None:
    importance.to_csv(IMPORTANCE_CSV, index=False)
