# 04 â€” Logistic Regression Direction (USD/CAD)

Goal: Train an interpretable probabilistic direction model and evaluate it via an
expanding-window rolling backtest.

Inputs:
- outputs/usdcad_features_h7.parquet (from Notebook 03)

Outputs (written to outputs/, ignored by git):
- logreg_backtest_rows.csv
- logreg_metrics_overall.csv
- logreg_metrics_by_confidence.csv
- logreg_coefficients_over_time.csv


In [13]:
from __future__ import annotations

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, log_loss, brier_score_loss

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

RANDOM_SEED = 7
np.random.seed(RANDOM_SEED)

def find_repo_root(start: Path | None = None) -> Path:
    start = start or Path.cwd()
    for p in [start, *start.parents]:
        if (p / "data").is_dir() and (p / "src").is_dir():
            return p
    raise RuntimeError(f"Repo root not found from: {start}. Run notebook from inside repo.")

REPO_ROOT = find_repo_root()
OUT_DIR = REPO_ROOT / "outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

H = 7
FEATURE_PATH = OUT_DIR / f"usdcad_features_h{H}.parquet"
if not FEATURE_PATH.exists():
    raise FileNotFoundError(
        f"Feature parquet not found: {FEATURE_PATH}\n"
        f"Run notebooks/03_direction_feature_engineering.ipynb first."
    )

BACKTEST_ROWS_CSV = OUT_DIR / "logreg_backtest_rows.csv"
METRICS_OVERALL_CSV = OUT_DIR / "logreg_metrics_overall.csv"
METRICS_BUCKETS_CSV = OUT_DIR / "logreg_metrics_by_confidence.csv"
COEF_TIME_CSV = OUT_DIR / "logreg_coefficients_over_time.csv"




In [3]:
df_feat = pd.read_parquet(FEATURE_PATH).sort_index()

target_col = f"direction_{H}d"

# Features: everything except targets
excluded = {target_col, f"fwd_return_{H}d"}
feature_cols = [c for c in df_feat.columns if c not in excluded]

X_all = df_feat[feature_cols].copy()
y_all = df_feat[target_col].astype(int).copy()

df_feat.shape, len(feature_cols), feature_cols[:10]


((1960, 23),
 21,
 ['value',
  'ret_1d',
  'ret_3d',
  'ret_5d',
  'ret_10d',
  'ret_21d',
  'vol_5d',
  'vol_10d',
  'vol_21d',
  'vol_63d'])

## Rolling Backtest Design

We use an expanding-window backtest to simulate sequential model updates:

- train on data up to time t (inclusive)
- predict probability for time t (or a block ending at t)
- step forward and repeat

To keep runtime reasonable, we predict in monthly blocks:
- re-fit once per month
- generate predictions for that month using the fit from data strictly before it

This preserves temporal integrity and avoids look-ahead bias.


In [4]:
def assign_confidence_bucket(p: pd.Series, edges=(0.0, 0.4, 0.45, 0.55, 0.6, 1.0)) -> pd.Series:
    """
    Bucket predicted probabilities.
    Default edges create:
    - very confident DOWN (0.0-0.4)
    - low/moderate (0.4-0.45)
    - uncertain (0.45-0.55)
    - low/moderate (0.55-0.6)
    - very confident UP (0.6-1.0)
    """
    labels = []
    for i in range(len(edges) - 1):
        labels.append(f"[{edges[i]:.2f},{edges[i+1]:.2f})" if i < len(edges) - 2 else f"[{edges[i]:.2f},{edges[i+1]:.2f}]")
    return pd.cut(p, bins=list(edges), labels=labels, include_lowest=True, right=False)


In [5]:
# Simple, defensible model:
# - standardize features
# - L2-regularized logistic regression
# - no hyperparameter tuning in this PR
model = Pipeline(steps=[
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LogisticRegression(
        penalty="l2",
        C=1.0,
        solver="lbfgs",
        max_iter=2000,
        random_state=RANDOM_SEED
    ))
])


In [6]:
def rolling_monthly_backtest(X: pd.DataFrame, y: pd.Series, model: Pipeline, min_train_size: int = 252*2):
    """
    Expanding-window monthly refit:
    - Fit at the start of each month using all prior data
    - Predict for all dates in that month
    """
    idx = X.index
    months = pd.PeriodIndex(idx, freq="M")

    rows = []
    coef_rows = []

    unique_months = months.unique().sort_values()
    for m in unique_months:
        in_month = (months == m)
        month_idx = idx[in_month]

        # train strictly before this month
        train_mask = idx < month_idx.min()
        if train_mask.sum() < min_train_size:
            continue

        X_train, y_train = X.loc[train_mask], y.loc[train_mask]
        X_test, y_test = X.loc[in_month], y.loc[in_month]

        # fit + predict probs
        model.fit(X_train, y_train)
        p_up = model.predict_proba(X_test)[:, 1]
        pred = (p_up >= 0.5).astype(int)

        # store prediction rows
        out = pd.DataFrame({
            "date": X_test.index,
            "y_true": y_test.values,
            "p_up": p_up,
            "y_pred": pred,
            "month": str(m),
            "train_end": month_idx.min() - pd.Timedelta(days=1),
            "n_train": len(X_train),
            "n_test": len(X_test),
        }).set_index("date")
        rows.append(out)

        # store coefficients (for interpretability / stability)
        clf = model.named_steps["clf"]
        coefs = pd.Series(clf.coef_.ravel(), index=X.columns, name="coef")
        coef_rows.append(pd.DataFrame({
            "month": str(m),
            "n_train": len(X_train),
            "intercept": float(clf.intercept_[0]),
            **coefs.to_dict()
        }, index=[0]))

    bt = pd.concat(rows).sort_index()
    coef_time = pd.concat(coef_rows, ignore_index=True) if coef_rows else pd.DataFrame()
    return bt, coef_time

bt_rows, coef_time = rolling_monthly_backtest(X_all, y_all, model=model)
bt_rows.head(), bt_rows.tail(), coef_time.shape




(            y_true      p_up  y_pred    month  train_end  n_train  n_test
 date                                                                     
 2020-03-02       1  0.304194       0  2020-03 2020-03-01      518      22
 2020-03-03       1  0.395651       0  2020-03 2020-03-01      518      22
 2020-03-04       1  0.419486       0  2020-03 2020-03-01      518      22
 2020-03-05       1  0.361288       0  2020-03 2020-03-01      518      22
 2020-03-06       1  0.310306       0  2020-03 2020-03-01      518      22,
             y_true      p_up  y_pred    month  train_end  n_train  n_test
 date                                                                     
 2025-12-05       0  0.423402       0  2025-12 2025-11-30     1951       9
 2025-12-08       0  0.526680       1  2025-12 2025-11-30     1951       9
 2025-12-09       0  0.545383       1  2025-12 2025-11-30     1951       9
 2025-12-10       0  0.516821       1  2025-12 2025-11-30     1951       9
 2025-12-11       0  0.4

In [7]:
y_true = bt_rows["y_true"].astype(int).values
p_up = bt_rows["p_up"].astype(float).values
y_pred = bt_rows["y_pred"].astype(int).values

overall = {
    "n": len(bt_rows),
    "accuracy": float(accuracy_score(y_true, y_pred)),
    "log_loss": float(log_loss(y_true, p_up, labels=[0, 1])),
    "brier": float(brier_score_loss(y_true, p_up)),
    "mean_p_up": float(np.mean(p_up)),
    "pos_rate": float(np.mean(y_true)),
}

metrics_overall = pd.DataFrame([overall])
metrics_overall


Unnamed: 0,n,accuracy,log_loss,brier,mean_p_up,pos_rate
0,1442,0.528433,0.733935,0.264894,0.495735,0.496533


In [8]:
bt = bt_rows.copy()
bt["bucket"] = assign_confidence_bucket(bt["p_up"])

bucket_metrics = (
    bt.groupby("bucket", observed=True)
      .apply(lambda g: pd.Series({
          "n": len(g),
          "coverage": len(g) / len(bt),
          "accuracy": (g["y_pred"] == g["y_true"]).mean(),
          "avg_p_up": g["p_up"].mean(),
          "pos_rate": g["y_true"].mean(),
          "avg_confidence": np.maximum(g["p_up"], 1 - g["p_up"]).mean(),
      }))
      .reset_index()
)

bucket_metrics


  .apply(lambda g: pd.Series({


Unnamed: 0,bucket,n,coverage,accuracy,avg_p_up,pos_rate,avg_confidence
0,"[0.00,0.40)",317.0,0.219834,0.580442,0.319699,0.419558,0.680301
1,"[0.40,0.45)",193.0,0.133842,0.544041,0.427424,0.455959,0.572576
2,"[0.45,0.55)",527.0,0.365465,0.497154,0.500217,0.538899,0.525254
3,"[0.55,0.60)",150.0,0.104022,0.573333,0.571861,0.573333,0.571861
4,"[0.60,1.00]",255.0,0.176838,0.490196,0.712233,0.490196,0.712233


In [9]:
# Simple calibration diagnostics:
# Fit y = a + b * p_up (OLS) to see if probabilities are compressed/overconfident.
p = bt["p_up"].values
y = bt["y_true"].values

Xc = np.column_stack([np.ones_like(p), p])
beta = np.linalg.lstsq(Xc, y, rcond=None)[0]
cal_intercept, cal_slope = float(beta[0]), float(beta[1])

cal = pd.DataFrame([{
    "cal_intercept": cal_intercept,
    "cal_slope": cal_slope
}])
cal


Unnamed: 0,cal_intercept,cal_slope
0,0.4378,0.118475


In [10]:
# Show most stable / largest magnitude coefficients (last fit month)
if len(coef_time) > 0:
    last = coef_time.iloc[-1].drop(labels=["month", "n_train", "intercept"])
    last.sort_values().head(10), last.sort_values(ascending=False).head(10)
else:
    "No coefficient history (train window too short?)"


In [12]:
bt_rows.to_csv(BACKTEST_ROWS_CSV, index=True)
metrics_overall.to_csv(METRICS_OVERALL_CSV, index=False)
bucket_metrics.to_csv(METRICS_BUCKETS_CSV, index=False)

if len(coef_time) > 0:
    coef_time.to_csv(COEF_TIME_CSV, index=False)