# XGBoost regressor (time series aware)
Load splits, run baseline XGB, tune with TimeSeriesSplit, compare to baseline, and save artifacts under `artifacts/xgboost`. Early stopping is applied on the validation split after selecting the best params.

Load splits from artifacts/data, run a strong XGB baseline, tune with TimeSeriesSplit, compare on val/test, and save outputs under `artifacts/xgboost`.

In [1]:
from pathlib import Path
import json
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit, ParameterGrid
from sklearn.base import clone
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
# paths to data and artifacts
_start = Path.cwd().resolve()
_candidates = [_start] + list(_start.parents)
_repo = None
for p in _candidates:
    if (p / 'notebooks/model_evaluation_final/artifacts/data/split_config.json').exists():
        _repo = p
        break
    if (p / 'data/master_dataset.parquet').exists():
        _repo = p
        break
REPO_ROOT = _repo if _repo else _start
ARTIFACTS_DIR = REPO_ROOT / 'notebooks/model_evaluation_final/artifacts'
DATA_DIR = ARTIFACTS_DIR / 'data'
MODEL_DIR = ARTIFACTS_DIR / 'xgboost'
for d in [ARTIFACTS_DIR, DATA_DIR, MODEL_DIR]:
    d.mkdir(parents=True, exist_ok=True)

DATA_DIR, MODEL_DIR

(PosixPath('/Users/aayushrijal/Documents/GitHub/volatility_forecast/notebooks/model_evaluation_final/artifacts/data'),
 PosixPath('/Users/aayushrijal/Documents/GitHub/volatility_forecast/notebooks/model_evaluation_final/artifacts/xgboost'))

In [3]:
# load artifacts
with open(DATA_DIR / 'feature_columns.json') as f:
    feature_cols = json.load(f)
with open(DATA_DIR / 'split_config.json') as f:
    CONFIG = json.load(f)
target_col = CONFIG['target_col']
date_col = CONFIG['date_col']

X_train = pd.read_parquet(DATA_DIR / 'X_train.parquet')
X_val = pd.read_parquet(DATA_DIR / 'X_val.parquet')
X_test = pd.read_parquet(DATA_DIR / 'X_test.parquet') if (DATA_DIR / 'X_test.parquet').exists() else pd.DataFrame(columns=feature_cols)
y_train = pd.read_parquet(DATA_DIR / 'y_train.parquet')[target_col].values
y_val = pd.read_parquet(DATA_DIR / 'y_val.parquet')[target_col].values
y_test = pd.read_parquet(DATA_DIR / 'y_test.parquet')[target_col].values if (DATA_DIR / 'y_test.parquet').exists() else np.array([])

len(X_train), len(X_val), len(X_test), len(feature_cols)

(3154, 788, 30, 20)

In [4]:
# keep raw features (XGB handles scaling internally)
X_train_raw = X_train.copy()
X_val_raw = X_val.copy()
X_test_raw = X_test.copy()
has_test = len(X_test_raw) > 0

len(X_train_raw), len(X_val_raw), len(X_test_raw)

(3154, 788, 30)

In [5]:
# helper functions
def metrics(y_true, y_pred):
    return {
        'rmse': float(np.sqrt(mean_squared_error(y_true, y_pred))),
        'mae': float(mean_absolute_error(y_true, y_pred)),
        'r2': float(r2_score(y_true, y_pred))
    }

def eval_model(model, name, X_tr, y_tr, X_v, y_v, X_te=None, y_te=None):
    model.fit(X_tr, y_tr)
    pred_tr = model.predict(X_tr)
    pred_v = model.predict(X_v)
    out = {
        'name': name,
        'train': metrics(y_tr, pred_tr),
        'val': metrics(y_v, pred_v)
    }
    preds = {'train': pred_tr, 'val': pred_v, 'test': None}
    if X_te is not None and len(X_te):
        pred_te = model.predict(X_te)
        out['test'] = metrics(y_te, pred_te)
        preds['test'] = pred_te
    return out, model, preds

def ts_cv_metrics(model, X, y, cv):
    fold_metrics = []
    for tr_idx, va_idx in cv.split(X):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        m = clone(model)
        m.fit(X_tr, y_tr)
        preds = m.predict(X_va)
        fold_metrics.append(metrics(y_va, preds))
    agg = {}
    for key in fold_metrics[0].keys():
        vals = [fm[key] for fm in fold_metrics]
        agg[key] = {'mean': float(np.mean(vals)), 'std': float(np.std(vals))}
    return agg

## Baseline XGBoost

In [6]:
baseline_xgb = XGBRegressor(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=1,
    gamma=0,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
    tree_method='hist',
    eval_metric='rmse'
)
baseline_xgb_out, baseline_xgb_model, baseline_xgb_preds = eval_model(
    baseline_xgb,
    'xgb_baseline',
    X_train_raw, y_train,
    X_val_raw, y_val,
    X_test_raw if has_test else None,
    y_test if has_test else None
)
baseline_xgb_out

{'name': 'xgb_baseline',
 'train': {'rmse': 0.0039009516239377076,
  'mae': 0.002953169563375308,
  'r2': 0.9334771106428861},
 'val': {'rmse': 0.00943672523159062,
  'mae': 0.006087905789721397,
  'r2': 0.40912466584460294},
 'test': {'rmse': 0.005408679875451533,
  'mae': 0.004782939888385149,
  'r2': -1.1589487904055042}}

## Hyperparameter tuning (TimeSeriesSplit)

In [12]:
tscv = TimeSeriesSplit(n_splits=3)


param_grid = {
    'n_estimators': [300, 500],
    'learning_rate': [0.03, 0.05, 0.08],
    'max_depth': [3, 4, 5, 6],
    'subsample': [0.8, 0.95],
    'colsample_bytree': [0.7, 0.85, 1.0],
    'min_child_weight': [1, 2, 4],
    'gamma': [0, 0.05],
    'reg_lambda': [0.8, 1.0, 1.5]
}


grid = list(ParameterGrid(param_grid))
print(f"Hyperparameter configs: {len(grid)}")


cv_records = []
best_entry = None
for params in grid:
    candidate = XGBRegressor(
        **params,
        random_state=42,
        n_jobs=-1,
        tree_method='hist',
        eval_metric='rmse'
    )
    cv_stats = ts_cv_metrics(candidate, X_train_raw, y_train, tscv)
    fitted = clone(candidate)
    fitted.fit(X_train_raw, y_train)
    entry = {
        'params': params,
        'cv': cv_stats,
        'val_rmse_mean': cv_stats['rmse']['mean'],
        'val_rmse_std': cv_stats['rmse']['std']
    }
    cv_records.append(entry)
    if (best_entry is None) or (entry['val_rmse_mean'] < best_entry['val_rmse_mean']):
        best_entry = {**entry, 'model': fitted}


cv_results_df = pd.DataFrame([{**{k: v for k, v in rec.items() if k != 'cv'}, 'cv_rmse_mean': rec['cv']['rmse']['mean'], 'cv_rmse_std': rec['cv']['rmse']['std']} for rec in cv_records])
cv_results_df_serializable = cv_results_df.copy()
cv_results_df_serializable['params'] = cv_results_df_serializable['params'].apply(json.dumps)


best_params = best_entry['params']
best_cv = best_entry['cv']


# refit best with early stopping on validation
best_model = XGBRegressor(
    **best_params,
    random_state=42,
    n_jobs=-1,
    tree_method='hist',
    eval_metric='rmse'
 )
best_model.fit(
    X_train_raw,
    y_train,
    eval_set=[(X_val_raw, y_val)],
    early_stopping_rounds=50,
    verbose=False
 )


best_model

Hyperparameter configs: 2592




## Evaluate tuned model and select best

In [13]:
best_pred_train = best_model.predict(X_train_raw)
best_pred_val = best_model.predict(X_val_raw)
best_pred_test = best_model.predict(X_test_raw) if has_test else None

best_out = {
    'name': 'xgb_tuned',
    'train': metrics(y_train, best_pred_train),
    'val': metrics(y_val, best_pred_val)
}
best_preds = {'train': best_pred_train, 'val': best_pred_val, 'test': best_pred_test}
if has_test:
    best_out['test'] = metrics(y_test, best_pred_test)

# choose best overall vs baseline XGB
baseline_val_rmse = baseline_xgb_out['val']['rmse']
tuned_val_rmse = best_out['val']['rmse']
best_overall_name = 'xgb_tuned' if tuned_val_rmse <= baseline_val_rmse else 'xgb_baseline'
if best_overall_name == 'xgb_tuned':
    best_overall_model = best_model
    best_overall_preds = best_preds
    best_overall_metrics = best_out
    best_overall_params = best_params
    best_overall_cv = best_cv
else:
    best_overall_model = baseline_xgb_model
    best_overall_preds = baseline_xgb_preds
    best_overall_metrics = baseline_xgb_out
    best_overall_params = baseline_xgb_model.get_params()
    best_overall_cv = None

print(f"Best model selected: {best_overall_name} (by val RMSE)")
best_overall_metrics

Best model selected: xgb_tuned (by val RMSE)


{'name': 'xgb_tuned',
 'train': {'rmse': 0.008045408197754638,
  'mae': 0.005604441560017746,
  'r2': 0.7170392862840006},
 'val': {'rmse': 0.00907279671175083,
  'mae': 0.00558345122421248,
  'r2': 0.453820241414309},
 'test': {'rmse': 0.004869681452443808,
  'mae': 0.0041195188750506005,
  'r2': -0.7500920503923685}}

## Compare baseline vs tuned

In [14]:
rows = []
for label, res in [('xgb_base', baseline_xgb_out), ('xgb_tuned', best_out)]:
    row = {
        'model': label,
        'val_rmse': res['val']['rmse'],
        'val_mae': res['val']['mae'],
        'val_r2': res['val']['r2']
    }
    if has_test and 'test' in res:
        row['test_rmse'] = res['test']['rmse']
        row['test_mae'] = res['test']['mae']
        row['test_r2'] = res['test']['r2']
    rows.append(row)
comparison_df = pd.DataFrame(rows).set_index('model')
print('Validation/Test comparison:')
display(comparison_df)

if 'xgb_base' in comparison_df.index and 'xgb_tuned' in comparison_df.index:
    diff_df = comparison_df.loc['xgb_tuned'] - comparison_df.loc['xgb_base']
    print('Tuned minus baseline (negative rmse/mae is better):')
    display(diff_df)
else:
    print('Cannot compute diff; missing one of xgb_base or xgb_tuned.')

Validation/Test comparison:


Unnamed: 0_level_0,val_rmse,val_mae,val_r2,test_rmse,test_mae,test_r2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
xgb_base,0.009437,0.006088,0.409125,0.005409,0.004783,-1.158949
xgb_tuned,0.009073,0.005583,0.45382,0.00487,0.00412,-0.750092


Tuned minus baseline (negative rmse/mae is better):


val_rmse    -0.000364
val_mae     -0.000504
val_r2       0.044696
test_rmse   -0.000539
test_mae    -0.000663
test_r2      0.408857
dtype: float64

## Feature importance

In [15]:
importance_df = None
if hasattr(best_overall_model, 'get_booster'):
    booster = best_overall_model.get_booster()
    score_dict = booster.get_score(importance_type='gain')
    # map scores back to feature order; missing keys get 0
    imp_values = [score_dict.get(f'f{i}', 0.0) for i in range(len(feature_cols))]
    importance_df = pd.DataFrame({
        'feature': feature_cols,
        'importance': imp_values
    }).sort_values(by='importance', ascending=False).reset_index(drop=True)
elif hasattr(best_overall_model, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'feature': feature_cols,
        'importance': best_overall_model.feature_importances_
    }).sort_values(by='importance', ascending=False).reset_index(drop=True)
importance_df.head(15) if importance_df is not None else None

Unnamed: 0,feature,importance
0,spy_ret_1d,0.0
1,spy_ret_5d,0.0
2,hyg_tlt_spread,0.0
3,corr_spy_hyg_60d,0.0
4,corr_spy_tlt_60d,0.0
5,corr_spy_hyg_20d,0.0
6,corr_spy_tlt_20d,0.0
7,rsi_spy_14,0.0
8,vix_term,0.0
9,vix3m,0.0


## Save artifacts

In [None]:
print(f"Saving best model: {best_overall_name}")
print("Hyperparameters:")
print(json.dumps(best_overall_params, indent=2))

metrics_payload = {
    'base': baseline_xgb_out,
    'tuned': best_out,
    'best': best_overall_metrics,
    'best_model': best_overall_name,
    'best_cv': best_overall_cv
}
hyperparams_payload = {
    'best_model_name': best_overall_name,
    'best_params': best_overall_params
}

with open(MODEL_DIR / 'metrics.json', 'w') as f:
    json.dump(metrics_payload, f, indent=2)
with open(MODEL_DIR / 'hyperparams.json', 'w') as f:
    json.dump(hyperparams_payload, f, indent=2)

if importance_df is not None:
    importance_df.to_parquet(MODEL_DIR / 'feature_importance.parquet', index=False)
    importance_df.to_csv(MODEL_DIR / 'feature_importance.csv', index=False)

joblib.dump(best_overall_model, MODEL_DIR / 'model.joblib')

pred_frames = [
    pd.DataFrame({'split': 'train', 'y_true': y_train, 'y_pred': best_overall_preds['train']}),
    pd.DataFrame({'split': 'val', 'y_true': y_val, 'y_pred': best_overall_preds['val']})
]
if has_test and best_overall_preds['test'] is not None:
    pred_frames.append(pd.DataFrame({'split': 'test', 'y_true': y_test, 'y_pred': best_overall_preds['test']}))
pd.concat(pred_frames, ignore_index=True).to_parquet(MODEL_DIR / 'preds.parquet', index=False)

cv_results_df_serializable.to_parquet(MODEL_DIR / 'cv_results.parquet', index=False)

best_overall_params

Saving best model: xgb_tuned
Hyperparameters:
{
  "colsample_bytree": 0.7,
  "gamma": 0,
  "learning_rate": 0.03,
  "max_depth": 3,
  "min_child_weight": 4,
  "n_estimators": 300,
  "reg_lambda": 1.5,
  "subsample": 0.8
}


{'colsample_bytree': 0.7,
 'gamma': 0,
 'learning_rate': 0.03,
 'max_depth': 3,
 'min_child_weight': 4,
 'n_estimators': 300,
 'reg_lambda': 1.5,
 'subsample': 0.8}