# Random forest regressor (tuned with OOB)

Load splits from artifacts/data, run a solid RF baseline with OOB, tune with TimeSeriesSplit + OOB-aware settings, compare on val/test, and save outputs under artifacts/random_forest.


In [3]:
from pathlib import Path
import json
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, ParameterGrid
from sklearn.base import clone
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [4]:

# paths to data and artifacts
_start = Path.cwd().resolve()
_candidates = [_start] + list(_start.parents)
_repo = None
for p in _candidates:
    if (p / 'notebooks/model_evaluation_final/artifacts/data/split_config.json').exists():
        _repo = p
        break
    if (p / 'data/master_dataset.parquet').exists():
        _repo = p
        break
REPO_ROOT = _repo if _repo else _start
ARTIFACTS_DIR = REPO_ROOT / 'notebooks/model_evaluation_final/artifacts'
DATA_DIR = ARTIFACTS_DIR / 'data'
MODEL_DIR = ARTIFACTS_DIR / 'random_forest'
for d in [ARTIFACTS_DIR, DATA_DIR, MODEL_DIR]:
    d.mkdir(parents=True, exist_ok=True)

DATA_DIR, MODEL_DIR

(PosixPath('/Users/aayushrijal/Documents/GitHub/volatility_forecast/notebooks/model_evaluation_final/artifacts/data'),
 PosixPath('/Users/aayushrijal/Documents/GitHub/volatility_forecast/notebooks/model_evaluation_final/artifacts/random_forest'))

In [5]:
# load artifacts
with open(DATA_DIR / 'feature_columns.json') as f:
    feature_cols = json.load(f)
with open(DATA_DIR / 'split_config.json') as f:
    CONFIG = json.load(f)
target_col = CONFIG['target_col']
date_col = CONFIG['date_col']

X_train = pd.read_parquet(DATA_DIR / 'X_train.parquet')
X_val = pd.read_parquet(DATA_DIR / 'X_val.parquet')
X_test = pd.read_parquet(DATA_DIR / 'X_test.parquet')
y_train = pd.read_parquet(DATA_DIR / 'y_train.parquet')[target_col].values
y_val = pd.read_parquet(DATA_DIR / 'y_val.parquet')[target_col].values
y_test = pd.read_parquet(DATA_DIR / 'y_test.parquet')[target_col].values if (DATA_DIR / 'y_test.parquet').exists() else np.array([])

len(X_train), len(X_val), len(X_test), len(feature_cols)

(3154, 788, 30, 20)

In [6]:
# keep raw features (RF handles scaling internally)
X_train_raw = X_train.copy()
X_val_raw = X_val.copy()
X_test_raw = X_test.copy()
has_test = len(X_test_raw) > 0

len(X_train_raw), len(X_val_raw), len(X_test_raw)

(3154, 788, 30)

In [7]:
# helper functions
def metrics(y_true, y_pred):
    return {
        'rmse': float(np.sqrt(mean_squared_error(y_true, y_pred))),
        'mae': float(mean_absolute_error(y_true, y_pred)),
        'r2': float(r2_score(y_true, y_pred))
    }

def eval_model(model, name, X_tr, y_tr, X_v, y_v, X_te=None, y_te=None):
    model.fit(X_tr, y_tr)
    pred_tr = model.predict(X_tr)
    pred_v = model.predict(X_v)
    out = {
        'name': name,
        'train': metrics(y_tr, pred_tr),
        'val': metrics(y_v, pred_v)
    }
    preds = {'train': pred_tr, 'val': pred_v, 'test': None}
    if X_te is not None and len(X_te):
        pred_te = model.predict(X_te)
        out['test'] = metrics(y_te, pred_te)
        preds['test'] = pred_te
    if getattr(model, 'oob_prediction_', None) is not None:
        out['oob'] = metrics(y_tr, model.oob_prediction_)
    return out, model, preds

def ts_cv_metrics(model, X, y, cv):
    fold_metrics = []
    for tr_idx, va_idx in cv.split(X):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        m = clone(model)
        m.fit(X_tr, y_tr)
        preds = m.predict(X_va)
        fold_metrics.append(metrics(y_va, preds))
    agg = {}
    for key in fold_metrics[0].keys():
        vals = [fm[key] for fm in fold_metrics]
        agg[key] = {'mean': float(np.mean(vals)), 'std': float(np.std(vals))}
    return agg

## Baseline (default RF + OOB)

In [8]:
baseline_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    max_features='sqrt',
    min_samples_split=2,
    min_samples_leaf=1,
    bootstrap=True,
    oob_score=True,
    random_state=42,
    n_jobs=-1
)
baseline_out, baseline_model, baseline_preds = eval_model(
    baseline_model,
    'rf_baseline',
    X_train_raw, y_train,
    X_val_raw, y_val,
    X_test_raw if has_test else None,
    y_test if has_test else None
)
baseline_out

{'name': 'rf_baseline',
 'train': {'rmse': 0.0028086877020911945,
  'mae': 0.001708142058771989,
  'r2': 0.9655144719596567},
 'val': {'rmse': 0.009302375771876566,
  'mae': 0.00595361555718338,
  'r2': 0.4258293364909922},
 'test': {'rmse': 0.005005817616055346,
  'mae': 0.00440487140243293,
  'r2': -0.8493104793880328},
 'oob': {'rmse': 0.007657993075988952,
  'mae': 0.004652571885568823,
  'r2': 0.7436343032470809}}

## Hyperparameter tuning (TimeSeriesSplit + OOB)

In [26]:
tscv = TimeSeriesSplit(n_splits=3)

#we had searched more broadly before, now tightening the grid
param_grid = {
    'n_estimators': [280, 300],
    'max_depth': [16, 18],
    'max_features': ['sqrt', 0.65],
    'min_samples_split': [4, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True],
    'max_samples': [0.55, 0.65]
}

cv_records = []
best_entry = None
for params in ParameterGrid(param_grid):
    candidate = RandomForestRegressor(random_state=42, n_jobs=-1, oob_score=True, **params)
    cv_stats = ts_cv_metrics(candidate, X_train_raw, y_train, tscv)
    fitted = clone(candidate)
    fitted.fit(X_train_raw, y_train)
    oob_stats = None
    if getattr(fitted, 'oob_prediction_', None) is not None:
        oob_stats = metrics(y_train, fitted.oob_prediction_)
    entry = {
        'params': params,
        'cv': cv_stats,
        'oob': oob_stats,
        'val_rmse_mean': cv_stats['rmse']['mean'],
        'val_rmse_std': cv_stats['rmse']['std']
    }
    cv_records.append(entry)
    if (best_entry is None) or (entry['val_rmse_mean'] < best_entry['val_rmse_mean']):
        best_entry = {**entry, 'model': fitted}

cv_results_df = pd.DataFrame([{**{k: v for k, v in rec.items() if k != 'cv'}, 'cv_rmse_mean': rec['cv']['rmse']['mean'], 'cv_rmse_std': rec['cv']['rmse']['std']} for rec in cv_records])
# serialize params for parquet friendliness
cv_results_df_serializable = cv_results_df.copy()
cv_results_df_serializable['params'] = cv_results_df_serializable['params'].apply(json.dumps)
best_params = best_entry['params']
best_cv = best_entry['cv']
best_oob = best_entry['oob']
best_model = best_entry['model']
cv_results_df_serializable.head(), best_params, best_cv['rmse']

(                                              params  \
 0  {"bootstrap": true, "max_depth": 16, "max_feat...   
 1  {"bootstrap": true, "max_depth": 16, "max_feat...   
 2  {"bootstrap": true, "max_depth": 16, "max_feat...   
 3  {"bootstrap": true, "max_depth": 16, "max_feat...   
 4  {"bootstrap": true, "max_depth": 16, "max_feat...   
 
                                                  oob  val_rmse_mean  \
 0  {'rmse': 0.00806107222567294, 'mae': 0.0049802...       0.010953   
 1  {'rmse': 0.008069921470353736, 'mae': 0.004981...       0.010926   
 2  {'rmse': 0.008108940226198716, 'mae': 0.005008...       0.010976   
 3  {'rmse': 0.00811394775541727, 'mae': 0.0050051...       0.010953   
 4  {'rmse': 0.008156604678132135, 'mae': 0.005007...       0.010915   
 
    val_rmse_std  cv_rmse_mean  cv_rmse_std  
 0      0.003407      0.010953     0.003407  
 1      0.003416      0.010926     0.003416  
 2      0.003315      0.010976     0.003315  
 3      0.003319      0.010953     0.0

## Evaluate tuned model

In [27]:
best_pred_train = best_model.predict(X_train_raw)
best_pred_val = best_model.predict(X_val_raw)
best_pred_test = best_model.predict(X_test_raw) if has_test else None

best_out = {
    'name': 'rf_tuned',
    'train': metrics(y_train, best_pred_train),
    'val': metrics(y_val, best_pred_val)
}
best_preds = {'train': best_pred_train, 'val': best_pred_val, 'test': best_pred_test}
if has_test:
    best_out['test'] = metrics(y_test, best_pred_test)
if best_oob is not None:
    best_out['oob'] = best_oob

# select best overall model vs baseline based on validation RMSE
baseline_val_rmse = baseline_out['val']['rmse']
tuned_val_rmse = best_out['val']['rmse']
best_overall_name = 'rf_tuned' if tuned_val_rmse <= baseline_val_rmse else 'rf_baseline'
if best_overall_name == 'rf_tuned':
    best_overall_model = best_model
    best_overall_preds = best_preds
    best_overall_metrics = best_out
    best_overall_params = best_params
    best_overall_cv = best_cv
else:
    best_overall_model = baseline_model
    best_overall_preds = baseline_preds
    best_overall_metrics = baseline_out
    best_overall_params = baseline_model.get_params()
    best_overall_cv = None

print(f"Best model selected: {best_overall_name} (by val RMSE)")
best_overall_metrics

Best model selected: rf_baseline (by val RMSE)


{'name': 'rf_baseline',
 'train': {'rmse': 0.0028086877020911945,
  'mae': 0.001708142058771989,
  'r2': 0.9655144719596567},
 'val': {'rmse': 0.009302375771876566,
  'mae': 0.00595361555718338,
  'r2': 0.4258293364909922},
 'test': {'rmse': 0.005005817616055346,
  'mae': 0.00440487140243293,
  'r2': -0.8493104793880328},
 'oob': {'rmse': 0.007657993075988952,
  'mae': 0.004652571885568823,
  'r2': 0.7436343032470809}}

## Compare baseline vs tuned

In [28]:
rows = []
for label, res in [('rf_base', baseline_out), ('rf_tuned', best_out)]:
    row = {
        'model': label,
        'val_rmse': res['val']['rmse'],
        'val_mae': res['val']['mae'],
        'val_r2': res['val']['r2']
    }
    if has_test and 'test' in res:
        row['test_rmse'] = res['test']['rmse']
        row['test_mae'] = res['test']['mae']
        row['test_r2'] = res['test']['r2']
    rows.append(row)
comparison_df = pd.DataFrame(rows).set_index('model')
print('Validation/Test comparison:')
display(comparison_df)

if 'rf_base' in comparison_df.index and 'rf_tuned' in comparison_df.index:
    diff_df = comparison_df.loc['rf_tuned'] - comparison_df.loc['rf_base']
    print('Tuned minus baseline (negative rmse/mae is better):')
    display(diff_df)
else:
    print('Cannot compute diff; missing one of rf_base or rf_tuned.')

Validation/Test comparison:


Unnamed: 0_level_0,val_rmse,val_mae,val_r2,test_rmse,test_mae,test_r2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
rf_base,0.009302,0.005954,0.425829,0.005006,0.004405,-0.84931
rf_tuned,0.009341,0.005924,0.421096,0.005098,0.004513,-0.917927


Tuned minus baseline (negative rmse/mae is better):


val_rmse     0.000038
val_mae     -0.000030
val_r2      -0.004733
test_rmse    0.000092
test_mae     0.000108
test_r2     -0.068617
dtype: float64

## Feature importance

In [29]:
importance_df = None
if hasattr(best_overall_model, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'feature': feature_cols,
        'importance': best_overall_model.feature_importances_
    }).sort_values(by='importance', ascending=False).reset_index(drop=True)
importance_df.head(15) if importance_df is not None else None

Unnamed: 0,feature,importance
0,vix,0.12004
1,rv_vix_spread_20d,0.116001
2,spy_vol_5d,0.099204
3,drawdown_60d,0.075776
4,vix_term,0.06857
5,spy_vol_10d,0.068437
6,vix3m,0.058747
7,spy_ret_10d,0.05853
8,spy_vol_20d,0.045328
9,spy_ret_20d,0.042826


## Save artifacts

In [31]:
print(f"Saving best model: {best_overall_name}")
print("Hyperparameters:")
print(json.dumps(best_overall_params, indent=2))

metrics_payload = {
    'base': baseline_out,
    'tuned': best_out,
    'best': best_overall_metrics,
    'best_model': best_overall_name,
    'best_cv': best_overall_cv
}
hyperparams_payload = {
    'best_model_name': best_overall_name,
    'best_params': best_overall_params
}

with open(MODEL_DIR / 'metrics.json', 'w') as f:
    json.dump(metrics_payload, f, indent=2)
with open(MODEL_DIR / 'hyperparams.json', 'w') as f:
    json.dump(hyperparams_payload, f, indent=2)

if importance_df is not None:
    importance_df.to_parquet(MODEL_DIR / 'feature_importance.parquet', index=False)
    importance_df.to_csv(MODEL_DIR / 'feature_importance.csv', index=False)

joblib.dump(best_overall_model, MODEL_DIR / 'model.joblib')

pred_frames = [
    pd.DataFrame({'split': 'train', 'y_true': y_train, 'y_pred': best_overall_preds['train']}),
    pd.DataFrame({'split': 'val', 'y_true': y_val, 'y_pred': best_overall_preds['val']})
]
if has_test and best_overall_preds['test'] is not None:
    pred_frames.append(pd.DataFrame({'split': 'test', 'y_true': y_test, 'y_pred': best_overall_preds['test']}))
pd.concat(pred_frames, ignore_index=True).to_parquet(MODEL_DIR / 'preds.parquet', index=False)

# keep saving CV results from tuning run
cv_results_df_serializable.to_parquet(MODEL_DIR / 'cv_results.parquet', index=False)

best_overall_params


Saving best model: rf_baseline
Hyperparameters:
{
  "bootstrap": true,
  "ccp_alpha": 0.0,
  "criterion": "squared_error",
  "max_depth": null,
  "max_features": "sqrt",
  "max_leaf_nodes": null,
  "max_samples": null,
  "min_impurity_decrease": 0.0,
  "min_samples_leaf": 1,
  "min_samples_split": 2,
  "min_weight_fraction_leaf": 0.0,
  "monotonic_cst": null,
  "n_estimators": 200,
  "n_jobs": -1,
  "oob_score": true,
  "random_state": 42,
  "verbose": 0,
  "warm_start": false
}


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 200,
 'n_jobs': -1,
 'oob_score': True,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}