# Linear regression (ridge/lasso)

Load splits from artifacts/data, scale features, run a simple baseline, then a hyperparameter sweep, compare on val, and save outputs under artifacts/linear_regression.


In [9]:
from pathlib import Path
import json
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit, ParameterGrid
from sklearn.base import clone
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# paths to data and artifacts
_start = Path.cwd().resolve()
_candidates = [_start] + list(_start.parents)
_repo = None
for p in _candidates:
    if (p / 'notebooks/model_evaluation_final/artifacts/data/split_config.json').exists():
        _repo = p
        break
    if (p / 'data/master_dataset.parquet').exists():
        _repo = p
        break
REPO_ROOT = _repo if _repo else _start
ARTIFACTS_DIR = REPO_ROOT / 'notebooks/model_evaluation_final/artifacts'
DATA_DIR = ARTIFACTS_DIR / 'data'
MODEL_DIR = ARTIFACTS_DIR / 'linear_regression'
for d in [ARTIFACTS_DIR, DATA_DIR, MODEL_DIR]:
    d.mkdir(parents=True, exist_ok=True)

DATA_DIR, MODEL_DIR

(PosixPath('/Users/aayushrijal/Documents/GitHub/volatility_forecast/notebooks/model_evaluation_final/artifacts/data'),
 PosixPath('/Users/aayushrijal/Documents/GitHub/volatility_forecast/notebooks/model_evaluation_final/artifacts/linear_regression'))

In [10]:
#load artifacts
with open(DATA_DIR / 'feature_columns.json') as f:
    feature_cols = json.load(f)
with open(DATA_DIR / 'split_config.json') as f:
    CONFIG = json.load(f)
target_col = CONFIG['target_col']
date_col = CONFIG['date_col']

X_train = pd.read_parquet(DATA_DIR / 'X_train.parquet')
X_val = pd.read_parquet(DATA_DIR / 'X_val.parquet')
X_test = pd.read_parquet(DATA_DIR / 'X_test.parquet')
y_train = pd.read_parquet(DATA_DIR / 'y_train.parquet')[target_col].values
y_val = pd.read_parquet(DATA_DIR / 'y_val.parquet')[target_col].values
y_test = pd.read_parquet(DATA_DIR / 'y_test.parquet')[target_col].values if (DATA_DIR / 'y_test.parquet').exists() else np.array([])

len(X_train), len(X_val), len(X_test), len(feature_cols)

(3154, 788, 30, 20)

In [11]:
#keep raw features; scaling handled inside pipelines
X_train_raw = X_train.copy()
X_val_raw = X_val.copy()
X_test_raw = X_test.copy()

len(X_train_raw), len(X_val_raw), len(X_test_raw)


(3154, 788, 30)

In [12]:
# small helper functions

def metrics(y_true, y_pred):
    return {
        'rmse': float(np.sqrt(mean_squared_error(y_true, y_pred))),
        'mae': float(mean_absolute_error(y_true, y_pred)),
        'r2': float(r2_score(y_true, y_pred))
    }

def eval_model(model, name, X_tr, y_tr, X_v, y_v, X_te=None, y_te=None):
    model.fit(X_tr, y_tr)
    pred_tr = model.predict(X_tr)
    pred_v = model.predict(X_v)
    out = {
        'name': name,
        'train': metrics(y_tr, pred_tr),
        'val': metrics(y_v, pred_v)
    }
    preds = {'train': pred_tr, 'val': pred_v, 'test': None}
    if X_te is not None and len(X_te):
        pred_te = model.predict(X_te)
        out['test'] = metrics(y_te, pred_te)
        preds['test'] = pred_te
    return out, model, preds

def ts_cv_score(model, X, y, n_splits=5):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    scores = []
    for train_idx, val_idx in tscv.split(X):
        X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_va = y[train_idx], y[val_idx]
        m = clone(model)
        m.fit(X_tr, y_tr)
        preds = m.predict(X_va)
        scores.append(np.sqrt(mean_squared_error(y_va, preds)))
    return float(np.mean(scores)), float(np.std(scores))

## Baseline (default ridge/lasso)


In [13]:
# baseline models with default settings
baseline_results = []
baseline_models = []
baseline_candidates = {
    'ridge_base': Pipeline([('scaler', StandardScaler()), ('model', Ridge())]),
    'lasso_base': Pipeline([('scaler', StandardScaler()), ('model', Lasso(max_iter=5000))])
}
for name, model in baseline_candidates.items():
    out, mdl, preds = eval_model(model, name, X_train_raw, y_train, X_val_raw, y_val, X_test_raw, y_test)
    baseline_results.append(out)
    baseline_models.append((out, mdl, preds))

baseline_best = sorted(baseline_results, key=lambda x: x['val']['rmse'])[0]
baseline_best_name = baseline_best['name']
baseline_best_tuple = [m for m in baseline_models if m[0]['name'] == baseline_best_name][0]
baseline_best_model = baseline_best_tuple[1]
baseline_best_preds = baseline_best_tuple[2]
baseline_best

{'name': 'ridge_base',
 'train': {'rmse': 0.009478733598480976,
  'mae': 0.006036318876028565,
  'r2': 0.6072369626380871},
 'val': {'rmse': 0.00937056442784859,
  'mae': 0.005723706201454683,
  'r2': 0.41738086626526394},
 'test': {'rmse': 0.004635058164696176,
  'mae': 0.003977044265744952,
  'r2': -0.5855142983167421}}

## Hyperparameter tuning (TimeSeriesSplit + pipelines)


In [14]:
# time-series CV over ridge, lasso, elastic net (step-by-step)
param_grids = {
    'ridge': {
        'model__alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 50.0, 100.0]
    },
    'lasso': {
        'model__alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 50.0],
        'model__max_iter': [5000]
    },
    'elasticnet': {
        'model__alpha': [0.001, 0.01, 0.1, 1.0, 10.0],
        'model__l1_ratio': [0.1, 0.5, 0.9],
        'model__max_iter': [5000]
    }
}

base_models = {
    'ridge': Pipeline([('scaler', StandardScaler()), ('model', Ridge())]),
    'lasso': Pipeline([('scaler', StandardScaler()), ('model', Lasso())]),
    'elasticnet': Pipeline([('scaler', StandardScaler()), ('model', ElasticNet())])
}

# run CV for every combo
cv_records = []
for family, base_model in base_models.items():
    grid = ParameterGrid(param_grids[family])
    for params in grid:
        candidate = clone(base_model).set_params(**params)
        mean_rmse, std_rmse = ts_cv_score(candidate, X_train_raw, y_train, n_splits=5)
        cv_records.append({
            'family': family,
            'params': params,
            'cv_rmse_mean': mean_rmse,
            'cv_rmse_std': std_rmse
        })

# pick the best CV combo for each family
best_cv_per_family = {}
for family in base_models.keys():
    family_rows = [row for row in cv_records if row['family'] == family]
    best_row = sorted(family_rows, key=lambda x: x['cv_rmse_mean'])[0]
    best_cv_per_family[family] = best_row

# fit tuned models using the best CV params
family_tuned = {}
for family, cv_info in best_cv_per_family.items():
    tuned_model = clone(base_models[family]).set_params(**cv_info['params'])
    tuned_metrics, tuned_fitted, tuned_preds = eval_model(
        tuned_model,
        f"{family}_tuned",
        X_train_raw,
        y_train,
        X_val_raw,
        y_val,
        X_test_raw,
        y_test
    )
    family_tuned[family] = {
        'cv': cv_info,
        'metrics': tuned_metrics,
        'model': tuned_fitted,
        'preds': tuned_preds
    }

# decide the best overall model by validation RMSE (baseline vs tuned)
tuned_metrics_list = [v['metrics'] for v in family_tuned.values()]
all_candidates = baseline_results + tuned_metrics_list
best_overall = sorted(all_candidates, key=lambda x: x['val']['rmse'])[0]
best_name = best_overall['name']

# grab the fitted model and predictions
model_pool = baseline_models + [(v['metrics'], v['model'], v['preds']) for v in family_tuned.values()]
best_tuple = [entry for entry in model_pool if entry[0]['name'] == best_name][0]
best_model = best_tuple[1]
best_preds = best_tuple[2]

best_cv_overall = best_cv_per_family[best_name.split('_')[0]] if best_name.endswith('tuned') else None

family_tuned

{'ridge': {'cv': {'family': 'ridge',
   'params': {'model__alpha': 100.0},
   'cv_rmse_mean': 0.00939326355004182,
   'cv_rmse_std': 0.003798918829684565},
  'metrics': {'name': 'ridge_tuned',
   'train': {'rmse': 0.009558150323875733,
    'mae': 0.006031938316587894,
    'r2': 0.6006279312809666},
   'val': {'rmse': 0.00943853028065473,
    'mae': 0.005782997477662603,
    'r2': 0.40889859992702926},
   'test': {'rmse': 0.004489562349565113,
    'mae': 0.0037324392363447807,
    'r2': -0.48753708126735384}},
  'model': Pipeline(steps=[('scaler', StandardScaler()), ('model', Ridge(alpha=100.0))]),
  'preds': {'train': array([0.01695871, 0.0164928 , 0.01606074, ..., 0.03332624, 0.03511177,
          0.0380785 ]),
   'val': array([0.04042721, 0.04218448, 0.04352601, 0.03860229, 0.03863921,
          0.03721972, 0.03593324, 0.03642944, 0.03627617, 0.03035978,
          0.0293626 , 0.02731585, 0.02643531, 0.02781402, 0.02291757,
          0.02581306, 0.0277659 , 0.03040584, 0.03061804, 0.0

## Compare baseline vs tuned


> Model selection uses validation RMSE only (time-aware splits). Test set is held out strictly for final reporting, not for choosing the model.

In [15]:
# compare baseline vs tuned (easy to read)
families = ['lasso', 'ridge']
metrics = ['rmse', 'mae', 'r2']

# maps for quick lookup
baseline_map = {m['name']: m for m in baseline_results}
tuned_map = {v['metrics']['name']: v['metrics'] for v in family_tuned.values()}

# build a simple table for a given split
def build_split_table(split_name):
    table = {}
    for family in families:
        base_key = f"{family}_base"
        tuned_key = f"{family}_tuned"
        base_metrics = [baseline_map[base_key][split_name][m] for m in metrics]
        tuned_metrics_vals = [tuned_map[tuned_key][split_name][m] for m in metrics]
        diff_metrics = [t - b for t, b in zip(tuned_metrics_vals, base_metrics)]
        table[f"{family}_base"] = base_metrics
        table[f"{family}_tuned"] = tuned_metrics_vals
        table[f"{family}_diff"] = diff_metrics
    return pd.DataFrame(table, index=metrics)

val_comparison_df = build_split_table('val')
test_comparison_df = build_split_table('test') if len(y_test) else None

val_comparison_df

Unnamed: 0,lasso_base,lasso_tuned,lasso_diff,ridge_base,ridge_tuned,ridge_diff
rmse,0.012286,0.009395,-0.002891,0.009371,0.009439,6.8e-05
mae,0.00761,0.005664,-0.001946,0.005724,0.005783,5.9e-05
r2,-0.001554,0.414331,0.415885,0.417381,0.408899,-0.008482


In [16]:
test_comparison_df

Unnamed: 0,lasso_base,lasso_tuned,lasso_diff,ridge_base,ridge_tuned,ridge_diff
rmse,0.00962,0.004834,-0.004785,0.004635,0.00449,-0.000145
mae,0.008887,0.003967,-0.004921,0.003977,0.003732,-0.000245
r2,-5.829308,-0.724685,5.104624,-0.585514,-0.487537,0.097977


In [17]:
best_overall

{'name': 'ridge_base',
 'train': {'rmse': 0.009478733598480976,
  'mae': 0.006036318876028565,
  'r2': 0.6072369626380871},
 'val': {'rmse': 0.00937056442784859,
  'mae': 0.005723706201454683,
  'r2': 0.41738086626526394},
 'test': {'rmse': 0.004635058164696176,
  'mae': 0.003977044265744952,
  'r2': -0.5855142983167421}}

## Feature importance from best model


In [18]:
# get feature importance sorted by absolute value
feature_names = list(X_train_raw.columns)
importance_df = None

model_has_coefs = hasattr(best_model.named_steps['model'], 'coef_')
if model_has_coefs:
    coefs = best_model.named_steps['model'].coef_
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'coefficient': coefs,
        'abs_coefficient': np.abs(coefs)
    }).sort_values('abs_coefficient', ascending=False).reset_index(drop=True)
else:
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'note': 'no coefficients available for this model'
    })

importance_df.head(10)

Unnamed: 0,feature,coefficient,abs_coefficient
0,vix3m,-0.011209,0.011209
1,rv_vix_spread_20d,-0.009704,0.009704
2,vix,0.009671,0.009671
3,spy_vol_10d,0.001641,0.001641
4,spy_ret_10d,-0.001495,0.001495
5,spy_vol_60d,0.001015,0.001015
6,corr_spy_hyg_60d,-0.000773,0.000773
7,spy_vol_20d,-0.000767,0.000767
8,corr_spy_tlt_20d,0.000716,0.000716
9,corr_spy_tlt_60d,-0.000693,0.000693


## Save results and artifacts


In [19]:
# save metrics, hyperparameters, model, and predictions
metrics_payload = {
    'base': {m['name']: m for m in baseline_results},
    'tuned': {v['metrics']['name']: v['metrics'] for v in family_tuned.values()},
    'best': best_overall,
    'best_model': best_name,
    'best_cv': best_cv_overall,
    'val_comparison': val_comparison_df.to_dict(),
    'test_comparison': test_comparison_df.to_dict() if test_comparison_df is not None else None
}

def pick_best_params(model_name):
    family = model_name.split('_')[0]
    # tuned models use the CV-chosen params
    if model_name.endswith('tuned') and family in best_cv_per_family:
        raw_params = best_cv_per_family[family]['params']
    else:
        # baselines grab whatever was in the grid for that family
        grid_keys = list(param_grids.get(family, {}).keys())
        baseline_entry = [item for item in baseline_models if item[0]['name'] == model_name][0][1]
        raw_params = {key: baseline_entry.get_params().get(key) for key in grid_keys}
    if not raw_params:
        return None
    clean_params = {key.split('__')[-1]: value for key, value in raw_params.items()}
    return clean_params

best_params = pick_best_params(best_name)

hyperparams_payload = {
    'best_model_name': best_name,
    'best_params': best_params
}

with open(MODEL_DIR / 'metrics.json', 'w') as f:
    json.dump(metrics_payload, f, indent=2)
with open(MODEL_DIR / 'hyperparams.json', 'w') as f:
    json.dump(hyperparams_payload, f, indent=2)

# save feature importance
if importance_df is not None:
    importance_df.to_parquet(MODEL_DIR / 'feature_importance.parquet', index=False)
    importance_df.to_csv(MODEL_DIR / 'feature_importance.csv', index=False)

joblib.dump(best_model, MODEL_DIR / 'model.joblib')

pd.DataFrame({
    'pred_train': best_preds['train'],
    'y_train': y_train
}).to_parquet(MODEL_DIR / 'pred_train.parquet', index=False)
pd.DataFrame({
    'pred_val': best_preds['val'],
    'y_val': y_val
}).to_parquet(MODEL_DIR / 'pred_val.parquet', index=False)
if best_preds['test'] is not None and len(best_preds['test']):
    pd.DataFrame({
        'pred_test': best_preds['test'],
        'y_test': y_test
    }).to_parquet(MODEL_DIR / 'pred_test.parquet', index=False)

best_name, best_params

('ridge_base', {'alpha': 1.0})