In [1]:
from better_regressions import AdaptiveLinear
from sklearn.linear_model import ARDRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.linear_model import Ridge, RidgeCV
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pprint import pprint


def stats(fn, *args, runs: int = 100, **kwargs) -> dict[str, str]:
    results = [fn(*args, **kwargs) for _ in range(runs)]

    metrics = {}
    for key in results[0].keys():
        values = [result[key] for result in results]
        mean = np.mean(values)
        mad = np.median(np.abs(values - mean))
        metrics[key] = f"{mean:.4f} ± {mad:.4f}"

    return metrics

## Models
1. `ARDRegression` (aka RVR)
2. Ridge with adaptive shrinkage

In [2]:
alphas = [1e-9, 1e-6, 3e-6, 1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1.0]
models = {
    "Ridge": Pipeline([("scaler", StandardScaler()), ("model", Ridge(alpha=1e-9))]),
    "RidgeCV": Pipeline([("scaler", StandardScaler()), ("model", RidgeCV(alphas, cv=3))]),
    "ARD": Pipeline([("scaler", StandardScaler()), ("model", ARDRegression())]),
    "AdaptiveRidge": Pipeline([("scaler", StandardScaler()), ("model", AdaptiveLinear(method="none"))]),
}

### Case 1: Multiple noisy observations of target

$y \sim \mathcal{N}(0, \sigma_y^2)\quad X_i \sim \mathcal{N}(y_i, \sigma_i^2)$

In [3]:
def experiment(model: BaseEstimator, target_noise: float, noise_levels: list[float], n_train: int = 100, n_test: int = 1000) -> float:
    y_train = np.random.randn(n_train) * target_noise
    y_test = np.random.randn(n_test) * target_noise

    d = len(noise_levels)
    noise_levels = np.array(noise_levels)[None, :]
    X_train = y_train[:, None] + np.random.randn(n_train, d) * noise_levels
    X_test = y_test[:, None] + np.random.randn(n_test, d) * noise_levels

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    inputs = np.vstack([np.eye(d), np.zeros((1, d))])
    outputs = model.predict(inputs)
    w_model = outputs[:d] - outputs[-1]

    w_optimal = noise_levels**-2
    w_optimal /= np.sum(w_optimal)
    weight_rmse = np.sqrt(np.mean((w_model - w_optimal) ** 2)).item()
    prediction_rmse = np.sqrt(np.mean((y_pred - y_test) ** 2)).item()
    return {
        "weight_rmse": weight_rmse,
        "prediction_rmse": prediction_rmse,
    }


for model_name, model in models.items():
    result = stats(experiment, model, 1e6, [1 / 2, 1 / 3, 1 / 4], n_train=10000, n_test=1000)
    print(f"{model_name:<20} prediction RMSE = {result['prediction_rmse']:<30} weight RMSE = {result['weight_rmse']:<30}")

Ridge                prediction RMSE = 0.1914 ± 0.0029                weight RMSE = 0.0751 ± 0.0036               
RidgeCV              prediction RMSE = 0.1913 ± 0.0032                weight RMSE = 0.0740 ± 0.0029               
ARD                  prediction RMSE = 463.3696 ± 463.0295            weight RMSE = 0.0764 ± 0.0540               
AdaptiveRidge        prediction RMSE = 0.1853 ± 0.0030                weight RMSE = 0.0093 ± 0.0039               


## Case 2: Multiple independent weak signals, of different strengths

$y \sim \mathcal{N}(Xw, \sigma^2), \quad X \sim \mathcal{N}(0, I), \quad \sum w_i^2 \ll \sigma^2, \quad \min |w_i| \ll \max |w_i|$

In [None]:
def experiment(model: BaseEstimator, target_noise: float, weights: np.ndarray, n_train: int = 100, n_test: int = 1000) -> dict[str, float]:
    d = len(weights)
    X_train = np.random.randn(n_train, d)
    X_test = np.random.randn(n_test, d)
    y_train = X_train @ weights + np.random.randn(n_train) * target_noise
    y_test = X_test @ weights + np.random.randn(n_test) * target_noise

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    inputs = np.vstack([np.eye(d), np.zeros((1, d))])
    outputs = model.predict(inputs)
    w_model = outputs[:d] - outputs[-1]

    weight_rmse = np.sqrt(np.mean((w_model - weights) ** 2)).item()
    prediction_rmse = np.sqrt(np.mean((y_pred - y_test) ** 2)).item()
    return {
        "weight_rmse": weight_rmse,
        "prediction_rmse": prediction_rmse,
    }


w = np.array([0.6, 0.5, 0.4, 0.02, 0.01, 0.01, 0.005, 0.001])
sigma = 2
for model_name, model in models.items():
    result = stats(experiment, model, sigma, w, n_train=100, n_test=1000)
    print(f"{model_name:<20} prediction RMSE = {result['prediction_rmse']:<30} weight RMSE = {result['weight_rmse']:<30}")

0.7706259999999999
Ridge                prediction RMSE = 2.1073 ± 0.0461                weight RMSE = 0.2025 ± 0.0388               
RidgeCV              prediction RMSE = 2.1012 ± 0.0503                weight RMSE = 0.2058 ± 0.0318               
ARD                  prediction RMSE = 2.0716 ± 0.0381                weight RMSE = 0.1764 ± 0.0358               
AdaptiveRidge        prediction RMSE = 2.0689 ± 0.0374                weight RMSE = 0.1745 ± 0.0316               
