In [1]:
# Source: Alexandru Tifrea and Fanny Yang, 2021.

# Python Notebook Commands
%reload_ext autoreload
%load_ext autoreload
%autoreload 2

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))

from copy import deepcopy
import numpy as np
import time

import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio

import ipywidgets
from ipywidgets import interact, interactive, interact_manual

import sklearn
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_val_score, GridSearchCV

from utils import generate_data, generate_additional_data, compute_population_risk, compute_empirical_risk, repeat_experiment, get_risk_vs_overparametrization

# Change these values if the images don't fit for your screen.
figure_width = 1200
figure_height = 500

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Bias-variance trade-off for ridge regression

We can estimate the bias and the variance of an estimator by sampling different training sets, and using a hold-out validation set to compute its empirical error.

The stacked area plot below illustrates the decomposition of the risk into three terms: the squared bias, the variance, and irreducible noise.

In [2]:
validation_size = 1000
num_trials = 5
snr = 1

all_noise_sigmas = [0, 0.1, 0.5, 1]


def plot_bias_variance_for_ridge(n, d, noise_sigma):
    ridge_coefficients = np.arange(0, 20, 1)
    risks, squared_biases, variances = [], [], []
    n, d = int(n), int(d)

    # Sample the validation set and one traing set for each of the trials.
    X_validation, y_validation, beta_star, Sigma = generate_data(
        n=validation_size, d=d, snr=snr, noise_sigma=noise_sigma)
    all_X, all_y = generate_additional_data(num_samples=n * num_trials,
                                            d=d,
                                            Sigma=Sigma,
                                            beta_star=beta_star,
                                            noise_sigma=noise_sigma)
    for ridge_coef in ridge_coefficients:
        validation_predictions, validation_bayes_predictions = [], []

        # Train num_trials estimators and use the validation set to estimate the bias and variance.
        for i in range(num_trials):
            start, end = i * n, (i + 1) * n
            # Compute closed form solution of the ridge regression optimization problem.
            beta_hat = 1 / (1 + ridge_coef) * np.linalg.inv(
                all_X[start:end].T
                @ all_X[start:end]) @ all_X[start:end].T @ all_y[start:end]
            validation_predictions.append(
                (X_validation @ beta_hat).reshape(-1, 1))
            validation_bayes_predictions.append(
                (X_validation @ beta_star).reshape(-1, 1))

        validation_predictions, validation_bayes_predictions = np.array(
            validation_predictions), np.array(validation_bayes_predictions)
        risks.append(np.power(validation_predictions - y_validation, 2).mean())
        variances.append(
            np.power(
                validation_predictions - validation_predictions.mean(axis=0),
                2).mean())
        squared_biases.append(
            max(0, risks[-1] - variances[-1] - noise_sigma**2))

    fig = go.Figure()
    fig.add_trace(
        go.Scatter(x=ridge_coefficients,
                   y=np.ones_like(ridge_coefficients) * noise_sigma**2,
                   name="Irreducible noise",
                   marker_color="gray"))
    fig.add_trace(
        go.Scatter(x=ridge_coefficients, y=variances, name="Variance"))
    fig.add_trace(
        go.Scatter(x=ridge_coefficients,
                   y=squared_biases,
                   name="Bias<sup>2</sup>"))
    fig.add_trace(go.Scatter(x=ridge_coefficients, y=risks, name="Risk"))

    if noise_sigma == 0.5:
        yaxis_range = [0, 1.5]
    elif noise_sigma == 1:
        yaxis_range = [0, 2.2]
    else:
        yaxis_range = [0, 1.05]

    fig.update_layout(height=figure_height,
                      width=figure_width,
                      yaxis_range=yaxis_range,
                      yaxis_title="Risk / Bias / Variance",
                      xaxis_title="Ridge coefficient",
                      hovermode='x')
    fig.show()


_ = interact(
    plot_bias_variance_for_ridge,
    n=ipywidgets.FloatSlider(value=200,
                             min=100,
                             max=500,
                             step=10,
                             readout_format='d',
                             description='Number of samples:',
                             style={'description_width': 'initial'},
                             continuous_update=False),
    d=ipywidgets.FloatSlider(value=100,
                             min=10,
                             max=100,
                             step=10,
                             readout_format='d',
                             description='Data dimension:',
                             style={'description_width': 'initial'},
                             continuous_update=False),
    noise_sigma=ipywidgets.Dropdown(options=all_noise_sigmas,
                                    value=0.5,
                                    description='Noise level:',
                                    disabled=False,
                                    style={'description_width': 'initial'},
                                    continuous_update=True),
)

interactive(children=(FloatSlider(value=200.0, continuous_update=False, description='Number of samples:', max=…

# Effect of overparametrization for ridge regression

In [3]:
def plot_risk_vs_overparametrization(n=None,
                                     d=None,
                                     cov_type="isotropic",
                                     num_runs=1):
    assert (n is not None) or (d is not None)

    all_snr = [1]
    all_gammas = np.concatenate((
        np.arange(0.1, 2, 0.1),
        # Uncomment this line for higher overparameterization, but at the cost of
        # longer computation time.
        # np.arange(3, 10)
    ))
    params = {
        "all_gammas": all_gammas,
        "all_snr": all_snr,
        "fix_n_vary_d": (n is not None),
        "cov_type": cov_type,
        "use_ridge": True,
    }
    if n is not None:
        params["n"] = n
    else:
        params["d"] = d
    ridge_aggregated_risks = repeat_experiment(
        num_runs, get_risk_vs_overparametrization, params)
    params["use_ridge"] = False
    ridgeless_aggregated_risks = repeat_experiment(
        num_runs, get_risk_vs_overparametrization, params)

    fig = go.Figure()
    for snr in all_snr:
        fig.add_trace(
            go.Scatter(x=all_gammas,
                       y=ridge_aggregated_risks[snr],
                       name="Ridge regression"))
        fig.add_trace(
            go.Scatter(x=all_gammas,
                       y=ridgeless_aggregated_risks[snr],
                       name="Ridgeless regression"))
    fig.update_layout(
        height=figure_height,
        width=figure_width,
        yaxis_type="log",
        yaxis_range=[0, 1],
        yaxis_title="Population risk",
        xaxis_type="log",
        xaxis_title="$\Large\gamma=d/n$",
        title=f"Fixed n={int(n)}"
        if params["fix_n_vary_d"] else f"Fixed d={int(d)}",
    )
    fig.show()

### Select the number of samples

In [4]:
_ = interact(
    lambda n, cov_type, num_runs: plot_risk_vs_overparametrization(
        n=n, cov_type=cov_type, num_runs=num_runs),
    n=ipywidgets.FloatSlider(value=100,
                             min=100,
                             max=200,
                             step=10,
                             readout_format='d',
                             description='Number of samples:',
                             style={'description_width': 'initial'},
                             continuous_update=False),
    cov_type=ipywidgets.Dropdown(options=["isotropic", "misspecified"],
                                 value="isotropic",
                                 description='Covariance model:',
                                 disabled=False,
                                 style={'description_width': 'initial'},
                                 continuous_update=False),
    num_runs=ipywidgets.Dropdown(options=[1, 3, 5, 10],
                                 value=1,
                                 description='Number of experiments:',
                                 disabled=False,
                                 style={'description_width': 'initial'},
                                 continuous_update=False),
)

interactive(children=(FloatSlider(value=100.0, continuous_update=False, description='Number of samples:', max=…

<function __main__.<lambda>(n, cov_type, num_runs)>

### Select the dimension of the data

In [5]:
_ = interact(
    lambda d, cov_type, num_runs: plot_risk_vs_overparametrization(
        d=d, cov_type=cov_type, num_runs=num_runs),
    d=ipywidgets.FloatSlider(value=20,
                             min=20,
                             max=1000,
                             step=10,
                             readout_format='d',
                             description='Data dimension:',
                             style={'description_width': 'initial'},
                             continuous_update=False),
    cov_type=ipywidgets.Dropdown(options=["isotropic", "misspecified"],
                                 value="isotropic",
                                 description='Covariance model:',
                                 disabled=False,
                                 style={'description_width': 'initial'},
                                 continuous_update=False),
    num_runs=ipywidgets.Dropdown(options=[1, 3, 5, 10],
                                 value=1,
                                 description='Number of experiments:',
                                 disabled=False,
                                 style={'description_width': 'initial'},
                                 continuous_update=False),
)

interactive(children=(FloatSlider(value=20.0, continuous_update=False, description='Data dimension:', max=1000…

<function __main__.<lambda>(d, cov_type, num_runs)>