In [1]:
# Source: Alexandru Tifrea and Fanny Yang, 2021.

# Python Notebook Commands
%reload_ext autoreload
%load_ext autoreload
%autoreload 2

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))

from copy import deepcopy
import numpy as np
import time

import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio

import ipywidgets
from ipywidgets import interact, interactive, interact_manual

import sklearn
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_val_score, GridSearchCV

from utils import generate_data, compute_population_risk, compute_empirical_risk, repeat_experiment, get_risk_vs_overparametrization

# Change these values if the images don't fit for your screen.
figure_width = 1200
figure_height = 500

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Data models

Quick overview of the data distributions used in this notebook. See the file utils.py for implementation details.

### Isotropic model

- $X \sim N(0, I_d)$
- $y = x^T\beta^* + \epsilon$, with $\epsilon \sim N(0, \sigma_{noise}^2)$

Note that the signal-to-noise ratio is given by SNR = $||\beta^*||_2^2$.





### Misspecified model

- $x = Wz + u \in \mathbb{R}^d$; the covariates are a linear combination of a lower dimensional latent code $z \in \mathbb{R}^p, p < d$
- $y=z^T\beta^*, \beta^* \in \mathbb{R}^d$

$Z \sim N(0, I_p), U \sim N(0, I_d)$ and $W \in \mathbb{R}^{d x p}$ is a deterministic matrix such that $W^TW=\frac{d}{p}I_p$

This model is equivalent to sampling the covariates from a normal distribution $N(0, \Sigma)$ and having $y = x^T\beta^*+\epsilon$, where:

- $\Sigma = I_d + WW^T$
- $\beta^* = W(I+W^TW)^{-1}\beta^*$
- $\epsilon \sim N(0, \sigma_{noise}^2)$
- $\sigma_{noise}^2 = \beta^{*T}(I+W^TW)^{-1}\beta^*$

Once again, the signal-to-noise ratio is given by SNR = $||\beta^*||_2^2$.


### Closed form of the population risk

Assume the covariates are drawn from a multivariate Gaussian distribution N(0, \Sigma). For a fixed training set $(X, y)$, the population risk of a linear regression estimator $\hat{\beta}$ can be written as:

$R(\hat{\beta})=\mathbb{E}_{x_{test}}(x_{test}^T\hat{\beta} - x_{test}^T\beta^*-\epsilon)^2 = (\hat{\beta} - \beta^*)^T\Sigma (\hat{\beta} - \beta^*) + \sigma_{noise}^2$

# Predictor risk for different degrees of overparameterization

In [2]:
def plot_risk_vs_overparametrization(n=None,
                                     d=None,
                                     cov_type="isotropic",
                                     num_runs=1):
    assert (n is not None) or (d is not None)

    all_snr = [1]
    all_gammas = np.concatenate((
        np.arange(0.1, 2, 0.1),
        # Uncomment this line for higher overparameterization, but at the cost of
        # longer computation time.
        # np.arange(3, 10)
    ))
    params = {
        "all_gammas": all_gammas,
        "all_snr": all_snr,
        "fix_n_vary_d": (n is not None),
        "cov_type": cov_type,
        "use_ridge": False,
    }
    if n is not None:
        params["n"] = n
    else:
        params["d"] = d
    aggregated_risks = repeat_experiment(num_runs,
                                         get_risk_vs_overparametrization,
                                         params)

    fig = go.Figure()
    for snr in all_snr:
        fig.add_trace(
            go.Scatter(x=all_gammas,
                       y=aggregated_risks[snr],
                       name=f"SNR={snr}"))
    fig.update_layout(
        height=figure_height,
        width=figure_width,
        yaxis_type="log",
        yaxis_range=[0, 1],
        yaxis_title="Population risk",
        xaxis_type="log",
        xaxis_title="$\Large\gamma=d/n$",
        title=f"Fixed n={int(n)}"
        if params["fix_n_vary_d"] else f"Fixed d={int(d)}",
    )
    fig.show()

### Select the number of samples

In [3]:
_ = interact(
    lambda n, cov_type, num_runs: plot_risk_vs_overparametrization(
        n=n, cov_type=cov_type, num_runs=num_runs),
    n=ipywidgets.FloatSlider(value=100,
                             min=100,
                             max=200,
                             step=10,
                             readout_format='d',
                             description='Number of samples:',
                             style={'description_width': 'initial'},
                             continuous_update=False),
    cov_type=ipywidgets.Dropdown(options=["isotropic", "misspecified"],
                                 value="isotropic",
                                 description='Covariance model:',
                                 disabled=False,
                                 style={'description_width': 'initial'},
                                 continuous_update=False),
    num_runs=ipywidgets.Dropdown(options=[1, 3, 5, 10],
                                 value=1,
                                 description='Number of experiments:',
                                 disabled=False,
                                 style={'description_width': 'initial'},
                                 continuous_update=False),
)

interactive(children=(FloatSlider(value=100.0, continuous_update=False, description='Number of samples:', max=…

### Select the dimension of the data

In [4]:
interact(
    lambda d, cov_type, num_runs: plot_risk_vs_overparametrization(
        d=d, cov_type=cov_type, num_runs=num_runs),
    d=ipywidgets.FloatSlider(value=10,
                             min=10,
                             max=1000,
                             step=10,
                             readout_format='d',
                             description='Data dimension:',
                             style={'description_width': 'initial'},
                             continuous_update=False),
    cov_type=ipywidgets.Dropdown(options=["isotropic", "misspecified"],
                                 value="isotropic",
                                 description='Covariance model:',
                                 disabled=False,
                                 style={'description_width': 'initial'},
                                 continuous_update=False),
    num_runs=ipywidgets.Dropdown(options=[1, 3, 5, 10],
                                 value=1,
                                 description='Number of experiments:',
                                 disabled=False,
                                 style={'description_width': 'initial'},
                                 continuous_update=False),
)

interactive(children=(FloatSlider(value=10.0, continuous_update=False, description='Data dimension:', max=1000…

# Empirical risk vs Population risk

In [5]:
# We choose the data dimension to be 1 in order to be able to easily illustrate
# the functions that minimize the risk (i.e. the dotted lines in the plot).
d = 1
snr = 1
n_max = 1000
sample_new_data = False  # toggle to choose whether to sample a new data set each time n is changed
seed = 21
all_noise_sigmas = [0, 0.5, 1]

if not sample_new_data:
    all_X, all_y, all_beta_star, all_Sigma = {}, {}, {}, {}
    for noise_sigma in all_noise_sigmas:
        all_X[noise_sigma], all_y[noise_sigma], all_beta_star[
            noise_sigma], all_Sigma[noise_sigma] = generate_data(
                n_max, d, snr=snr, noise_sigma=noise_sigma, seed=seed)
"""
Compares the population risk with the empirical risk computed using different 
sample sizes.
"""


def plot_empirical_and_population_risks(n, noise_sigma):
    n = int(n)

    if not sample_new_data:
        X, y, beta_star, Sigma = all_X[noise_sigma][:n], all_y[
            noise_sigma][:n], all_beta_star[noise_sigma], all_Sigma[
                noise_sigma]
    else:
        X, y, beta_star, Sigma = generate_data(n,
                                               d,
                                               snr=snr,
                                               noise_sigma=noise_sigma,
                                               seed=seed)

    # Compute the empirical risk and the population risk for all the 1D functions
    # in a range.
    threshold_functions = np.arange(-10, 10, 0.1)
    empirical_risks = np.array([
        compute_empirical_risk(beta_hat, X[:n], y[:n])
        for beta_hat in threshold_functions
    ])
    population_risks = np.array([
        compute_population_risk(beta_star, beta_hat, noise_sigma, Sigma)
        for beta_hat in threshold_functions
    ])

    # Obtain the minimizers of the empirical risk and the population risk.
    empirical_minimizer = threshold_functions[np.argmin(empirical_risks)]
    population_minimizer = threshold_functions[np.argmin(population_risks)]

    fig = go.Figure()
    fig.add_trace(
        go.Scatter(x=threshold_functions,
                   y=empirical_risks,
                   marker_color="blue",
                   name="Empirical risk"))
    fig.add_trace(
        go.Scatter(x=threshold_functions,
                   y=population_risks,
                   marker_color="red",
                   name="Population risk"))

    fig.add_vline(x=empirical_minimizer, line_dash="dot", line_color="blue")
    fig.add_vline(x=population_minimizer, line_dash="dot", line_color="red")

    fig.add_annotation(x=empirical_minimizer + 0.35,
                       y=22,
                       text="$\huge\hat{f}$",
                       showarrow=False)
    fig.add_annotation(x=population_minimizer - 0.4,
                       y=22,
                       text="$\huge f^*$",
                       showarrow=False)

    if noise_sigma == all_noise_sigmas[0]:
        yaxis_range = [0, 25]
    elif noise_sigma == all_noise_sigmas[1]:
        yaxis_range = [0, 25]
    elif noise_sigma == all_noise_sigmas[2]:
        yaxis_range = [0, 25]
    else:
        yaxis_range = [0, 500]

    fig.update_layout(height=figure_height,
                      width=figure_width,
                      yaxis_range=yaxis_range,
                      xaxis_range=[-6, 5],
                      yaxis_title="Risk",
                      hovermode='x')
    fig.show()


_ = interact(
    plot_empirical_and_population_risks,
    n=ipywidgets.FloatSlider(value=10,
                             min=10,
                             max=n_max,
                             step=10,
                             readout_format='d',
                             description='Number of samples:',
                             style={'description_width': 'initial'},
                             continuous_update=True),
    noise_sigma=ipywidgets.Dropdown(options=all_noise_sigmas,
                                    value=0.5,
                                    description='Noise level:',
                                    disabled=False,
                                    style={'description_width': 'initial'},
                                    continuous_update=True),
)

interactive(children=(FloatSlider(value=10.0, description='Number of samples:', max=1000.0, min=10.0, readout_…

We can also see that the empirical risk minimizer converges to the population minimizer, by comparing the difference between the two as the number of samples increases.

In [6]:
d = 1
snr = 1
n_max = 1000
all_noise_sigmas = [0, 0.5, 1]
sample_new_data = False  # toggle to choose whether to sample a new data set each time n is changed
seed = 21

if not sample_new_data:
    all_X, all_y, all_beta_star, all_Sigma = {}, {}, {}, {}
    for noise_sigma in all_noise_sigmas:
        all_X[noise_sigma], all_y[noise_sigma], all_beta_star[
            noise_sigma], all_Sigma[noise_sigma] = generate_data(
                n_max, d, snr=snr, noise_sigma=noise_sigma, seed=seed)

diffs = {}
for noise_sigma in all_noise_sigmas:
    diffs[noise_sigma] = []

    for n in np.arange(10, n_max, 10):
        if not sample_new_data:
            X, y, beta_star, Sigma = all_X[noise_sigma][:n], all_y[
                noise_sigma][:n], all_beta_star[noise_sigma], all_Sigma[
                    noise_sigma]
        else:
            X, y, beta_star, Sigma = generate_data(n,
                                                   d,
                                                   snr=snr,
                                                   noise_sigma=noise_sigma,
                                                   seed=seed)
        threshold_functions = np.arange(-10, 10, 0.05)
        empirical_risks = np.array([
            compute_empirical_risk(beta_hat, X[:n], y[:n])
            for beta_hat in threshold_functions
        ])
        population_risks = np.array([
            compute_population_risk(beta_star, beta_hat, noise_sigma, Sigma)
            for beta_hat in threshold_functions
        ])

        empirical_minimizer = threshold_functions[np.argmin(empirical_risks)]
        population_minimizer = threshold_functions[np.argmin(population_risks)]
        diffs[noise_sigma].append(
            np.fabs(empirical_minimizer - population_minimizer))

fig = go.Figure()
for noise_sigma in all_noise_sigmas:
    fig.add_trace(
        go.Scatter(x=np.arange(0, n_max),
                   y=diffs[noise_sigma],
                   name=f"Noise level {noise_sigma}"))

fig.update_layout(height=figure_height,
                  width=figure_width,
                  xaxis_title="Number of samples",
                  yaxis_title="$\large|\hat{f} - f^*|$",
                  hovermode='x')

fig.show()