In [164]:
# Source: Alexandru Tifrea and Fanny Yang, 2021.
# Based on an earlier version by Sebastian Curi and Andreas Krause.

# Python Notebook Commands
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from IPython.core.display import HTML
from IPython import display

display.display(HTML("<style>.container { width:100% !important; }</style>"))

# General math and plotting modules.
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.colors import DEFAULT_PLOTLY_COLORS

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

# Widget and formatting modules
import ipywidgets
from ipywidgets import interact, interactive, interact_manual, fixed, widgets
from matplotlib import rcParams

rcParams['figure.figsize'] = (10, 6)
rcParams['font.size'] = 16

# Machine Learning library.
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn import datasets
from sklearn.metrics import mean_squared_error

import warnings

rcParams['figure.figsize'] = (15, 6)
rcParams['font.size'] = 20

# Regularized Polynomial Regression

### Regression with polynomial features

In the following we show how the estimator depends on hyperparameters like the regularization coefficient (for LASSO and ridge penalties) or the the degree of the polynomial used for the features.


Let's consider 1-dimensional data $\{(x_i, y_i)\}_{i=0}^n \subset \mathbb{R} \times \mathbb{R}$. To obtain a better feature representation for the data, we map the samples to the space of monomials of degree at most $d$, i.e. $\varphi: \mathbb{R} \rightarrow span(\{1, X, X^2, ..., X^d\})$. The maximum degree controls the complexity of the regression model: the higher the degree, the more complex the features we obtain. As you will see later in the course, regression with polynomial features is equivalent to using a polynomial kernel function.

We perform regularized regression and consider two different regularization penalties:

- LASSO penalty, i.e. minimizing $L_{\text{lasso}}(w; \lambda) := \sum_{i=0}^n (y_i - w^T\varphi(x_i))^2 + \lambda ||w||_1 $.

- ridge penalty, i.e. minimizing $L_{\text{ridge}}(w; \lambda) := \sum_{i=0}^n (y_i - w^T\varphi(x_i))^2 + \lambda ||w||_2^2 $.

Below we show the mean squared error (MSE) computed on the training points, as well as the L2 error of the estimator compared to the ground truth function $f^*$, i.e. $||\hat{f}-f^*||_{L_2}$.

In [165]:
def true_regression_fun(ground_truth):
  if ground_truth == "sine":
    return lambda X: np.cos(3 * np.pi * X)
  elif "poly" in ground_truth:
    coefficients = [float(coef) for coef in ground_truth.split(":")[1].split(",")]
    return lambda X: np.poly1d(coefficients[::-1])(X)
  else:
    raise RuntimeError(f"Unknown ground truth function {ground_truth}")

@ignore_warnings(category=ConvergenceWarning)
def poly_kernel_regression(ground_truth, n_samples, degree, reg_type, reg_coef, noise):
    np.random.seed(101)

    X = np.sort(np.random.rand(n_samples))
    y = true_regression_fun(ground_truth)(X) + np.random.randn(n_samples) * noise

    if reg_type == "ridge" and reg_coef > 0:
      model = Ridge(alpha=reg_coef, fit_intercept=False, solver="svd")
      model_key = "ridge"
    elif reg_type == "lasso" and reg_coef > 0:
      model = Lasso(alpha=reg_coef, fit_intercept=False, tol=1e-2, max_iter=10000)
      model_key = "lasso"
    else:
      model = LinearRegression(fit_intercept=False)
      model_key = "linearregression"
    
    clf = make_pipeline(PolynomialFeatures(degree), model)
    clf.fit(X[:, np.newaxis], y)

    X_test = np.sort(np.concatenate((np.linspace(0 - 1e-4, 1 + 1e-4, 100), X)))
    train_mse = mean_squared_error(
      y_true=y,
      y_pred=clf.predict(X[:, np.newaxis])
    )
    test_mse = mean_squared_error(
      y_true=true_regression_fun(ground_truth)(X_test),
      y_pred=clf.predict(X_test[:, np.newaxis])
    )
    
    fig = make_subplots(rows=1, cols=2) #, row_width=[0.15, 0.35])
    fig.add_trace(go.Scatter(x=X_test,
                             y=clf.predict(X_test[:, np.newaxis]),
                             line_width=3,
                             name="Model"),
                  row=1,
                  col=1)
    fig.add_trace(go.Scatter(x=X_test,
                             y=true_regression_fun(ground_truth)(X_test),
                             line_dash="dash",
                             line_width=3,
                             name="True function"),
                  row=1,
                  col=1)
    fig.add_trace(go.Scatter(x=X,
                             y=y,
                             mode="markers",
                             marker_size=7,
                             marker_symbol="x",
                             marker_color="black",
                             name="Samples"),
                  row=1,
                  col=1)
    fig.add_trace(go.Scatter(x=np.arange(clf[model_key].coef_.shape[0]),
                             y=np.fabs(clf[model_key].coef_),
                             line_width=3,
                             showlegend=False),
                  row=1,
                  col=2)

    fig.update_layout(
        title=f"Training MSE = {train_mse:.6}" + "<br>L2 error" + f" = {test_mse:.6}" + "<br>" + f"l2 norm = {np.linalg.norm(clf[model_key].coef_):.2}; l1 norm = {np.linalg.norm(clf[model_key].coef_, ord=1):.2}",
        margin=go.layout.Margin(
            l=0,  #left margin
            r=0,  #right margin
            b=0,  #bottom margin
            t=100,  #top margin
        ),
        xaxis1_range=[0, 1],
        xaxis1_title="x",
        yaxis1_range=[-2, 2],
        yaxis1_title="y",
        xaxis2_title="Degree",
        yaxis2_title="Abs. value of coefficient",
    )
    fig.update_layout(legend=dict(
        yanchor="top",
        y=0.97,
        xanchor="left",
        x=0.37
    ))
    fig.show()


_ = interact(
    poly_kernel_regression,
    ground_truth=ipywidgets.Dropdown(options=["sine", "poly:0,-1,0,0,1"],
                                     value="sine",
                                     description='Ground truth function:',
                                     disabled=False,
                                     style={'description_width': 'initial'},
                                     continuous_update=False),
    n_samples=ipywidgets.IntSlider(value=20,
                                   min=5,
                                   max=100,
                                   step=5,
                                   description='Number of samples:',
                                   style={'description_width': 'initial'},
                                   continuous_update=False),
    degree=ipywidgets.IntSlider(value=5,
                                min=1,
                                max=15,
                                step=1,
                                description='Polynomial Degree:',
                                style={'description_width': 'initial'},
                                continuous_update=False),
    reg_type=ipywidgets.Dropdown(options=["lasso", "ridge"],
                                 value="ridge",
                                 description='Regularization type:',
                                 disabled=False,
                                 style={'description_width': 'initial'},
                                 continuous_update=False),
    reg_coef=ipywidgets.FloatSlider(value=0.,
                                    min=0,
                                    max=0.001,
                                    step=0.0001,
                                    readout_format='.4f',
                                    description='Regularization coefficient:',
                                    style={'description_width': 'initial'},
                                    continuous_update=False),
    noise=ipywidgets.FloatSlider(value=0.5,
                                 min=0,
                                 max=1,
                                 step=0.1,
                                 readout_format='.2f',
                                 description='Noise level:',
                                 style={'description_width': 'initial'},
                                 continuous_update=False),
)

interactive(children=(Dropdown(description='Ground truth function:', options=('sine', 'poly:0,-1,0,0,1'), styl…

### Impact of noise on the estimator norm

In these figures we see how increasing the noise level leads to linear estimators whose weights have higher norm. This phenomenon holds true for both the $\ell_1$ and the $\ell_2$ norm. This observation motivates regularization with a norm-based penalty (like LASSO or ridge).

In [166]:
@ignore_warnings(category=ConvergenceWarning)
def norm_increase_with_noise(n_samples, degree, reg_type):
    np.random.seed(101)
    
    ground_truth = "poly:0,-1,0,0,1"
    noise_values = np.arange(0, 0.1, 0.01)
    reg_coef_values = [0., 1e-2, 1e-1, 1]
    
    X = np.sort(np.random.rand(n_samples))
    gauss_noise = np.random.randn(n_samples)
    
    fig = make_subplots(rows=1, cols=2)
    for i, reg_coef in enumerate(reg_coef_values):
      l1_norms, l2_norms = [], []
      if reg_type == "ridge" and reg_coef > 0:
        model = Ridge(alpha=reg_coef, fit_intercept=False, solver="svd")
        model_key = "ridge"
      elif reg_type == "lasso" and reg_coef > 0:
        model = Lasso(alpha=reg_coef, fit_intercept=False, tol=1e-2, max_iter=10000)
        model_key = "lasso"
      else:
        model = LinearRegression(fit_intercept=False)
        model_key = "linearregression"
      
      for noise in noise_values:
        y = true_regression_fun(ground_truth)(X) + gauss_noise * noise
        clf = make_pipeline(PolynomialFeatures(degree), model)
        clf.fit(X[:, np.newaxis], y)
        l1_norms.append(np.linalg.norm(clf[model_key].coef_, ord=1))
        l2_norms.append(np.linalg.norm(clf[model_key].coef_, ord=2))
    
      fig.add_trace(go.Scatter(x=noise_values,
                               y=l1_norms,
                               line_width=3,
                               marker_color=DEFAULT_PLOTLY_COLORS[i],
                               name=f"${reg_type.title()}\ \lambda={reg_coef}$",
                               legendgroup=f"${reg_type.title()}\ \lambda={reg_coef}$"),
                    row=1,
                    col=1)
      fig.add_trace(go.Scatter(x=noise_values,
                               y=l2_norms,
                               line_width=3,
                               marker_color=DEFAULT_PLOTLY_COLORS[i],
                               name=f"${reg_type.title()}\ \lambda={reg_coef}$",
                               legendgroup=f"${reg_type.title()}\ \lambda={reg_coef}$",
                               showlegend=False),
                    row=1,
                    col=2)

    fig.update_layout(
        margin=go.layout.Margin(
            l=0,  #left margin
            r=0,  #right margin
            b=0,  #bottom margin
            t=10,  #top margin
        ),
        xaxis1_title="Noise level",
        yaxis1_title="$\ell_1\ norm$",
        xaxis2_title="Noise level",
        yaxis2_title="$\ell_2\ norm$",
    )
    fig.show()
    
_ = interact(
    norm_increase_with_noise,
    n_samples=ipywidgets.IntSlider(value=20,
                                   min=10,
                                   max=100,
                                   step=10,
                                   description='Number of samples:',
                                   style={'description_width': 'initial'},
                                   continuous_update=False),
    degree=ipywidgets.IntSlider(value=5,
                                min=1,
                                max=10,
                                step=1,
                                description='Polynomial Degree:',
                                style={'description_width': 'initial'},
                                continuous_update=False),
    reg_type=ipywidgets.Dropdown(options=["lasso", "ridge"],
                                 value="ridge",
                                 description='Regularization type:',
                                 disabled=False,
                                 style={'description_width': 'initial'},
                                 continuous_update=False),
)

interactive(children=(IntSlider(value=20, continuous_update=False, description='Number of samples:', min=10, s…