In [14]:
# Source: Alexandru Tifrea and Fanny Yang, 2022.
# Based on an earlier version by Sebastian Curi and Andreas Krause.

# Python Notebook Commands
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from IPython.core.display import HTML
from IPython.display import display

display(HTML("<style>.container { width:100% !important; }</style>"))

# General math and plotting modules.
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from copy import deepcopy
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Widget and formatting modules
import ipywidgets
from ipywidgets import interact, interactive, interact_manual, fixed, widgets
from matplotlib import rcParams

rcParams['figure.figsize'] = (15, 6)
rcParams['font.size'] = 20

# Machine Learning library.
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn import datasets
from sklearn.metrics import mean_squared_error

import warnings

MAX_NUM_SAMPLES = 100
figure_width = 1500
figure_height = 600

# Kernel Ridge Regression

### Regression with polynomial kernels

In the following we show how the estimator depends on hyperparameters like the ridge coefficient or the the degree of the polynomial used for to define the kernel.


Let's consider 1-dimensional data $\{(x_i, y_i)\}_{i=0}^n \subset \mathbb{R} \times \mathbb{R}$. We use a polynomial kernel of the form $k(x, z)=1 + xz + (xz)^2+...+(xz)^d$ for the regression task. This kernel induces a feature representation of the data in the space of monomials of degree at most $d$, i.e. $\varphi: \mathbb{R} \rightarrow span(\{1, X, X^2, ..., X^d\})$. Minimizing the kernel regression objective is equivalent to performing linear regression in this feature space. The maximum degree controls the complexity of the kernel function.

The kernel ridge regression that is minimized below can be written as: $L(w; \lambda) := \sum_{i=0}^n (y_i - w^T\varphi(x_i))^2 + \lambda ||w||_2^2 $.

Below we show the mean squared error (MSE) computed on the training points, as well as the L2 error of the estimator compared to the ground truth function $f^*$, i.e. $||\hat{f}-f^*||_{L_2}$.

In [10]:
def true_regression_fun(X):
    return np.cos(3 * np.pi * X)


def poly_kernel_regression(n_samples, degree, l2_coef, noise):
    np.random.seed(101)
    X = np.sort(np.random.rand(MAX_NUM_SAMPLES))
    gaussian_noise = np.random.randn(MAX_NUM_SAMPLES)
    idx = np.random.choice(np.arange(MAX_NUM_SAMPLES), n_samples)
    X, gaussian_noise = X[idx], gaussian_noise[idx]
    y = true_regression_fun(X) + gaussian_noise * noise

    clf = make_pipeline(
        PolynomialFeatures(degree),
        Ridge(alpha=l2_coef, fit_intercept=False, solver="svd"))
    clf.fit(X[:, np.newaxis], y)

    X_test = np.sort(np.concatenate((np.linspace(0 - 1e-4, 1 + 1e-4, 100), X)))
    train_mse = mean_squared_error(
      y_true=y,
      y_pred=clf.predict(X[:, np.newaxis])
    )
    test_mse = mean_squared_error(
      y_true=true_regression_fun(X_test),
      y_pred=clf.predict(X_test[:, np.newaxis])
    )
    
    fig = make_subplots(rows=2, cols=1, row_width=[0.15, 0.35])
    fig.add_trace(go.Scatter(x=X_test,
                             y=clf.predict(X_test[:, np.newaxis]),
                             line_width=3,
                             name="Model"),
                  row=1,
                  col=1)
    fig.add_trace(go.Scatter(x=X_test,
                             y=true_regression_fun(X_test),
                             line_dash="dash",
                             line_width=3,
                             name="True function"),
                  row=1,
                  col=1)
    fig.add_trace(go.Scatter(x=X,
                             y=y,
                             mode="markers",
                             marker_size=7,
                             marker_symbol="x",
                             marker_color="black",
                             name="Samples"),
                  row=1,
                  col=1)
    fig.add_trace(go.Scatter(x=np.arange(clf["ridge"].coef_.shape[0]),
                             y=np.fabs(clf["ridge"].coef_),
                             line_width=3,
                             showlegend=False),
                  row=2,
                  col=1)

    fig.update_layout(
        width=figure_width, 
        height=figure_height,
        title=f"Training MSE = {train_mse:.6}" + "<br>L2 error" + f" = {test_mse:.6}",
        margin=go.layout.Margin(
            l=0,  #left margin
            r=0,  #right margin
            b=0,  #bottom margin
            t=60,  #top margin
        ),
        xaxis1_range=[0, 1],
        xaxis1_title="x",
        yaxis1_range=[-2, 2],
        yaxis1_title="y",
        xaxis2_title="Degree",
        yaxis2_title="Abs. value of coefficient",
    )
    fig.show()


_ = interact(
    poly_kernel_regression,
    n_samples=ipywidgets.IntSlider(value=20,
                                   min=5,
                                   max=MAX_NUM_SAMPLES,
                                   step=5,
                                   description='Number of samples:',
                                   style={'description_width': 'initial'},
                                   continuous_update=False),
    degree=ipywidgets.IntSlider(value=10,
                                min=1,
                                max=30,
                                step=1,
                                description='Polynomial Degree:',
                                style={'description_width': 'initial'},
                                continuous_update=False),
    l2_coef=ipywidgets.FloatSlider(value=0.,
                                   min=0,
                                   max=0.001,
                                   step=0.0001,
                                   readout_format='.4f',
                                   description='Ridge coefficient:',
                                   style={'description_width': 'initial'},
                                   continuous_update=False),
    noise=ipywidgets.FloatSlider(value=0.5,
                                 min=0,
                                 max=1,
                                 step=0.1,
                                 readout_format='.2f',
                                 description='Noise level:',
                                 style={'description_width': 'initial'},
                                 continuous_update=False),
)

interactive(children=(IntSlider(value=20, continuous_update=False, description='Number of samples:', min=5, st…

### Regression with RBF kernels

In the following we show how the estimator depends on hyperparameters like the ridge coefficient or the bandwidth for two commonly used radial basis function (RBF) kernels: the Gaussian and the Laplacian kernels. RBF kernels differ from polynomial kernels in that they induce a feature map from the inputs to an infinite-dimensional space.

The general form of RBF kernels is $k(x, z) = \exp\left( \frac{|x-z|^p}{\sigma}\right)$; for $p=1$ and $p=2$ we recover the Laplacian and the Gaussian kernel, respectively. The bandwidth $\sigma$ controls the smoothness of prediction function.

Below we show the mean squared error (MSE) computed on the training points, as well as the L2 error of the estimator compared to the ground truth function $f^*$, i.e. $||\hat{f}-f^*||_{L_2}$.

In [11]:
def rbf_kernel_regression(kernel_str, n_samples, bandwidth, l2_coef, noise):
    np.random.seed(101) 
    X = np.sort(np.random.rand(MAX_NUM_SAMPLES))
    gaussian_noise = np.random.randn(MAX_NUM_SAMPLES)
    idx = np.random.choice(np.arange(MAX_NUM_SAMPLES), n_samples)
    X, gaussian_noise = X[idx], gaussian_noise[idx]
    y = true_regression_fun(X) + gaussian_noise * noise

    gamma = np.power(10., -bandwidth)
    if kernel_str == 'Gaussian':
        kernel = "rbf"
    elif kernel_str == 'Laplacian':
        kernel = "laplacian"

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        orig_X = deepcopy(X)
        X = X[:, np.newaxis]
        clf = KernelRidge(alpha=l2_coef, kernel=kernel, gamma=gamma)   
        clf.fit(X, y)
    
    X_test = np.sort(np.concatenate((np.linspace(0 - 1e-4, 1 + 1e-4, 100), orig_X)))
    orig_X_test = deepcopy(X_test)
    X_test = X_test[:, np.newaxis]

    train_mse = mean_squared_error(
      y_true=y,
      y_pred=clf.predict(X)
    )
    test_mse = mean_squared_error(
      y_true=true_regression_fun(orig_X_test),
      y_pred=clf.predict(X_test)
    )

    fig = go.Figure()
    fig.add_trace(
        go.Scatter(x=orig_X_test,
                   y=clf.predict(X_test),
                   line_width=3,
                   name="Model"))
    fig.add_trace(
        go.Scatter(x=orig_X_test,
                   y=true_regression_fun(orig_X_test),
                   line_dash="dash",
                   line_width=3,
                   name="True function"))
    fig.add_trace(
        go.Scatter(x=orig_X,
                   y=y,
                   mode="markers",
                   marker_size=7,
                   marker_symbol="x",
                   marker_color="black",
                   name="Samples"))

    fig.update_layout(
        width=figure_width, 
        height=figure_height,
        title=f"{kernel_str} kernel" + "<br>" + f"Training MSE = {train_mse:.6}" + "<br>L2 error" + f" = {test_mse:.6}",
        margin=go.layout.Margin(
            l=0,  #left margin
            r=0,  #right margin
            b=0,  #bottom margin
            t=95,  #top margin
        ),
        xaxis_range=[0, 1],
        xaxis_title="x",
        yaxis_range=[-2, 2],
        yaxis_title="y",
    )
    fig.show()

### Gaussian kernel

In [15]:
_ = interact(
    rbf_kernel_regression,
    kernel_str=ipywidgets.fixed("Gaussian"),
    bandwidth=ipywidgets.FloatSlider(value=-3,
                                     min=-4,
                                     max=-2,
                                     step=0.1,
                                     readout_format='.1f',
                                     description='Bandwidth 10^:',
                                     style={'description_width': 'initial'},
                                     continuous_update=False),
    n_samples=ipywidgets.IntSlider(value=30,
                                   min=10,
                                   max=MAX_NUM_SAMPLES,
                                   step=10,
                                   description='Number of samples:',
                                   style={'description_width': 'initial'},
                                   continuous_update=False),
    l2_coef=ipywidgets.FloatSlider(value=0.,
                                 min=0,
                                 max=1.,
                                 step=0.0001,
                                 readout_format='.5f',
                                 description='Ridge coefficient:',
                                 style={'description_width': 'initial'},
                                 continuous_update=False),
    noise=ipywidgets.FloatSlider(value=0.1,
                                 min=0,
                                 max=0.5,
                                 step=0.01,
                                 readout_format='.2f',
                                 description='Noise level:',
                                 style={'description_width': 'initial'},
                                 continuous_update=False),
)

interactive(children=(IntSlider(value=30, continuous_update=False, description='Number of samples:', min=10, s…

### Laplacian kernel

In [16]:
_ = interact(
    rbf_kernel_regression,
    kernel_str=ipywidgets.fixed("Laplacian"),
    bandwidth=ipywidgets.FloatSlider(value=-1,
                                     min=-2,
                                     max=0,
                                     step=0.1,
                                     readout_format='.1f',
                                     description='Bandwidth 10^:',
                                     style={'description_width': 'initial'},
                                     continuous_update=False),
    n_samples=ipywidgets.IntSlider(value=30,
                                   min=10,
                                   max=MAX_NUM_SAMPLES,
                                   step=10,
                                   description='Number of samples:',
                                   style={'description_width': 'initial'},
                                   continuous_update=False),
    l2_coef=ipywidgets.FloatSlider(value=0.,
                                 min=0,
                                 max=1.,
                                 step=0.0001,
                                 readout_format='.5f',
                                 description='Ridge coefficient:',
                                 style={'description_width': 'initial'},
                                 continuous_update=False),
    noise=ipywidgets.FloatSlider(value=0.1,
                                 min=0,
                                 max=0.5,
                                 step=0.01,
                                 readout_format='.2f',
                                 description='Noise level:',
                                 style={'description_width': 'initial'},
                                 continuous_update=False),
)

interactive(children=(IntSlider(value=30, continuous_update=False, description='Number of samples:', min=10, s…

# RBF kernel classification with SVMs

Illustration of binary classification with SVM using RBF kernels (i.e. Gaussian and Laplacian). You can observe the decision boundary for a few different data distributions and you can control the spread of the samples by moving the "Variance" slider.

In [20]:
# Our dataset and targets
tol = 1e-1


def laplacian_kernel(X, Y, gamma):
    rows = X.shape[0]
    cols = Y.shape[0]
    K = np.zeros((rows, cols))
    for col in range(cols):
        dist = gamma * np.linalg.norm(X - Y[col, :], ord=1, axis=1)
        K[:, col] = np.exp(-dist)
    return K


def kernelized_svm(dataset, kernel, n_samples, reg, bw, noise):
    if dataset == 'blobs':
        X, Y = datasets.make_blobs(n_samples=MAX_NUM_SAMPLES,
                                   centers=2,
                                   random_state=3,
                                   cluster_std=10 * noise)
    elif dataset == 'circles':
        X, Y = datasets.make_circles(n_samples=MAX_NUM_SAMPLES,
                                     factor=.5,
                                     noise=noise,
                                     random_state=42)
    elif dataset == 'moons':
        X, Y = datasets.make_moons(n_samples=MAX_NUM_SAMPLES,
                                   noise=noise,
                                   random_state=42)
    elif dataset == 'xor':
        np.random.seed(42)
        step = int(MAX_NUM_SAMPLES / 4)

        X = np.zeros((MAX_NUM_SAMPLES, 2))
        Y = np.zeros(MAX_NUM_SAMPLES)

        X[0 * step:1 * step, :] = noise * np.random.randn(step, 2)
        Y[0 * step:1 * step] = 1
        X[1 * step:2 *
          step, :] = np.array([1, 1]) + noise * np.random.randn(step, 2)
        Y[1 * step:2 * step] = 1

        X[2 * step:3 *
          step, :] = np.array([0, 1]) + noise * np.random.randn(step, 2)
        Y[2 * step:3 * step] = -1
        last_group_size = MAX_NUM_SAMPLES - 3 * step
        X[3 * step:, :] = np.array([1, 0]) + noise * np.random.randn(last_group_size, 2)
        Y[3 * step:] = -1

    elif dataset == 'periodic':
        np.random.seed(42)
        step = int(MAX_NUM_SAMPLES / 4)

        X = np.zeros((MAX_NUM_SAMPLES, 2))
        Y = np.zeros(MAX_NUM_SAMPLES)

        X[0 * step:1 * step, :] = noise * np.random.randn(step, 2)
        Y[0 * step:1 * step] = 1
        X[1 * step:2 *
          step, :] = np.array([0, 2]) + noise * np.random.randn(step, 2)
        Y[1 * step:2 * step] = 1

        X[2 * step:3 *
          step, :] = np.array([0, 1]) + noise * np.random.randn(step, 2)
        Y[2 * step:3 * step] = -1
        last_group_size = MAX_NUM_SAMPLES - 3 * step
        X[3 * step:, :] = np.array([0, 3]) + noise * np.random.randn(last_group_size, 2)
        Y[3 * step:] = -1

    X = X[Y <= 1, :]
    Y = Y[Y <= 1]
    Y[Y == 0] = -1
    
    orig_X, orig_Y = deepcopy(X), deepcopy(Y)
    np.random.seed(42)
    idx = np.random.permutation(MAX_NUM_SAMPLES)
    X, Y = X[idx][:n_samples], Y[idx][:n_samples]

    # Add the 1 feature.
    X = np.concatenate((X, np.ones((X.shape[0], 1))), axis=1)
    plot_support = False
    gamma = np.power(10., -bw)
    coef0 = 0
    if kernel == 'Gaussian':
        kernel = "rbf"
    elif kernel == 'Laplacian':
        kernel = lambda X, Y: laplacian_kernel(X, Y, gamma)
        plot_support = False

    classifier = svm.SVC(kernel=kernel,
                         C=np.power(10., -reg),
                         gamma=gamma,
                         coef0=coef0,
                         tol=tol,
                         random_state=10)
    classifier.fit(X, Y)

    # plot the line, the points, and the nearest vectors to the plane
    plt.figure()
    plt.clf()
    fig = plt.axes()
    opt = {'marker': 'r*', 'label': '+'}
    plot_data(X[np.where(Y == 1)[0], 0],
              X[np.where(Y == 1)[0], 1],
              fig=fig,
              options=opt)
    opt = {'marker': 'bo', 'label': '-'}
    plot_data(X[np.where(Y == -1)[0], 0],
              X[np.where(Y == -1)[0], 1],
              fig=fig,
              options=opt)

    if plot_support:
        plt.scatter(classifier.support_vectors_[:, 0],
                    classifier.support_vectors_[:, 1],
                    s=80,
                    facecolors='none',
                    edgecolors='k')

    mins = np.min(orig_X, 0)
    maxs = np.max(orig_X, 0)
    x_min = mins[0] - 1
    x_max = maxs[0] + 1
    y_min = mins[1] - 1
    y_max = maxs[1] + 1

    XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
    Xtest = np.c_[XX.ravel(), YY.ravel(), np.ones_like(XX.ravel())]
    Z = classifier.decision_function(Xtest)

    # Put the result into a color plot
    Z = Z.reshape(XX.shape)
    plt.contourf(XX, YY, Z > 0, cmap=plt.cm.jet, alpha=0.3)
    plt.contour(XX,
                YY,
                Z,
                colors=['k', 'k', 'k'],
                linestyles=['-'],
                levels=[0])

    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)


def plot_data(X, Y, fig=None, options=dict()):
    fig.plot(X,
             Y,
             options.get('marker', 'b*'),
             label=options.get('label', 'Raw data'),
             fillstyle=options.get('fillstyle', 'full'),
             ms=options.get('size', 10))


_ = interact(
    kernelized_svm,
    dataset=['blobs', 'circles', 'moons', 'xor', 'periodic'],
    kernel=['Gaussian', 'Laplacian'],
    n_samples=ipywidgets.IntSlider(value=100,
                                 min=20,
                                 max=MAX_NUM_SAMPLES,
                                 step=10,
                                 description='Number of samples:',
                                 style={'description_width': 'initial'},
                                 continuous_update=False),
    reg=ipywidgets.FloatSlider(value=-3,
                               min=-3,
                               max=3,
                               step=0.5,
                               readout_format='.1f',
                               description='Regularization 10^:',
                               style={'description_width': 'initial'},
                               continuous_update=False),
    bw=ipywidgets.FloatSlider(value=-1,
                              min=-3,
                              max=3,
                              step=0.1,
                              readout_format='.1f',
                              description='Bandwidth 10^:',
                              style={'description_width': 'initial'},
                              continuous_update=False),
    noise=ipywidgets.FloatSlider(value=0.05,
                                 min=0.01,
                                 max=0.1,
                                 step=0.01,
                                 readout_format='.2f',
                                 description='Variance:',
                                 style={'description_width': 'initial'},
                                 continuous_update=False),
)

interactive(children=(Dropdown(description='dataset', options=('blobs', 'circles', 'moons', 'xor', 'periodic')…