In [1]:
import math
import torch
import qpytorch
import tqdm
import matplotlib.pyplot as plt
%matplotlib inline


/bin/sh: brew: command not found



# Modifying the Variational Strategy/Variational Distribution

The predictive distribution for approximate QEPs is given by

$$
p( \mathbf f(\mathbf x^*) ) = \int_{\mathbf u} p( f(\mathbf x^*) \mid \mathbf u) \: q(\mathbf u) \: d\mathbf u,
\quad
q(\mathbf u) = \mathcal Q( \mathbf m, \mathbf S).
$$

$\mathbf u$ represents the function values at the $m$ inducing points.
Here, $\mathbf m \in \mathbb R^m$ and $\mathbf S \in \mathbb R^{m \times m}$ are learnable parameters.

If $m$ (the number of inducing points) is quite large, the number of learnable parameters in $\mathbf S$ can be quite unwieldy.
Furthermore, a large $m$ might make some of the computations rather slow.
Here we show a few ways to use different [variational distributions](https://qepytorch.readthedocs.io/en/stable/variational.html#variational-distributions) and
[variational strategies](https://qepytorch.readthedocs.io/en/stable/variational.html#variational-strategies) to accomplish this.

### Experimental setup

We're going to train an approximate QEP on a medium-sized regression dataset, taken from the UCI repository.

In [2]:
import urllib.request
import os
from scipy.io import loadmat
from math import floor


# this is for running the notebook in our testing framework
smoke_test = ('CI' in os.environ)


if not smoke_test and not os.path.isfile('../elevators.mat'):
    print('Downloading \'elevators\' UCI dataset...')
    urllib.request.urlretrieve('https://drive.google.com/uc?export=download&id=1jhWL3YUHvXIaftia4qeAyDwVxo6j1alk', '../elevators.mat')


if smoke_test:  # this is for running the notebook in our testing framework
    X, y = torch.randn(1000, 3), torch.randn(1000)
else:
    data = torch.Tensor(loadmat('../elevators.mat')['data'])
    X = data[:, :-1]
    X = X - X.min(0)[0]
    X = 2 * (X / X.max(0)[0]) - 1
    y = data[:, -1]


train_n = int(floor(0.8 * len(X)))
train_x = X[:train_n, :].contiguous()
train_y = y[:train_n].contiguous()

test_x = X[train_n:, :].contiguous()
test_y = y[train_n:].contiguous()

if torch.cuda.is_available():
    train_x, train_y, test_x, test_y = train_x.cuda(), train_y.cuda(), test_x.cuda(), test_y.cuda()

In [3]:
from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(train_x, train_y)
train_loader = DataLoader(train_dataset, batch_size=500, shuffle=True)

test_dataset = TensorDataset(test_x, test_y)
test_loader = DataLoader(test_dataset, batch_size=500, shuffle=False)

### Some quick training/testing code

This will allow us to train/test different model classes.

In [4]:
# this is for running the notebook in our testing framework
num_epochs = 1 if smoke_test else 10


# Our testing script takes in a QPyTorch MLL (objective function) class
# and then trains/tests an approximate QEP with it on the supplied dataset

def train_and_test_approximate_qep(model_cls):
    inducing_points = torch.randn(128, train_x.size(-1), dtype=train_x.dtype, device=train_x.device)
    model = model_cls(inducing_points)
    likelihood = qpytorch.likelihoods.QExponentialLikelihood(power=model.power)
    mll = qpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.numel())
    optimizer = torch.optim.Adam(list(model.parameters()) + list(likelihood.parameters()), lr=0.1)
    
    if torch.cuda.is_available():
        model = model.cuda()
        likelihood = likelihood.cuda()

    # Training
    model.train()
    likelihood.train()
    epochs_iter = tqdm.notebook.tqdm(range(num_epochs), desc=f"Training {model_cls.__name__}")
    for i in epochs_iter:
        # Within each iteration, we will go over each minibatch of data
        for x_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output = model(x_batch)
            loss = -mll(output, y_batch)
            epochs_iter.set_postfix(loss=loss.item())
            loss.backward()
            optimizer.step()
            
    # Testing
    model.eval()
    likelihood.eval()
    means = torch.tensor([0.])
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            preds = model(x_batch)
            means = torch.cat([means, preds.mean.cpu()])
    means = means[1:]
    error = torch.mean(torch.abs(means - test_y.cpu()))
    print(f"Test {model_cls.__name__} MAE: {error.item()}")

## The Standard Approach

As a default, we'll use the default [VariationalStrategy](https://qepytorch.readthedocs.io/en/stable/variational.html#id1) class with a [CholeskyVariationalDistribution](https://qepytorch.readthedocs.io/en/stable/variational.html#choleskyvariationaldistribution).
The `CholeskyVariationalDistribution` class allows $\mathbf S$ to be on any positive semidefinite matrix. This is the most general/expressive option for approximate QEPs.

In [5]:
POWER = 1.0
class StandardApproximateQEP(qpytorch.models.ApproximateQEP):
    def __init__(self, inducing_points):
        self.power = torch.tensor(POWER)
        variational_distribution = qpytorch.variational.CholeskyVariationalDistribution(inducing_points.size(-2), power=self.power)
        variational_strategy = qpytorch.variational.VariationalStrategy(
            self, inducing_points, variational_distribution, learn_inducing_locations=True
        )
        super().__init__(variational_strategy)
        self.mean_module = qpytorch.means.ConstantMean()
        self.covar_module = qpytorch.kernels.ScaleKernel(qpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return qpytorch.distributions.MultivariateQExponential(mean_x, covar_x, power=self.power)

In [6]:
train_and_test_approximate_qep(StandardApproximateQEP)

Training StandardApproximateQEP:   0%|          | 0/10 [00:00<?, ?it/s]

Test StandardApproximateQEP MAE: 0.08965195715427399


## Reducing parameters

### MeanFieldVariationalDistribution: a diagonal $\mathbf S$ matrix 

One way to reduce the number of parameters is to restrict that $\mathbf S$ is only diagonal. This is less expressive, but the number of parameters is now linear in $m$ instead of quadratic.

All we have to do is take the previous example, and change `CholeskyVariationalDistribution` (full $\mathbf S$ matrix) to [MeanFieldVariationalDistribution](https://qepytorch.readthedocs.io/en/stable/variational.html#meanfieldvariationaldistribution) (diagonal $\mathbf S$ matrix).

In [7]:
class MeanFieldApproximateQEP(qpytorch.models.ApproximateQEP):
    def __init__(self, inducing_points):
        self.power = torch.tensor(POWER)
        variational_distribution = qpytorch.variational.MeanFieldVariationalDistribution(inducing_points.size(-2), power=self.power)
        variational_strategy = qpytorch.variational.VariationalStrategy(
            self, inducing_points, variational_distribution, learn_inducing_locations=True
        )
        super().__init__(variational_strategy)
        self.mean_module = qpytorch.means.ConstantMean()
        self.covar_module = qpytorch.kernels.ScaleKernel(qpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return qpytorch.distributions.MultivariateQExponential(mean_x, covar_x, power=self.power)

In [8]:
train_and_test_approximate_qep(MeanFieldApproximateQEP)

Training MeanFieldApproximateQEP:   0%|          | 0/10 [00:00<?, ?it/s]

Test MeanFieldApproximateQEP MAE: 0.08979436010122299


### DeltaVariationalDistribution: no $\mathbf S$ matrix 

A more extreme method of reducing parameters is to get rid of $\mathbf S$ entirely. This corresponds to learning a delta distribution ($\mathbf u = \mathbf m$) rather than a multivariate Normal distribution for $\mathbf u$. In other words, this corresponds to performing MAP estimation rather than variational inference.

In QPyTorch, getting rid of $\mathbf S$ can be accomplished by using a [DeltaVariationalDistribution](https://qepytorch.readthedocs.io/en/stable/variational.html#deltavariationaldistribution).

In [9]:
class MAPApproximateQEP(qpytorch.models.ApproximateQEP):
    def __init__(self, inducing_points):
        self.power = torch.tensor(POWER)
        variational_distribution = qpytorch.variational.DeltaVariationalDistribution(inducing_points.size(-2), power=self.power)
        variational_strategy = qpytorch.variational.VariationalStrategy(
            self, inducing_points, variational_distribution, learn_inducing_locations=True
        )
        super().__init__(variational_strategy)
        self.mean_module = qpytorch.means.ConstantMean()
        self.covar_module = qpytorch.kernels.ScaleKernel(qpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return qpytorch.distributions.MultivariateQExponential(mean_x, covar_x, power=self.power)

In [10]:
train_and_test_approximate_qep(MAPApproximateQEP)

Training MAPApproximateQEP:   0%|          | 0/10 [00:00<?, ?it/s]

Test MAPApproximateQEP MAE: 0.08309302479028702


## Reducing computation (through decoupled inducing points)

One way to reduce the computational complexity is to use separate inducing points for the mean and covariance computations. The [Orthogonally Decoupled Variational Gaussian Processes](https://arxiv.org/abs/1809.08820) method of Salimbeni et al. (2018) uses more inducing points for the (computationally easy) mean computations and fewer inducing points for the (computationally intensive) covariance computations.

In QPyTorch we implement this method in a modular way. The [OrthogonallyDecoupledVariationalStrategy](https://qepytorch.readthedocs.io/en/stable/variational.html#qpytorch.variational.OrthogonallyDecoupledVariationalStrategy) defines the variational strategy for the mean inducing points. It wraps an existing variational strategy/distribution that defines the covariance inducing points:

In [14]:
def make_orthogonal_vs(model, train_x):
    mean_inducing_points = torch.randn(1000, train_x.size(-1), dtype=train_x.dtype, device=train_x.device)
    covar_inducing_points = torch.randn(100, train_x.size(-1), dtype=train_x.dtype, device=train_x.device)

    covar_variational_strategy = qpytorch.variational.VariationalStrategy(
        model, covar_inducing_points,
        qpytorch.variational.CholeskyVariationalDistribution(covar_inducing_points.size(-2), power=torch.tensor(POWER)),
        learn_inducing_locations=True,
    )

    variational_strategy = qpytorch.variational.OrthogonallyDecoupledVariationalStrategy(
        covar_variational_strategy, mean_inducing_points,
        qpytorch.variational.DeltaVariationalDistribution(mean_inducing_points.size(-2), power=torch.tensor(POWER)),
    )
    return variational_strategy

Putting it all together we have:

In [15]:
class OrthDecoupledApproximateQEP(qpytorch.models.ApproximateQEP):
    def __init__(self, inducing_points):
        self.power = torch.tensor(POWER)
        variational_distribution = qpytorch.variational.DeltaVariationalDistribution(inducing_points.size(-2), power=self.power)
        variational_strategy = make_orthogonal_vs(self, train_x)
        super().__init__(variational_strategy)
        self.mean_module = qpytorch.means.ConstantMean()
        self.covar_module = qpytorch.kernels.ScaleKernel(qpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return qpytorch.distributions.MultivariateQExponential(mean_x, covar_x, power=self.power)

In [16]:
train_and_test_approximate_qep(OrthDecoupledApproximateQEP)

Training OrthDecoupledApproximateQEP:   0%|          | 0/10 [00:00<?, ?it/s]

  if nonzero_indices.storage():
  res = cls(index_tensor, value_tensor, interp_size)


Test OrthDecoupledApproximateQEP MAE: 0.08983905613422394
