# GP-UCB optimisation

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from IPython.display import HTML

set_matplotlib_formats('pdf', 'svg')
css_style = open('../../../_static/custom_style.css', 'r').read()
HTML(f'<style>{css_style}</style>')

<div class="lemma">
    
**Lemma (Bounding the residues)** Suppose $\delta \in (0, 1)$ and let $\beta_t$ be a sequence
    
$$\begin{align}
\beta_t = 2 \log \left(\frac{|D| \pi_t}{\delta}\right),
\end{align}$$
    
such that $\sum \pi_t^{-1} = 1, \pi_t > 0$. Then 
    
$$\begin{align}
|f(\mathbf{x}) - \mu_{t - 1}(\mathbf{x})| \leq \beta_t^{1/2} \sigma_{t-1}(\mathbf{x})
\end{align}$$
    
holds for all $x \in D$ and $t \geq 1$ with probability $1 - \delta$.
    
</div>
<br>

<details class="proof">
<summary>Derivation: Bounding the residues</summary>
<br>
    
Fix $t \geq 1$ and $\mathbf{x} \in D$, where $|D| < \infty$. The function value at $\mathbf{x}$ is Gaussian-distributed with
    
$$\begin{align}
f(\mathbf{x}) \sim \mathcal{N}\left(\mu_{t-1}(\mathbf{x}), \sigma^2_{t-1}(\mathbf{x})\right),
\end{align}$$
    
and we want to bound the absolute residue $|f(\mathbf{x}) - \mu_{t - 1}(\mathbf{x})|$. Now if $r \sim \mathcal{N}(0, 1)$ and $c > 0$, we have
    
$$\begin{align}
p(r > c) = \int_c^\infty (2\pi)^{-1/2} e^{-\frac{1}{2}r^2} dr
\end{align}$$
    
while also
    
$$\begin{align}
\frac{d}{dc} p(r > c) &= \frac{d}{dc} \int_c^\infty (2\pi)^{-1/2} e^{-\frac{1}{2}r^2} dr \\
                      &= -(2\pi)^{-1/2} c e^{-\frac{1}{2}c^2} \\
                      &\leq -(2\pi)^{-1/2} \frac{c}{2} e^{-\frac{1}{2}c^2}.
\end{align}$$
    
Since $p(r > c) \leq e^{-\frac{1}{2}c^2}$ holds for $c = 0$, we can integrate both sides of the above inequality to arrive at
    
$$\begin{align}
p(r > c) \leq \frac{1}{2} e^{-\frac{1}{2}c^2}.
\end{align}$$
    
Then, setting $r = |f(\mathbf{x}) - \mu_{t - 1}(\mathbf{x})| / \sigma_{t-1}(\mathbf{x})$ and $c = \beta_t^{1/2}$, and using symmetry we see that
    
$$\begin{align}
p\left(|f(\mathbf{x}) - \mu_{t - 1}(\mathbf{x})| > \beta_t^{1/2} \sigma_{t-1}(\mathbf{x})\right) \leq e^{-\frac{\beta_t}{2}}.
\end{align}$$
    
Using the [union bound](https://mathworld.wolfram.com/BonferroniInequalities.html) over $\mathbf{x} \in D$ we see that 
    
$$\begin{align}
|f(\mathbf{x}) - \mu_{t - 1}(\mathbf{x})| \leq \beta_t^{1/2}\sigma_{t-1}(\mathbf{x})
\end{align}$$
    
holds for all $\mathbf{x} \in D$ with probability at least $1 - |D| e^{-\frac{\beta_t}{2}}$. Lastly, applying the union bound again over $t \in \mathbb{N}$, we see that the above inequality holds for all $\mathbf{x} \in D$ and all $t \geq 1$, with probability at least
    
$$\begin{align}
1 - |D| \sum_{t = 1}^\infty e^{-\frac{\beta_t}{2}}  = 1 - \delta,
\end{align}$$
    
where we have chosen $|D| e^{-\frac{\beta_t}{2}} = \frac{\delta}{\pi_t}$, where we have chosen $\pi_t$ to be a sequence of positive numbers such that
    
$$\begin{align}
\sum_{t = 1}^\infty \pi_t^{-1} = 1.
\end{align}$$
    
An example of such a sequence is $\pi_t = \frac{\pi^2t^2}{6}$.
    
</details>
<br>


<div class="lemma">
    
**Lemma (Bounding the regret by the confidence interval)** Suppose $t \geq 1$ and $|f(\mathbf{x}) - \mu_{t-1}(x)| \leq \beta_t^{1/2}\sigma_{t-1}(\mathbf{x})$ for all $\mathbf{x} \in D$. Then the regret is bounded according to
    
$$\begin{align}
r_t \leq 2 \beta_t^{1/2} \sigma_{t-1}(\mathbf{x}_t).
\end{align}$$
    
</div>
<br>

<details class="proof">
<summary>Derivation: Bounding the regret by the confidence interval</summary>
<br>
    
By the definition of $\mathbf{x}_t$ we have
    
$$\begin{align}
\mu_{t-1}(\mathbf{x}_t) + \beta_t^{1/2} \sigma_{t-1}(\mathbf{x}_t) \geq \mu_{t-1}(\mathbf{x}^*) + \beta_t^{1/2} \sigma_{t-1}(\mathbf{x}^*) \geq f(\mathbf{x}^*),
\end{align}$$
    
where the last inequality holds by assumption. Therefore
    
$$\begin{align}
r_t &= f(\mathbf{x}^*) - f(\mathbf{x}_t) \\
    &\leq \mu_{t-1}(\mathbf{x}_t) + \beta_t^{1/2} \sigma_{t-1}(\mathbf{x}_t) - f(\mathbf{x}_t) \\
    &\leq 2 \beta_t^{1/2} \sigma_{t-1}(\mathbf{x}_t).
\end{align}$$

    
</details>
<br>

<div class="lemma">
    
**Lemma (Mutual information of obserevations and function values)** The mutual information, or information gain at the points seeleected can  be expressed in terms of the predictive variances as
    
$$\begin{align}
I(\mathbf{y}_T; \mathbf{f}_T) = \frac{1}{2} \sum^{T}_{t = 1} \log\left(1 + \sigma^{-2}\sigma_{t-1}(\mathbf{x}_t)\right).
\end{align}$$
    
</div>
<br>

<details class="proof">
<summary>Derivation: Mutual information of obserevations and function values</summary>
<br>
    
First, we have the fact that
    
$$\begin{align}
I(\mathbf{y}_T; \mathbf{f}_T) &= H(\mathbf{y}_T) - H(\mathbf{y}_T | \mathbf{f}_T) \\
                              &= H(\mathbf{y}_T) - \frac{1}{2}\log|2\pi e \sigma^2 \mathbf{I}|.
\end{align}$$
    
Then, we can decompose the first entropy term as
    
$$\begin{align}
H(\mathbf{y}_T) &= H(\mathbf{y}_{T-1}) + H(y_T | \mathbf{y}_{T-1}) \\
                &~~\vdots \\
                &= \sum_{t=1}^T H(y_t | \mathbf{y}_{t-1})
\end{align}$$
    
where for convenience we define $H(y_1 | \mathbf{y}_0) \equiv H(y_1)$. Now since
    
$$\begin{align}
H(y_t | \mathbf{y}_{t-1}) = \frac{1}{2} \log \left(2\pi e (\sigma^2 + \sigma^2_{t-1}(x_t))\right),
\end{align}$$
    
we arrive at the result
    
$$\begin{align}
I(\mathbf{y}_T; \mathbf{f}_T) &= \frac{1}{2} \sum_{t=1}^T \log \left(1 + \sigma^{-2} \sigma^2_{t-1}(x_t)\right).
\end{align}$$
    
</details>
<br>

<div class="lemma">
    
**Lemma (Bounding the regret by the information gain)** Suppose $\delta \in (0, 1)$ and let $\beta_t = \pi^2 t^2 / 6$. Then
    
$$\begin{align}
\sum_{t = 1}^T r_t^2 \leq C_1 \beta_T I(\mathbf{y}_T; \mathbf{f}_T) \leq C_1 \beta_T \gamma_T, \text{ for all } T \geq 1
\end{align}$$
    
where $C_1 = 8 / \log(1 + \sigma^{-2})$ holds with probability at least $1 - \delta$.
    
</div>
<br>

<details class="proof">
<summary>Derivation: Bounding the regret by the information gain</summary>
<br>
    
By the previous lemmas, the inequality
    
$$\begin{align}
r_t^2 \leq 4 \beta_t \sigma_{t-1}^2(\mathbf{x}_t),
\end{align}$$
    
holds for all $t \geq 1$, with probability $1 - \delta$. Since $\beta_t$ is non-decreasing, we have
    
$$\begin{align}
4 \beta_t \sigma_{t-1}^2(x_t) &\leq 4 \beta_T \sigma^2 \left(\sigma^{-2} \sigma_{t-1}^2(\mathbf{x}_t)\right).
\end{align}$$
    
To bound the RHS of this expression we can use the fact that
    
$$\begin{align}
\frac{u}{v} \leq \frac{\log (1 + u)}{\log(1 + v)},
\end{align}$$
    
holds for all $u \in [0, v]$. We can see that this inequality holds, because it holds for $u = 0$ and for $u = v$ and also the RHS is a concave function of $u$. Letting $v = \sigma^{-2}$ and $u = \sigma^{-2} \sigma_{t-1}^2(x_t) \leq \sigma^{-2}$ and rearranging, we arrive at
    
$$\begin{align}
\sigma^{-2} \sigma_{t-1}^2(x_t) \leq \sigma^{-2} \frac{\log \left(1 + \sigma^{-2} \sigma_{t-1}^2(x_t)\right)}{\log\left(1 + \sigma^{-2}\right)} = C_2 \log \left(1 + \sigma^{-2} \sigma_{t-1}^2(x_t)\right), \text{ where } C_2 = \frac{\sigma^{-2}}{\log\left(1 + \sigma^{-2}\right)},
\end{align}$$
    
which we can plug into the previous inequality, to arrive at
    
$$\begin{align}
4 \beta_t \sigma_{t-1}^2(x_t) &\leq 4 \beta_T \sigma^2 C_2 \log \left(1 + \sigma^{-2} \sigma_{t-1}^2(x_t)\right).
\end{align}$$
    
Remembering that $I(\mathbf{y}_T; \mathbf{f}_T) \leq \gamma_T$ by definition, we can sum over $t$ and use the previous lemma to arrive at the result that
    
$$\begin{align}
\sum_{t = 1}^T r_t^2 \leq C_1 \beta_T I(\mathbf{y}_T; \mathbf{f}_T) \leq C_1 \beta_T \gamma_T, \text{ for all } T \geq 1
\end{align}$$
    
holds with probability at least $1 - \delta$.
    
</details>
<br>