The implementation of the following paper: https://arxiv.org/pdf/1706.07094.pdf

NEI stands for noisy expected improvement

## EI without noise correction

In [None]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import math
import scipy.stats
import time
from IPython import display
%matplotlib inline

In [None]:
x = np.linspace(0, 1, 1000)
true_y = np.random.randn(1) * np.ones(1000)
for i in range(1, 6):
    true_y += np.random.randn(1) * np.sin(i * math.pi * x)
    true_y += np.random.randn(1) * np.cos(i * math.pi * x)

In [None]:
def k_gaussian(x1, x2, l=0.1):
    # calculating gaussian kernel matrix
    # the output has shape (len(x2), len(x1))
    # entry at [i, j] position is given by k(x2[i], x1[j])
    # dealing with gaussian kernels so k(x, y) = e ^ ((x - y) ** 2 / 2 * l ** 2)
    
    # gaussian kernel hyperparameters - adjusts the distance between points and variance
    scale_kernel = 1
    
    x1_matrix = np.tile(x1, len(x2)).reshape((len(x2), len(x1)))
    x2_matrix = np.tile(x2, len(x1)).reshape((len(x1), len(x2))).transpose()
    
    k_matrix = np.exp(-(x1_matrix - x2_matrix) ** 2 / (2 * l * l)) * scale_kernel ** 2
    
    return k_matrix

In [None]:
def log_likelihood(sample_x, sample_y, sigma_obs, l):
    # The following function calculates the log-likelihood of observed data
    # wrt to prior distribution for GP, i.e. zero mean and sigma given by k_gaussian(l) + sigma_obs ** 2 * I
    
    # Under that model the log-likelihood is given by 
    # -0.5 * y' * sigma(-1) * y - 0.5 * n * log(2pi) - 0.5 * log|sigma|
    
    # To make sense of the code below note that we express log-likelihood through the cholesky decomposition of sigma
    # Then |sigma| = |chol| ** 2 (det of product is product of det)
    # |chol| = prod(chol_ii) (because cholesky matrix is lower triangular)
    # Thus, 0.5 * log|sigma| = sum(log(chol_ii))
    
    sigma = k_gaussian(sample_x, sample_x, l) + np.eye(len(sample_x)) * sigma_obs ** 2
    chol = np.linalg.cholesky(sigma)
    
    # Calculating alpha = sigma(-1) * y (or solution to sigma * alpha = y) using cholesky matrix
    # (This trick is taken from sklearn implementation of GP)
    alpha = sp.linalg.cho_solve((chol, True), sample_y).reshape((-1, 1))
    
    log_lik = -0.5 * np.dot(sample_y.reshape(1, -1), alpha)
    log_lik -= 0.5 * len(sample_x) * np.log(2 * np.pi)
    log_lik -= np.trace(np.log(np.absolute(chol)))
    
    return log_lik[0][0]

In [None]:
def gp_posterior(sample_x, sample_y, x, sigma_obs):
    # Calculating posterior for gaussian processes
    # I am specifically interested in posterior mean, std and cholesky matrix for postrior at sampled points (for nei)
    # it is assumed that observations have some additional gaussian noise
    
    # Important: the method cannot handle sigma_obs=0 if I want to predict for sample_x
    # Mostly numerical issues: with zero noise matrix to invert may not be positive-semidefinite

    # Picking the optimal kernel hyperparameter (bruteforcing maximum likelihood wrt to prior GP model)
    l = max(np.exp(np.linspace(np.log(0.01), np.log(1), 100)), 
            key = lambda z: log_likelihood(sample_x, sample_y, sigma_obs, z))
#     print "Picked kernel l: ", l
    
    # Separately calculating matrix used to calculate both mean and variance
    K = np.dot(k_gaussian(sample_x, x, l),
               np.linalg.inv(k_gaussian(sample_x, sample_x, l) + np.eye(len(sample_x)) * sigma_obs ** 2)
              )
    
    mu = np.dot(K, sample_y)
    sigma = k_gaussian(x, x, l) - np.dot(K, k_gaussian(x, sample_x, l))
    std_1d = np.sqrt([sigma[i, i] for i in range(len(mu))])
    
    return mu.reshape(-1), std_1d.reshape(-1)

def cholesky_posterior(sample_x, sample_y, sigma_obs):  
    # Section to get the cholesky matrix
    # Picking the optimal kernel hyperparameter (bruteforcing maximum likelihood wrt to prior GP model)
    l = max(np.exp(np.linspace(np.log(0.01), np.log(1), 100)), 
            key = lambda z: log_likelihood(sample_x, sample_y, sigma_obs, z))
    
    # Calculating the posterior covariance matrix for observed data
    K = np.dot(k_gaussian(sample_x, sample_x, l),
               np.linalg.inv(k_gaussian(sample_x, sample_x, l) + np.eye(len(sample_x)) * sigma_obs ** 2)
              )
    sigma = k_gaussian(sample_x, sample_x, l) - np.dot(K, k_gaussian(sample_x, sample_x, l))
    
    # Because of numerical issues the supposed posterior covariance matrix may not be positive definite
    # For that reason I add noise along main diagonal until the matrix is properly conditioned    
    noise_addition = 0.00001
    while True:
        try:
            chol = np.linalg.cholesky(sigma)
            break
        except:
            print "Problems with getting cholesky matrix, adding noise to main diagonal"
            sigma += noise_addition * np.eye(len(sample_x))
    
    return chol

The code below does basic EI steps - two options are to use skleran or to switch to custom implementation of GP

In [None]:
sigma_obs = 0.5

pick_x = np.random.choice(range(len(x)), size=5, replace=False)
sample_x = x[pick_x]
sample_y = true_y[pick_x] + sigma_obs * np.random.randn(5)

for t in range(1, 100):
    
    # Evaluating GP posterior
    mu, std_1d = gp_posterior(sample_x, sample_y, x, sigma_obs)
    
    z = (mu - np.max(mu)) / std_1d
    ei = std_1d * scipy.stats.norm.pdf(z) + z * std_1d * scipy.stats.norm.cdf(z)

    pick_x = np.argmax(ei)
    sample_x = np.append(sample_x, x[pick_x])
    sample_y = np.append(sample_y, true_y[pick_x] + sigma_obs * np.random.randn(1))
    
    log_lik = log_likelihood(sample_x, sample_y, sigma_obs, 0.1)
    
    plt.plot(x, true_y, label="true_signal")
    plt.plot(sample_x, sample_y, ".", color="r", label="picked_x")
    plt.plot([sample_x[-1]], [sample_y[-1]], ".", color="b", label="last_x")
    plt.plot(x, mu, color="g", label="posterior")
    plt.fill_between(x, mu - 2 * std_1d, mu + 2 * std_1d, color="g", alpha=0.5)

    plt.title("True and recovered signals")
    plt.legend()
    plt.xlabel("x")
    plt.show()

    display.display(plt.gcf())
    display.clear_output(wait=True)
    time.sleep(2)
    

## EI with noise correction

In [None]:
sigma_obs = 0.5
n_mc = 10

pick_x = np.random.choice(range(len(x)), size=1, replace=False)
sample_x = x[pick_x]
sample_y = true_y[pick_x] + sigma_obs * np.random.randn(1)

for t in range(1, 100):
    
    # First step: figuring out posterior distributions for sampled points
    sample_mu, _ = gp_posterior(sample_x, sample_y, sample_x, sigma_obs)   
    chol = cholesky_posterior(sample_x, sample_y, sigma_obs) 
    nei = np.zeros((len(x),))
    
    # Second step: sampling from posterior distribution and calculating EI as if the sample has no noise
    for n in range(n_mc):
        # Not going for inverse of normal cdf as for now I an fine with usual MC (compared to QMC)
        fresh_y = np.dot(chol, np.random.randn(len(sample_x)).reshape(-1, 1)) + sample_mu.reshape(-1, 1)

        # Doing GP posterior as if sampled y are true noiseless values 
        # (noise is not zero so that the code would not crash)
        mu, std_1d = gp_posterior(sample_x, fresh_y, x, 0.001)
        
        z = (mu - np.max(mu)) / std_1d
        nei += std_1d * scipy.stats.norm.pdf(z) + z * std_1d * scipy.stats.norm.cdf(z)
    
    pick_x = np.argmax(nei)
    sample_x = np.append(sample_x, x[pick_x])
    sample_y = np.append(sample_y, true_y[pick_x] + sigma_obs * np.random.randn(1))

    # Recalculating posterior for plotting
    mu, std_1d = gp_posterior(sample_x, sample_y, x, sigma_obs)
    
    plt.plot(x, true_y, label="true_signal")
    plt.plot(sample_x, sample_y, ".", color="r", label="picked_x")
    plt.plot([sample_x[-1]], [sample_y[-1]], ".", color="b", label="last_x")
    plt.plot(x, mu, color="g", label="posterior")
    plt.fill_between(x, mu - 2 * std_1d, mu + 2 * std_1d, color="g", alpha=0.5)

    plt.title("True and recovered signals")
    plt.legend()
    plt.xlabel("x")
    plt.show()

    display.display(plt.gcf())
    display.clear_output(wait=True)
    time.sleep(2)
    

## Appendix code

### sklearn implementation of GP

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

def gp_posterior(sample_x, sample_y, x, sigma_obs):
    # calculating posterior for gaussian processes
    # This version makes use of sklearn package to do inference
    
    # Main difference so far: sklearn implementation actually estimates kernel hyperparameters through ML
    # n_restarts specifies how many times ML is performed when starting from random points
    # noise in observed data is handled via alpha (is specifies the value to add along the main diagonal when inverting kernel matrix)
    gp = GaussianProcessRegressor(kernel=RBF(length_scale=0.1),
                                  alpha=sigma_obs**2,
                                  n_restarts_optimizer=100,
                                  normalize_y=False
                                 )
    
    gp.fit(sample_x.reshape(-1, 1), sample_y.reshape(-1, 1))

    mu, std_1d = gp.predict(x.reshape(-1, 1), return_std=True)
    
    return mu.reshape(-1), std_1d.reshape(-1)#, gp.L_

def cholesky_posterior(sample_x, sample_y, sigma_obs):   
    # Section to get the cholesky matrix
    gp = GaussianProcessRegressor(kernel=RBF(length_scale=0.1),
                                  alpha=sigma_obs**2,
                                  n_restarts_optimizer=100,
                                  normalize_y=False
                                 )
    
    gp.fit(sample_x.reshape(-1, 1), sample_y.reshape(-1, 1))

    _, sigma = gp.predict(sample_x.reshape(-1, 1), return_cov=True)
    
    # Because of numerical issues the supposed posterior covariance matrix may not be positive definite
    # For that reason I add noise along main diagonal until the matrix is properly conditioned
    noise_addition = 0.01
    while True:
        try:
            chol = np.linalg.cholesky(sigma)
            break
        except:
            print "Problems with getting cholesky matrix, adding noise to main diagonal"
            sigma += noise_addition * np.eye(len(sample_x))
    
    return chol