# Prediction Frequence Coverage Simulation
Run a simulation to compare prediction performance of a Bayesian Gaussian process model with reference prior to a Gaussian process model using a maximum likelihood approach.

In [1]:
import numpy as np
import scipy
from bbai.gp import BayesianGaussianProcessRegression, RbfCovarianceFunction
import matplotlib.pyplot as plt
import seaborn
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

np.random.seed(0)

## Generate Data Set

In [2]:
def make_location_matrix(num_train, num_test):
    res = np.zeros((num_train + num_test, 1))
    step = 1.0 / (num_train - 1)
    for i in range(num_train):
        res[i, 0] = i * step
    for i in range(num_test):
        res[num_train + i, 0] = np.random.uniform(0, 1)
    return res

def make_covariance_matrix(Z, params):
    sigma2, theta, eta = params
    N = Z.shape[0]
    res = np.zeros((N, N))
    for i in range(N):
        xi = Z[i]
        for j in range(N):
            xj = Z[j]
            d2 = np.linalg.norm(xi - xj)**2
            res[i, j] = np.exp(-0.5 * d2/theta**2)
        res[i, i] += eta
    return res

def make_target_vector(K):
    return np.random.multivariate_normal(np.zeros(K.shape[0]), K)

## Set up Simlation

In [3]:
low = 0.025
high = 0.975

def run_ml_model(S_train, y_train, S_test, y_test, params0):
    sigma2, theta, eta = params0
    kernel = sigma2 * RBF(length_scale=theta) + WhiteKernel(noise_level=sigma2*eta)
    model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=25)
    model.fit(S_train, y_train)
    pred, pred_std = model.predict(S_test, return_std = True)
    cnt = 0
    for i, yi in enumerate(y_test):
        pdf = scipy.stats.norm(loc=pred[i], scale=pred_std[i])
        t = pdf.cdf(yi)
        is_included = t > low and t < high
        if is_included:
            cnt += 1
    return cnt

def run_bay_model(S_train, y_train, S_test, y_test, params0):
    model = BayesianGaussianProcessRegression(kernel=RbfCovarianceFunction())
    model.fit(S_train, y_train)
    _, pdfs = model.predict(S_test, with_pdf = True)
    cnt = 0
    for i, yi in enumerate(y_test):
        pdf = pdfs[i]
        t = pdf.cdf(yi)
        is_included = t > low and t < high
        if is_included:
            cnt += 1
    return cnt

In [4]:
def run_simulation(runner, params):
    num_train = 20
    num_test = 20   
    N = 100
    cnt = 0
    for i in range(N):
        S = make_location_matrix(num_train, num_test)
        K = make_covariance_matrix(S, params)
        y = make_target_vector(K) 
        cnt_i = runner(
            S[:num_train], y[:num_train], 
            S[num_train:], y[num_train:],
            params)
        cnt += cnt_i
    normalizer = num_test * N
    return cnt / normalizer

## Run Simulation

In [5]:
sigma2 = 1
thetas = [0.1, 0.2, 0.5]
etas = [0.001, 0.01, 0.1, 0.2]
px = []
for theta in thetas:
    for eta in etas:
        px.append((sigma2, theta, eta))

In [6]:
np.random.seed(0)
for p in px:
    cov = run_simulation(run_ml_model, p)
    print(p, cov)

(1, 0.1, 0.001) 0.8115
(1, 0.1, 0.01) 0.8375
(1, 0.1, 0.1) 0.8465
(1, 0.1, 0.2) 0.8525


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


(1, 0.2, 0.001) 0.9055
(1, 0.2, 0.01) 0.912
(1, 0.2, 0.1) 0.8925
(1, 0.2, 0.2) 0.8925
(1, 0.5, 0.001) 0.9335
(1, 0.5, 0.01) 0.9185
(1, 0.5, 0.1) 0.9195
(1, 0.5, 0.2) 0.903


In [7]:
np.random.seed(0)
for p in px:
    cov = run_simulation(run_bay_model, p)
    print(p, cov)

(1, 0.1, 0.001) 0.9185
(1, 0.1, 0.01) 0.9385
(1, 0.1, 0.1) 0.929
(1, 0.1, 0.2) 0.936
(1, 0.2, 0.001) 0.9505
(1, 0.2, 0.01) 0.953
(1, 0.2, 0.1) 0.9425
(1, 0.2, 0.2) 0.9365
(1, 0.5, 0.001) 0.942
(1, 0.5, 0.01) 0.944
(1, 0.5, 0.1) 0.9315
(1, 0.5, 0.2) 0.938
