In [1]:
import numpy as np
from numpy.linalg import inv


In [2]:
from numpy.linalg import cholesky, det, lstsq
from scipy.optimize import minimize


In [55]:
def f(X, noise=0.2):
    return -np.sin(3*X) - X**2 + 0.7*X + noise * np.random.randn(*X.shape)

X_train = np.array([[-0.9], [0], [0.5], [0.9], [1.1]])
Y_train = f(X_train)

In [9]:
#From gausian process
noise = 0.4
# Noisy training data
X_train = np.arange(-3, 4, 1).reshape(-1, 1)
Y_train = np.sin(X_train) + noise * np.random.randn(*X_train.shape)


In [4]:
X_train.shape

(7, 1)

In [16]:
np.sum(X_train**2,1)

array([9, 4, 1, 0, 1, 4, 9], dtype=int32)

In [5]:
def RBF_noise_kernel(X1, X2, l = 1.0, sigma_f = 1.0, sigma_y = 1.0):
    '''
    Isotropic squared exponential kernal (radial basis function kernel). Computes
    a covariance matrix from points in X1 and X2.
    Args:
        X1: Array of m points 
        X2: array of n points
        sigma_f: variance of the Gaussian Process
        sigma_y: noise variance
    Returns:
        Covariance matrix(m X n).
    '''
    sqdist = np.sum(X1**2, 1).reshape(-1,1) + np.sum(X2**2, 1) -  2 * np.dot(X1, X2.T)
    return sigma_f**2 * np.exp(-0.5 / l**2 * sqdist) + sigma_y**2 * np.eye(len(X_train))

In [6]:
def posterior_predictive(X_s, X_train, Y_train, l = 1.0, sigma_f = 1.0, sigma_y = 1e-8, kernel = RBF_noise_kernel):
    '''
    Computes the suffifient statistics of the GP posterior predictive distribution 
    from m training data X_train and Y_train and n new inputs X_s.
    
    Args:
        X_s: New input locations (n x d).
        X_train: Training locations (m x d).
        Y_train: Training targets (m x 1).
        l: Kernel length parameter.
        sigma_f: Kernel vertical variation parameter.
        sigma_y: Noise parameter.
        kernel: kernel function for the GP
    Returns:
        Posterior mean vector (n x d) and covariance matrix (n x n).
    '''
    #Create kernel with noise added along the matrix diagonal
    K = kernel(X_train, X_train, l, sigma_f) + sigma_y**2 * np.eye(len(X_train))
    #Get covariance of new inputs from X_train
    K_s = kernel(X_train, X_s, l , sigma_f)
    #Get Covariance of new inputs, with noise
    K_ss = kernel(X_s, X_s, l, sigma_f) + 1e-8 * np.eye(len(X_s))
    #get inverse of X_train covariance matrix
    K_inv = inv(K)
    
    #Calculate mean of the posterior predictive distribution 
    mu_s = K_s.T.dot(K_inv).dot(Y_train)
    
    #Calculate covariance matrix of the posterior predictive distribution 
    cov_s = K_ss - K_s.T.dot(K_inv).dot(K_s)
    
    return mu_s, cov_s

In [7]:
def nll_full_fn(X_train, Y_train, naive=True):
    '''
    Returns a function that computes the negative log marginal
    likelihood for training data X_train and Y_train and an unknown 
    noise level.
    
    Args:
        X_train: training locations (m x d).
        Y_train: training targets (m x 1).
        naive: if True use a naive implementation of Eq. (7), if 
               False use a numerically more stable implementation. 
        
    Returns:
        Minimization objective.
    '''
    def nll_naive(theta):
        # Naive implementation of the equation for log marginal likeliehood. Works well for simple examples 
        # but is numerically less stable compared to 
        # the implementation in nll_stable below.
        K = RBF_noise_kernel(X_train, X_train, l=theta[0], sigma_f=theta[1], sigma_y=theta[2])
        return 0.5 * np.log(det(K)) + \
               0.5 * Y_train.T.dot(inv(K).dot(Y_train)) + \
               0.5 * len(X_train) * np.log(2*np.pi)

    def nll_stable(theta):
        # Numerically more stable implementation of  equation for log marginal likeliehood as described
        # in http://www.gaussianprocess.org/gpml/chapters/RW2.pdf, Section
        # 2.2, Algorithm 2.1.
        K = RBF_noise_kernel(X_train, X_train, l=theta[0], sigma_f=theta[1], sigma_y=theta[2])
        L = cholesky(K)
        return np.sum(np.log(np.diagonal(L))) + \
               0.5 * Y_train.T.dot(lstsq(L.T, lstsq(L, Y_train)[0])[0]) + \
               0.5 * len(X_train) * np.log(2*np.pi)
    
    if naive:
        return nll_naive
    else:
        return nll_stable

In [8]:
np.random.uniform(0, 3, size=(10, 3))

array([[0.3269925 , 2.90431074, 1.87032437],
       [0.21828521, 2.69015686, 0.704197  ],
       [0.97760723, 0.44340326, 2.54062448],
       [2.23805631, 1.96205646, 1.5922256 ],
       [2.97053554, 2.4440606 , 1.30513295],
       [0.02356643, 1.05392543, 0.21336335],
       [0.18284152, 2.23634398, 0.76604553],
       [2.64453164, 2.99947593, 0.8744783 ],
       [1.92692148, 1.46257513, 1.30458259],
       [1.08338357, 2.28104967, 1.55716542]])

In [9]:
res = minimize(nll_full_fn(X_train, Y_train), [1,1,1], 
              bounds = ((1e-5, None), (1e-5, None),(1e-5, None)),
              method = 'L-BFGS-B')

In [10]:
res.x

array([1.44775204e+00, 1.33911646e+00, 1.00000000e-05])

In [11]:
# Minimize the negative log-likelihood w.r.t. parameters l, sigma_f, and sigma_y.
# Can run the minimization several times with different
# initializations to avoid local minima.
dim = 3
min_val = 10
min_x = None
n_restarts = 20

# Find the best optimum by starting from n_restart different random points.
for x0 in np.random.uniform(0, 2, size=(n_restarts, dim)):      
    res = minimize(nll_full_fn(X_train, Y_train, naive= False), x0 = x0, 
        bounds = ((1e-5, None), (1e-5, None),(1e-5, None)),
        method = 'L-BFGS-B')
    #print(res.fun)
    if res.fun < min_val:
        min_val = res.fun[0]
        min_x = res.x 



In [12]:
min_x

array([1.44775361e+00, 1.33912055e+00, 1.00000000e-05])

array([[10.35523829]])

In [6]:
a = [[1, 0], [1,1]]
b = [[2, 3], [1,2]]

In [8]:
np.matmul(a, b)

array([[2, 3],
       [3, 5]])