In [None]:
'''
 * Copyright (c) 2018 Radhamadhab Dalai
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
'''

# Gaussian Process for Regression and Classification

In this section, we discuss Gaussian process methods for regression and classification problems.

## 8.6 Gaussian Process for Regression and Classification

### 8.6.1 Joint, Marginal, and Conditional Probabilities

Let \( y_1, \ldots, y_n \) be \( n \) random variables with joint probability \( p(y_1, \ldots, y_n) \) or \( p(y) \) for short. We partition the variables into two groups, \( y_A \) and \( y_B \), where \( A \cup B = \{1, \ldots, n\} \) and \( A \cap B = \emptyset \), so that \( p(y) = p(y_A, y_B) \). The marginal probability of \( y_A \) is given by:

$$
p(y_A) = \int p(y_A, y_B) \, dy_B
$$

For discrete variables, the integral is replaced by a sum. If the joint distribution is equal to the product of the marginals, then the variables are said to be independent; otherwise, they are dependent.

The conditional probability function is defined as:

$$
p(y_A \mid y_B) = \frac{p(y_A, y_B)}{p(y_B)}
$$

Bayes' theorem relates these probabilities:

$$
p(y_A \mid y_B) = \frac{p(y_B \mid y_A) \, p(y_A)}{p(y_B)}
$$

### 8.6.2 Gaussian Process

A Gaussian Process (GP) generalizes the multivariate Gaussian distribution to functions. 

#### Multivariate Gaussian Distribution

For a multivariate Gaussian distribution, the joint probability density is given by:

$$
p(x \mid \mu, \Sigma) = \frac{1}{(2 \pi)^{N/2} \, |\Sigma|^{1/2}} \exp \left( -\frac{1}{2} (x - \mu)^T \Sigma^{-1} (x - \mu) \right)
$$

where \( \mu \) is the mean vector and \( \Sigma \) is the covariance matrix.

#### Gaussian Process Definition

A Gaussian Process \( f(x) \) is a collection of random variables, any finite number of which have joint Gaussian distributions. Formally, a Gaussian Process is defined as:

$$
f \sim \text{GP}(\mu, K)
$$

where \( \mu = E[f(x)] \) is the mean function and \( K(x, x') \) is the covariance function.

For a vector-valued Gaussian process, we have:

$$
\mu(x) = E[f(x)]
$$

$$
K(x_i, x_j) = E\left[(f(x_i) - \mu(x_i))^T (f(x_j) - \mu(x_j))\right]
$$

and the Gaussian process is expressed as:

$$
f(x) \sim \text{GP}(\mu, \Sigma)
$$

where the \((i, j)\)-th element of the covariance matrix \( \Sigma \) is:

$$
\Sigma_{ij} = K(x_i, x_j)
$$

The covariance function \( K(x_i, x_j) \), also known as the kernel function, plays a crucial role in Gaussian processes, encoding the structure present in the data. According to Mercer’s theorem, the kernel function can be constructed as:

$$
K(x_i, x_j) = \phi(x_i)^T \phi(x_j)
$$

In [2]:
import numpy as np

class GaussianProcessRegressor:
    def __init__(self, kernel, sigma_n=1e-5):
        self.kernel = kernel
        self.sigma_n = sigma_n  # Noise variance

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        self.K = self.kernel(X_train, X_train) + self.sigma_n**2 * np.eye(len(X_train))
        self.L = np.linalg.cholesky(self.K)
        self.alpha = np.linalg.solve(self.L.T, np.linalg.solve(self.L, y_train))

    def predict(self, X_test):
        K_s = self.kernel(self.X_train, X_test)
        K_ss = self.kernel(X_test, X_test) + 1e-8 * np.eye(len(X_test))
        
        # Mean
        mu_s = K_s.T @ self.alpha
        
        # Covariance
        v = np.linalg.solve(self.L, K_s)
        cov_s = K_ss - v.T @ v
        
        return mu_s, cov_s

def rbf_kernel(X1, X2, length_scale=1.0):
    sqdist = np.sum(X1**2, 1).reshape(-1, 1) + np.sum(X2**2, 1) - 2 * np.dot(X1, X2.T)
    return np.exp(-0.5 / length_scale**2 * sqdist)


In [3]:
from scipy.special import expit
from scipy.optimize import minimize

class GaussianProcessClassifier:
    def __init__(self, kernel):
        self.kernel = kernel

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        self.n_samples = X_train.shape[0]
        self.K = self.kernel(X_train, X_train)
        self.K_inv = np.linalg.inv(self.K)
        
        # Optimization of the log marginal likelihood
        result = minimize(self.neg_log_marginal_likelihood, x0=np.zeros(self.n_samples))
        self.alpha = result.x

    def predict_proba(self, X_test):
        K_s = self.kernel(self.X_train, X_test)
        f_mean = K_s.T @ (self.alpha * self.y_train)
        f_var = self.kernel(X_test, X_test) - K_s.T @ self.K_inv @ K_s
        
        return expit(f_mean), f_var

    def neg_log_marginal_likelihood(self, alpha):
        L = np.linalg.cholesky(self.K + 1e-6 * np.eye(self.n_samples))
        alpha = np.linalg.solve(L.T, np.linalg.solve(L, alpha * self.y_train))
        log_likelihood = -0.5 * self.y_train.T @ alpha - np.sum(np.log(np.diagonal(L))) - 0.5 * self.n_samples * np.log(2 * np.pi)
        return -log_likelihood

def rbf_kernel(X1, X2, length_scale=1.0):
    sqdist = np.sum(X1**2, 1).reshape(-1, 1) + np.sum(X2**2, 1) - 2 * np.dot(X1, X2.T)
    return np.exp(-0.5 / length_scale**2 * sqdist)
# Generate synthetic data
X_train = np.array([[1], [2], [3], [4], [5]])
y_train = np.sin(X_train).ravel()

# GP Regression
gp = GaussianProcessRegressor(kernel=rbf_kernel)
gp.fit(X_train, y_train)
X_test = np.linspace(0, 6, 100)[:, np.newaxis]
mu_s, cov_s = gp.predict(X_test)

# GP Classification
X_train_cls = np.array([[1], [2], [3], [4], [5]])
y_train_cls = np.array([0, 0, 1, 1, 1])  # Binary classification
gp_cls = GaussianProcessClassifier(kernel=rbf_kernel)
gp_cls.fit(X_train_cls, y_train_cls)
X_test_cls = np.linspace(0, 6, 100)[:, np.newaxis]
probs, _ = gp_cls.predict_proba(X_test_cls)
