In [None]:
'''
 * Copyright (c) 2018 Radhamadhab Dalai
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
'''

## Kernel Partial Least Squares Regression

Given two data blocks \( \mathbf{X} \) and \( \mathbf{Y} \), the kernel partial least squares (PLS) regression is a natural extension of PLS regression. The key steps involved in the PLS regression are:

1. $$ \mathbf{w} = \frac{\mathbf{X}^T \mathbf{u}}{\mathbf{u}^T \mathbf{u}} $$
2. $$ \mathbf{t} = \mathbf{Xw} $$
3. $$ \mathbf{c} = \frac{\mathbf{Y}^T \mathbf{t}}{\mathbf{t}^T \mathbf{t}} $$
4. $$ \mathbf{u} = \frac{\mathbf{Y}^T \mathbf{c}}{\mathbf{c}^T \mathbf{c}} $$
5. $$ \mathbf{p} = \frac{\mathbf{X}^T \mathbf{t}}{\mathbf{t}^T \mathbf{t}} $$
6. $$ \mathbf{q} = \frac{\mathbf{Y}^T \mathbf{u}}{\mathbf{u}^T \mathbf{u}} $$
7. $$ \mathbf{X} = \mathbf{X} - \mathbf{tp}^T $$
8. $$ \mathbf{Y} = \mathbf{Y} - \mathbf{tc}^T $$

These steps can also be expressed as:

$$ \mathbf{t} = \frac{\mathbf{X} \mathbf{X}^T \mathbf{u}}{\mathbf{u}^T \mathbf{u}} $$

$$ \mathbf{c} = \mathbf{Y}^T \mathbf{t} $$

$$ \mathbf{u} = \frac{\mathbf{Y}^T \mathbf{c}}{\mathbf{c}^T \mathbf{c}} $$

$$ \mathbf{X} = \mathbf{X} - \mathbf{tt}^T \mathbf{X} $$

$$ \mathbf{Y} = \mathbf{Y} - \mathbf{tc}^T $$

Using \( \Phi = \Phi(\mathbf{X}) \) instead of \( \mathbf{X} \), the equations become:

$$ \mathbf{t} = \frac{\Phi \Phi^T \mathbf{u}}{\mathbf{u}^T \mathbf{u}} $$

$$ \Phi = \Phi - \mathbf{tt}^T \Phi $$

Therefore, the key steps of kernel nonlinear iterative partial least squares (NIPALS) regression are as follows:

Given \( \Phi_0 = \Phi \) and the data block \( \mathbf{Y}_0 = \mathbf{Y} \):

1. Randomly initialize \( \mathbf{u} \).
2. $$ \mathbf{t} = \Phi \Phi^T \mathbf{u}, \quad \mathbf{t} \leftarrow \frac{\mathbf{t}}{\mathbf{t}^T \mathbf{t}} $$
3. $$ \mathbf{c} = \mathbf{Y}^T \mathbf{t} $$
4. $$ \mathbf{u} = \mathbf{Y}^T \mathbf{u}, \quad \mathbf{u} \leftarrow \frac{\mathbf{u}}{\mathbf{u}^T \mathbf{u}} $$


\textbf{Kernel NIPALS Regression Algorithm}

5. Repeat Steps 2–4 until convergence of \( t \).

6. Deflate the matrix:
$$
\mathbf{T} = (\mathbf{I} - \mathbf{t}\mathbf{t}^T)\mathbf{T}(\mathbf{I} - \mathbf{t}\mathbf{t}^T)^T.
$$

7. Deflate the matrix:
$$
\mathbf{Y} = \mathbf{Y} - \mathbf{t}\mathbf{c}^T.
$$

The kernel NIPALS regression is an iterative process: after extraction of the first component \( t_1 \), the algorithm starts again using the deflated matrices \( \mathbf{T} \) and \( \mathbf{Y} \) computed in Step 6 and Step 7, and repeat Steps 2–7 until the deflated matrix \( \mathbf{T} \) or \( \mathbf{Y} \) becomes a null matrix.

Once two matrices \( \mathbf{T} = [t_1, \ldots, t_p] \) and \( \mathbf{U} = [u_1 , \ldots, u_p] \) are found by using the NIPALS regression algorithm, then the matrix regression coefficients \( \mathbf{B} \) can be computed in the form similar to (6.9.26):
$$
\mathbf{B} = \mathbf{T}^0 \mathbf{U}(\mathbf{T}^T \mathbf{T}^0 \mathbf{U})^{-1} \mathbf{T}^T \mathbf{Y}^0.
$$

Then for a given new data block \( X_{\text{new}} \) and \( \mathbf{T}_{\text{new}} = \mathbf{T}(X_{\text{new}}) \), the unknown \( \mathbf{Y} \)-values can be predicted as:
$$
\hat{\mathbf{Y}}_{\text{new}} = \mathbf{T}_{\text{new}}\mathbf{B}.
$$


In [8]:
import numpy as np

def kernel_nipals(X, Y, n_components):
    # X: Input data matrix (n_samples, n_features)
    # Y: Output data matrix (n_samples, n_targets)
    # n_components: Number of components to extract
    
    n_samples, n_features = X.shape
    n_targets = Y.shape[1]
    
    # Initialize matrices to store results
    T = np.zeros((n_samples, n_components))
    P = np.zeros((n_features, n_components))
    C = np.zeros((n_targets, n_components))
    
    for i in range(n_components):
        # Step 1: Initialize t (randomly or with the first column of X)
        t = X[:, 0].reshape(-1, 1)  # Shape (n_samples, 1)
        
        while True:
            # Step 2: Compute u = (Y.T @ t) / (t.T @ t)
            t_t = t.T @ t  # Scalar value (1x1 matrix)
            u = (Y.T @ t) / t_t  # Shape (n_targets, 1)
            u = u / np.linalg.norm(u)  # Normalize u
            
            # Step 3: Compute new t = (X @ u) / (u.T @ u)
            u_u = u.T @ u  # Scalar value (1x1 matrix)
            t_new = (X @ u) / u_u  # Shape (n_samples, 1)
            
            # Check for convergence
            if np.linalg.norm(t_new - t) < 1e-10:
                t = t_new
                break
            t = t_new
        
        # Step 4: Deflation
        p = (X.T @ t) / t_t  # Shape (n_features, 1)
        c = (Y.T @ t) / t_t  # Shape (n_targets, 1)
        
        # Store components
        T[:, i] = t.flatten()
        P[:, i] = p.flatten()
        C[:, i] = c.flatten()
        
        # Deflate X and Y
        X -= t @ p.T  # X shape (n_samples, n_features), p.T shape (1, n_features)
        Y -= t @ c.T  # Y shape (n_samples, n_targets), c.T shape (1, n_targets)
    
    # Compute regression coefficients B
    T_T_T = T.T @ T  # Shape (n_components, n_components)
    T_T_Y = T.T @ Y  # Shape (n_components, n_targets)
    B = np.linalg.pinv(T_T_T) @ T_T_Y  # Shape (n_features, n_targets)
    
    return B, T, P, C

# Predict function
def predict(X_new, B):
    return X_new @ B  # Shape (n_samples, n_features) @ (n_features, n_targets)

# Example usage
if __name__ == "__main__":
    # Example input data
    X = np.array([[1, 2], [2, 3], [3, 4]])  # (3 samples, 2 features)
    Y = np.array([[1], [2], [3]])           # (3 samples, 1 target)
    
    # Number of components
    n_components = 1
    
    # Train the model
    B, T, P, C = kernel_nipals(X, Y, n_components)
    
    # Predict new data
    X_new = np.array([[4, 5]])  # (1 sample, 2 features)
    Y_pred = predict(X_new, B)
    
    print("Predicted Y:", Y_pred)


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 1 is different from 2)

### Laplacian Support Vector Machines (LapSVM)

**Problem Statement:**

Consider a set of labeled graph examples \(\{(x_i, y_i)\}_{i=1}^l\) and a set of unlabeled graph examples \(\{x_j\}_{j=l+1}^{l+u}\). The goal is to solve the following optimization problem for a semi-supervised SVM:

$$
f^* = \arg\min_f \left[ \frac{1}{l} \sum_{i=1}^l \left( 1 - y_i f(x_i) \right)_+ + \gamma_A \frac{1}{2} \|f\|_K^2 + \frac{(u - l)^2}{2} f^T L f \right]
$$

where:
- \(\left(1 - y_i f(x_i)\right)_+\) denotes the hinge loss for the \(i\)-th labeled example.
- \(\gamma_A\) is the regularization parameter for the kernel norm.
- \(L\) is the Laplacian matrix of the graph.

**Solution Representation:**

According to the extended representer theorem, the solution \(f^*\) can be expressed as:

$$
f^* = \sum_{i=1}^{l+u} \alpha_i^* K(x, x_i) + b
$$

**Primal Problem for \(\alpha\):**

Substituting \(f^*\) into the optimization problem and adding a bias term \(b\), the primal problem for optimizing \(\alpha\) can be written as:

$$
\begin{aligned}
\min_{\alpha, \xi} & \quad \frac{1}{2} \alpha^T \left( \gamma_A K + \frac{(u - l)^2}{2} KLK \right) \alpha + \sum_{i=1}^l \xi_i \\
\text{subject to} & \quad y_i \left( \sum_{j=1}^{l+u} \alpha_j K(x_i, x_j) + b \right) \geq 1 - \xi_i, \quad \forall i = 1, \ldots, l \\
& \quad \xi_i \geq 0, \quad \forall i = 1, \ldots, l
\end{aligned}
$$

**Lagrangian Formulation:**

The Lagrangian function incorporating the constraints is:

$$
\begin{aligned}
L(\alpha, \xi, b, \beta, \zeta) = & \frac{1}{2} \alpha^T \left( \gamma_A K + \frac{(u - l)^2}{2} KLK \right) \alpha + \sum_{i=1}^l \xi_i \\
& - \sum_{i=1}^l \beta_i \left( y_i \left( \sum_{j=1}^{l+u} \alpha_j K(x_i, x_j) + b \right) - 1 + \xi_i \right) \\
& - \sum_{i=1}^l \zeta_i \xi_i
\end{aligned}
$$

From the first-order optimization conditions, we get:

$$
\frac{\partial L}{\partial b} = 0 \implies \beta_i y_i = 0
$$

$$
\frac{\partial L}{\partial \xi_i} = 0 \implies \beta_i + \zeta_i = 1
$$

$$
0 \leq \beta_i \leq \zeta_i, \quad \text{where } \zeta_i \text{ and } \xi_i \text{ are nonnegative}
$$

**Reduced Lagrangian Function:**

The reduced Lagrangian function is:

$$
\begin{aligned}
L_R(\alpha, \beta) = & \frac{1}{2} \alpha^T \left( \gamma_A K + \frac{(u - l)^2}{2} KLK \right) \alpha - \sum_{i=1}^l \beta_i \left( y_i \left( \sum_{j=1}^{l+u} \alpha_j K(x_i, x_j) - 1 \right) \right) \\
& + \beta_i
\end{aligned}
$$

where \(J = [1, \ldots, 1, 0, \ldots, 0]\) is a \(1 \times (l + u)\) matrix with the first \(l\) entries as 1 and the rest as 0, and \(Y = \text{Diag}(y_1, \ldots, y_l)\).

From the first-order optimization condition:

$$
\frac{\partial L_R}{\partial \alpha} = \gamma_A K \alpha + \frac{(u - l)^2}{2} KLK \alpha - K J^T Y \beta = 0
$$


In [11]:
import numpy as np

def compute_kernel(X, gamma=1.0):
    """Compute the Gaussian RBF kernel matrix."""
    pairwise_dists = np.linalg.norm(X[:, None] - X, axis=2) ** 2
    K = np.exp(-gamma * pairwise_dists)
    return K

def compute_laplacian(K):
    """Compute the Laplacian matrix L from the kernel matrix K."""
    D = np.diag(K.sum(axis=1))
    L = D - K
    return L

def solve_linear_system(A, b):
    """Solve the linear system Ax = b."""
    return np.linalg.solve(A, b)

def laplacian_svm(X, y, gamma_A=1.0, C=1.0):
    """Train a Laplacian Support Vector Machine."""
    n_samples = X.shape[0]
    K = compute_kernel(X)
    L = compute_laplacian(K)
    
    # Define matrix for the quadratic term
    A = gamma_A * K + (C / n_samples) * L
    
    # Define vector for the linear term
    b = y
    
    # Solve the system A * alpha = b
    alpha = solve_linear_system(A, b)
    
    return alpha

# Example usage
if __name__ == "__main__":
    # Sample data
    X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]])
    y = np.array([1, -1, 1, -1])
    
    # Parameters
    gamma_A = 1.0
    C = 1.0
    
    # Train the LapSVM
    alpha = laplacian_svm(X, y, gamma_A=gamma_A, C=C)
    
    print("Optimal alpha:", alpha)


Optimal alpha: [ 1.07962249 -1.14805856  1.14805856 -1.07962249]


## Support Vector Machine Regression

Support Vector Machine Regression (SVR) is an algorithm that looks for an optimal hyperplane to predict continuous values in a high-dimensional space.

### 8.3.1 Support Vector Machine Regressor

Given a training set of \( N \) data points \( \{ (x_k, y_k) \}_{k=1}^N \), where \( x_k \in \mathbb{R}^n \) is the \( k \)-th input pattern and \( y_k \in \mathbb{R} \) is the associated "truth", we aim to find a hyperplane \( (w, b) \) that satisfies certain conditions.

The SVM learning algorithm seeks to minimize the following function:

$$
\min_{w, b} \quad \frac{1}{2} \|w\|^2
$$

subject to:

$$
y_i \left(w^T \phi(x_i) - b \right) \geq \epsilon
$$

where \( \phi : I \subseteq \mathbb{R}^n \rightarrow F \subseteq \mathbb{R}^N \) is a mapping from the input space \( I \) to the feature space \( F \), and \( \phi(x_i) \) is the extracted feature of the input \( x_i \).

Here, the distance between the point \( x_i \) and the decision boundary is given by:

$$
\text{Quality} = \left\langle w, \phi(x_i) \right\rangle - b
$$

The margin \( \gamma \) is defined as:

$$
\gamma = \frac{1}{N} \sum_{i=1}^N y_i \left(\left\langle w, \phi(x_i) \right\rangle - b \right)
$$

The constrained optimization problem can be rewritten in Lagrangian form as an unconstrained optimization problem:

$$
\min_{w, b} \quad L(w, b) = \frac{1}{2} \|w\|^2 - \sum_{i=1}^N \alpha_i \left[ y_i \left\langle w, \phi(x_i) \right\rangle - b \right]
$$

where the Lagrange multipliers \( \alpha_i \) are nonnegative.


## Optimization Conditions for Support Vector Machine Regression

From the optimization conditions, we have:

1. The gradient of the Lagrangian with respect to \( w \) is:

$$
\frac{\partial L(w, b)}{\partial w} = w - \sum_{i=1}^N \alpha_i y_i \phi(x_i) = 0 \quad \Rightarrow \quad w = \sum_{i=1}^N \alpha_i y_i \phi(x_i)
$$

2. The gradient of the Lagrangian with respect to \( b \) is:

$$
\frac{\partial L(w, b)}{\partial b} = -\sum_{i=1}^N \alpha_i y_i = 0
$$

### Constrained Optimization with Respect to \( \alpha \)

Substituting these results into the original constrained optimization problem, we obtain the following dual optimization problems:

#### Minimization Problem

$$
\min_{\alpha} \quad J_1(\alpha) = \sum_{i=1}^N \sum_{j=1}^N \alpha_i \alpha_j y_i y_j K(x_i, x_j) - \sum_{i=1}^N \alpha_i
$$

#### Maximization Problem

$$
\max_{\alpha} \quad J_2(\alpha) = \sum_{i=1}^N \alpha_i - \frac{1}{2} \sum_{i=1}^N \sum_{j=1}^N \alpha_i \alpha_j y_i y_j K(x_i, x_j)
$$

subject to:

$$
\sum_{i=1}^N \alpha_i y_i = 0
$$

and

$$
\alpha_i \geq 0, \quad i = 1, \ldots, N
$$

where \( K(x_i, x_j) = \langle \phi(x_i), \phi(x_j) \rangle = \phi(x_i)^T \phi(x_j) \) is the kernel function.

### Algorithm for Support Vector Machine Regression

1. Solve the maximization problem (8.3.8) with constraints (8.3.9) to obtain the Lagrange multipliers \( \alpha_i \), \( i = 1, \ldots, N \).
2. Update the bias \( b \) using:

$$
b \leftarrow b - \eta \sum_{i=1}^N \alpha_i y_i
$$

3. Calculate the support vector regressor \( w \) using:

$$
w = \sum_{i=1}^N \alpha_i y_i \phi(x_i)
$$


In [12]:
import numpy as np
from cvxopt import matrix, solvers

def compute_kernel(X, gamma=1.0):
    """Compute the Gaussian RBF kernel matrix."""
    pairwise_dists = np.linalg.norm(X[:, None] - X, axis=2) ** 2
    K = np.exp(-gamma * pairwise_dists)
    return K

def svr(X, y, C=1.0, epsilon=0.1, gamma=1.0):
    """Train a Support Vector Machine Regressor using quadratic programming."""
    N = X.shape[0]
    K = compute_kernel(X, gamma)
    
    # Define the parameters for quadratic programming
    H = np.outer(y, y) * K
    H = (H + H.T) / 2  # Ensure symmetry
    P = matrix(H)
    
    # Linear term
    q = np.ones(N) * -1
    q = matrix(q)
    
    # Constraints
    G = np.vstack([-np.eye(N), np.eye(N)])
    h = np.hstack([np.zeros(N), np.ones(N) * C])
    G = matrix(G)
    h = matrix(h)
    
    # Equality constraint: sum of alpha_i * y_i = 0
    A = np.array(y, dtype=float)
    A = matrix(A, (1, N))
    b = matrix(np.zeros(1))
    
    # Solve the quadratic programming problem
    solution = solvers.qp(P, q, G, h, A, b)
    alpha = np.array(solution['x']).flatten()
    
    # Compute the bias term
    support_vector_indices = alpha > 1e-5
    support_vectors = X[support_vector_indices]
    support_vector_labels = y[support_vector_indices]
    support_vector_alphas = alpha[support_vector_indices]
    
    K_sv = compute_kernel(support_vectors, gamma)
    b = np.mean(support_vector_labels - np.dot(K_sv, support_vector_alphas))
    
    # Compute the weights
    w = np.dot(alpha * y, K)
    
    return w, b, alpha

# Example usage
if __name__ == "__main__":
    # Sample data
    X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]])
    y = np.array([1, -1, 1, -1])
    
    # Parameters
    C = 1.0
    epsilon = 0.1
    gamma = 1.0
    
    # Train the SVR
    w, b, alpha = svr(X, y, C=C, epsilon=epsilon, gamma=gamma)
    
    print("Weights:", w)
    print("Bias:", b)
    print("Alpha:", alpha)


ModuleNotFoundError: No module named 'cvxopt'

In [13]:
import numpy as np

def compute_kernel(X, gamma=1.0):
    """Compute the Gaussian RBF kernel matrix."""
    pairwise_dists = np.linalg.norm(X[:, None] - X, axis=2) ** 2
    K = np.exp(-gamma * pairwise_dists)
    return K

def svr(X, y, C=1.0, epsilon=0.1, gamma=1.0, tol=1e-6, max_iter=1000):
    """Train a Support Vector Machine Regressor using a basic quadratic solver."""
    N = X.shape[0]
    K = compute_kernel(X, gamma)
    
    # Initialize alpha values
    alpha = np.zeros(N)
    b = 0
    
    # Define the function for the dual problem
    def objective(alpha):
        return 0.5 * np.sum(alpha[:, None] * alpha[None, :] * y[:, None] * y[None, :] * K) - np.sum(alpha)
    
    # Simple gradient ascent for solving dual problem
    for _ in range(max_iter):
        gradient = np.dot((alpha * y), K) - 1
        alpha += 0.01 * gradient  # Basic step size, needs tuning
        alpha = np.clip(alpha, 0, C)
        if np.linalg.norm(gradient) < tol:
            break
    
    # Calculate the bias term
    support_vector_indices = alpha > 1e-5
    support_vectors = X[support_vector_indices]
    support_vector_labels = y[support_vector_indices]
    support_vector_alphas = alpha[support_vector_indices]
    
    K_sv = compute_kernel(support_vectors, gamma)
    b = np.mean(support_vector_labels - np.dot(K_sv, support_vector_alphas))
    
    # Compute the weights
    w = np.dot(alpha * y, K)
    
    return w, b, alpha

# Example usage
if __name__ == "__main__":
    # Sample data
    X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]])
    y = np.array([1, -1, 1, -1])
    
    # Parameters
    C = 1.0
    epsilon = 0.1
    gamma = 1.0
    
    # Train the SVR
    w, b, alpha = svr(X, y, C=C, epsilon=epsilon, gamma=gamma)
    
    print("Weights:", w)
    print("Bias:", b)
    print("Alpha:", alpha)


Weights: [0. 0. 0. 0.]
Bias: nan
Alpha: [0. 0. 0. 0.]


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
