In [None]:
'''
 * Copyright (c) 2018 Radhamadhab Dalai
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
'''

$$
\textbf{ν-Support Vector Machine Binary Classifier}
$$

$$
\text{The primal optimization problem of ν-SVC is described by:}
$$

$$
\min_{w, \xi, \rho} \quad \frac{1}{2} \lVert w \rVert^2 - \nu \rho + \sum_{i=1}^{N} \xi_i 
$$
$$
\text{subject to:} \quad y_i (w^T \phi(x_i) + b) \geq \rho - \xi_i, \quad \xi_i \geq 0 \quad (i = 1, \dots, N), \quad \rho \geq 0
$$

$$
\text{Let Lagrange multipliers} \, \alpha_i, \beta_i, \delta \geq 0, \text{ and the Lagrange function is:}
$$

$$
L(w, \xi, b, \rho, \alpha, \beta, \delta) = \frac{1}{2} \lVert w \rVert^2 - \nu \rho + \sum_{i=1}^{N} \xi_i - \delta \rho - \sum_{i=1}^{N} \alpha_i \left( y_i (w^T x_i + b) - \rho + \xi_i \right) + \sum_{i=1}^{N} \beta_i \xi_i
$$

$$
\text{The first-order optimization conditions give:}
$$

$$
\frac{\partial L}{\partial w} = 0 \quad \Rightarrow \quad w = \sum_{i=1}^{N} \alpha_i y_i x_i
$$

$$
\frac{\partial L}{\partial \xi_i} = 0 \quad \Rightarrow \quad \alpha_i + \beta_i = \frac{1}{N}
$$

$$
\frac{\partial L}{\partial b} = 0 \quad \Rightarrow \quad \sum_{i=1}^{N} \alpha_i y_i = 0
$$

$$
\frac{\partial L}{\partial \rho} = 0 \quad \Rightarrow \quad \sum_{i=1}^{N} \alpha_i - \delta = \nu
$$

$$
\text{The Wolfe dual optimization problem for ν-SVC is:}
$$

$$
\max_{\alpha} W(\alpha) = - \frac{1}{2} \sum_{i=1}^{N} \sum_{j=1}^{N} \alpha_i \alpha_j y_i y_j K(x_i, x_j)
$$

$$
\text{subject to:} \quad 0 \leq \alpha_i \leq \frac{1}{N}, \quad \sum_{i=1}^{N} \alpha_i y_i = 0, \quad \alpha_i \geq \nu
$$

$$
\text{To compute the parameters \( b \) and \( \rho \), define two sets} \, S^+ \, \text{and} \, S^-:
$$

$$
s_1 = |S^+| = |\{i \mid 0 < \alpha_i < 1, y_i = 1\}|
$$

$$
s_2 = |S^-| = |\{i \mid 0 < \alpha_i < 1, y_i = -1\}|
$$

$$
r_1 = \frac{1}{s_1} \sum_{x \in S^+} \sum_{j=1}^{N} \alpha_j y_j K(x, x_j)
$$

$$
r_2 = \frac{1}{s_2} \sum_{x \in S^-} \sum_{j=1}^{N} \alpha_j y_j K(x, x_j)
$$

$$
b = -\frac{r_1 - r_2}{2}, \quad \rho = \frac{r_1 + r_2}{2}
$$


In [1]:
# Importing necessary libraries
import numpy as np
from sklearn.svm import NuSVC
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Generate a random binary classification dataset
X, y = make_classification(n_samples=100, n_features=20, n_classes=2, random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize ν-SVC with a kernel
nu = 0.5  # ν parameter
classifier = NuSVC(nu=nu, kernel='rbf', gamma='scale')

# Train the classifier
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Support Vectors
support_vectors = classifier.support_vectors_
print(f'Support Vectors:\n{support_vectors}')


Accuracy: 93.33%
Support Vectors:
[[-0.48760622  1.34700796 -2.03812454 ... -0.40807537 -0.32602353
   1.20121392]
 [-0.76325916  1.89033108 -0.65183611 ... -1.2110162  -1.66152006
  -0.0660798 ]
 [-1.22576566  0.80742726  0.65232288 ...  0.88365994 -0.03850847
  -0.1726273 ]
 ...
 [-0.9354387   0.15985512  0.7999419  ... -0.30317978 -0.3493168
  -0.01941961]
 [ 0.84064355  0.37531604 -0.96697614 ...  0.42545756  0.76041466
   0.78580016]
 [-0.42018682 -0.24038388  0.9843224  ... -0.99835404  0.23421473
   1.55050049]]


In [2]:
def kernel(x1, x2):
    return np.dot(x1, x2)  # Linear kernel
N = len(X_train)  # Number of training samples
alpha = np.zeros(N)  # Lagrange multipliers
b = 0  # Bias term
C = 1 / N  # Constant related to the number of samples
rho = 0  # Threshold parameter
def compute_objective(X, y, alpha, kernel):
    # Dual objective function
    total = 0
    for i in range(N):
        for j in range(N):
            total += alpha[i] * alpha[j] * y[i] * y[j] * kernel(X[i], X[j])
    return np.sum(alpha) - 0.5 * total
def update_alpha(X, y, alpha, C, kernel, tol=1e-4, max_passes=10):
    passes = 0
    while passes < max_passes:
        num_changed_alphas = 0
        for i in range(N):
            Ei = np.dot(alpha * y, kernel(X, X[i])) + b - y[i]
            if (y[i] * Ei < -tol and alpha[i] < C) or (y[i] * Ei > tol and alpha[i] > 0):
                j = np.random.randint(0, N)
                while j == i:
                    j = np.random.randint(0, N)
                Ej = np.dot(alpha * y, kernel(X, X[j])) + b - y[j]
                
                alpha_i_old = alpha[i]
                alpha_j_old = alpha[j]

                # Compute bounds L and H
                if y[i] != y[j]:
                    L = max(0, alpha[j] - alpha[i])
                    H = min(C, C + alpha[j] - alpha[i])
                else:
                    L = max(0, alpha[i] + alpha[j] - C)
                    H = min(C, alpha[i] + alpha[j])

                if L == H:
                    continue
                
                # Update alpha_j
                eta = 2 * kernel(X[i], X[j]) - kernel(X[i], X[i]) - kernel(X[j], X[j])
                if eta >= 0:
                    continue
                alpha[j] -= y[j] * (Ei - Ej) / eta
                alpha[j] = np.clip(alpha[j], L, H)
                
                if abs(alpha[j] - alpha_j_old) < tol:
                    continue

                # Update alpha_i
                alpha[i] += y[i] * y[j] * (alpha_j_old - alpha[j])

                # Compute b
                b1 = b - Ei - y[i] * (alpha[i] - alpha_i_old) * kernel(X[i], X[i]) - y[j] * (alpha[j] - alpha_j_old) * kernel(X[i], X[j])
                b2 = b - Ej - y[i] * (alpha[i] - alpha_i_old) * kernel(X[i], X[j]) - y[j] * (alpha[j] - alpha_j_old) * kernel(X[j], X[j])
                if 0 < alpha[i] < C:
                    b = b1
                elif 0 < alpha[j] < C:
                    b = b2
                else:
                    b = (b1 + b2) / 2

                num_changed_alphas += 1
        
        if num_changed_alphas == 0:
            passes += 1
        else:
            passes = 0

    return alpha, b
def predict(X, y, alpha, b, kernel, X_test):
    y_pred = []
    for i in range(len(X_test)):
        prediction = np.dot(alpha * y, [kernel(x, X_test[i]) for x in X]) + b
        y_pred.append(np.sign(prediction))
    return np.array(y_pred)
# Generate synthetic data
N = 100
X = np.random.randn(N, 2)
y = np.where(np.dot(X, np.array([2, -1])) > 0, 1, -1)

# Train ν-SVM
alpha, b = update_alpha(X, y, alpha, C, kernel)

# Test on some data points
X_test = np.random.randn(10, 2)
y_pred = predict(X, y, alpha, b, kernel, X_test)

print('Predictions:', y_pred)
# Fix it------- May be a small issue

ValueError: operands could not be broadcast together with shapes (70,) (100,) 

Standard SVMs are powerful tools for data classification by assigning them to one
of two disjoint halfspaces in either the original input space for linear classifiers,
or in a higher-dimensional feature space for nonlinear classifiers. The least squares
SVMs (LS-SVMs) developed in [37] and the proximal SVMs (PSVMs) presented in
[14, 15] are two much simpler classifiers, in which each class of points is assigned
to the closest of two parallel planes (in input or feature space) such that they are
pushed apart as far as possible.

The LS-SVMs formulate the binary classification problem as:

$$
\min_{\mathbf{w}, b, \mathbf{e}} \quad \frac{1}{2} \|\mathbf{w}\|^2 + \frac{C}{2} \sum_{k=1}^{N} e_k
$$
subject to:
$$
y_k (\mathbf{w}^T \phi(x_k) + b) = 1 - e_k, \quad k = 1, \dots, N
$$
for the binary classification \(y_k \in \{-1, 1\}\).

Define the Lagrange function (Lagrangian):
$$
L = L(\mathbf{w}, b, \mathbf{e}; \boldsymbol{\alpha}) = \frac{1}{2} \|\mathbf{w}\|^2 + \frac{C}{2} \sum_{k=1}^{N} e_k - \sum_{k=1}^{N} \alpha_k \left( y_k (\mathbf{w}^T \phi(x_k) + b) - 1 + e_k \right)
$$
where \(\alpha_k\) are Lagrange multipliers. Unlike Lagrange multipliers in SVM
with inequality constraints, in LS-SVM, Lagrange multipliers \(\alpha_k\) can be either
positive or negative due to the equality constraints.

Based on the KKT conditions, we get the optimality conditions as follows:

$$
\frac{\partial L}{\partial \mathbf{w}} = 0 \Rightarrow \mathbf{w} = \sum_{k=1}^{N} \alpha_k y_k \phi(x_k) \Rightarrow \mathbf{w} = Z \boldsymbol{\alpha}
$$
$$
\frac{\partial L}{\partial b} = 0 \Rightarrow \sum_{k=1}^{N} \alpha_k y_k = 0 \Rightarrow \mathbf{y}^T \boldsymbol{\alpha} = 0
$$
$$
\frac{\partial L}{\partial \mathbf{e}} = 0 \Rightarrow \alpha_k = C e_k, \quad k = 1, \dots, N \Rightarrow \boldsymbol{\alpha} = C \mathbf{e}
$$
$$
\frac{\partial L}{\partial \alpha_k} = 0 \Rightarrow y_k (\mathbf{w}^T \phi(x_k) + b) - 1 + e_k = 0, \quad k = 1, \dots, N \Rightarrow Z^T \mathbf{w} + b \mathbf{y} + \mathbf{e} = 1
$$

Where \(Z = [y_1 \phi(x_1), \dots, y_N \phi(x_N)] \in \mathbb{R}^{m \times N}\), \(\mathbf{y} = [y_1, \dots, y_N]^T\), \(\mathbf{1} = [1, \dots, 1]^T\), \(\mathbf{e} = [e_1, \dots, e_N]^T\), and \(\boldsymbol{\alpha} = [\alpha_1, \dots, \alpha_N]^T\).

The KKT conditions can be written as a matrix equation:

$$
\begin{bmatrix}
I & 0 & 0 & -Z \\
0 & 0 & 0 & -\mathbf{y}^T \\
0 & 0 & C I & I \\
-Z^T & -\mathbf{y} & I & 0
\end{bmatrix}
\begin{bmatrix}
\mathbf{w} \\
b \\
\mathbf{e} \\
\boldsymbol{\alpha}
\end{bmatrix}
=
\begin{bmatrix}
0 \\
0 \\
0 \\
1
\end{bmatrix}
$$

By eliminating \(\mathbf{w}\) and \(\mathbf{e}\), we simplify the KKT system as:

$$
\begin{bmatrix}
0 & \mathbf{y}^T \\
\mathbf{y} & Z^T Z + C^{-1} I
\end{bmatrix}
\begin{bmatrix}
b \\
\boldsymbol{\alpha}
\end{bmatrix}
=
\begin{bmatrix}
0 \\
\mathbf{1}
\end{bmatrix}
$$

Mercer’s condition can be applied to the matrix \(Z^T Z\):

$$
[Z^T Z]_{ij} = y_i y_j \phi(x_i)^T \phi(x_j) = y_i y_j K(x_i, x_j)
$$

Given a training set \(\{(x_i, y_i) \,|\, x_i \in \mathbb{R}^n, y_i \in \{-1, 1\}, i = 1, \dots, N \}\), constant \(C > 0\), and the kernel function \(K(x_i, x_j)\), the LS-SVM binary classification algorithm performs the following learning step:

- Construct the \(N \times N\) matrix \([Z^T Z]_{ij} = y_i y_j K(x_i, x_j)\).
- Solve the KKT matrix equation for \(\boldsymbol{\alpha} = [\alpha_1, \dots, \alpha_N]^T\) and \(b\).

In the testing step, for a given test sample \(x \in \mathbb{R}^n\), the decision function is:

$$
\text{class of } x = \text{sign}\left( \sum_{j=1}^{N} \alpha_j y_j K(x, x_j) + b \right)
$$


$$
b = \sum_{i=1}^{N} \alpha_i y_i
\tag{8.4.53}
$$

Here,

$$
Z = \begin{bmatrix} y_1 x_1, \dots, y_N x_N \end{bmatrix} \tag{8.4.54}
$$

$$
y = \begin{bmatrix} y_1, \dots, y_N \end{bmatrix}^T \tag{8.4.55}
$$

$$
\alpha = \begin{bmatrix} \alpha_1, \dots, \alpha_N \end{bmatrix}^T \tag{8.4.56}
$$

Similar to LS-SVM, the training data \( x \) can be mapped from the input space \( \mathbb{R}^n \) into the feature space \( \phi : x \to \phi(x) \). Hence, the nonlinear PSVM classifier still has the KKT equation (8.4.52) with:

$$
Z = \begin{bmatrix} y_1 \phi(x_1), \dots, y_N \phi(x_N) \end{bmatrix}
\tag{8.4.54}
$$

### Algorithm 8.1: PSVM Binary Classification Algorithm

**Input:**  
Training set \( \{(x_i, y_i) | x_i \in \mathbb{R}^n, y_i \in \{-1, 1\}, i = 1, \dots, N\} \), constant \( C > 0 \), and the kernel function \( K(x, x_i) \).

**Initialization:**  
\( y = \begin{bmatrix} y_1, \dots, y_N \end{bmatrix}^T \)

**Learning Step:**

1. Construct the \( N \times N \) matrix:
   $$
   [Z^T Z]_{ij} = y_i y_j \phi^T(x_i) \phi(x_j) = y_i y_j K(x_i, x_j)
   $$

2. Solve the KKT matrix equation (8.4.52) for:
   $$
   \alpha = \begin{bmatrix} \alpha_1, \dots, \alpha_N \end{bmatrix}^T
   $$

3. Compute:
   $$
   b = \sum_{i=1}^{N} \alpha_i y_i
   $$

**Testing Step:**  
For a given testing sample \( x \in \mathbb{R}^n \), its decision is given by:

$$
\text{class of } x = \text{sign} \left( \sum_{i=1}^{N} \alpha_i y_i K(x, x_i) + b \right)
$$

The decision functions of binary SVM, LS-SVM, and PSVM classifiers have the same form:

$$
f(x) = \text{sign} \left( \sum_{i=1}^{N} \alpha_i y_i K(x, x_i) + b \right) \tag{8.4.57}
$$

where \( y_i \) is the corresponding target class label of the training data \( x_i \), \( \alpha_i \) is the Lagrange multiplier, and \( K(x, x_i) \) is a suitable kernel function.

### Comparisons of SVM, LS-SVM, and PSVM

- **SVM**, **LS-SVM**, and **PSVM** are originally proposed for binary classification.
- LS-SVM and PSVM provide faster implementations of traditional SVMs.
- Both LS-SVM and PSVM use equality constraints instead of inequality constraints, avoiding quadratic programming and allowing a direct least-square solution.


In [4]:
import numpy as np

# Define a kernel function (linear or RBF kernel)
def linear_kernel(x1, x2):
    return np.dot(x1, x2)

def rbf_kernel(x1, x2, sigma=1.0):
    return np.exp(-np.linalg.norm(x1 - x2)**2 / (2 * (sigma**2)))

# PSVM Implementation
class PSVM:
    def __init__(self, C=1.0, kernel=linear_kernel):
        self.C = C  # Regularization parameter
        self.kernel = kernel  # Kernel function
    
    # Fit the PSVM model
    def fit(self, X, y):
        N = X.shape[0]  # Number of samples
        # Compute the kernel matrix K
        K = np.array([[self.kernel(X[i], X[j]) for j in range(N)] for i in range(N)])

        # Construct matrix A and vector B
        A = np.vstack([np.hstack([K, np.ones((N, 1))]), np.hstack([y[:, np.newaxis].T, np.array([[0]])])])
        B = np.hstack([np.zeros(N), [0]])
        
        # Solve the linear system A * alpha = B
        result = np.linalg.solve(A, B)
        self.alpha = result[:-1]  # Alpha coefficients
        self.b = result[-1]  # Bias term

    # Predict the class of new data points
    def predict(self, X_test):
        y_pred = []
        for x in X_test:
            decision_value = sum(self.alpha[i] * y[i] * self.kernel(X[i], x) for i in range(len(X))) + self.b
            y_pred.append(np.sign(decision_value))
        return np.array(y_pred)

# Example usage
if __name__ == "__main__":
    # Example training data (binary classification)
    X = np.array([[1, 2], [2, 3], [3, 3], [4, 5], [1, 0], [0, 1]])  # Training features
    y = np.array([1, 1, 1, -1, -1, -1])  # Labels (+1 or -1)

    # Initialize and train PSVM
    psvm = PSVM(C=1.0, kernel=linear_kernel)  # Using linear kernel
    psvm.fit(X, y)

    # Predict on new data points
    X_test = np.array([[3, 2], [1, 1], [2, 2]])
    predictions = psvm.predict(X_test)

    print("Predictions:", predictions)


Predictions: [0. 0. 0.]
