In [None]:
'''
 * Copyright (c) 2018 Radhamadhab Dalai
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
'''

# SVM-Recursive Feature Elimination (SVM-RFE)

The decision function for SVM is defined as a linear discriminant function:

$$ D(x) = \langle w, x \rangle + b, \tag{8.4.58} $$

where \( w \) is the weight vector and \( b \) is the bias term.

The optimal hyperplane \( (w, b) \) is constructed such that:

$$ w^\top_{\text{opt}} x + b_{\text{opt}} = 0, \tag{8.4.59} $$

which separates a set of training data \( (x_1, y_1), \ldots, (x_n, y_n) \). Vectors \( x_i \) such that:

$$ y_i (w^\top x_i + b) = 1 $$

are termed **support vectors**. The constrained optimization problem for finding the optimal weight vector \( w \) is formulated as:

$$ \min_{w,b} f(w,b) = \frac{1}{2} \|w\|_2^2, \tag{8.4.60} $$

subject to:

$$ y_i (x_i^\top w + b) \geq 1, \quad i = 1, \ldots, n, \tag{8.4.61} $$

where the inequality constraint ensures that \( x \) corresponds to support vectors.

### Lagrangian Form

The above constrained optimization problem can be written in the Lagrangian form as:

$$ \min_{w,b,\alpha} L(w, b, \alpha) = \frac{1}{2} \|w\|_2^2 - \sum_{i=1}^{n} \alpha_i \left[ y_i (x_i^\top w + b) - 1 \right], \tag{8.4.62} $$

where \( \alpha = [\alpha_1, \ldots, \alpha_n]^\top \) is the vector of non-negative Lagrange multipliers \( \alpha_i \geq 0 \), \( i = 1, \ldots, n \).

### Optimization Conditions

The optimization conditions for \( w \) and \( b \) are given by:

$$ \frac{\partial L(w, b, \alpha)}{\partial w} = w - \sum_{i=1}^{n} \alpha_i y_i x_i = 0 \quad \Rightarrow \quad w = \sum_{i=1}^{n} \alpha_i y_i x_i, \tag{8.4.63} $$

$$ \frac{\partial L(w, b, \alpha)}{\partial b} = \sum_{i=1}^{n} \alpha_i y_i = 0, \tag{8.4.64} $$

Substituting these into Equation (8.4.62) yields:

$$ \min_{\alpha} J(\alpha) = \frac{1}{2} \sum_{i=1}^{n} \sum_{j=1}^{n} \alpha_i \alpha_j y_i y_j x_i^\top x_j - \sum_{i=1}^{n} \alpha_i, $$

subject to \( 0 \leq \alpha \leq C \mathbf{1} \) and \( \alpha^\top y = 0 \). \( (8.4.65) \)

For nonlinear binary classification, this becomes:

$$ \min_{\alpha} J(\alpha) = \frac{1}{2} \sum_{i=1}^{n} \sum_{j=1}^{n} \alpha_i \alpha_j y_i y_j \phi(x_i)^\top \phi(x_j) - \sum_{i=1}^{n} \alpha_i, \tag{8.4.66} $$

subject to the same constraints.

### SVM-RFE Algorithm

The goal of the SVM-Recursive Feature Elimination (SVM-RFE) algorithm is to find a subset of features that maximizes the performance of the predictor. The algorithm works as follows:

$$ \|\mathbf{w}\|_2^2 - \|\mathbf{w}^{(i)}\|_2^2 = 2 \sum_{j=1}^{d} \sum_{k=1}^{d} \alpha_j \alpha_k y_j y_k \left[ K(x_j, x_k) - K^{(i)}(x_j, x_k) \right], \tag{8.4.67} $$

where \( K^{(i)}(x_j, x_k) = \langle \phi(x_j^{(i)}), \phi(x_k^{(i)}) \rangle \) is the Gram matrix of the training data with feature \( i \) removed.

### Algorithm Steps

1. **Input:** Training data \( X = \{x_1, \ldots, x_d\} \), class labels \( Y = \{y_1, \ldots, y_d\} \), and expected feature number \( r \).
2. **Initialization:** Index subset of surviving features \( S = \{1, \ldots, d\} \).
3. **Repeat:**
   - Restrict training examples to good feature indices \( (X, Y) \).
   - Solve Equation (8.4.65) or (8.4.66) for the classifier \( \alpha \).
   - Compute the weight vector of dimension \( m = \text{length}(S) \) as:
   
     $$ w = \sum_{k=1}^{m} \alpha_k y_k x_k, $$

   - Compute the ranking criterion \( c_i = (w_i)^2 \) for all \( i = 1, \ldots, m \).
   - Find the feature index with the smallest ranking criterion:
   
     $$ i = \arg \min \{c_1, \ldots, c_m\}, $$

   - Eliminate the variable \( i \) with the smallest ranking criterion and update \( X \leftarrow X \setminus x_i \), \( Y \leftarrow Y \setminus y_i \), and \( S \leftarrow S \setminus i \).
4. **Until:** \( \text{length}(S) = r \).
5. **Output:** Feature ranked list \( X \).

This process iteratively removes the least important features based on their ranking criterion until only the top \( r \) features remain.


In [1]:
# Simple Python implementation of SVM-RFE without using libraries like NumPy or scikit-learn

import random

# SVM Decision function
def decision_function(w, x, b):
    return sum(w[i] * x[i] for i in range(len(w))) + b

# Dot product between two vectors
def dot_product(v1, v2):
    return sum(v1[i] * v2[i] for i in range(len(v1)))

# Training SVM (simplified version)
def train_svm(X, Y, C=1.0):
    n = len(X)
    d = len(X[0])
    
    # Initialize weights and bias
    w = [0.0] * d
    b = 0.0
    alpha = [0.0] * n  # Lagrange multipliers

    # Simplified Gradient Descent for optimization (simplified for small datasets)
    for epoch in range(100):  # max iterations
        for i in range(n):
            if Y[i] * decision_function(w, X[i], b) <= 1:
                for j in range(d):
                    w[j] += C * Y[i] * X[i][j]
                b += C * Y[i]
    
    return w, b

# SVM-RFE algorithm
def svm_rfe(X, Y, r):
    # X is the feature matrix, Y is the labels
    n = len(X)
    d = len(X[0])

    # Feature indices that survive the elimination
    S = list(range(d))

    while len(S) > r:
        # Train SVM
        X_reduced = [[X[i][j] for j in S] for i in range(n)]
        w, b = train_svm(X_reduced, Y)

        # Compute ranking criterion c_i = (w_i)^2
        ranking_criteria = [(i, w[idx] ** 2) for idx, i in enumerate(S)]

        # Find feature with smallest ranking criterion
        i_min = min(ranking_criteria, key=lambda x: x[1])[0]

        # Remove the feature with smallest ranking criterion
        S.remove(i_min)

    return S

# Example usage
if __name__ == "__main__":
    # Example dataset (small and simple for demo purposes)
    X = [
        [2.0, 1.0, 3.0],
        [1.0, 4.0, 1.0],
        [2.0, 3.0, 4.0],
        [5.0, 4.0, 2.0],
        [6.0, 3.0, 3.0],
        [7.0, 2.0, 5.0]
    ]
    Y = [1, -1, 1, -1, 1, -1]

    # Reduce to top 2 features
    r = 2
    surviving_features = svm_rfe(X, Y, r)

    print("Surviving feature indices after SVM-RFE:", surviving_features)


Surviving feature indices after SVM-RFE: [0, 2]


# Support Vector Machine Multiclass Classification

A multiclass classifier is a function $ H: X \to Y $ that maps an instance $ x \in X $ (for example, $ X = \mathbb{R}^n $) into an element $ y \in Y $ (for example, $ y \in \{1, \ldots, k\} $).

## Decomposition Methods for Multiclass Classification

A popular way to solve a \(k\)-class problem is to decompose it into a set of \(L\) binary classification problems. Three of the most common decomposition approaches are:

- **One-Versus-All (OVA)** (or One-Against-All)
- **One-Versus-One (OVO)** (or One-Against-One)

### One-Against-All Method

In the one-against-all (OVA) approach, we construct \( L = k \) binary classifiers $ C_m $,  m = 1, $\ldots$, k . The \( i \)-th SVM is trained with all of the examples in the \( i \)-th class as positive labels, and all other examples as negative labels.

Let  S = {($x_1$, $y_1$), $\ldots$, ($x_N$, $y_N$)}  be a set of \( N \) training examples, where each $ x_i $ is drawn from a domain $ X \subseteq \mathbb{R}^n $ and $ y_i \in \{1, $\ldots$, k\} $ represents the class of $ x_i $.

The optimization problem for the \( m \)-th one-against-all classifier is formulated as:

$$
\min_{w_m, b_m, \xi_m} \frac{1}{2} \|w_m\|^2 + C \sum_{i=1}^{N} \xi_{m,i}
$$
subject to:
$$
w_m^T \phi(x_i) + b_m \geq 1 - \xi_{m,i}, \quad \text{if } y_i = m,
$$
$$
w_m^T \phi(x_i) + b_m \leq -1 + \xi_{m,i}, \quad \text{if } y_i \neq m,
$$
$$
\xi_{m,i} \geq 0, \quad i = 1, \ldots, N,
$$
where \( w_m \) is the weight vector for the \( m \)-th class, \( C \) is the penalty parameter, and \( \xi_{m,i} \) represents the training error for the \( i \)-th data point. 

The decision function is represented as:

$$
f(x) = \arg \max_{m=1,\ldots,k} \left( w_m^T \phi(x) + b_m \right)
$$

### One-Against-One Method

In the one-against-one (OVO) method, we construct $ L = \frac{k(k-1)}{2} $ binary classifiers, each distinguishing between two classes. For every pair of classes \( i \) and \( j \), a binary classifier is trained using the examples from the \( i \)-th class as positive and the examples from the \( j \)-th class as negative. This results in \( k(k-1)/2 \) classifiers.

For training data from the \( i \)-th and \( j \)-th classes, the optimization problem is:

$$
\min_{w^{ij}, b^{ij}, \xi^{ij}} \frac{1}{2} \left( w^{ij} \right)^T w^{ij} + C \sum_{n=1}^{N} \xi_n^{ij}
$$
subject to:
$$
\left( w^{ij} \right)^T \phi(x_n) + b^{ij} \geq 1 - \xi_n^{ij}, \quad \text{if } y_n = i,
$$
$$
\left( w^{ij} \right)^T \phi(x_n) + b^{ij} \leq -1 + \xi_n^{ij}, \quad \text{if } y_n = j,
$$
$$
\xi_n^{ij} \geq 0, \quad n = 1, \ldots, N.
$$

After training, the "Max Wins" voting strategy is used. Each binary classifier $ C_{ij} $ votes for either the \( i \)-th or \( j \)-th class. The final class of the new instance is predicted as the class with the most votes.

### References
- [1] Support Vector Machines, [21], [41], [48].
- [2] One-Versus-One Method [13], [25].


In [2]:
import numpy as np

class SVM_OVA:
    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
        self.learning_rate = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.classes = None
        self.weights = None
        self.biases = None

    def fit(self, X, y):
        # One-Versus-All: Get unique class labels
        self.classes = np.unique(y)
        n_classes = len(self.classes)
        n_samples, n_features = X.shape

        # Initialize weights and biases for each class
        self.weights = np.zeros((n_classes, n_features))
        self.biases = np.zeros(n_classes)

        for idx, c in enumerate(self.classes):
            # Convert labels to +1/-1 for binary classification for class c
            binary_y = np.where(y == c, 1, -1)
            self.weights[idx], self.biases[idx] = self._train(X, binary_y)

    def _train(self, X, y):
        n_samples, n_features = X.shape
        # Initialize weights and bias
        w = np.zeros(n_features)
        b = 0

        # Gradient Descent
        for _ in range(self.n_iters):
            for i in range(n_samples):
                condition = y[i] * (np.dot(X[i], w) - b) >= 1
                if condition:
                    # L2 regularization term
                    w -= self.learning_rate * (2 * self.lambda_param * w)
                else:
                    # Update weights and bias
                    w -= self.learning_rate * (2 * self.lambda_param * w - np.dot(X[i], y[i]))
                    b -= self.learning_rate * y[i]

        return w, b

    def predict(self, X):
        n_samples = X.shape[0]
        predictions = np.zeros(n_samples)

        # Predict using each classifier
        for idx, (w, b) in enumerate(zip(self.weights, self.biases)):
            linear_output = np.dot(X, w) + b
            predictions += (linear_output >= 0) * (self.classes[idx])

        return predictions

# Sample Usage
if __name__ == "__main__":
    # Sample dataset (4 samples, 2 features)
    X = np.array([[1, 2], [2, 3], [3, 4], [5, 6]])
    y = np.array([0, 1, 0, 1])  # Two classes: 0 and 1

    # Create and train the model
    svm = SVM_OVA(learning_rate=0.001, lambda_param=0.01, n_iters=1000)
    svm.fit(X, y)

    # Predict on new samples
    predictions = svm.predict(X)
    print("Predictions:", predictions)


Predictions: [1. 1. 1. 1.]


# DAGSVM Method

The Directed Acyclic Graph Support Vector Machine (DAGSVM) method is a variation of multiclass classification. It is called the DAGSVM method because, during the testing phase, it uses a **rooted binary directed acyclic graph** (DAG), while its training phase is the same as the one-against-one method.

## Training Phase

The training phase is identical to the one-against-one (OAO) method, solving \( \frac{k(k-1)}{2} \) binary SVM classification problems for a \(k\)-class problem.

## Testing Phase

In the testing phase, instead of using a voting strategy like OVO, the DAGSVM method uses a **directed acyclic graph** (DAG). The DAG is rooted, with internal nodes and leaves. Each internal node corresponds to a binary SVM classifier for two classes, and the leaves correspond to the final class labels.

For a \(k\)-class classification problem, a rooted binary DAG has \(k\) leaves and \( \frac{k(k-1)}{2} \) internal nodes. Each node is a binary SVM classifier of two classes, and the graph is structured such that the nodes eliminate one class at a time from the list of possible classes until only one remains.

### Definition: Decision Directed Acyclic Graph (DDAG)

A **decision directed acyclic graph (DDAG)** is a graph whose edges have orientations and no cycles. It is defined as follows:

Given a space \(X\) and a set of Boolean functions \( F = \{f: X \to \{0, 1\} \} \), the decision directed acyclic graphs (DDAGs) on \(k\) classes over \(F\) are functions that can be implemented using a rooted binary DAG with \(k\) leaves labeled by the classes. Each of the \(L = \frac{k(k-1)}{2}\) internal nodes is labeled with an element of \(F\).

The nodes are arranged in a triangular structure:
- The root node is at the top.
- The second layer has two nodes.
- The third layer has three nodes, and so on until the final layer has \(k\) leaves.

Each internal node represents a binary SVM decision between two classes. The testing process eliminates classes from the list based on the decision of the nodes, eventually leaving only one class.

### DDAG Testing Procedure

1. Start with the root node.
2. The binary decision at each node eliminates one class from the list.
3. Continue evaluating the first and last elements of the updated list until only one class remains.

Thus, for a \(k\)-class problem, \(k-1\) decision nodes are evaluated to determine the final class.

### Example 8.1

Consider \(N\) test samples \( \{x_i, y_i\} \), \( i = 1, \dots, N \), where \(x_i \in \mathbb{R}^n\) and \(y_i \in \{1, 2, 3, 4\}\). The DDAG method proceeds as follows:

- Start with the list of classes \(\{1, 2, 3, 4\}\).
- Evaluate the decision node corresponding to the first and last classes, \(1\) vs. \(4\).
- If class \(1\) wins, eliminate class \(4\), leaving the list \(\{1, 2, 3\}\).
- Evaluate the decision node \(1\) vs. \(3\). If class \(1\) wins again, eliminate class \(3\).
- Continue until only one class remains.

This decision process is shown below:

$$
\text{Decision Directed Acyclic Graph (DDAG) for } k=4:
$$

\[
\begin{aligned}
1 \text{ vs } 4 &\rightarrow \text{not } 1 \quad & \{2, 3, 4\} \\
2 \text{ vs } 4 &\rightarrow \text{not } 2 \quad & \{3, 4\} \\
3 \text{ vs } 4 &\rightarrow \text{Class } 3 \quad & \{3\}
\end{aligned}
\]

### Conclusion

The DAGSVM method generalizes decision trees by efficiently eliminating classes in a directed acyclic graph. It provides an efficient testing procedure for multiclass classification by eliminating one class at each node.


![Decision Directed Acyclic Graphs (DDAG)](svm5_1.png)
Fig 1: The decision directed acyclic graphs (DDAG) for finding the best class out of four classes

The decision directed acyclic graph (DDAG) eliminates one class from the list at each node. Starting at the root node with a decision between Class 1 and Class 4, the binary decision function evaluates which class remains. 

If the output value does not favor Class 1, Class 1 is eliminated from the list, yielding a new list:

$$ \{2, 3, 4\} $$

Next, the binary decision is made between Classes 2 and 4. 

If the root node prefers Class 1, then Class 4 is removed from the list, resulting in a new list:

$$ \{1, 2, 3\} $$

The next binary decision is made between Classes 1 and 3. 

At the second layer, there are two decision nodes:

1. $$ 2 \, \text{vs} \, 4 $$
2. $$ 1 \, \text{vs} \, 3 $$

The DDAG continues this process, proceeding through each binary decision at every node, until only one class remains in the list. 

This process is depicted in Figure 8.2, showing the DDAG for the four classes. 

DDAGs generalize the structure of Decision Trees, enabling a more efficient representation of redundancies and repetitions within different branches of the tree by allowing decision paths to merge and streamline the classification process.


In [3]:
import numpy as np
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

def build_ddag(X_train, y_train):
    classes = np.unique(y_train)
    classifiers = {}
    
    # Train binary classifiers for each pair of classes
    for i in range(len(classes)):
        for j in range(i+1, len(classes)):
            class_i = classes[i]
            class_j = classes[j]
            # Create binary labels for the i-th vs j-th classifier
            y_binary = np.where(y_train == class_i, 1, -1)
            
            # Train SVM
            clf = SVC(kernel='linear')
            clf.fit(X_train, y_binary)
            
            # Store the classifier
            classifiers[(class_i, class_j)] = clf
    
    return classifiers

def predict_ddag(X_test, classifiers):
    # Start with all classes
    remaining_classes = set(c[0] for c in classifiers.keys()).union(c[1] for c in classifiers.keys())
    
    # Predict using the classifiers
    for (class_i, class_j), clf in classifiers.items():
        decision = clf.predict(X_test)
        
        if decision == 1:
            # Class i is preferred
            remaining_classes.discard(class_j)
        else:
            # Class j is preferred
            remaining_classes.discard(class_i)
    
    return remaining_classes

# Load dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Build and use DDAG
classifiers = build_ddag(X_train, y_train)
predictions = predict_ddag(X_test, classifiers)

print("Predicted classes:", predictions)

'''
build_ddag Function:

Trains binary SVM classifiers for each pair of classes.
Uses SVC from scikit-learn for linear classification.
predict_ddag Function:

Uses the trained classifiers to determine which classes are remaining based on predictions.
Loading and Splitting Data:

Example uses the Iris dataset. Replace with your own dataset as needed.
Include Image:

Use Markdown to embed an image showing the DDAG structure.

'''

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [4]:
import numpy as np

# Linear SVM Classifier implementation (simplified)
class LinearSVM:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0

        for _ in range(self.epochs):
            for idx, x_i in enumerate(X):
                if y[idx] * (np.dot(x_i, self.weights) + self.bias) < 1:
                    self.weights += self.learning_rate * (y[idx] * x_i - 2 * 1/self.epochs * self.weights)
                    self.bias += self.learning_rate * y[idx]
                else:
                    self.weights -= self.learning_rate * 2 * 1/self.epochs * self.weights

    def predict(self, X):
        return np.sign(np.dot(X, self.weights) + self.bias)

# Training binary SVM classifiers for DAGSVM
def train_binary_svm_classifiers(X_train, y_train):
    classifiers = {}
    classes = np.unique(y_train)
    num_classes = len(classes)
    
    for i in range(num_classes):
        for j in range(i + 1, num_classes):
            svm = LinearSVM()
            binary_labels = np.where(y_train == i, 1, -1)
            svm.fit(X_train, binary_labels)
            classifiers[(i, j)] = svm
    
    return classifiers, classes

# Predict using the DAGSVM method
def predict_dagsvm(classifiers, classes, X_test):
    num_classes = len(classes)
    
    def classify(sample):
        remaining_classes = list(classes)
        
        for i in range(num_classes - 1):
            for j in range(i + 1, num_classes):
                clf = classifiers[(i, j)]
                pred = clf.predict([sample])[0]
                if pred == 1:
                    if j in remaining_classes:
                        remaining_classes.remove(j)
                else:
                    if i in remaining_classes:
                        remaining_classes.remove(i)
                if len(remaining_classes) == 1:
                    return remaining_classes[0]
        
        return remaining_classes[0]
    
    return np.array([classify(sample) for sample in X_test])

# Main script
if __name__ == "__main__":
    # Example data (Iris dataset)
    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score

    # Load and prepare data
    data = load_iris()
    X = data.data
    y = data.target

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Train classifiers
    classifiers, classes = train_binary_svm_classifiers(X_train, y_train)
    
    # Predict and evaluate
    y_pred = predict_dagsvm(classifiers, classes, X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.78


## Least Squares SVM Multiclass Classifier

In the multiclass case with \( k \) labels, LS-SVM uses \( k \) output nodes to encode the multiclasses, where \( y_{i,j} \) denotes the output value of the \( j \)-th output node for the training data \( x_i \). The primal optimization problem of LS-SVM can be represented as:

$$
\min_{w_m, b_m, \xi_{m,i}} \sum_{m=1}^k \left( \frac{1}{2} \| w_m \|^2 + \frac{C}{2} \sum_{i=1}^N \xi_{m,i} \right)
$$

subject to:

$$
\begin{cases}
w_m^T \phi_m(x_i) + b_m = 1 - \xi_{m,i}, & \text{for } i = 1, \dots, N \text{ and } y_i = m \\
w_m^T \phi_m(x_i) + b_m \geq 1 - \xi_{m,i}, & \text{for } i = 1, \dots, N \text{ and } y_i \ne m
\end{cases}
$$

where \( w_m \) and \( b_m \) are the weights and bias of the \( m \)-th classifier, \( \xi_{m,i} \) is the slack variable, and \( \phi_m(x_i) \) represents the feature transformation for the \( m \)-th class.

### Dual Formulation

The Lagrange function in the dual LS-SVM multiclass classifier is given by:

$$
L_D = \frac{1}{2} \sum_{m=1}^k \left( \frac{1}{2} \| w_m \|^2 + \frac{C}{2} \sum_{i=1}^N \xi_{m,i} \right) - \sum_{i=1}^N \sum_{m=1}^k \alpha_{m,i} \left( y_i w_m^T \phi_m(x_i) + b_m - 1 + \xi_{m,i} \right)
$$

The conditions for optimality are:

$$
\frac{\partial L_D}{\partial w_m} = 0 \implies w_m = \sum_{i=1}^N \alpha_{m,i} y_i \phi_m(x_i)
$$

$$
\frac{\partial L_D}{\partial b_m} = 0 \implies \sum_{i=1}^N \alpha_{m,i} y_i = 0
$$

$$
\frac{\partial L_D}{\partial \xi_{m,i}} = 0 \implies \alpha_{m,i} = C \xi_{m,i}
$$

$$
\frac{\partial L_D}{\partial \alpha_{m,i}} = 0 \implies y_i w_m^T \phi_m(x_i) + b_m - 1 + \xi_{m,i} = 0
$$

### KKT Conditions in Matrix Form

The KKT conditions can be rewritten in matrix form:

$$
\begin{bmatrix}
0 & (y^{(m)})^T \\
y^{(m)} & \Omega^{(m)}
\end{bmatrix}
\begin{bmatrix}
b_m \\
\alpha_m
\end{bmatrix}
= \begin{bmatrix}
1 \\
0
\end{bmatrix}
$$

where:

$$
\Omega^{(m)} = Z_m^T Z_m + C^{-1} I
$$

with entries:

$$
\Omega^{(m)}_{ij} = y_i^{(m)} y_j^{(m)} K_m(x_i, x_j) + C^{-1} \delta_{ij}
$$

and \( K_m(x_i, x_j) = \phi_m(x_i)^T \phi_m(x_j) \) is the kernel function for the \( m \)-th SVM.

### LS-SVM Multiclass Classification Algorithm

Algorithm 8.3 shows the LS-SVM multiclass classification algorithm:

**Input**: Training set \( \{(x_i, y_i) \mid x_i \in \mathbb{R}^n, y_i \in \{1, \dots, k\}, i = 1, \dots, N\} \), constant \( C > 0 \), and the kernel function \( K_m(x, x_i) \), \( m = 1, \dots, k \).

**Initialization**: 

$$
y_i^{(m)} = \begin{cases}
+1, & \text{if } y_i = m \\
-1, & \text{otherwise}
\end{cases}
$$

**Learning Step**:

1. Construct the \( N \times 1 \) vector \( y^{(m)} = [y_1^{(m)}, \dots, y_N^{(m)}]^T \).
2. Use the kernel function to construct the entries of the \( N \times N \) matrix \( \Omega^{(m)} \).
3. Solve the KKT matrix equation for \( b_m \) and \( \alpha_m = [\alpha_{m,i}]_{i=1}^N \).

**Testing Step**:

For a given test sample \( x \in \mathbb{R}^n \), the class is:

$$
\text{class of } x = \arg \max_{m=1, \dots, k} \left( \sum_{i=1}^N \alpha_{m,i} y_i K_m(x, x_i) + b_m \right)
$$


In [6]:
import numpy as np
from scipy.linalg import solve

class LeastSquaresSVM:
    def __init__(self, C=1.0, kernel='rbf', gamma=1.0):
        self.C = C
        self.kernel = kernel
        self.gamma = gamma
        self.alphas = None
        self.b = None
        self.X_train = None
        self.y_train = None
        self.K = None

    def _kernel(self, X1, X2):
        if self.kernel == 'rbf':
            sq_dists = np.sum(X1**2, axis=1).reshape(-1, 1) + np.sum(X2**2, axis=1) - 2 * np.dot(X1, X2.T)
            return np.exp(-self.gamma * sq_dists)
        else:
            raise ValueError("Unsupported kernel")

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.K = self._kernel(X, X)
        
        N = X.shape[0]
        num_classes = len(np.unique(y))
        
        # Construct the matrix for each class
        alphas = np.zeros((N, num_classes))
        biases = np.zeros(num_classes)

        for m in range(num_classes):
            y_m = np.where(y == m, 1, -1)
            K_m = self.K + np.eye(N) / self.C
            
            # Solve for alpha and bias
            alpha_m = solve(K_m, y_m)
            alphas[:, m] = alpha_m
            
            # Calculate bias term for the m-th classifier
            bias_m = np.mean(y_m - np.dot(self.K, alpha_m))
            biases[m] = bias_m

        self.alphas = alphas
        self.b = biases

    def predict(self, X):
        K_test = self._kernel(X, self.X_train)
        decision_values = np.dot(K_test, self.alphas) + self.b
        return np.argmax(decision_values, axis=1)

# Example usage
if __name__ == "__main__":
    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score

    # Load dataset
    data = load_iris()
    X = data.data
    y = data.target

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Train LS-SVM classifier
    classifier = LeastSquaresSVM(C=1.0, kernel='rbf', gamma=0.1)
    classifier.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")


Accuracy: 1.00


## Proximal Support Vector Machine (PSVM) Multiclass Classification

### Primal Constrained Optimization Problem

The primal constrained optimization problem for the PSVM multiclass classifier is:

$$
\begin{aligned}
\min_{w_m, b_m, \xi_{m,i}} \quad & L_{PPSVM}(w_m, b_m, \xi_{m,i}) \\
= \quad & \frac{1}{2} \sum_{m=1}^k \left( \|w_m\|^2 + b_m^2 \right) + \frac{C}{2} \sum_{i=1}^N \xi_{m,i}^2 \\
\text{subject to} \quad & y_i \left( w_m^T \phi_m(x_i) + b_m \right) \geq 1 - \xi_{m,i}, \\
& \text{for } i = 1, \ldots, N.
\end{aligned}
$$

### Dual Unconstrained Optimization Problem

The corresponding dual unconstrained optimization problem is:

$$
\begin{aligned}
\min_{\alpha_{m,i}} \quad & L_{DPSVM} \\
= \quad & \frac{1}{2} \sum_{m=1}^k \left( \|w_m\|^2 + b_m^2 \right) + \frac{C}{2} \sum_{i=1}^N \xi_{m,i}^2 \\
\text{subject to} \quad & \sum_{i=1}^N \alpha_{m,i} \left( y_i^{(m)} \left( w_m^T \phi_m(x_i) + b_m \right) - 1 + \xi_{m,i} \right) = 0.
\end{aligned}
$$

### Optimality Conditions

The optimality conditions are:

$$
\frac{\partial L_{DPSVM}}{\partial w_m} = 0 \implies w_m = \sum_{i=1}^N \alpha_{m,i} y_i^{(m)} \phi_m(x_i),
$$

$$
\frac{\partial L_{DPSVM}}{\partial b_m} = 0 \implies b_m = \sum_{i=1}^N \alpha_{m,i} y_i^{(m)},
$$

$$
\frac{\partial L_{DPSVM}}{\partial \xi_{m,i}} = 0 \implies \xi_{m,i} = C^{-1} \alpha_{m,i},
$$

$$
\frac{\partial L_{DPSVM}}{\partial \alpha_{m,i}} = 0 \implies y_i^{(m)} \left( w_m^T \phi_m(x_i) + b_m \right) - 1 + \xi_{m,i} = 0.
$$

### KKT Matrix Equation

The KKT matrix equation is:

$$
\sum_{i=1}^N \alpha_{m,i} y_i^{(m)} = b_m,
$$

$$
(C^{-1} I + Z_m^T Z_m + y_m y_m^T) \alpha_m = 1,
$$

where

$$
Z_m = \left[ y_1^{(m)} \phi_m(x_1), \ldots, y_N^{(m)} \phi_m(x_N) \right],
$$

$$
y_m = \left[ y_1^{(m)}, \ldots, y_N^{(m)} \right]^T,
$$

$$
\alpha_m = \left[ \alpha_{m,1}, \ldots, \alpha_{m,N} \right]^T.
$$

### PSVM Multiclass Classification Algorithm

**Algorithm:**

1. **Input**: Training set \(\{ (x_i, y_i) \mid x_i \in \mathbb{R}^n, y_i \in \{1, \ldots, k\}, i = 1, \ldots, N \}\), constant \(C > 0\), and kernel functions \(K_m(x, x_i)\) for \(m = 1, \ldots, k\).

2. **Initialization**: 
   - Set \( y_i^{(m)} = 1 \) if \( y_i = m \); otherwise, \( y_i^{(m)} = -1 \) for \( m = 1, \ldots, k \) and \( i = 1, \ldots, N \).

3. **Learning Step**:
   - For \( m = 1, \ldots, k \):
     1. Construct the \( N \times N \) matrix \( \left[ Z_m^T Z_m \right]_{ij} = y_i^{(m)} y_j^{(m)} K_m(x_i, x_j) \).
     2. Solve the KKT matrix equation \( (C^{-1} I + Z_m^T Z_m + y_m y_m^T) \alpha_m = 1 \) for \( \alpha_m \).
     3. Compute \( b_m = \alpha_m^T y_m \).

4. **Testing Step**:
   - For a given test sample \( x \in \mathbb{R}^n \), its class is given by:
   $$
   \text{class of } x = \arg \max_{m=1, \ldots, k} \left( \sum_{i=1}^N \alpha_{m,i} y_i^{(m)} K_m(x, x_i) + b_m \right).
   $$


In [8]:
import numpy as np

class PSVM:
    def __init__(self, C=1.0, kernel='linear'):
        self.C = C
        self.kernel = kernel
        self.kernels = {
            'linear': lambda x, y: np.dot(x, y.T),
            'poly': lambda x, y: (1 + np.dot(x, y.T)) ** 3,
            'rbf': lambda x, y: np.exp(-np.linalg.norm(x[:, None] - y, axis=2) ** 2 / 2.0)
        }
        self.kernel_func = self.kernels[kernel]

    def fit(self, X, y):
        N, D = X.shape
        k = len(np.unique(y))
        
        alpha = np.zeros((k, N))
        b = np.zeros(k)
        
        # Construct Z_m and y_m
        Z = [self.kernel_func(X, X) for _ in range(k)]
        y_m = [np.where(y == m+1, 1, -1) for m in range(k)]
        
        # Learning step
        for m in range(k):
            # Construct matrix for (C^-1 I + Z_m^T Z_m + y_m y_m^T)
            K_m = Z[m]
            K_m_y = K_m * y_m[m][:, np.newaxis]  # Element-wise multiplication
            M = (1 / self.C) * np.eye(N) + K_m @ K_m.T + np.outer(y_m[m], y_m[m])
            
            # Solve for alpha_m
            alpha_m = np.linalg.solve(M, np.ones(N))
            alpha[m, :] = alpha_m
            b[m] = np.mean(alpha_m * y_m[m])
        
        self.alpha = alpha
        self.b = b
        self.support_vectors = X
        self.support_vector_labels = y

    def predict(self, X):
        k = len(self.b)
        predictions = np.zeros((X.shape[0], k))
        
        for m in range(k):
            K_m = self.kernel_func(X, self.support_vectors)
            predictions[:, m] = K_m @ self.alpha[m, :] + self.b[m]
        
        return np.argmax(predictions, axis=1) + 1

# Example Usage
if __name__ == "__main__":
    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    
    # Load the Iris dataset
    data = load_iris()
    X = data.data
    y = data.target + 1  # PSVM expects labels to start from 1
    
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Initialize and train PSVM classifier
    classifier = PSVM(C=1.0, kernel='rbf')
    classifier.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.89
