# Part I – Kernel Perceptron: (Handwritten Digit Classification)

## Load data

In [1]:
import numpy as np
import torch
from pathlib import Path
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_digit_file(path):
    data = np.loadtxt(path)
    y = data[:, 0].astype(int)
    X = data[:, 1:].astype(np.float32)
    X = torch.from_numpy(X).to(device)
    y = torch.from_numpy(y).to(device)
    return X, y

X, y = load_digit_file("/datasets/sl2-data/zipcombo.dat")

## Polynomial One-versus-Rest 

One-versus-Rest is a method which trains k number of binary classifiers, where k is the number of classes. Each binary classifier is trained to determine whether the input is of the class c, in which case the value 1 is assigned, or one of the other classes, with value 0. The classifier of all k classifiers which can most confidently assign input as its class is the chosen one.

### Polynomial OvR Perceptron

In [4]:
# ----- polynomial kernel -----

def poly_kernel(X, Z, degree):
    """
    Compute Gram matrix K[i,j] = (X_i · Z_j)^degree
    X: (n1, d), Z: (n2, d)
    """
    # x, z: 1-D vectors of length d
    return (X @ Z.T).pow(degree)

# ----- binary kernel perceptron (vectorized) -----

class KernelPerceptronBinary:
    def __init__(self, epochs=5, degree=3, device=None):
        self.epochs = epochs
        self.degree = degree
        self.device = device or torch.device("cpu")
        self.alpha = None      # (m,)
        self.X_sv = None       # (m, d)
        self.K = None          # (m, m)

    def fit(self, X, y):
        # X: (m, d), y: (m,) in {+1, -1}
        X = X.to(self.device)
        y = y.to(self.device).float()
        m = X.shape[0]
        self.X_sv = X
        self.alpha = torch.zeros(m, device=self.device)

        # Precompute full Gram matrix once
        self.K = (X @ X.T).pow(self.degree)   # (m, m)

        for _ in range(self.epochs):
            # scores_t = sum_i alpha_i K_{i,t}
            scores = self.K.T @ self.alpha    # (m,)
            y_hat = torch.sign(scores)
            y_hat[y_hat == 0] = 1
            mis = (y_hat != y)
            if not mis.any():
                continue
            # Standard perceptron: alpha_t += y_t for each misclassified t
            self.alpha[mis] += y[mis]

    def decision_function(self, X):
        # X: (n, d)
        X = X.to(self.device)
        # Kernel between test X and stored X_sv: (n, m)
        K_test = (X @ self.X_sv.T).pow(self.degree)
        # scores: (n,) = K_test @ alpha
        return K_test @ self.alpha

    def predict(self, X):
        scores = self.decision_function(X)
        y_hat = torch.sign(scores)
        y_hat[y_hat == 0] = 1
        return y_hat

# ----- one‑vs‑rest multiclass (polynomial kernel) -----

class KernelPerceptronOvR:
    def __init__(self, classes=10, epochs=5, degree=3, device=None):
        self.classes = classes
        self.epochs = epochs
        self.degree = degree
        self.device = device or torch.device("cpu")
        self.classifiers = []

    def fit(self, X, y):
        X = X.to(self.device)
        y = y.to(self.device)
        self.classifiers = []
        for cls in range(self.classes):
            y_bin = torch.where(y == cls, 1, -1).float()
            clf = KernelPerceptronBinary(
                epochs=self.epochs,
                degree=self.degree,
                device=self.device
            )
            clf.fit(X, y_bin)
            self.classifiers.append(clf)

    def predict(self, X):
        X = X.to(self.device)
        scores = []
        for clf in self.classifiers:
            scores.append(clf.decision_function(X))  # (n,)
        scores = torch.stack(scores, dim=1)          # (n, classes)
        return torch.argmax(scores, dim=1)

### Toy Data

In [7]:
X_train, y_train = load_digit_file("/datasets/sl2-data/dtrain123.dat")
X_test,  y_test  = load_digit_file("/datasets/sl2-data/dtest123.dat") 

clf = KernelPerceptronOvR(classes=10, epochs=5, degree=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
err = (y_pred != y_test).float().mean().item()  # scalar float
print("Test error:", err)

### Random Split Function

In [7]:
def randomsplit(X, y, train_ratio=0.8):
    """
    Random 80/20 train/test split of data.
    Args:
        X: data
        y: labels
        train_ratio: ratio of data to use for training split.
    Returns:
        X_tr, y_tr, X_te, y_te, train_idx, test_idx
    """
    n = X.shape[0]
    idx = torch.randperm(n, device=X.device)
    n_train = int(train_ratio * n)
    train_idx = idx[:n_train]
    test_idx  = idx[n_train:]
    return X[train_idx], y[train_idx], X[test_idx], y[test_idx], train_idx, test_idx

The implementation loops over k number of classes, and builds binary labels for y == c is y == not c. The loop then fits the classifier on (X, label), and the classifier is stored in a list or dictionary. Once training is complete, for any new x, the score for probability that x is class c is computed and the argmax of the scores is returned . The weight function is represented as a sum of (a, X) pairs, where a is the array of coefficients and X is an array of training points, and the kernel function (x, z). The sum is evaluated for any x by a loop of the vectorised dot product of alpha and a vector of the kernel function, for indices where i ≠ 0. y is then the sign of the value of w. A new term is added by setting a = y if the prediction is wrong and a = 0 otherwise. The new point x_t is appended to X and update a_t as += y_t.

Each epoch iterates over the training data once, and although the predictions should improve with more epochs, so do the chances of overfitting and the computation time and cost. Each epoch adds O(m^2) cost. Epochs were limited at 5 to make the time of computation manageable while making sure convergence occurred.

### PolyOvR Basic Results

In [13]:
degrees = range(1, 8)
runs  = 20
epochs  = 5
classes = 10

ovrbasic_train = np.zeros((len(degrees), runs))
ovrbasic_test = np.zeros((len(degrees), runs))

for r in range(runs):
    X_tr, y_tr, X_te, y_te, _, _ = randomsplit(X, y, train_ratio=0.8)
    for di, d in enumerate(degrees):
        clf = KernelPerceptronOvR(classes=10, epochs=epochs, degree=d, device=device)
        clf.fit(X_tr, y_tr)
        y_tr_pred = clf.predict(X_tr)
        y_te_pred = clf.predict(X_te)

        tr_err = (y_tr_pred != y_tr).float().mean().item()
        te_err = (y_te_pred != y_te).float().mean().item()

        ovrbasic_train[di, r] = tr_err
        ovrbasic_test[di,  r] = te_err

    print(f"Run {r+1}/{runs} done.")

ovrbasic_train_mean = ovrbasic_train.mean(axis=1)
ovrbasic_train_std  = ovrbasic_train.std(axis=1, ddof=1)
ovrbasic_test_mean  = ovrbasic_test.mean(axis=1)
ovrbasic_test_std   = ovrbasic_test.std(axis=1, ddof=1)

print("\nResults over 20 runs (OvR, polynomial kernel)")
for di, d in enumerate(degrees):
    print(
        f"d={d}: "
        f"train {ovrbasic_train_mean[di]:.4f} ± {ovrbasic_train_std[di]:.4f}, "
        f"test {ovrbasic_test_mean[di]:.4f} ± {ovrbasic_test_std[di]:.4f}"
    )

Results over 20 runs (OvR, polynomial kernel)
d=1: train 0.6657 ± 0.0114, test 0.6656 ± 0.0083
d=2: train 0.3193 ± 0.0042, test 0.3280 ± 0.0135
d=3: train 0.2480 ± 0.0039, test 0.2559 ± 0.0123
d=4: train 0.2024 ± 0.0034, test 0.2099 ± 0.0096
d=5: train 0.1731 ± 0.0025, test 0.1860 ± 0.0088
d=6: train 0.1491 ± 0.0024, test 0.1678 ± 0.0087
d=7: train 0.1280 ± 0.0021, test 0.1537 ± 0.0090

The training results show lower mean as d increases, and lowering standard deviation until d=5, after which there are diminishing returns in terms of standard deviation. The test means simlarly decreased as d increased, but the standardn deviation showed little differences.

### Polynomial 5-fold Cross Validation: d* selection

In [10]:
def kfold_indices(n, k=5, device=device):
    idx = torch.randperm(n, device=device)
    folds = torch.chunk(idx, k)
    return folds  # tuple of 1D index tensors

def best_d(X_tr, y_tr, degrees, ClfClass, folds=5, epochs=5, classes=10, device=device):
    n = X_tr.shape[0]
    folds_idx = kfold_indices(n, k=folds, device=device)
    mean_errors = []

    for d in degrees:
        fold_err = []
        for f in range(folds):
            val_idx   = folds_idx[f]
            train_idx = torch.cat([folds_idx[i] for i in range(folds) if i != f])

            X_cv_tr, y_cv_tr   = X_tr[train_idx], y_tr[train_idx]
            X_cv_val, y_cv_val = X_tr[val_idx],  y_tr[val_idx]

            clf = ClfClass(classes=classes, epochs=epochs, degree=d, device=device)
            clf.fit(X_cv_tr, y_cv_tr)
            y_val_pred = clf.predict(X_cv_val)
            err = (y_val_pred != y_cv_val).float().mean()
            fold_err.append(err.item())
        mean_errors.append(sum(fold_err) / len(fold_err))

    mean_errors_t = torch.tensor(mean_errors)
    best_idx = int(torch.argmin(mean_errors_t))
    return degrees[best_idx], mean_errors_t


### PolyOvR CV Results

In [16]:
degrees = range(1, 8)
runs  = 20
epochs  = 5
classes = 10
ovrcv_best_ds = torch.zeros(runs, dtype=torch.long, device=device)
ovrcv_train   = torch.zeros(runs, device=device)
ovrcv_test    = torch.zeros(runs, device=device)

for r in range(runs):
    X_tr, y_tr, X_te, y_te, _, _ = randomsplit(X, y, train_ratio=0.8)
    d_star, _ = best_d(
        X_tr, y_tr, degrees, KernelPerceptronOvR,
        folds=5, epochs=epochs, classes=classes, device=device
    )
    ovrcv_best_ds[r] = d_star

    clf = KernelPerceptronOvR(classes=classes, epochs=epochs, degree=int(d_star), device=device)
    clf.fit(X_tr, y_tr)
    y_tr_pred = clf.predict(X_tr)
    y_te_pred = clf.predict(X_te)
    ovrcv_train[r] = (y_tr_pred != y_tr).float().mean()
    ovrcv_test[r]  = (y_te_pred != y_te).float().mean()

print("Polynomial CV OvR Results:")
print("d* mean ± std:", ovrcv_best_ds.float().mean().item(), ovrcv_best_ds.float().std(unbiased=True).item())
print("train err mean ± std:", ovrcv_train.mean().item(), ovrcv_train.std(unbiased=True).item())
print("test err mean ± std:",  ovrcv_test.mean().item(),  ovrcv_test.std(unbiased=True).item())

Polynomial CV OvR Results over 20 runs:
train err mean ± std: 0.12785695493221283 0.002016906626522541
test err mean ± std: 0.15177418291568756 0.008331004530191422

The results reached with CV were comparable to those of the basic results at d = 7. 

### Confusion Matrix

In [13]:
def confusion_matrix(y_true, y_pred, classes=10):
    cm = torch.zeros(classes, classes, dtype=torch.long)
    for t, p in zip(y_true, y_pred):
        cm[int(t), int(p)] += 1
    return cm

### PolyOvR CM Results

In [22]:
degrees = range(1, 8)
runs  = 20
epochs  = 5
classes = 10
cms = torch.zeros(runs, classes, classes, dtype=torch.float32)

for r in range(runs):
    X_tr, y_tr, X_te, y_te, _, _ = randomsplit(X, y, train_ratio=0.8)
    d_star, _ = best_d(X_tr, y_tr, degrees, KernelPerceptronOvR,
                             folds=5, epochs=epochs, classes=classes, device=device)

    clf = KernelPerceptronOvR(classes=classes, epochs=epochs, degree=int(d_star), device=device)
    clf.fit(X_tr, y_tr)
    y_te_pred = clf.predict(X_te)

    cm_counts = confusion_matrix(y_te.cpu(), y_te_pred.cpu(), classes=classes).float()
    row_counts = cm_counts.sum(dim=1, keepdim=True).clamp(min=1.0)
    cm_rates = cm_counts / row_counts
    cm_rates.fill_diagonal_(0.0)

    cms[r] = cm_rates

cm_mean = cms.mean(dim=0)
cm_std  = cms.std(dim=0, unbiased=True)

for a in range(classes):
    row = []
    for b in range(classes):
        row.append(f"{cm_mean[a, b].item():.3f}±{cm_std[a, b].item():.3f}")
    print("  ".join(row))


Confusion Matrix Results over 20 runs 

0.000±0.000  0.006±0.004  0.003±0.003  0.001±0.002  0.004±0.004  0.001±0.001  0.015±0.011  0.005±0.003  0.001±0.001  0.003±0.003
0.000±0.000  0.000±0.000  0.000±0.000  0.000±0.000  0.003±0.003  0.000±0.000  0.000±0.000  0.000±0.000  0.000±0.000  0.000±0.000
0.022±0.009  0.155±0.029  0.000±0.000  0.030±0.012  0.027±0.009  0.000±0.000  0.012±0.006  0.075±0.019  0.021±0.010  0.015±0.010
0.010±0.007  0.041±0.015  0.001±0.003  0.000±0.000  0.002±0.003  0.008±0.007  0.000±0.000  0.026±0.013  0.011±0.009  0.019±0.011
0.004±0.004  0.143±0.031  0.005±0.006  0.000±0.000  0.000±0.000  0.000±0.000  0.003±0.005  0.004±0.004  0.000±0.001  0.105±0.022
0.033±0.015  0.086±0.023  0.002±0.005  0.034±0.014  0.033±0.015  0.000±0.000  0.033±0.012  0.015±0.008  0.011±0.007  0.032±0.015
0.024±0.009  0.071±0.019  0.003±0.003  0.000±0.000  0.008±0.006  0.001±0.002  0.000±0.000  0.003±0.003  0.001±0.002  0.001±0.003
0.000±0.000  0.026±0.009  0.000±0.000  0.000±0.000  0.010±0.006  0.000±0.000  0.000±0.000  0.000±0.000  0.000±0.000  0.048±0.020
0.017±0.010  0.157±0.037  0.000±0.002  0.046±0.013  0.014±0.008  0.004±0.005  0.006±0.007  0.013±0.008  0.000±0.000  0.058±0.019
0.004±0.005  0.060±0.019  0.000±0.000  0.002±0.004  0.025±0.013  0.000±0.000  0.000±0.000  0.054±0.015  0.002±0.003  0.000±0.000


In [4]:
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

raw = """
0.000±0.000  0.006±0.004  0.003±0.003  0.001±0.002  0.004±0.004  0.001±0.001  0.015±0.011  0.005±0.003  0.001±0.001  0.003±0.003
0.000±0.000  0.000±0.000  0.000±0.000  0.000±0.000  0.003±0.003  0.000±0.000  0.000±0.000  0.000±0.000  0.000±0.000  0.000±0.000
0.022±0.009  0.155±0.029  0.000±0.000  0.030±0.012  0.027±0.009  0.000±0.000  0.012±0.006  0.075±0.019  0.021±0.010  0.015±0.010
0.010±0.007  0.041±0.015  0.001±0.003  0.000±0.000  0.002±0.003  0.008±0.007  0.000±0.000  0.026±0.013  0.011±0.009  0.019±0.011
0.004±0.004  0.143±0.031  0.005±0.006  0.000±0.000  0.000±0.000  0.000±0.000  0.003±0.005  0.004±0.004  0.000±0.001  0.105±0.022
0.033±0.015  0.086±0.023  0.002±0.005  0.034±0.014  0.033±0.015  0.000±0.000  0.033±0.012  0.015±0.008  0.011±0.007  0.032±0.015
0.024±0.009  0.071±0.019  0.003±0.003  0.000±0.000  0.008±0.006  0.001±0.002  0.000±0.000  0.003±0.003  0.001±0.002  0.001±0.003
0.000±0.000  0.026±0.009  0.000±0.000  0.000±0.000  0.010±0.006  0.000±0.000  0.000±0.000  0.000±0.000  0.000±0.000  0.048±0.020
0.017±0.010  0.157±0.037  0.000±0.002  0.046±0.013  0.014±0.008  0.004±0.005  0.006±0.007  0.013±0.008  0.000±0.000  0.058±0.019
0.004±0.005  0.060±0.019  0.000±0.000  0.002±0.004  0.025±0.013  0.000±0.000  0.000±0.000  0.054±0.015  0.002±0.003  0.000±0.000
"""

# parse "mean±std"
pairs = re.findall(r'([\d.]+)\s*±\s*([\d.]+)', raw)
means = np.array([float(m) for m, s in pairs]).reshape(10, 10)
stds  = np.array([float(s) for m, s in pairs]).reshape(10, 10)

sns.set(style="white")

plt.figure(figsize=(5, 4))
ax = sns.heatmap(
    means,
    cmap=sns.color_palette("rocket", as_cmap=True),   # try "mako", "magma", "viridis"
    square=True,
    linewidths=0.5,
    linecolor="white",
    cbar_kws={"label": "Mean rate"},
    vmin=0.0,
    vmax=means.max()
)

ax.set_xlabel("Predicted class")
ax.set_ylabel("True class")
ax.set_title("Confusion Matrix over 20 runs",
             fontsize=14, fontweight="bold", pad=12)

plt.tight_layout()
plt.show()


### PolyOvR Hard-to-predict Samples

In [23]:
samples = X.shape[0]
wrong_counts = torch.zeros(samples, dtype=torch.long)

for r in range(runs):
    X_tr, y_tr, X_te, y_te, _, test_idx = randomsplit(X, y, train_ratio=0.8)
    d_star, _ = best_d(X_tr, y_tr, degrees, KernelPerceptronOvR,
                             folds=5, epochs=epochs, classes=classes, device=device)
    clf = KernelPerceptronOvR(classes=classes, epochs=epochs, degree=int(d_star), device=device)
    clf.fit(X_tr, y_tr)
    y_te_pred = clf.predict(X_te)

    mis_mask = (y_te_pred != y_te)
    mis_idx_global = test_idx[mis_mask]
    wrong_counts[mis_idx_global] += 1

hardest_idx = torch.argsort(-wrong_counts)[:5]
X_hard = X[hardest_idx]
y_hard = y[hardest_idx]

def show_hard_examples(X_hard, y_hard):
    plt.figure(figsize=(10, 2))
    for i in range(len(X_hard)):
        img = X_hard[i].cpu().numpy().reshape(16, 16)
        plt.subplot(1, len(X_hard), i + 1)
        plt.imshow(img, cmap='gray', vmin=-1, vmax=1)
        plt.axis('off')
        plt.title(f"Label: {int(y_hard[i].item())}")
    plt.tight_layout()
    plt.show()

show_hard_examples(X_hard, y_hard)

The 2, 8 and 9 are more difficult samples, since the hand-writing leads to slanted and loopy figures, and the circular part of 9 is flattened. 7 and 4 however, are more unexpected as they have quite standard presentations.

## Gaussian Kernel OvR

S = [1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0]

For the Gaussian Kernel, small c gives a very smooth decision boundary, but if c is too small, all points will look similar and the kernel becomes constant. Large c makes the kernel very local and reduces generalisation as the kernel becomes nearly 0 for all but almost identical points. This set is chosen to demonstrate this, while respecting the common practice of searching for c in a logarithmic grid (here, powers of 10).

### RBF OvR Perceptron

In [22]:
def rbf_kernel(X, Z, c=1.0):
    """
    Compute RBF Gram matrix:
        K[i,j] = exp(-c * ||x - z||^2)
    x: (n1, d), z: (n2, d)
    """
    X_sq = (X**2).sum(dim=1, keepdim=True)        # (n1, 1)
    Z_sq = (Z**2).sum(dim=1, keepdim=True).T      # (1, n2)
    dist2 = X_sq + Z_sq - 2.0 * (X @ Z.T)
    return torch.exp(-c * dist2)

# ----- binary RBF kernel perceptron -----

class RBFKPerceptronBinary:
    def __init__(self, epochs=5, c=1.0, device=device):
        self.epochs = epochs
        self.c = c
        self.device = device
        self.alpha = None
        self.X_sv = None
        self.K = None

    def fit(self, X, y):
        X = X.to(self.device)
        y = y.to(self.device).float()
        m = X.shape[0]
        self.X_sv = X
        self.alpha = torch.zeros(m, device=self.device)
        self.K = rbf_kernel(X, X, self.c)

        for _ in range(self.epochs):
            scores = self.K @ self.alpha
            y_hat = torch.sign(scores)
            y_hat[y_hat == 0] = 1
            mis = (y_hat != y)
            if not mis.any():
                continue
            self.alpha[mis] += y[mis]

    def decision_function(self, X):
        X = X.to(self.device)
        K = rbf_kernel(X, self.X_sv, self.c)
        return K @ self.alpha

    def predict(self, X):
        scores = self.decision_function(X)
        y_hat = torch.sign(scores)
        y_hat[y_hat == 0] = 1
        return y_hat

# ----- OvR with Gaussian kernel -----

class RBFKPerceptronOvR:
    def __init__(self, classes=10, epochs=5, c=1.0, device=device):
        self.classes = classes
        self.epochs = epochs
        self.c = c
        self.device = device
        self.classifiers = []

    def fit(self, X, y):
        X = X.to(self.device)
        y = y.to(self.device)
        self.classifiers = []
        for digit in range(self.classes):
            y_bin = torch.where(y == digit, 1, -1).float()
            clf = RBFKPerceptronBinary(epochs=self.epochs, c=self.c, device=self.device)
            clf.fit(X, y_bin)
            self.classifiers.append(clf)

    def predict(self, X):
        X = X.to(self.device)
        scores = [clf.decision_function(X) for clf in self.classifiers]
        scores = torch.stack(scores, dim=1)
        return torch.argmax(scores, dim=1)

### RBFOvR Basic Results

In [25]:
C_values = [1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0]

gaussovrbasic_train = torch.zeros(len(C_values), runs, device=device)
gaussovrbasic_test  = torch.zeros(len(C_values), runs, device=device)

for r in range(runs):
    X_tr, y_tr, X_te, y_te, _, _ = randomsplit(X, y, train_ratio=0.8)

    for ci, c in enumerate(C_values):
        clf = RBFKPerceptronOvR(classes=classes, epochs=epochs, c=c, device=device)
        clf.fit(X_tr, y_tr)
        y_tr_pred = clf.predict(X_tr)
        y_te_pred = clf.predict(X_te)

        gaussovrbasic_train[ci, r] = (y_tr_pred != y_tr).float().mean()
        gaussovrbasic_test[ci, r]  = (y_te_pred != y_te).float().mean()

    print(f"Gaussian OvR run {r+1}/{runs} done.")

gaussovrbasic_train_mean = gaussovrbasic_train.mean(dim=1)
gaussovrbasic_train_std  = gaussovrbasic_train.std(dim=1, unbiased=True)
gaussovrbasic_test_mean  = gaussovrbasic_test.mean(dim=1)
gaussovrbasic_test_std   = gaussovrbasic_test.std(dim=1, unbiased=True)

print("\nResults over 20 runs (OvR, Gaussian kernel)")
for ci, c in enumerate(C_values):
    print(
        f"c={c}: "
        f"train {gaussovrbasic_train_mean[ci].item():.4f} ± {gaussovrbasic_train_std[ci].item():.4f}, "
        f"test {gaussovrbasic_test_mean[ci].item():.4f} ± {gaussovrbasic_test_std[ci].item():.4f}"
    )

Results over 20 runs (OvR, Gaussian kernel)
c=0.001: train 0.6986 ± 0.0027, test 0.6952 ± 0.0107
c=0.01: train 0.2217 ± 0.0054, test 0.2274 ± 0.0100
c=0.1: train 0.0004 ± 0.0001, test 0.0456 ± 0.0049
c=1.0: train 0.0000 ± 0.0000, test 0.0510 ± 0.0050
c=10.0: train 0.0000 ± 0.0000, test 0.6950 ± 0.0102
c=100.0: train 0.0000 ± 0.0000, test 0.8234 ± 0.0091

c = 0.1 had the best results for both training and test runs. Once c ≥ 10, overfitting is evident in the perfect training results but much worse test results. 

### Gaussian CV: c* selection

In [16]:
def best_c(X_tr, y_tr, C_values, ClfClass, folds=5, epochs=5, classes=10, device=None):
    if device is None:
        device = X_tr.device
    n = X_tr.shape[0]
    folds_idx = kfold_indices(n, k=folds, device=device)
    mean_errors = []

    for c in C_values:
        fold_err = []
        for f in range(folds):
            val_idx   = folds_idx[f]
            train_idx = torch.cat([folds_idx[i] for i in range(folds) if i != f])

            X_cv_tr, y_cv_tr   = X_tr[train_idx], y_tr[train_idx]
            X_cv_val, y_cv_val = X_tr[val_idx],  y_tr[val_idx]

            clf = ClfClass(classes=classes, epochs=epochs, c=c, device=device)
            clf.fit(X_cv_tr, y_cv_tr)
            y_val_pred = clf.predict(X_cv_val)
            err = (y_val_pred != y_cv_val).float().mean()
            fold_err.append(err.item())
        mean_errors.append(sum(fold_err) / len(fold_err))

    mean_errors_t = torch.tensor(mean_errors, device=device)
    best_idx = int(torch.argmin(mean_errors_t))
    return C_values[best_idx], mean_errors_t

### RBFOvR CV Results

In [25]:
runs  = 20
epochs  = 5
classes = 10
C_values = [1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0]
gaussovr_best_cs = torch.zeros(runs, device=device)
gaussovrcv_train = torch.zeros(runs, device=device)
gaussovrcv_test  = torch.zeros(runs, device=device)

for r in range(runs):
    X_tr, y_tr, X_te, y_te, _, _ = randomsplit(X, y, train_ratio=0.8)
    c_star, _ = best_c(
        X_tr, y_tr, C_values, RBFKPerceptronOvR,
        folds=5, epochs=epochs, classes=classes, device=device
    )
    gaussovr_best_cs[r] = c_star

    clf = RBFKPerceptronOvR(classes=classes, epochs=epochs, c=c_star, device=device)
    clf.fit(X_tr, y_tr)
    y_tr_pred = clf.predict(X_tr)
    y_te_pred = clf.predict(X_te)

    gaussovrcv_train[r] = (y_tr_pred != y_tr).float().mean()
    gaussovrcv_test[r]  = (y_te_pred != y_te).float().mean()

print("Gaussian CV OvR Results:")
print("c* mean ± std:", gaussovr_best_cs.mean().item(), gaussovr_best_cs.std(unbiased=True).item())
print("train err mean ± std:", gaussovrcv_train.mean().item(), gaussovrcv_train.std(unbiased=True).item())
print("test err mean ± std:",  gaussovrcv_test.mean().item(),  gaussovrcv_test.std(unbiased=True).item())

Gaussian CV OvR Results over 20 runs:
train err mean ± std: 0.00040333421202376485 0.00011540669947862625
test err mean ± std: 0.04467742517590523 0.0029234865214675665

The Gaussian CV results show much less variation than the polynomial results, for both training and test sets. Compared to the polynomial kernel, CV made a marked improvement on learning.

## One-versus-One

The OvO method turns each pair of the k classes into a binary classifier problem. So for each pair of classes (a, b), only the data with those labels are extracted and used for training, for [k(k+1)]/2 classifiers. A vote counter is kept and the class with the most votes is selected. Although OvO has O(k^2) cost from the higher number of classifiers, each classifier is trained on fewer samples, which is better for methods such as kernel perceptron which scale poorly with data size. OvO also tends to provide cleaner decision boundaries for more difficult pairs.

### Polynomial OvO Perceptron

In [26]:
class KernelPerceptronOvO:
    def __init__(self, classes=10, epochs=5, degree=3, device=None):
        self.classes = classes
        self.epochs = epochs
        self.degree = degree
        self.device = device or torch.device("cpu")
        self.pair_clfs = []  # list of (a, b, clf)

    def fit(self, X, y):
        """
        One classifier per pair (a, b) with labels ±1.
        X, y: torch tensors
        """
        X = X.to(self.device)
        y = y.to(self.device)
        self.pair_clfs = []
        for a in range(self.classes):
            for b in range(a + 1, self.classes):
                mask = (y == a) | (y == b)
                X_ab = X[mask]
                y_ab = y[mask]
                y_bin = torch.where(y_ab == a, 1, -1).float()

                clf = KernelPerceptronBinary(
                    epochs=self.epochs,
                    degree=self.degree,
                    device=self.device
                )
                clf.fit(X_ab, y_bin)
                self.pair_clfs.append((a, b, clf))

    def predict(self, X):
        X = X.to(self.device)
        m = X.shape[0]
        votes = torch.zeros(m, self.classes, dtype=torch.long, device=self.device)

        for a, b, clf in self.pair_clfs:
            y_pred = clf.predict(X)  # +1 → a, -1 → b
            mask_a = (y_pred == 1)
            mask_b = ~mask_a
            votes[mask_a, a] += 1
            votes[mask_b, b] += 1

        return torch.argmax(votes, dim=1)

### PolyOvO Basic Results

In [27]:
degrees = range(1, 8)

ovobasic_train = torch.zeros(len(degrees), runs, device=device)
ovobasic_test  = torch.zeros(len(degrees), runs, device=device)

for r in range(runs):
    X_tr, y_tr, X_te, y_te, _, _ = randomsplit(X, y, train_ratio=0.8)

    for di, d in enumerate(degrees):
        clf = KernelPerceptronOvO(classes=classes, epochs=epochs, degree=d, device=device)
        clf.fit(X_tr, y_tr)
        y_tr_pred = clf.predict(X_tr)
        y_te_pred = clf.predict(X_te)

        ovobasic_train[di, r] = (y_tr_pred != y_tr).float().mean()
        ovobasic_test[di, r]  = (y_te_pred != y_te).float().mean()

    print(f"OvO run {r+1}/{runs} done.")

ovobasic_train_mean = ovobasic_train.mean(dim=1)
ovobasic_train_std  = ovobasic_train.std(dim=1, unbiased=True)
ovobasic_test_mean  = ovobasic_test.mean(dim=1)
ovobasic_test_std   = ovobasic_test.std(dim=1, unbiased=True)

print("\nResults over 20 runs (OvO, polynomial kernel)")
for di, d in enumerate(degrees):
    print(
        f"d={d}: "
        f"train {ovobasic_train_mean[di].item():.4f} ± {ovobasic_train_std[di].item():.4f}, "
        f"test {ovobasic_test_mean[di].item():.4f} ± {ovobasic_test_std[di].item():.4f}"
    )

Results over 20 runs (OvO, polynomial kernel)
d=1: train 0.1871 ± 0.0171, test 0.1936 ± 0.0212
d=2: train 0.1609 ± 0.0152, test 0.1685 ± 0.0187
d=3: train 0.1367 ± 0.0027, test 0.1509 ± 0.0100
d=4: train 0.1401 ± 0.0022, test 0.1597 ± 0.0104
d=5: train 0.1349 ± 0.0019, test 0.1633 ± 0.0105
d=6: train 0.1244 ± 0.0022, test 0.1622 ± 0.0112
d=7: train 0.1114 ± 0.0020, test 0.1590 ± 0.0102

For the basic results, OvO performed better much sooner than OvR in training, reaching the same numbers as OvR by d=3. The test results showed a similar pattern for the means, although the standard deviations were marginally higher than those of OvR.

### PolyOvO CV Results

In [28]:
degrees_ovo = range(1, 11)

ovo_best_ds = torch.zeros(runs, dtype=torch.long, device=device)
ovocv_train = torch.zeros(runs, device=device)
ovocv_test  = torch.zeros(runs, device=device)

for r in range(runs):
    X_tr, y_tr, X_te, y_te, _, _ = randomsplit(X, y, train_ratio=0.8)

    d_star, _ = best_d(
        X_tr, y_tr, degrees_ovo,
        ClfClass=KernelPerceptronOvO,
        folds=5, epochs=epochs, classes=classes, device=device
    )
    ovo_best_ds[r] = d_star

    clf = KernelPerceptronOvO(classes=classes, epochs=epochs, degree=int(d_star), device=device)
    clf.fit(X_tr, y_tr)
    y_tr_pred = clf.predict(X_tr)
    y_te_pred = clf.predict(X_te)

    ovocv_train[r] = (y_tr_pred != y_tr).float().mean()
    ovocv_test[r]  = (y_te_pred != y_te).float().mean()

print("OvO CV Results:")
print("d* mean ± std:", ovo_best_ds.float().mean().item(), ovo_best_ds.float().std(unbiased=True).item())
print("train err mean ± std:", ovocv_train.mean().item(), ovocv_train.std(unbiased=True).item())
print("test err mean ± std:",  ovocv_test.mean().item(),  ovocv_test.std(unbiased=True).item())

OvO CV Results over 20 runs:
train err mean ± std: 0.07894594967365265 0.015663130208849907
test err mean ± std: 0.13978494703769684 0.008001072332262993

CV meant an improvement for OvO where it made little difference to OvR. The training mean is much lower for OvO, and the test mean is also slightly lower. The standard deviation for training, however, is higher than without CV, while the test standard deviation is lower with CV, a difference not see with OvR.

# Part II

## Semi-supervised Learning via Laplacian Interpolation

### Build graph W D L

In [4]:
def build_W_3nn(X, k=3):
    m = X.shape[0]
    W = np.zeros((m, m), dtype=float)
    for j in range(m):
        dists = np.linalg.norm(X - X[j], axis=1)
        nn_idx = np.argsort(dists)[1:k+1]  # skip self
        W[nn_idx, j] = 1
    # symmetrise: 'xi is 3-NN of xj or xj is 3-NN of xi'
    W = np.maximum(W, W.T)
    np.fill_diagonal(W, 0.0)
    return W

def laplacian(W):
    D = np.diag(W.sum(axis=1))
    return D - W

### Laplacian Interpolation

In [7]:
def laplacian_interpolation(Lmat, y, labelled_idx):
    m = Lmat.shape[0]
    all_idx = np.arange(m)
    Lset = np.array(labelled_idx)
    Uset = np.setdiff1d(all_idx, Lset)

    L_LL = Lmat[np.ix_(Lset, Lset)]
    L_LU = Lmat[np.ix_(Lset, Uset)]
    L_UL = Lmat[np.ix_(Uset, Lset)]
    L_UU = Lmat[np.ix_(Uset, Uset)]

    y_L = y[Lset]

    v = np.zeros(m, dtype=float)
    v[Lset] = y_L
    if len(Uset) > 0:
        v[Uset] = -np.linalg.solve(L_UU, L_UL @ y_L)
    return v

### Prediction and empirical error

In [11]:
def discrete_error(v, y, labelled_idx):
    m = len(y)
    all_idx = np.arange(m)
    Uset = np.setdiff1d(all_idx, labelled_idx)
    y_hat = np.sign(v[Uset])
    return np.mean(y_hat != y[Uset])

### Laplacian kernel interpolation

In [15]:
def laplacian_kernel_interpolation(Lmat, y, labelled_idx):
    m = Lmat.shape[0]
    Lplus = np.linalg.pinv(Lmat)

    Lset = np.array(labelled_idx)
    K = Lplus[np.ix_(Lset, Lset)]
    y_L = y[Lset]

    # solve alpha* = K^+ y_L
    alpha = np.linalg.pinv(K) @ y_L

    # v_j = sum_{i in L} alpha_i (e_i^T L^+)_j
    #      = (L^+_{:,L} alpha)_j
    v = Lplus[:, Lset] @ alpha
    return v

### Sampling label sets L

In [19]:
def sample_L_per_class(y, labels, ell):
    L = []
    for c in labels:
        idx_c = np.where(y == c)[0]
        chosen = np.random.choice(idx_c, size=ell, replace=False)
        L.append(chosen)
    return np.concatenate(L)

In [35]:
import numpy as np

datasets = [
    "/datasets/sl2-data/dtrain13_50.dat",
    "/datasets/sl2-data/dtrain13_100.dat",
    "/datasets/sl2-data/dtrain13_200.dat",
    "/datasets/sl2-data/dtrain13_400.dat",
]

label_sizes = [1, 2, 4, 8, 16]

for fname in datasets:
    X, y_raw = load_digit_file(fname)

    # map the two digits to {-1, +1}
    unique_digits = np.unique(y_raw)
    y = np.where(y_raw == unique_digits[0], -1, 1)

    # build graph and Laplacian once per dataset
    W = build_W_3nn(X, k=3)
    Lmat = laplacian(W)

    print(f"\nDataset: {fname}")
    for ell in label_sizes:
        err_LI_runs  = []
        err_LKI_runs = []

        for run in range(20):
            L_idx = sample_L_per_class(y, labels=[-1, +1], ell=ell)

            v_LI  = laplacian_interpolation(Lmat, y, L_idx)
            errLI = discrete_error(v_LI, y, L_idx)

            v_LKI  = laplacian_kernel_interpolation(Lmat, y, L_idx)
            errLKI = discrete_error(v_LKI, y, L_idx)

            err_LI_runs.append(errLI)
            err_LKI_runs.append(errLKI)

        errLI_mean  = np.mean(err_LI_runs)
        errLI_std   = np.std(err_LI_runs, ddof=1)
        errLKI_mean = np.mean(err_LKI_runs)
        errLKI_std  = np.std(err_LKI_runs, ddof=1)

        print(
            f"  ell={ell}: "
            f"LI  {errLI_mean:.3f} ± {errLI_std:.3f}, "
            f"LKI {errLKI_mean:.3f} ± {errLKI_std:.3f}"
        )

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cb182644-878e-48cb-992b-68a78a5afe3d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>