In [12]:
########################
# Kernel SVM (dual)
########################

# A kernel is a function that calculates the inner product of data in a higher-dimensional feature space without explicitly performing the mapping
# K(x,z) is a kernel function
# it returns the inner product of x and z in some feature space phi
# K(x,z) = phi(x). phi(z)
# phi itself is never constructed

# dual svm only depends on inner products between data points
# because w = sum_i alpha_i * y_i * phi(x_i)
# substituting this into ||w||^2 and the margin removes w completely
# everything reduces to pairwise dot products

# kernel trick:
# replace (x_i . x_j) with K(x_i, x_j)
# optimization over alpha stays exactly the same

# linear kernel:
# K(x,z) = x . z
# phi(x) = x
# same as linear svm

# polynomial kernel:
# K(x,z) = (x . z + c)^d
# equivalent to explicit polynomial features
# feature space is finite but large
# kernel avoids writing phi(x) manually

# rbf (radial bias function) / gaussian kernel:
# K(x,z) = exp(-||x - z||^2 / (2Ïƒ^2))
# corresponds to an infinite dimensional phi(x)
# a space which contains all polynomial features of all degrees
# phi(x) cannot be written or stored explicitly
# only K(x,z) can be evaluated

# important:
# kernel svm does not add features explicitly
# it changes the geometry by redefining similarity
# linear separation happens in phi-space, not x-space

# decision function:
# same as in dual SVM we measure how each constraint interacts with each other via dot product and eliminate the notion of coordinates
# f(x) = sum_i alpha_i * y_i * K(x_i, x) + b
# w only exists for linear kernel
# for rbf / poly w is implicit and meaningless

# soft margin:
# alpha_i constrained to [0, C]
# allows margin violations
# bounds influence of individual points
# kernel increases capacity but margin still controls generalization


In [None]:
import numpy as np

def linear_kernel(x,z):
    return np.dot(x,z)

def polynomial_kernel(x,z,degree=2,coeff=1.0):
    return (np.dot(x,z)+coeff)**degree

def rbf_kernel(x,z,sigma=1.0):
    return np.exp(-np.linalg.norm(x-z)**2/(2*sigma**2))

def kernel_matrix(X,kernel):
    n=X.shape[0]
    K=np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            K[i,j]=kernel(X[i],X[j])
    return K

def dual_svm_kernel(X,Y,kernel,C=None,step_size=1e-3,cycles=20000):

    n_samples=X.shape[0]
    alpha=np.zeros(n_samples)
    b=0.0
    K=kernel_matrix(X,kernel)

    for _ in range(cycles):
        for i in range(n_samples):
            margin = Y[i] * (np.sum(alpha* Y * K[:,i])+b)
            if margin<1:
                alpha[i]+=step_size*(1-margin)
                if C is not None:
                    alpha[i]=np.clip(alpha[i],0,C)
                else:
                    alpha[i]=max(alpha[i],0)

        sv=alpha>1e-6
        if np.any(sv):
            b=np.mean(Y[sv]-np.sum(alpha*Y*K[:,sv],axis=0))

    return alpha,b
                     
# Training produces alpha, b -> the support vectors and we use kernel to see how similar the new point is to the SV's
def decision_function(X_train,Y,alpha,b,kernel,x):
    sv_idx = alpha > 1e-6
    return np.sum(alpha[sv_idx] * Y[sv_idx] *np.array([kernel(xi,x) for xi in X_train[sv_idx]]))+b


In [14]:
X = np.array([
    [1.0, 1.0], [1.2, 0.8], [0.8, 1.3], [1.1, 1.4],
    [3.0, 3.1], [2.8, 2.9], [3.2, 2.7], [3.1, 3.3]
])

Y = np.array([-1, -1, -1, -1, 1, 1, 1, 1])

alpha_h, b_h = dual_svm_kernel(X, Y, linear_kernel)
check_h = np.array([Y[i] * decision_function(X, Y, alpha_h, b_h, linear_kernel, X[i]) for i in range(len(X))])

print("================= SEPARABLE: LINEAR KERNEL ====================")
print(f"b : {b_h:.4f}")
print(f"alpha : {alpha_h}")
print(f"check : {check_h}")


b : -2.5376
alpha : [0.05415677 0.05498561 0.059014   0.25710127 0.06070088 0.20068304
 0.07499278 0.04230803]
check : [1.29514513 1.27763723 1.2549085  0.99766852 1.24745975 0.99897372
 1.14947859 1.42944731]


In [15]:
X = np.array([
    [0.0, 0.0],
    [0.0, 1.0],
    [1.0, 0.0],
    [1.0, 1.0]
])

Y = np.array([-1, 1, 1, -1])

alpha_s, b_s = dual_svm_kernel(X, Y, rbf_kernel, C=1.0)
check_s = np.array([Y[i] * decision_function(X, Y, alpha_s, b_s, rbf_kernel, X[i])for i in range(len(X))])

print("\n================= XOR: RBF KERNEL ====================")
print(f"b : {b_s:.4f}")
print(f"alpha : {alpha_s}")
print(f"check : {check_s}")



b : 0.0000
alpha : [1. 1. 1. 1.]
check : [0.15481812 0.15481812 0.15481812 0.15481812]
