In [2]:
import pandas as pd
import numpy as np

from warnings import filterwarnings
from sklearn import svm
from scipy.spatial.distance import pdist, squareform
from scipy import exp
from scipy.linalg import eigh
from sklearn.metrics import accuracy_score

# Disable warnings from being printed
filterwarnings('ignore')

In [3]:
# Get the train and validation data
train = pd.read_csv("arcene_train.data.txt", header=None, sep=" ", usecols=range(10000))
train_labels = pd.read_csv("arcene_train.labels.txt", header=None)
valid = pd.read_csv("arcene_valid.data.txt", header=None, sep=" ", usecols=range(10000))
valid_labels = pd.read_csv("arcene_valid.labels.txt", header=None)

#train = pd.read_csv("madelon_train.data.txt", header=None, sep=" ", usecols=range(500))
#train_labels = pd.read_csv("madelon_train.labels.txt", header=None)
#valid = pd.read_csv("madelon_valid.data.txt", header=None, sep=" ", usecols=range(500))
#valid_labels = pd.read_csv("madelon_valid.labels.txt", header=None)

In [4]:
def KLDA(X, X_labels, lmb):
    # Calculating the squared Euclidean distances for every pair of points
    # in the MxN dimensional dataset.
    sq_dists = pdist(X, 'sqeuclidean')

    # Converting the pairwise distances into a symmetric MxM matrix.
    mat_sq_dists = squareform(sq_dists)

    # Computing the MxM RBF kernel matrix.
    
    # For RBF kernel
    # K = exp(-gamma * mat_sq_dists)
      
    # For linear kernel
    K = X.dot(X.T)
    
    Karr = np.array(K, dtype=np.float)
    yarr = np.array(X_labels, dtype=np.int)

    
    labels = np.unique(yarr)
    n = yarr.shape[0]

    idx1 = np.where(yarr==labels[0])[0]
    idx2 = np.where(yarr==labels[1])[0]
    n1 = idx1.shape[0]
    n2 = idx2.shape[0]
    
    K1, K2 = Karr[:, idx1], Karr[:, idx2]
    
    N1 = np.dot(np.dot(K1, np.eye(n1) - (1 / float(n1))), K1.T)
    N2 = np.dot(np.dot(K2, np.eye(n2) - (1 / float(n2))), K2.T)
    N = N1 + N2 + np.diag(np.repeat(lmb, n))

    M1 = np.sum(K1, axis=1) / float(n1)
    M2 = np.sum(K2, axis=1) / float(n2)
    M = M1 - M2
    
    coeff = np.linalg.solve(N, M).reshape(-1, 1)
            
    return coeff

In [5]:
lmb = 1e-3

In [31]:
def project(data, X, coeff):
    projected_data = np.zeros((data.shape[0], 1))
    X_arr = np.array(X)
    data_arr = np.array(data)
    for i in range(data_arr.shape[0]):
        cur_dist = np.array([np.sum((data_arr[i]-x)**2) for x in X_arr])
        cur_k = cur_dist
        projected_data[i, :] = cur_k.dot(coeff)
    return projected_data    

In [32]:
coeff = KLDA(train, train_labels, lmb)
projected_valid = project(valid, train, coeff)
projected_train = project(train, train, coeff)
clf = svm.SVC(kernel="linear", max_iter=1000000)
clf.fit(projected_train, train_labels)
results = clf.predict(projected_valid)
print(accuracy_score(valid_labels, results))

[ -2.19556872e+12]
[ -1.89851594e+12]
[ -2.54353204e+12]
[ -2.17921390e+12]
[ -1.65327946e+12]
[ -1.85520114e+12]
[ -2.87900071e+12]
[ -1.41225791e+12]
[ -3.11996279e+12]
[ -2.51361226e+12]
[ -2.16707661e+12]
[ -2.98105144e+12]
[ -3.15408741e+12]
[ -1.99722525e+12]
[ -1.95818500e+12]
[ -2.19369099e+12]
[ -2.73247921e+12]
[ -1.92275634e+12]
[ -2.10577020e+12]
[ -2.80538383e+12]
[ -1.61504764e+12]
[ -3.45839247e+12]
[ -1.77701085e+12]
[ -2.63457941e+12]
[ -2.01596636e+12]
[ -2.06263458e+12]
[ -2.96126451e+12]
[ -1.68869028e+12]
[ -1.93390231e+12]
[ -2.58209368e+12]
[ -3.47608463e+12]
[ -2.48132578e+12]
[ -2.44480675e+12]
[ -1.75594691e+12]
[ -2.24775983e+12]
[ -3.46632150e+12]
[ -2.31869965e+12]
[ -1.97380680e+12]
[ -1.91834432e+12]
[ -2.73610269e+12]
[ -2.32240384e+12]
[ -2.42227368e+12]
[ -2.74720814e+12]
[ -2.42616603e+12]
[ -1.96490338e+12]
[ -2.17718797e+12]
[ -2.20545107e+12]
[ -2.40332817e+12]
[ -2.06020440e+12]
[ -3.32122839e+12]
[ -2.53070478e+12]
[ -2.58455540e+12]
[ -1.8625894