In [1]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import time
import os
import cvxopt
from cvxopt import matrix as cvm


In [2]:
# train = pd.read_csv("dtrain123.dat", delimiter = "  ", header = None)
test = pd.read_csv("data/dtest123.dat", delimiter = "  ", header = None)
train = pd.read_csv("data/zipcombo.dat", delimiter = " ", header = None)
train.shape
minitrain = pd.read_csv("data/dtrain123.dat", delimiter = "  ", header = None)
train = np.array(train)
xtrain = train[:,1:257]
ytrain = train[:,0]
one = np.array(minitrain)
xone = one[:, 1:257]
yone = one[:,0]
test = np.array(test)
xtest = test[:, 1:]
ytest = test[:,0]
def split_data(fullK, jtrain, jtest, y):
    Ktrain = fullK[jtrain][:,jtrain].copy()
    Ktest =  fullK[jtrain][:,jtest].copy()
    ytrain = y[jtrain]
    ytest = y[jtest]
    return Ktrain, ytrain, Ktest, ytest

In [3]:
def K(p, q, d = 2):
    return (p.T.dot(q))**d

def GaussK(p, q, c):
    return np.exp(-c * np.sum((p-q)**2))

In [4]:
class onevsall:
    def __init__(self, xtrain, ytrain, k, c = 1):
        self.N = len(xtrain)
        self.models = []
        self.x = xtrain
        self.y = ytrain
        self.K = k
        self.c = c
    
    def train(self):
        self.vals = range(int(max(self.y)+1)) 
        if len(self.models) == 0:
            self.ytemps = []
            for val in self.vals:
                ytemp = self.y.copy()
                ytemp[ytemp != val] = -1
                ytemp[ytemp == val] = 1
                self.models.append(SVM(self.x, ytemp, self.K, c = self.c))
                self.ytemps.append(ytemp.copy())
                self.models[val].optimize()        
    
    def predict(self, x, sign = False, full = False):
        preds = np.zeros((x.shape[1], len(self.models)))
        yhat = np.zeros(x.shape[0])
        for i, model in enumerate(self.models):
            preds[:,i] = model.predict(x, sign)
        preds[preds<0] = 0
        if full:
            return 
        return np.abs(preds).argmax(axis = 1)

In [5]:
class SVM:
    def __init__(self, xtrain, ytrain, k = None, c = 1):
        self.K = k
        self.y = ytrain
        self.n = len(k)
        self.c = c
        self.x = xtrain

    def optimize(self):
        P = cvm(np.outer(self.y, self.y)*self.K)
        q = cvm(-np.ones((self.n)))
        G = np.zeros((2*self.n, self.n))
        G[:self.n] = -np.diag(np.ones((self.n)))
        G[self.n:] = np.eye(self.n)
        
        G = cvm(G)
        H = np.zeros((2*self.n, 1))
        H[self.n:] = self.c * np.ones((self.n, 1))
        H = cvm(H)
        A = cvm(self.y.copy().reshape(1, self.n))
        b = cvm(np.zeros(1))
        self.A = A
        
        
        sol = cvxopt.solvers.qp(P, q, G, H, A, b,  options =  {"show_progress": False})
        self.alpha = np.array(sol["x"])
        self.sv = np.where(self.alpha > 1e-5)[0]
        self.svx = self.x[self.sv]
        self.svy = self.y[self.sv]
        self.b = np.sum(self.svy)
        for sv in self.sv:
            self.b -= np.sum(self.alpha[sv]*self.y[sv]* self.K[sv, self.sv])
        self.b /= len(self.sv)
    
    def predict(self, ktest, sign = False):
        yhat = (self.alpha[self.sv].flatten()*self.svy.flatten()).dot(ktest[self.sv])

        if sign:
            return np.sign(yhat)
        return yhat


1

In [6]:
class onevsone:
    def __init__(self, xtrain, ytrain, k, c = 1):
        self.x = xtrain
        self.y = ytrain
        self.K = k
        self.c = c
        self.models = []
        self.n = len(xtrain)
    
    def train(self):
        self.vals = range(int(max(self.y)+1))
        self.m = int(max(self.y)+1)
        self.Ks = [int(max(self.y+1))*[None] for i in range(int(max(self.y+1)))]
        self.ijs = [int(max(self.y+1))*[None] for i in range(int(max(self.y+1)))]
        self.ys = [int(max(self.y+1))*[None] for i in range(int(max(self.y+1)))]
        self.models = [int(max(self.y+1))*[None] for i in range(int(max(self.y+1)))]
        for i in range(self.m):
            for j in range(i+1, self.m):
                ij = np.where((self.y==i)|(self.y==j))[0]
                self.ijs[i][j] = ij.copy()
                ktemp = self.K[ij][:,ij]
                ytemp = self.y[ij].copy()
                ytemp[ytemp==j] = -1
                ytemp[ytemp==i] = 1
                self.models[i][j] = SVM(self.x[ij], ytemp, ktemp, self.c)
                self.models[i][j].optimize()
    
    def predict(self, K):
        n = K.shape[1]
        preds = np.zeros((n, self.m, self.m))
        for i in range(self.m):
            for j in range(i+1, self.m):
                preds[:,i,j] = self.models[i][j].predict(K[self.ijs[i][j]], sign = False)
        scores = np.zeros((n, n))
        for n, pred in enumerate(preds):
            for i in range(self.m):
                scores[n,i] += -np.sum(pred[:i, i]) + np.sum(pred[i,i:])
        return np.argmax(scores, axis = 1)
    
                

In [7]:
class bag:
    def __init__(self, n, xtrain, ytrain, k, c=1):
        self.nsplits= n
        self.x = xtrain
        self.y = ytrain
        self.K = k
        self.c = c
        self.n = len(xtrain)
        
    def train(self):
        self.models = []
        self.splits = np.linspace(0, self.n, self.nsplits + 1).astype("int")
        for i in range(1, self.nsplits+1):
            n, nprev = self.splits[i], self.splits[i-1]
            xtemp = self.x[nprev:n]
            ytemp = self.y[nprev:n]
            ktemp = self.K[nprev:n][:,nprev:n]
            self.models.append(onevsall(xtemp, ytemp, ktemp, c = self.c))
            self.models[-1].train()
    
    def predict(self, ktest, full = False):
        ntest = ktest.shape[1]
        votes = np.zeros((ntest, self.nsplits)).astype("int")
        for i in range(1, self.nsplits+1):
            n, nprev = self.splits[i], self.splits[i-1]
            votes[:,i-1] = self.models[i-1].predict(ktest[nprev:n])
        votes = votes.astype("int")
        yhat = np.zeros(ntest)
        if full:
            yhat = np.zeros((ntest, 10))
            for i in range(ntest):
#                 print(yhat, np.bincount(votes[i]))
                yhat[i] = np.bincount(votes[i], minlength = 10)
            return yhat
        for i in range(ntest):
            yhat[i] = np.bincount(votes[i]).argmax()
        return yhat