In [14]:
# This class manages dataset that will be used to train SVM
class SVMDataset :
    def __init__(self) :
        import numpy
        self.np = numpy
        
    # Load dataset from file, preprocess and normalize
    def load_dataset(self, file) :
        dataset = self.np.genfromtxt(file, delimiter=',', dtype="|S50", autostrip=True)
        train = self.sanitize(dataset)
        self.calc_mean_std(train)
        self.normalize_train(train)
        self.dataset = train
        self.np.random.shuffle(self.dataset)
    
    # Calculate mean and standard variance of each feature
    def calc_mean_std(self, train) :
        self.mean = self.np.empty(14)
        self.std = self.np.empty(14)
        for i in xrange(14) :
            self.mean[i] = self.np.mean(train[:,i])
            self.std[i] = self.np.std(train[:,i])
        
    # Normalize data using mean and standard deviation of features
    def normalize_train(self, train) :
        for i in xrange(len(train)) :
            for j in xrange(14) :
                train[i][j] = (train[i][j] - self.mean[j])/self.std[j]
     
    # Replace strings with a number
    def sanitize(self, dataset) :
        self.replacer = []
        for i in xrange(len(dataset[0])) :
            u, indices = self.np.unique(dataset[:, i], return_inverse=True)
            self.replacer.append(u[self.np.argmax(self.np.bincount(indices))])
        for i in xrange(len(dataset[0])) :
            dataset[:,i][dataset[:,i] == '?'] = self.replacer[i]
        data = self.np.zeros((len(dataset), len(dataset[0])), dtype=self.np.float)
        data[:,0] = dataset[:,0].astype(self.np.float)
        self.col1dict, data[:,1] = self.np.unique(dataset[:, 1], return_inverse=True)
        data[:,2] = dataset[:,2].astype(self.np.float)
        self.col3dict, data[:,3] = self.np.unique(dataset[:, 3], return_inverse=True)
        data[:,4] = dataset[:,4].astype(self.np.float)
        self.col5dict, data[:,5] = self.np.unique(dataset[:, 5], return_inverse=True)
        self.col6dict, data[:,6] = self.np.unique(dataset[:, 6], return_inverse=True)
        self.col7dict, data[:,7] = self.np.unique(dataset[:, 7], return_inverse=True)
        self.col8dict, data[:,8] = self.np.unique(dataset[:, 8], return_inverse=True)
        self.col9dict, data[:,9] = self.np.unique(dataset[:, 9], return_inverse=True)
        data[:,10] = dataset[:,10].astype(self.np.float)
        data[:,11] = dataset[:,11].astype(self.np.float)
        data[:,12] = dataset[:,12].astype(self.np.float)
        self.col13dict, data[:,13] = self.np.unique(dataset[:, 13], return_inverse=True)
        data[:,14] = dataset[:,14].astype(self.np.float)
        return data
    
    # Replace strings with numbers in validation and test set
    # The number to be used for replacing is decided in above function itself
    def sanitize_valid(self, dataset) :
        data = self.np.zeros((len(dataset), len(dataset[0])), dtype=self.np.float)
        for i in xrange(len(dataset[0])) :
            dataset[dataset[:, i] == '?', i] = self.replacer[i]
        data[:,0] = dataset[:,0].astype(self.np.float)
        for i in xrange(len(self.col1dict)) :
            data[:,1][dataset[:,1] == self.col1dict[i]] = i
        data[:,2] = dataset[:,2].astype(self.np.float)
        for i in xrange(len(self.col3dict)) :
            data[:,3][dataset[:,3] == self.col3dict[i]] = i
        data[:,4] = dataset[:,4].astype(self.np.float)
        for i in xrange(len(self.col5dict)) :
            data[:,5][dataset[:,5] == self.col5dict[i]] = i
        for i in xrange(len(self.col6dict)) :
            data[:,6][dataset[:,6] == self.col6dict[i]] = i
        for i in xrange(len(self.col7dict)) :
            data[:,7][dataset[:,7] == self.col7dict[i]] = i
        for i in xrange(len(self.col8dict)) :
            data[:,8][dataset[:,8] == self.col8dict[i]] = i
        for i in xrange(len(self.col9dict)) :
            data[:,9][dataset[:,9] == self.col9dict[i]] = i
        data[:,10] = dataset[:,10].astype(self.np.float)
        data[:,11] = dataset[:,11].astype(self.np.float)
        data[:,12] = dataset[:,12].astype(self.np.float)
        for i in xrange(len(self.col13dict)) :
            data[:,13][dataset[:,13] == self.col13dict[i]] = i
        return data

In [15]:
import numpy as np

# Linear kernel
def linear_kernel(X1, X2) :
    return np.dot(X1, X2.T)

# Polynomial kenel of degree 2
def polynomial_kernel(X1, X2, q) :
    return (np.dot(X1, X2.T)+1)**q

# Gaussian kernel with s as parameter
def gaussian_kernel(X1, X2, s) :
    gram_matrix = np.zeros((X1.shape[0], X2.shape[0]))
    for i, x1 in enumerate(X1):
        for j, x2 in enumerate(X2):
            gram_matrix[i, j] = np.exp(- np.linalg.norm(x1.flatten() - x2.flatten())**2 / 2*float(s**2) )
    return gram_matrix

# Calculate accuracy of the predicted data
def get_accuracy(test_result_y, test_y):
    correct = 0;
    for i in xrange(len(test_y)):
        if test_y[i] == test_result_y[i]:
            correct += 1
    return correct, float(correct)/len(test_y)


In [17]:
import numpy as np
from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

# Initialize dataset from file
s = SVMDataset()
s.load_dataset('data/train.csv')

# Seperate X (input) and y (output) from dataset
X = s.dataset[:,:-1]
y = s.dataset[:,-1]
X = X[:3000,:]
y = y[:3000]

# Do stratified k fold splitting for cross validation
skf = StratifiedKFold(n_splits=5)

In [18]:
# Initialize SVM with custom kernel
svc_linear = SVC(kernel='precomputed')
svc_polynomial = SVC(kernel='precomputed')
svc_gaussian = SVC(kernel='precomputed')

In [19]:
accuracy_linear_list = []

# For each cross validation set run linear kernel SVM
for train_index, test_index in skf.split(X, y) :
    # Dataset
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Train
    kernel_train_linear = linear_kernel(X_train, X_train) 
    svc_linear.fit(kernel_train_linear, y_train)
    # Test
    kernel_test_linear = linear_kernel(X_test, X_train)
    y_pred_linear = svc_linear.predict(kernel_test_linear)
    # Calculate Accuracy
    _, accuracy_linear = get_accuracy(y_pred_linear, y_test)
    accuracy_linear_list.append(accuracy_linear)
    print 'Accuracy for linear kernel: ', accuracy_linear

# Calculate Average Accuracy
print "Average accuracy for Linear Kernel : ", np.mean(accuracy_linear_list)

Accuracy for linear kernel:  0.826955074875
Accuracy for linear kernel:  0.838602329451
Accuracy for linear kernel:  0.825
Accuracy for linear kernel:  0.818030050083
Accuracy for linear kernel:  0.797996661102
Average accuracy for Linear Kernel :  0.821316823102


In [20]:
accuracy_polynomial_list = []

# For each cross validation set run polynomial kernel SVM
for train_index, test_index in skf.split(X, y) :
    # Dataset
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Train
    kernel_train_polynomial = polynomial_kernel(X_train, X_train, 2) 
    svc_polynomial.fit(kernel_train_polynomial, y_train)
    # Test
    kernel_test_polynomial = polynomial_kernel(X_test, X_train, 2)
    y_pred_polynomial = svc_polynomial.predict(kernel_test_polynomial)
    # Calculate Accuracy
    _, accuracy_polynomial = get_accuracy(y_pred_polynomial, y_test)
    accuracy_polynomial_list.append(accuracy_polynomial)
    print 'Accuracy for Polynomial kernel: ', accuracy_polynomial
    
# Calculate Average Accuracy
print "Average accuracy for Polynomial Kernel : ", np.mean(accuracy_polynomial_list)

Accuracy for Polynomial kernel:  0.865224625624
Accuracy for Polynomial kernel:  0.835274542429
Accuracy for Polynomial kernel:  0.836666666667
Accuracy for Polynomial kernel:  0.85141903172
Accuracy for Polynomial kernel:  0.828046744574
Average accuracy for Polynomial Kernel :  0.843326322203


In [23]:
accuracy_polynomial_list = []

# For each cross validation set run polynomial kernel SVM
for train_index, test_index in skf.split(X, y) :
    # Dataset
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Train
    kernel_train_polynomial = polynomial_kernel(X_train, X_train, 3) 
    svc_polynomial.fit(kernel_train_polynomial, y_train)
    # Test
    kernel_test_polynomial = polynomial_kernel(X_test, X_train, 3)
    y_pred_polynomial = svc_polynomial.predict(kernel_test_polynomial)
    # Calculate Accuracy
    _, accuracy_polynomial = get_accuracy(y_pred_polynomial, y_test)
    accuracy_polynomial_list.append(accuracy_polynomial)
    print 'Accuracy for Polynomial kernel: ', accuracy_polynomial
    
# Calculate Average Accuracy
print "Average accuracy for Polynomial Kernel : ", np.mean(accuracy_polynomial_list)

Accuracy for Polynomial kernel:  0.793677204659
Accuracy for Polynomial kernel:  0.777038269551
Accuracy for Polynomial kernel:  0.771666666667
Accuracy for Polynomial kernel:  0.786310517529
Accuracy for Polynomial kernel:  0.766277128548
Average accuracy for Polynomial Kernel :  0.778993957391


In [24]:
accuracy_polynomial_list = []

# For each cross validation set run polynomial kernel SVM
for train_index, test_index in skf.split(X, y) :
    # Dataset
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Train
    kernel_train_polynomial = polynomial_kernel(X_train, X_train, 4) 
    svc_polynomial.fit(kernel_train_polynomial, y_train)
    # Test
    kernel_test_polynomial = polynomial_kernel(X_test, X_train, 4)
    y_pred_polynomial = svc_polynomial.predict(kernel_test_polynomial)
    # Calculate Accuracy
    _, accuracy_polynomial = get_accuracy(y_pred_polynomial, y_test)
    accuracy_polynomial_list.append(accuracy_polynomial)
    print 'Accuracy for Polynomial kernel: ', accuracy_polynomial
    
# Calculate Average Accuracy
print "Average accuracy for Polynomial Kernel : ", np.mean(accuracy_polynomial_list)

Accuracy for Polynomial kernel:  0.780366056572
Accuracy for Polynomial kernel:  0.765391014975
Accuracy for Polynomial kernel:  0.763333333333
Accuracy for Polynomial kernel:  0.74958263773
Accuracy for Polynomial kernel:  0.726210350584
Average accuracy for Polynomial Kernel :  0.756976678639


In [25]:
accuracy_gaussian_list = []

# For each cross validation set gaussian kernel SVM
for train_index, test_index in skf.split(X, y) :
    # Dataset
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Train
    kernel_train_gaussian = gaussian_kernel(X_train, X_train, 0.5)
    svc_gaussian.fit(kernel_train_gaussian, y_train)
    # Test
    kernel_test_gaussian = gaussian_kernel(X_test, X_train, 0.5)
    y_pred_gaussian = svc_gaussian.predict(kernel_test_gaussian)
    # Calculate Accuracy
    _, accuracy_gaussian = get_accuracy(y_pred_gaussian, y_test)
    accuracy_gaussian_list.append(accuracy_gaussian)
    print 'Accuracy for Gaussian kernel: ', accuracy_gaussian
    
# Calculate Average Accuracy
print "Average accuracy for Gaussian Kernel : ", np.mean(accuracy_gaussian_list)

Accuracy for Gaussian kernel:  0.860232945092
Accuracy for Gaussian kernel:  0.833610648918
Accuracy for Gaussian kernel:  0.835
Accuracy for Gaussian kernel:  0.841402337229
Accuracy for Gaussian kernel:  0.819699499165
Average accuracy for Gaussian Kernel :  0.837989086081


In [26]:
accuracy_gaussian_list = []

# For each cross validation set gaussian kernel SVM
for train_index, test_index in skf.split(X, y) :
    # Dataset
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Train
    kernel_train_gaussian = gaussian_kernel(X_train, X_train, 0.25)
    svc_gaussian.fit(kernel_train_gaussian, y_train)
    # Test
    kernel_test_gaussian = gaussian_kernel(X_test, X_train, 0.25)
    y_pred_gaussian = svc_gaussian.predict(kernel_test_gaussian)
    # Calculate Accuracy
    _, accuracy_gaussian = get_accuracy(y_pred_gaussian, y_test)
    accuracy_gaussian_list.append(accuracy_gaussian)
    print 'Accuracy for Gaussian kernel: ', accuracy_gaussian
    
# Calculate Average Accuracy
print "Average accuracy for Gaussian Kernel : ", np.mean(accuracy_gaussian_list)

Accuracy for Gaussian kernel:  0.83693843594
Accuracy for Gaussian kernel:  0.83693843594
Accuracy for Gaussian kernel:  0.831666666667
Accuracy for Gaussian kernel:  0.836393989983
Accuracy for Gaussian kernel:  0.819699499165
Average accuracy for Gaussian Kernel :  0.832327405539


In [27]:
accuracy_gaussian_list = []

# For each cross validation set gaussian kernel SVM
for train_index, test_index in skf.split(X, y) :
    # Dataset
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Train
    kernel_train_gaussian = gaussian_kernel(X_train, X_train, 0.75)
    svc_gaussian.fit(kernel_train_gaussian, y_train)
    # Test
    kernel_test_gaussian = gaussian_kernel(X_test, X_train, 0.75)
    y_pred_gaussian = svc_gaussian.predict(kernel_test_gaussian)
    # Calculate Accuracy
    _, accuracy_gaussian = get_accuracy(y_pred_gaussian, y_test)
    accuracy_gaussian_list.append(accuracy_gaussian)
    print 'Accuracy for Gaussian kernel: ', accuracy_gaussian
    
# Calculate Average Accuracy
print "Average accuracy for Gaussian Kernel : ", np.mean(accuracy_gaussian_list)

Accuracy for Gaussian kernel:  0.848585690516
Accuracy for Gaussian kernel:  0.821963394343
Accuracy for Gaussian kernel:  0.828333333333
Accuracy for Gaussian kernel:  0.821368948247
Accuracy for Gaussian kernel:  0.816360601002
Average accuracy for Gaussian Kernel :  0.827322393488


In [28]:
accuracy_gaussian_list = []

# For each cross validation set gaussian kernel SVM
for train_index, test_index in skf.split(X, y) :
    # Dataset
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Train
    kernel_train_gaussian = gaussian_kernel(X_train, X_train, 0.6)
    svc_gaussian.fit(kernel_train_gaussian, y_train)
    # Test
    kernel_test_gaussian = gaussian_kernel(X_test, X_train, 0.6)
    y_pred_gaussian = svc_gaussian.predict(kernel_test_gaussian)
    # Calculate Accuracy
    _, accuracy_gaussian = get_accuracy(y_pred_gaussian, y_test)
    accuracy_gaussian_list.append(accuracy_gaussian)
    print 'Accuracy for Gaussian kernel: ', accuracy_gaussian
    
# Calculate Average Accuracy
print "Average accuracy for Gaussian Kernel : ", np.mean(accuracy_gaussian_list)

Accuracy for Gaussian kernel:  0.855241264559
Accuracy for Gaussian kernel:  0.83693843594
Accuracy for Gaussian kernel:  0.836666666667
Accuracy for Gaussian kernel:  0.831385642738
Accuracy for Gaussian kernel:  0.813021702838
Average accuracy for Gaussian Kernel :  0.834650742548


In [29]:
# MultiKernelfixedrules Class
class MultiKernelfixedrules(object):
    
    def __init__(self, kernels, X=None):
        self.kernels = kernels
        self.X = X
        
    # Calculoates gram matrix with given alphas
    def get_gram_matrix(self, Y, X=None, alphas=[0.33, 0.33, 0.34]):
        if X is not None:
            self.X = X
        
        gram_matrix = np.zeros((X.shape[0], Y.shape[0]))
        
        for i in xrange(len(self.kernels)):
            gram_matrix += alphas[i]*self.kernels[i](X, Y)
            
        return gram_matrix

In [30]:
# Define Kernel functions with best parameters found in above case

# Polynomial kernel
def polynomial_kernel_2(X1, X2) :
    return polynomial_kernel(X1, X2, 2)

# Gaussain kernel
def gaussian_kernel_2(X1, X2) :
    return gaussian_kernel(X1, X2, 0.5)

# Initailize SVM with custom kernel
svc = SVC(kernel='precomputed')
# Initailize MKL kernel with the 3 kernels
mkl = MultiKernelfixedrules([linear_kernel, polynomial_kernel_2, gaussian_kernel_2])

In [31]:
accuracy = []

# For each cross validation train and test set run mkl kernel SVM
for train_index, test_index in skf.split(X, y):
    # Dataset
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Train
    kernel_train = mkl.get_gram_matrix(X_train, X_train)  # linear kernel
    svc.fit(kernel_train, y_train)
    # Test
    kernel_test = mkl.get_gram_matrix(X_train, X_test)
    y_pred = svc.predict(kernel_test)
    # Calculate Accuracy
    accuracy.append(get_accuracy(y_pred, y_test)[1])
    print 'Accuracy : ', get_accuracy(y_pred, y_test)[1]

# Calculate Average Accuracy
print "Average accuracy for Multi Kernel : ", np.mean(accuracy)

Accuracy :  0.865224625624
Accuracy :  0.840266222962
Accuracy :  0.835
Accuracy :  0.854757929883
Accuracy :  0.834724540902
Average accuracy for Multi Kernel :  0.845994663874


In [32]:
accuracy = []

# For each cross validation train and test set run mkl kernel SVM
for train_index, test_index in skf.split(X, y):
    # Dataset
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Train
    kernel_train = mkl.get_gram_matrix(X_train, X_train, alphas=[0.25, 0.4, 0.35])  # linear kernel
    svc.fit(kernel_train, y_train)
    # Test
    kernel_test = mkl.get_gram_matrix(X_train, X_test, alphas=[0.25, 0.4, 0.35])
    y_pred = svc.predict(kernel_test)
    # Calculate Accuracy
    accuracy.append(get_accuracy(y_pred, y_test)[1])
    print 'Accuracy : ', get_accuracy(y_pred, y_test)[1]

# Calculate Average Accuracy
print "Average accuracy for Multi Kernel : ", np.mean(accuracy)

Accuracy :  0.865224625624
Accuracy :  0.838602329451
Accuracy :  0.835
Accuracy :  0.854757929883
Accuracy :  0.83305509182
Average accuracy for Multi Kernel :  0.845327995356


In [33]:
accuracy = []

# For each cross validation train and test set run mkl kernel SVM
for train_index, test_index in skf.split(X, y):
    # Dataset
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Train
    kernel_train = mkl.get_gram_matrix(X_train, X_train, alphas=[0.25, 0.4, 0.35])  # linear kernel
    svc.fit(kernel_train, y_train)
    # Test
    kernel_test = mkl.get_gram_matrix(X_train, X_test, alphas=[0.25, 0.35, 0.4])
    y_pred = svc.predict(kernel_test)
    # Calculate Accuracy
    accuracy.append(get_accuracy(y_pred, y_test)[1])
    print 'Accuracy : ', get_accuracy(y_pred, y_test)[1]

# Calculate Average Accuracy
print "Average accuracy for Multi Kernel : ", np.mean(accuracy)

Accuracy :  0.861896838602
Accuracy :  0.838602329451
Accuracy :  0.851666666667
Accuracy :  0.843071786311
Accuracy :  0.823038397329
Average accuracy for Multi Kernel :  0.843655203672


## Report

* Due to time constraints I ran only for 3000 datapoints

### Accuracy details (4a)
* Linear kernel : 0.821316823102
* Polynomial kernel :
    * q = 2 : 0.843326322203
    * q = 3 : 0.778993957391
    * q = 4 : 0.756976678639
    * Best q value is 2
* Gaussian Kernel :
    * s = 0.5 : 0.837989086081
    * s = 0.25 : 0.832327405539
    * s = 0.75 : 0.827322393488
    * s = 0.6 : 0.834650742548
    * Best s value is 0.5
    
### For multi kernel (4b)
* Multikernelfixedrules : [Linear, Polynomial(q = 2), Gaussian(s = 0.5)]
    * alphas = [0.33, 0.33, 0.34] : 0.845994663874
    * alphas = [0.25, 0.4, 0.35] : 0.845327995356
    * alphas = [0.25, 0.35, 0.4] : 0.843655203672
    * Best alphas combination : [0.33, 0.33, 0.34]