In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.utils import shuffle
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import *
import numpy as np
import math
from sklearn.datasets import make_classification
from numpy import linalg
from collections import Counter

from implementation import *

In [2]:
"""
Abalone dataset, we choose the subset of 2 classes with most count
"""

data = pd.read_csv('abalone.data',names=['sex','length','diameter','height','whole_w','shucked_w','viscera_w','shell_w','label'])

# Take 2 most common labels which are 9 and 10 but results are poor (see report)
# most_common_label = [label for label, _ in Counter(data.label).most_common(10)]

# Take a better subset of class that yields better result (see report for discussion on this)
most_common_label = [7,11]
data = data[data.label.isin(most_common_label)]

# Convert label to -1 and 1
def encode_label(x):
    if x == most_common_label[0]:
        return -1
    return 1
data['label'] = data['label'].apply(lambda x: encode_label(x))

# Encode nominal feature
data = pd.get_dummies(data)
X, y = data.loc[:, data.columns != 'label'].values, data.loc[:,data.columns=='label'].values.reshape(-1)

In [3]:
"""
Use KFold to compare polynomial and gaussian on abalone dataset
"""

kf = KFold(n_splits=5)
kernels = ['polynomial','radial']

average_test_accuracies = []
average_train_accuracies = []

for kernel in kernels:
    train_accuracy = []
    test_accuracy = []
    # Start KFold for current degree of polynomial
    for train_index, val_index in kf.split(X):
        # Split to train and test for this fold
        X_train_kfold, X_val = X[train_index], X[val_index]
        y_val = y[val_index]
        y_train_kfold = y[train_index]
        
        X_train_poly = X_train_kfold
        X_test_poly = X_val

        # Train SVM with specified kernel
        SVM = KernelSVM(kernel=kernel)#,degree=2)
        # Fit the separable data
        SVM.fit(X_train_poly,y_train_kfold)
        
        # Predict
        train_pred = SVM.predict(X_train_poly)
        test_pred = SVM.predict(X_test_poly)

        # Save the accuracy of this fold
        train_accuracy.append(accuracy_score(train_pred,y_train_kfold))
        test_accuracy.append(accuracy_score(test_pred,y_val))

    # Average the accuracy across 5 fold for each mapping
    average_test_accuracies.append(np.average(test_accuracy))
    average_train_accuracies.append(np.average(train_accuracy))
    
for i, (average_train_accuracy, average_test_accuracy) in enumerate(zip(average_train_accuracies,average_test_accuracies)):
    print(f"SVM with {kernels[i]} kernel:")
    print(f"\tAverage training accuracy score: {average_train_accuracy*100}")
    print(f"\tAverage testing accuracy score: {average_test_accuracy*100}")

     pcost       dcost       gap    pres   dres
 0: -1.0702e+03 -4.7219e+04  1e+05  7e-01  7e-13
 1: -8.5762e+02 -2.2239e+04  3e+04  2e-01  9e-13
 2: -7.5690e+02 -1.0880e+04  1e+04  6e-02  8e-13
 3: -8.6851e+02 -4.2844e+03  4e+03  2e-02  7e-13
 4: -1.0944e+03 -2.7646e+03  2e+03  4e-03  8e-13
 5: -1.1735e+03 -2.4745e+03  1e+03  3e-03  8e-13
 6: -1.2759e+03 -2.0567e+03  8e+02  1e-03  8e-13
 7: -1.3241e+03 -1.9185e+03  6e+02  1e-03  8e-13
 8: -1.3728e+03 -1.7630e+03  4e+02  5e-04  8e-13
 9: -1.4115e+03 -1.6544e+03  2e+02  3e-04  9e-13
10: -1.4333e+03 -1.5990e+03  2e+02  1e-04  1e-12
11: -1.4626e+03 -1.5344e+03  7e+01  4e-05  9e-13
12: -1.4771e+03 -1.5084e+03  3e+01  1e-05  1e-12
13: -1.4836e+03 -1.4982e+03  1e+01  4e-06  9e-13
14: -1.4887e+03 -1.4906e+03  2e+00  4e-07  1e-12
15: -1.4895e+03 -1.4896e+03  5e-02  7e-09  1e-12
16: -1.4896e+03 -1.4896e+03  6e-04  9e-11  1e-12
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0: -1.0954e+03 -5.1370e+04  1e+05  7e-01  6e-1

SVM with polynomial kernel:
	Average training accuracy score: 91.51491572544204
	Average testing accuracy score: 89.52597402597404
SVM with radial kernel:
	Average training accuracy score: 86.44664907822802
	Average testing accuracy score: 85.19350649350649


In [4]:
"""
Use KFold to compare polynomial SVM of different degree
"""

kf = KFold(n_splits=5)
degrees = [2,3,4]

average_test_accuracies = []
average_train_accuracies = []

for degree in degrees:
    train_accuracy = []
    test_accuracy = []
    # Start KFold for current degree of polynomial
    for train_index, val_index in kf.split(X):
        # Split to train and test for this fold
        X_train_kfold, X_val = X[train_index], X[val_index]
        y_val = y[val_index]
        y_train_kfold = y[train_index]
        
        X_train_poly = X_train_kfold
        X_test_poly = X_val

        # Train SVM with specified kernel
        SVM = KernelSVM(kernel="polynomial",degree=degree)
        # Fit the separable data
        SVM.fit(X_train_poly,y_train_kfold)
        
        # Predict
        train_pred = SVM.predict(X_train_poly)
        test_pred = SVM.predict(X_test_poly)

        # Save the accuracy of this fold
        train_accuracy.append(accuracy_score(train_pred,y_train_kfold))
        test_accuracy.append(accuracy_score(test_pred,y_val))

    # Average the accuracy across 5 fold for each mapping
    average_test_accuracies.append(np.average(test_accuracy))
    average_train_accuracies.append(np.average(train_accuracy))
    
for i, (average_train_accuracy, average_test_accuracy) in enumerate(zip(average_train_accuracies,average_test_accuracies)):
    print(f"Polynomial SVM with degree {degrees[i]}:")
    print(f"\tAverage training accuracy score: {average_train_accuracy*100}")
    print(f"\tAverage testing accuracy score: {average_test_accuracy*100}")

     pcost       dcost       gap    pres   dres
 0: -1.0702e+03 -4.7219e+04  1e+05  7e-01  7e-13
 1: -8.5762e+02 -2.2239e+04  3e+04  2e-01  9e-13
 2: -7.5690e+02 -1.0880e+04  1e+04  6e-02  8e-13
 3: -8.6851e+02 -4.2844e+03  4e+03  2e-02  7e-13
 4: -1.0944e+03 -2.7646e+03  2e+03  4e-03  8e-13
 5: -1.1735e+03 -2.4745e+03  1e+03  3e-03  8e-13
 6: -1.2759e+03 -2.0567e+03  8e+02  1e-03  8e-13
 7: -1.3241e+03 -1.9185e+03  6e+02  1e-03  8e-13
 8: -1.3728e+03 -1.7630e+03  4e+02  5e-04  8e-13
 9: -1.4115e+03 -1.6544e+03  2e+02  3e-04  9e-13
10: -1.4333e+03 -1.5990e+03  2e+02  1e-04  1e-12
11: -1.4626e+03 -1.5344e+03  7e+01  4e-05  9e-13
12: -1.4771e+03 -1.5084e+03  3e+01  1e-05  1e-12
13: -1.4836e+03 -1.4982e+03  1e+01  4e-06  9e-13
14: -1.4887e+03 -1.4906e+03  2e+00  4e-07  1e-12
15: -1.4895e+03 -1.4896e+03  5e-02  7e-09  1e-12
16: -1.4896e+03 -1.4896e+03  6e-04  9e-11  1e-12
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0: -1.0954e+03 -5.1370e+04  1e+05  7e-01  6e-1

 8: -1.5557e+03 -1.7963e+03  2e+02  2e-04  2e-12
 9: -1.5759e+03 -1.7430e+03  2e+02  9e-05  2e-12
10: -1.5981e+03 -1.6897e+03  9e+01  3e-05  2e-12
11: -1.6181e+03 -1.6512e+03  3e+01  5e-06  2e-12
12: -1.6282e+03 -1.6352e+03  7e+00  4e-07  3e-12
13: -1.6307e+03 -1.6318e+03  1e+00  3e-08  2e-12
14: -1.6312e+03 -1.6313e+03  9e-02  2e-09  2e-12
15: -1.6312e+03 -1.6312e+03  3e-03  6e-11  2e-12
16: -1.6312e+03 -1.6312e+03  6e-05  9e-13  2e-12
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0: -6.6353e+02 -4.6419e+04  1e+05  7e-01  8e-12
 1: -5.1793e+02 -2.0505e+04  3e+04  2e-01  1e-11
 2: -4.8971e+02 -1.3257e+04  2e+04  6e-02  9e-12
 3: -5.9292e+02 -7.8672e+03  9e+03  3e-02  7e-12
 4: -6.7491e+02 -5.7179e+03  6e+03  1e-02  8e-12
 5: -8.1678e+02 -2.5587e+03  2e+03  3e-03  9e-12
 6: -9.0671e+02 -1.8809e+03  1e+03  1e-03  8e-12
 7: -9.6162e+02 -1.6261e+03  7e+02  6e-04  7e-12
 8: -1.0192e+03 -1.3634e+03  4e+02  3e-04  8e-12
 9: -1.0649e+03 -1.2231e+03  2e+02  8e-05  8e-

In [5]:
"""
Use KFold to compare gaussian SVM of different sigma
"""

kf = KFold(n_splits=5)
sigmas = [0.1,1.0,5.0,10.0]

average_test_accuracies = []
average_train_accuracies = []

for sigma in sigmas:
    train_accuracy = []
    test_accuracy = []
    # Start KFold for current degree of polynomial
    for train_index, val_index in kf.split(X):
        # Split to train and test for this fold
        X_train_kfold, X_val = X[train_index], X[val_index]
        y_val = y[val_index]
        y_train_kfold = y[train_index]
        
        X_train_poly = X_train_kfold
        X_test_poly = X_val

        # Train SVM with specified kernel
        SVM = KernelSVM(kernel="radial",sigma=sigma)
        # Fit the separable data
        SVM.fit(X_train_poly,y_train_kfold)
        
        # Predict
        train_pred = SVM.predict(X_train_poly)
        test_pred = SVM.predict(X_test_poly)

        # Save the accuracy of this fold
        train_accuracy.append(accuracy_score(train_pred,y_train_kfold))
        test_accuracy.append(accuracy_score(test_pred,y_val))

    # Average the accuracy across 5 fold for each mapping
    average_test_accuracies.append(np.average(test_accuracy))
    average_train_accuracies.append(np.average(train_accuracy))
    
for i, (average_train_accuracy, average_test_accuracy) in enumerate(zip(average_train_accuracies,average_test_accuracies)):
    print(f"Gaussian SVM with sigma {sigmas[i]}:")
    print(f"\tAverage training accuracy score: {average_train_accuracy*100}")
    print(f"\tAverage testing accuracy score: {average_test_accuracy*100}")

     pcost       dcost       gap    pres   dres
 0:  5.4557e+02 -2.4192e+04  4e+04  2e-01  6e-15
 1:  3.6095e+01 -4.4269e+03  5e+03  2e-02  5e-15
 2: -4.8695e+02 -1.7240e+03  1e+03  4e-03  4e-15
 3: -6.7259e+02 -1.1832e+03  5e+02  1e-03  5e-15
 4: -7.4685e+02 -9.5744e+02  2e+02  2e-04  5e-15
 5: -7.7435e+02 -8.5869e+02  8e+01  3e-05  5e-15
 6: -7.8673e+02 -8.1169e+02  2e+01  5e-06  6e-15
 7: -7.9168e+02 -7.9702e+02  5e+00  6e-14  6e-15
 8: -7.9288e+02 -7.9414e+02  1e+00  1e-14  6e-15
 9: -7.9323e+02 -7.9338e+02  2e-01  7e-15  6e-15
10: -7.9327e+02 -7.9328e+02  1e-02  1e-13  5e-15
11: -7.9328e+02 -7.9328e+02  3e-04  6e-14  6e-15
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0:  4.1571e+02 -2.5933e+04  4e+04  2e-01  6e-15
 1: -8.0392e+01 -4.8249e+03  5e+03  2e-02  6e-15
 2: -6.4858e+02 -2.0360e+03  1e+03  4e-03  6e-15
 3: -8.6249e+02 -1.3496e+03  5e+02  9e-04  6e-15
 4: -9.4133e+02 -1.1236e+03  2e+02  2e-04  6e-15
 5: -9.7245e+02 -1.0296e+03  6e+01  2e-05  7e-1

14: -2.2797e+03 -2.2797e+03  8e-05  1e-13  2e-13
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0: -7.6681e+02 -5.4520e+04  1e+05  7e-01  7e-14
 1: -3.9030e+02 -1.4534e+04  1e+04  7e-15  9e-14
 2: -1.6034e+03 -4.4395e+03  3e+03  5e-14  9e-14
 3: -2.0140e+03 -3.4029e+03  1e+03  6e-14  1e-13
 4: -2.2419e+03 -2.9234e+03  7e+02  7e-14  1e-13
 5: -2.3598e+03 -2.7133e+03  4e+02  3e-14  1e-13
 6: -2.4118e+03 -2.6315e+03  2e+02  3e-14  1e-13
 7: -2.4485e+03 -2.5708e+03  1e+02  1e-13  1e-13
 8: -2.4680e+03 -2.5398e+03  7e+01  9e-14  2e-13
 9: -2.4896e+03 -2.5083e+03  2e+01  2e-16  2e-13
10: -2.4949e+03 -2.5010e+03  6e+00  3e-14  2e-13
11: -2.4965e+03 -2.4989e+03  2e+00  1e-13  2e-13
12: -2.4971e+03 -2.4981e+03  1e+00  9e-14  2e-13
13: -2.4975e+03 -2.4976e+03  1e-01  2e-13  2e-13
14: -2.4976e+03 -2.4976e+03  3e-03  3e-14  2e-13
15: -2.4976e+03 -2.4976e+03  3e-05  2e-13  2e-13
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0: -7.1157e+02 -5.3070