In [7]:
import pandas as pd
import numpy as np
from sklearn.svm import NuSVC
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

In [8]:
def listconversion(directory):
    datalist = []
    with open(directory) as f:
        for line in f.readlines():
            datalist.append(list(map(float, line.rstrip('\n').split())))
    
    return np.array(datalist)

In [9]:
def datasplitting(Data):
    actualnumbers = Data[:, 0] 
    symmetry = Data[:, 2:] 
    intensity = Data[:, 1] 
    
    return actualnumbers, symmetry, intensity

In [10]:
def readingdata():
    trainingdata = listconversion("features.train.txt")
    testingdata = listconversion("features.test.txt")
    
    numberstrain, symmetry_train, intensity_train = datasplitting(trainingdata)
    numberstest, symmetry_test, intensity_test = datasplitting(testingdata)
    
    return [[numberstrain, symmetry_train, intensity_train], [numberstest, symmetry_test, intensity_test]]

In [11]:
def labels1_5(actualnumbers, trainingdata, labels, class1, class2):
    
    
    class1_in_test_data = np.where(actualnumbers == class1)[0]
    class2_in_test_data = np.where(actualnumbers == class2)[0]
    
  
    sorted_indices = np.sort(np.concatenate((class1_in_test_data, class2_in_test_data)))
    
   
    newclassdata = np.take(actualnumbers, sorted_indices)
    newtraindata = np.take(trainingdata, sorted_indices)
    newlabeldata = np.take(labels, sorted_indices)
    
   
    np.put(newlabeldata, np.where(newclassdata == 1)[0], [1])
    np.put(newlabeldata, np.where(newclassdata == 5)[0], [-1])
    
    return newtraindata, newlabeldata

In [12]:
def linear_get_training_and_test_data():
    train_test_data = readingdata()
    numberstrain, symmetry_train, intensity_train = train_test_data[0]
    numberstest, symmetry_test, intensity_test = train_test_data[1]
    
    
    train_X, train_Y = labels1_5(numberstrain, symmetry_train, intensity_train, 1, 5)
    test_X, test_Y = labels1_5(numberstest, symmetry_test, intensity_test, 1, 5)
    
    return [[train_X, train_Y], [test_X, test_Y]]

In [13]:
def classifier_generic(classifier, train_X, train_Y, test_X, test_Y):
    classifier.fit(train_X.reshape(-1, 1), train_Y)
    Y_predicted = classifier.predict(test_X.reshape(-1, 1))
    
    
    training_error = 1-classifier.score(train_X.reshape(-1, 1), train_Y)
    test_error = 1-classifier.score(test_X.reshape(-1, 1), test_Y)
    num_support_vectors = np.shape(classifier.support_)
    accuracy_score_ = accuracy_score(test_Y, Y_predicted)
    
    return training_error, test_error, num_support_vectors, accuracy_score_

In [14]:
def classify_SVM_linear(train_X, train_Y, test_X, test_Y, nu_value_NuSVC = 0.5):
    
    classifier = NuSVC(nu = nu_value_NuSVC, kernel = 'linear', decision_function_shape = 'ovo')
    _, _, number_of_support_vectors, accuracy_score_ = classifier_generic(classifier, train_X, train_Y, test_X, test_Y)    
    return number_of_support_vectors, accuracy_score_

In [15]:
def SVM_on_limited_samples(samples, train_X, train_Y, test_X, test_Y):
    
    num_support_vectors, accuracy_score_ = 0.0, 0.0
    nu_value_NuSVC = 0.5
    
    if samples != 0:
        portion_of_train_X = train_X[:samples,]
        portion_of_train_Y = train_Y[:samples,]
        
        if samples == 50:
            nu_value_NuSVC = 0.1
        
        elif samples == 100:
            nu_value_NuSVC = 0.2
        
        elif samples == 200:
            nu_value_NuSVC = 0.3
            
        elif samples == 800:
            nu_value_NuSVC = 0.4
        num_support_vectors, accuracy_score_ = classify_SVM_linear(portion_of_train_X, portion_of_train_Y, test_X, test_Y, nu_value_NuSVC)
    
    else:
        num_support_vectors, accuracy_score_ = classify_SVM_linear(train_X, train_Y, test_X, test_Y, nu_value_NuSVC)
    
    return num_support_vectors, accuracy_score_


In [16]:
def classify_SVM_Polynomial(train_X, train_Y, test_X, test_Y, Q, C_value):
    
    classifier = SVC(C = C_value, kernel = 'poly', degree = Q, gamma = 0.4, decision_function_shape = 'ovo')    
    training_error, test_error, num_support_vectors, accuracy_score_ = classifier_generic(classifier, train_X, train_Y, test_X, test_Y)    
    return training_error, test_error, num_support_vectors, accuracy_score_


In [17]:
def classify_SVM_RBF_kernel(train_X, train_Y, test_X, test_Y, C_value, gamma_value):
    
    classifier = SVC(C = C_value, kernel = 'rbf', gamma = gamma_value)
    training_error, test_error, _, _ = classifier_generic(classifier, train_X, train_Y, test_X, test_Y)    
    return training_error, test_error

### PART A & B

In [20]:
no_of_samples = [0, 50, 100, 200, 800]

train_test_data = linear_get_training_and_test_data()
train_X, train_Y = train_test_data[0]
test_X, test_Y = train_test_data[1]


for i in no_of_samples:
    num_support_vectors, accuracy_score_ = SVM_on_limited_samples(i, train_X, train_Y, test_X, test_Y)
    print("No of samples:",i,"no of support vectors:",num_support_vectors,"accuracy score:",accuracy_score_)

No of samples: 0 no of support vectors: (782,) accuracy score: 0.9834905660377359
No of samples: 50 no of support vectors: (6,) accuracy score: 0.9811320754716981
No of samples: 100 no of support vectors: (20,) accuracy score: 0.9811320754716981
No of samples: 200 no of support vectors: (60,) accuracy score: 0.9811320754716981
No of samples: 800 no of support vectors: (320,) accuracy score: 0.9811320754716981


### PART C

In [21]:
Q = [2, 5]
for degree in Q:
    for exponent in range(0, 5):
        C_value = 1 * pow(10, -exponent)
        training_error, test_error, num_support_vectors, accuracy_score_ = classify_SVM_Polynomial(train_X, train_Y, test_X, test_Y, degree, C_value)
        print("Q=",degree,"C=",C_value,"Training error:",training_error,"test error:",test_error,"Support vectors:", num_support_vectors,"Accuracy score:",accuracy_score_)

Q= 2 C= 1 Training error: 0.004484304932735439 test error: 0.018867924528301883 Support vectors: (24,) Accuracy score: 0.9811320754716981
Q= 2 C= 0.1 Training error: 0.004484304932735439 test error: 0.018867924528301883 Support vectors: (30,) Accuracy score: 0.9811320754716981
Q= 2 C= 0.01 Training error: 0.004484304932735439 test error: 0.01650943396226412 Support vectors: (64,) Accuracy score: 0.9834905660377359
Q= 2 C= 0.001 Training error: 0.00832799487508007 test error: 0.021226415094339646 Support vectors: (190,) Accuracy score: 0.9787735849056604
Q= 2 C= 0.0001 Training error: 0.029468289557975647 test error: 0.04481132075471694 Support vectors: (642,) Accuracy score: 0.9551886792452831
Q= 5 C= 1 Training error: 0.004484304932735439 test error: 0.01650943396226412 Support vectors: (26,) Accuracy score: 0.9834905660377359
Q= 5 C= 0.1 Training error: 0.004484304932735439 test error: 0.01650943396226412 Support vectors: (26,) Accuracy score: 0.9834905660377359
Q= 5 C= 0.01 Training

### PART D

In [22]:
for i in range(-2, 8, 2):
    C_value = 1 * pow(10, i)
    gamma_value = 0
    if C_value >= 10:
        gamma_value = 0.01
    else:
        gamma_value = 10
    training_error, test_error = classify_SVM_RBF_kernel(train_X, train_Y, test_X, test_Y, C_value, gamma_value)
    print("C value:  ",C_value,"  train_error:  ",training_error,"  Test_error:  ",test_error)    

C value:   0.01   train_error:   0.00832799487508007   Test_error:   0.04481132075471694
C value:   1   train_error:   0.004484304932735439   Test_error:   0.021226415094339646
C value:   100   train_error:   0.004484304932735439   Test_error:   0.021226415094339646
C value:   10000   train_error:   0.004484304932735439   Test_error:   0.018867924528301883
C value:   1000000   train_error:   0.004484304932735439   Test_error:   0.021226415094339646
