In [1]:
import numpy as np
import matplotlib 
matplotlib.use('nbagg')
import matplotlib.pyplot as plt
import random
import pandas as pd
from statistics import mean 
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from mlxtend.plotting import plot_decision_regions
import matplotlib.pyplot as plt

In [2]:
# Defining a function to train SVM on a model and compute the error on any specified dataset
def SVM_analysis(data_training,data_testing,plus_one_label,c,q,kernel_type,gamma_val,coef0_val,minus_one_label=10):
    """
    Inputs-
    data_training: A Pandas dataframe of Nx3 to be used as a training set where the first column is the specific digit, second column
                   is the intensity and third is symmetry
    data_testing: A Pandas dataframe of Nx3 to be used as a test set where the first column is the specific digit, second column
                   is the intensity and third is symmetry
    plus_one_label: digit to be assigned the label +1
    minus_one_label: digit to be assigned -1 label, only valid if we are doing one digit versus another one
                     with all others being disregarded (by default it is 10)
    c: penalty for violation (a scalar)
    q: Degree of the polynomial Kernel (a scalar)
    kernel_type: can be any of the types accepted by scikit learn (a string)
    gamma_val: see the scikit learn documentation (https://scikit-learn.org/stable/modules/svm.html#svm-kernels)
    coef0_val: see the scikit learn documentation
    
    Output-
    error_SVM - computes the error of the model on the dataset specified for testing
    no_of_support_vectors - intuitive
    """
    data_training['resp_label'] = np.where(data_training['digit']==plus_one_label, 1, -1)
    data_testing['resp_label'] = np.where(data_testing['digit']==plus_one_label, 1, -1)
    if 0<=minus_one_label<10:
        data_training = data_training.loc[(data_training['digit']==plus_one_label) | (data_training['digit']==minus_one_label)]
        data_testing = data_testing.loc[(data_testing['digit']==plus_one_label) | (data_testing['digit']==minus_one_label)] 
    labelled_training_data = data_training.copy()
    labelled_testing_data = data_testing.copy()
    clf = svm.SVC(C=c, degree=q, kernel=kernel_type,gamma=gamma_val,coef0=coef0_val)
    # https://scikit-learn.org/stable/modules/svm.html#svm-kernels
    train_values = labelled_training_data.values
    test_values = labelled_testing_data.values
    svm_fit = clf.fit(train_values[:,1:3], (train_values[:,3]).reshape(-1,))
    pred_svm = (svm_fit.predict(test_values[:,1:3])).reshape(-1,1)
    error_SVM = np.mean(pred_svm!=((test_values[:,3]).reshape(-1,1)))
    no_of_support_vectors = (svm_fit.support_vectors_).shape[0]
    return (error_SVM,no_of_support_vectors)

In [10]:
# Performs k cross validation for SVM with binary classification error(as discussed in the course) and for any kernel 
def cross_validation_for_C(data_training,plus_one_label,c,q,nsims,kernel_type,gamma_val,coef0_val,kfold,minus_one_label=10):
    """
    Inputs-
    data_training: A Pandas dataframe of Nx3 to be used as a training set where the first column is the specific digit, second column
                   is the intensity and third is symmetry
    plus_one_label: digit to be assigned the label +1
    minus_one_label: digit to be assigned -1 label, only valid if we are doing one digit versus another one
                     with all others being disregarded (by default it is 10)
    c: penalty for violation (a list)
    q: Degree of the polynomial Kernel (a scalar)
    nsims: number of simulations (a scalar)
    kernel_type: can be any of the types accepted by scikit learn (a string)
    gamma_val: see the scikit learn documentation (https://scikit-learn.org/stable/modules/svm.html#svm-kernels)
    coef0_val: see the scikit learn documentation
    kfold: number of folds for cross validation
    
    Output-
    dist_c - gives the number of times each value was selected in "c" (a list)
    avg_err_cv - Average cross validation error for the finally selected model 
    """
    data_training['resp_label'] = np.where(data_training['digit']==plus_one_label, 1, -1)
    if 0<=minus_one_label<10:
        data_training = data_training.loc[(data_training['digit']==plus_one_label) | (data_training['digit']==minus_one_label)]
    labelled_training_data = data_training.copy()
    train_values = labelled_training_data.values
    selected_val_c = []
    dist_c = []
    for i in range(nsims):
        Ecv_penalty = []
        for penalty in c:
            clf = svm.SVC(C=penalty, degree=q, kernel=kernel_type,gamma=gamma_val,coef0=coef0_val) 
            kf = KFold(n_splits=kfold,shuffle=True)
            X = train_values[:,1:3]
            y = train_values[:,3]
            error_svm = []
            for folds in range(kfold):
                for a,b in kf.split(X):
                    X1, X2 = X[a], X[b]
                    y1, y2 = y[a], y[b]
                svm_fit = clf.fit(X1, y1.reshape(-1,))
                pred_svm = (svm_fit.predict(X2)).reshape(-1,1)
                error_svm.append(np.mean(pred_svm!=(y2.reshape(-1,1))))
            Ecv_penalty.append(mean(error_svm))
        selected_val_c.append(c[Ecv_penalty.index(min(Ecv_penalty))])
    
    # Generating the Distribution vector for selected c-values
    for j in c:
        dist_c.append(selected_val_c.count(j))
        
    # Computing the average cross validation error for the final selected model over 100 iterations
    final_c = c[dist_c.index(max(dist_c))]
    Ecv_sims = []
    for k in range(nsims):
        clf = svm.SVC(C=penalty, degree=q, kernel=kernel_type,gamma=gamma_val,coef0=coef0_val) 
        kf = KFold(n_splits=kfold,shuffle=True)
        X = train_values[:,1:3]
        y = train_values[:,3]
        error_svm = []
        for folds in range(kfold):
            for a,b in kf.split(X):
                X1, X2 = X[a], X[b]
                y1, y2 = y[a], y[b]
            svm_fit = clf.fit(X1, y1.reshape(-1,))
            pred_svm = (svm_fit.predict(X2)).reshape(-1,1)
            error_svm.append(np.mean(pred_svm!=(y2.reshape(-1,1))))
        Ecv_sims.append(mean(error_svm))
    return (dist_c,mean(Ecv_sims))

In [4]:
# Importing Digit Training and Test datasets
train_data = pd.read_csv('digit_identification_training.txt', sep="\t")
test_data = pd.read_csv('digit_identification_testing.txt', sep="\t")

In [5]:
# Problem 2
i = range(0,9,2)
in_sample_error_even = []
no_support_vectors_even = []
for j in i:
    mod_res_even = SVM_analysis(train_data,train_data,j,0.01,2,'poly',1,0)
    in_sample_error_even.append(mod_res_even[0])
    no_support_vectors_even.append(mod_res_even[1])
    
print("The maximum in sample error among even digits is observed for : ", i[in_sample_error_even.index(max(in_sample_error_even))])

# Problem 3
k = range(1,8,2)
in_sample_error_odd = []
no_support_vectors_odd = []
for j in k:
    mod_res_odd = SVM_analysis(train_data,train_data,j,0.01,2,'poly',1,0)
    in_sample_error_odd.append(mod_res_odd[0])
    no_support_vectors_odd.append(mod_res_odd[1])
    
print("The minimum in sample error among odd digits is observed for : ", k[in_sample_error_odd.index(min(in_sample_error_odd))])

# Problem 4
print("The difference between the number of support vectors based on the models selected in Problem 2 and 3 is: ",no_support_vectors_even[0] - no_support_vectors_odd[0])

The maximum in sample error among even digits is observed for :  0
The minimum in sample error among odd digits is observed for :  1
The difference between the number of support vectors based on the models selected in Problem 2 and 3 is:  1879


In [6]:
# Problem 5 and Problem 6: 1 vs 5 classifier
C = [0.0001,0.001,0.01,0.1,1]

# q=2 case
no_c_support_vectors = []
Ein_c = []
Eout_c = []
for penalty in C:
    mod_res_c_train = SVM_analysis(train_data,train_data,1,penalty,2,'poly',1,0,5)
    mod_res_c_test = SVM_analysis(train_data,test_data,1,penalty,2,'poly',1,0,5)
    no_c_support_vectors.append(mod_res_c_train[1])
    Ein_c.append(mod_res_c_train[0])
    Eout_c.append(mod_res_c_test[0])
print("In sample error vector for second degree polynomial kernel is: ",Ein_c)
print("Out of sample error vector for second degree polynomial kernel is: ", Eout_c)
print("Number of support vectors for second degree polynomial kernel: ", no_c_support_vectors)

# q=5 case
no_c_support_vectors5 = []
Ein_c5 = []
Eout_c5 = []
for penalty in C:
    mod_res_c_train = SVM_analysis(train_data,train_data,1,penalty,5,'poly',1,0,5)
    mod_res_c_test = SVM_analysis(train_data,test_data,1,penalty,5,'poly',1,0,5)
    no_c_support_vectors5.append(mod_res_c_train[1])
    Ein_c5.append(mod_res_c_train[0])
    Eout_c5.append(mod_res_c_test[0])
print("In sample error vector for fifth degree polynomial kernel is: ",Ein_c5)
print("Out of sample error vector for fifth degree polynomial kernel is: ", Eout_c5)
print("Number of support vectors for fifth degree polynomial kernel is: ", no_c_support_vectors5)

In sample error vector for second degree polynomial kernel is:  [0.010249839846252402, 0.004484304932735426, 0.004484304932735426, 0.004484304932735426, 0.003843689942344651]
Out of sample error vector for second degree polynomial kernel is:  [0.01650943396226415, 0.01650943396226415, 0.018867924528301886, 0.018867924528301886, 0.018867924528301886]
Number of support vectors for second degree polynomial kernel:  [244, 80, 34, 24, 24]
In sample error vector for fifth degree polynomial kernel is:  [0.005124919923126201, 0.004484304932735426, 0.005124919923126201, 0.004484304932735426, 0.004484304932735426]
Out of sample error vector for fifth degree polynomial kernel is:  [0.01650943396226415, 0.01650943396226415, 0.01650943396226415, 0.018867924528301886, 0.01650943396226415]
Number of support vectors for fifth degree polynomial kernel is:  [27, 26, 27, 24, 24]


In [78]:
# Testing the usage of KFold function to implement cross validation with the binary classification error
# as specified in the course. Scikit Learn doesn't enables it automatically
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8],[9,10],[11,12],[13,14],[15,16]])
y = np.array([1, -1, 1, 1,1,-1,1,-1])
kf = KFold(n_splits=4,shuffle=True)
kf.get_n_splits(X)
for X1,X2 in kf.split(X):
    X_train,X_test = X[X1],X[X2]
    y_train, y_test = y[X1], y[X2]

In [11]:
# Problem 7 and Problem 8: Cross Validation
C = [0.0001,0.001,0.01,0.1,1]
out_cv = cross_validation_for_C(train_data,1,C,2,100,'poly',1,0,10,5)
c_vec = out_cv[0]
print("The best value of C as per cross-validation is:, ",C[c_vec.index(max(c_vec))])
print("The average cross validation error for the best value of penalty parameter is: ",out_cv[1])

The best value of C as per cross-validation is:,  0.001
The average cross validation error for the best value of penalty parameter is:  0.0048397435897435895


In [12]:
# Problem 9 and Problem 10: Using RBF Kernel for 1 vs 5 classifier
C = [0.01,1,100,10e4,10e6]
in_sample_error_rbf = []
out_sample_error_rbf = []
for j in C:
    mod_res_train_rbf = SVM_analysis(train_data,train_data,1,j,2,'rbf',1,0,5)
    mod_res_test_rbf = SVM_analysis(train_data,test_data,1,j,2,'rbf',1,0,5)
    in_sample_error_rbf.append(mod_res_train_rbf[0])
    out_sample_error_rbf.append(mod_res_test_rbf[0])
    
print("The minimum in sample error is observed for C= : ", C[in_sample_error_rbf.index(min(in_sample_error_rbf))])
print("The minimum out of sample error is observed for C= : ", C[out_sample_error_rbf.index(min(out_sample_error_rbf))])

The minimum in sample error is observed for C= :  10000000.0
The minimum out of sample error is observed for C= :  100


In [15]:
# Bonus: Using 7 fold cross validation to find optimal C for rbf kernel
out_cv = cross_validation_for_C(train_data,1,C,2,100,'rbf',1,0,10,5)
c_vec = out_cv[0]
print("The best value of C as per cross-validation is: ", C[c_vec.index(max(c_vec))])
print("The average cross validation error for the best value of penalty parameter is: ",out_cv[1])

The best value of C as per cross-validation is:  0.01
The average cross validation error for the best value of penalty parameter is:  0.007346153846153846


The above example is quite important to note that there may be times when even cross validation is not sufficient to properly estimate the hyperparameters and alternate techniques must be applied to select/corroborate the same. Now, as a further experiment we try the cross validation after remvoving the previously most preferred values which was 0.01.

In [14]:
# Performing 10 fold on a reduced C list (removing 0.01)
C_ch = [1,100,10e4,10e6]
out_cv = cross_validation_for_C(train_data,1,C_ch,2,100,'rbf',1,0,10,5)
c_vec = out_cv[0]
print("The best value of C as per cross-validation is: ", C_ch[c_vec.index(max(c_vec))])
print("The average cross validation error for the best value of penalty parameter is: ",out_cv[1])

The best value of C as per cross-validation is:  100
The average cross validation error for the best value of penalty parameter is:  0.00733974358974359


Here we see that as we remove 0.01 cross validation picks up the value of the penalty parameter as suggested by the test dataset. This implies that the search space for hyperparameter optimization plays a very crucial role in identifying the optimal values of the same. Also, notice that as we do not change the folds by much the average error estimate remains constant.
