# Philip Carr
# CS/CNS/EE_156a_Homework_8_Code (Jupyter Notebook)

Code for SVM With Soft Margins Section (Problems 2 - 10)

In [1]:
import random as rn
import numpy as np

# Package imported for SVM: Scikit-Learn sklearn.svm SVC
from sklearn.svm import SVC

# Scikit-Learn Cross Validation function
from sklearn.model_selection import cross_val_score, KFold

In [2]:
def get_data_from_file(filename):
    """
    Return an array of points of (intensity, symmetry)
    and an array of corresponding values together as a tuple obtained
    from the file with the given name filename.
    
    Dataset used: US Postal Service Zip Code data set
    
    Return type: 2D array of points
    """
    file1 = open(filename, "r")
    points = []
    values = []
    for line in file1:
        data = line.split()
        y = int(float(data[0])) # digit
        x0 = float(data[1]) # intensity value
        x1 = float(data[2]) # symmetry value
        points.append([x0, x1])
        values.append(y)
    file1.close()
    return (np.array(points, dtype=np.float64),
            np.array(values, dtype=np.float64))

In [3]:
def get_n_vs_all_values(values, n):
    """
    Return an array of values of 1 if the original value is equal
    to n or -1 if not.
    """
    new_values = []
    for y in values:
        if y == n:
            new_values.append(1)
        else:
            new_values.append(-1)
    return np.array(new_values, dtype=int)

In [4]:
def get_n_vs_m_data(points, values, n, m):
    """
    Return a tuple of an array of points (corresponding to the
    returned array of values) and an array of values of 1 if the
    original value is equal to n or -1 if the original value is
    equal to m. (x_vector, y) data with y values that do not equal
    n or m are discarded from new returned dataset.
    """
    new_points = []
    new_values = []
    for i in range(len(values)):
        x = points[i]
        y = values[i]
        if y == n:
            new_points.append(x)
            new_values.append(1)
        else:
            if y == m:
                new_points.append(x)
                new_values.append(-1)
    return (np.array(new_points, dtype=np.float64),
            np.array(new_values, dtype=int))

In [5]:
def SVM_poly_n_vs_all_test(n_vs_all_list, svm_return, C=0.01, Q=2):
    """
    Return in-sample errors of the SVM (Support Vector Machine)
    with a polynomial kernel with degree Q after being trained on
    the US Postal Service Zip Code data set.
    """
    assert(len(n_vs_all_list) > 0)
    assert(svm_return in ["lowest", "highest"])
    
    # Initialize training data.
    training_points, training_values_original = \
        get_data_from_file("features.train")
    
    min_E_in = float("inf")
    n_min_E_in = -1
    svm_min_E_in = None
    
    max_E_in = -float("inf")
    n_max_E_in = -1
    svm_max_E_in = None
    for n in n_vs_all_list:
        # Initialize training data.
        training_values = \
            get_n_vs_all_values(training_values_original, n)
    
        # Initialize and fit SVM (Scikit-Learn SVC) to
        # training data.
        svc = SVC(C=C, kernel="poly", degree=Q, gamma=1.0,
                  coef0=1.0)
        svc.fit(training_points, training_values)
        
        # Compute the in-sample classification error of the SVM.
        svc_in_sample_error = 1.0 - \
            svc.score(training_points, training_values)
        
        print("In-sample error (E_in) for", str(n) + "-vs-all SVM:",
              svc_in_sample_error)
        
        # Check if min E_in found.
        if svm_return == "lowest" \
           and svc_in_sample_error < min_E_in:
            min_E_in = svc_in_sample_error
            n_min_E_in = n
            svm_min_E_in = svc
            
        # Check if max E_in found.
        if svm_return == "highest" \
           and svc_in_sample_error > max_E_in:
            max_E_in = svc_in_sample_error
            n_max_E_in = n
            svm_max_E_in = svc
    
    if svm_return == "lowest":
        print("\nSVM chosen with min E_in:", str(n_min_E_in)
          + "-vs-all SVM")
        return svm_min_E_in
    else:
        print("\nSVM chosen with max E_in:", str(n_max_E_in)
          + "-vs-all SVM")
        return svm_max_E_in

For Problem 2

In [6]:
svm_prob2 = SVM_poly_n_vs_all_test([0, 2, 4, 6, 8], "highest")

In-sample error (E_in) for 0-vs-all SVM: 0.10588396653408316
In-sample error (E_in) for 2-vs-all SVM: 0.10026059525442321
In-sample error (E_in) for 4-vs-all SVM: 0.08942531888629812
In-sample error (E_in) for 6-vs-all SVM: 0.09107118365107669
In-sample error (E_in) for 8-vs-all SVM: 0.074338225209162

SVM chosen with max E_in: 0-vs-all SVM


For Problem 3

In [7]:
svm_prob3 = SVM_poly_n_vs_all_test([1, 3, 5, 7, 9], "lowest")

In-sample error (E_in) for 1-vs-all SVM: 0.014401316691811772
In-sample error (E_in) for 3-vs-all SVM: 0.09024825126868741
In-sample error (E_in) for 5-vs-all SVM: 0.07625840076807022
In-sample error (E_in) for 7-vs-all SVM: 0.08846523110684401
In-sample error (E_in) for 9-vs-all SVM: 0.08832807570977919

SVM chosen with min E_in: 1-vs-all SVM


For Problem 4

In [8]:
n_support_diff_2_3 = len(svm_prob2.support_) \
                     - len(svm_prob3.support_)
print("Difference between the number of support vectors of the two",
      "selected classifiers from Problems 2 and 3:",
      n_support_diff_2_3)

Difference between the number of support vectors of the two selected classifiers from Problems 2 and 3: 1793


In [9]:
def SVM_poly_n_vs_m_test1(n, m, C_list=[0.001, 0.01, 0.1, 1], Q=2):
    """
    Iterate over the given values of C in C_list and print the
    in-sample error, out-of-sample error, and number of support
    vectors resulting using an n-vs-m SVM (Support Vector Machine)
    with a polynomial kernel with degree Q after being trained on
    the US Postal Service Zip Code data set.
    """
    assert(n >= 0 and n <= 9)
    assert(m >= 0 and m <= 9)
    
    print(str(n) + "-vs-" + str(m) + " SVM with polynomial kernel,",
          "Q = " + str(Q) + ":\n")
    
    # Initialize training data.
    training_points_original, training_values_original = \
        get_data_from_file("features.train")
    training_points, training_values = \
        get_n_vs_m_data(training_points_original,
                        training_values_original, n, m)
    # Initialize test data.
    test_points_original, test_values_original = \
        get_data_from_file("features.test")
    test_points, test_values = \
        get_n_vs_m_data(test_points_original,
                        test_values_original, n, m)
    
    for C in C_list:
        # Initialize and fit SVM (Scikit-Learn SVC) to
        # training data.
        print("C =", str(C) + ":")
        svc = SVC(C=C, kernel="poly", degree=Q, gamma=1.0,
                  coef0=1.0)
        svc.fit(training_points, training_values) 
        
        # Compute the in-sample classification error of the SVM.
        svc_in_sample_error = 1.0 - \
            svc.score(training_points, training_values)
        
        print("In-sample error (E_in):", svc_in_sample_error)
        
        # Compute the out-of-sample classification error of the SVM.
        svc_out_sample_error = 1.0 - \
            svc.score(test_points, test_values)
        
        print("Out-of-sample error (E_out):", svc_out_sample_error)
        
        print("Number of support vectors:", str(len(svc.support_))
              + "\n")

For Problem 5

In [10]:
SVM_poly_n_vs_m_test1(1, 5, C_list=[0.001, 0.01, 0.1, 1])

1-vs-5 SVM with polynomial kernel, Q = 2:

C = 0.001:
In-sample error (E_in): 0.004484304932735439
Out-of-sample error (E_out): 0.01650943396226412
Number of support vectors: 76

C = 0.01:
In-sample error (E_in): 0.004484304932735439
Out-of-sample error (E_out): 0.018867924528301883
Number of support vectors: 34

C = 0.1:
In-sample error (E_in): 0.004484304932735439
Out-of-sample error (E_out): 0.018867924528301883
Number of support vectors: 24

C = 1:
In-sample error (E_in): 0.0032030749519538215
Out-of-sample error (E_out): 0.018867924528301883
Number of support vectors: 24



For Problem 6

In [11]:
SVM_poly_n_vs_m_test1(1, 5, C_list=[0.0001, 0.001, 0.01, 1], Q=2)
print("-" * 60 + "\n")
SVM_poly_n_vs_m_test1(1, 5, C_list=[0.0001, 0.001, 0.01, 1], Q=5)

1-vs-5 SVM with polynomial kernel, Q = 2:

C = 0.0001:
In-sample error (E_in): 0.008968609865470878
Out-of-sample error (E_out): 0.01650943396226412
Number of support vectors: 236

C = 0.001:
In-sample error (E_in): 0.004484304932735439
Out-of-sample error (E_out): 0.01650943396226412
Number of support vectors: 76

C = 0.01:
In-sample error (E_in): 0.004484304932735439
Out-of-sample error (E_out): 0.018867924528301883
Number of support vectors: 34

C = 1:
In-sample error (E_in): 0.0032030749519538215
Out-of-sample error (E_out): 0.018867924528301883
Number of support vectors: 24

------------------------------------------------------------

1-vs-5 SVM with polynomial kernel, Q = 5:

C = 0.0001:
In-sample error (E_in): 0.004484304932735439
Out-of-sample error (E_out): 0.018867924528301883
Number of support vectors: 26

C = 0.001:
In-sample error (E_in): 0.004484304932735439
Out-of-sample error (E_out): 0.021226415094339646
Number of support vectors: 25

C = 0.01:
In-sample error (E_in):

In [12]:
def SVM_poly_n_vs_m_test2(n, m, C_list=[0.001, 0.01, 0.1, 1], Q=2,
                          trials=100):
    """
    Run the given number of trials and select the value of C in
    C_list that yields the lowest cross-validation error using an
    n-vs-m SVM (Support Vector Machine) with a polynomial kernel
    with degree Q after being trained on the US Postal Service Zip
    Code data set.
    """
    assert(n >= 0 and n <= 9)
    assert(m >= 0 and m <= 9)
    C_dict = {}
    for C in C_list:
        C_dict[C] = [0, 0]
    
    print(str(n) + "-vs-" + str(m) + " SVM with polynomial kernel,",
          "Q = " + str(Q) + ":")
    print("C values:", str(C_list) + "\n")
    
    # Initialize training data.
    training_points_original, training_values_original = \
        get_data_from_file("features.train")
    training_points, training_values = \
        get_n_vs_m_data(training_points_original,
                        training_values_original, n, m)
    
    for i in range(trials):
        C_min_E_cv = -1
        min_E_cv = float("inf")
        for C in C_list:
            # Initialize SVM (Scikit-Learn SVC) to
            # training data.
            svc = SVC(C=C, kernel="poly", degree=Q, gamma=1.0,
                  coef0=1.0)
            
            # Compute the cross-validation error of the SVM.
            svc_cross_val_error = 1 \
                - np.mean(cross_val_score(svc, training_points,
                                          training_values,
                                          cv=KFold(n_splits=10,
                                                   shuffle=True)))
            
            if svc_cross_val_error < min_E_cv:
                min_E_cv = svc_cross_val_error
                C_min_E_cv = C
            else:
                if svc_cross_val_error == min_E_cv \
                   and C < C_min_E_cv:
                    min_E_cv = svc_cross_val_error
                    C_min_E_cv = C
        C_dict[C_min_E_cv][0] += 1
        C_dict[C_min_E_cv][1] += min_E_cv
    
    C_max_count = -1
    max_count = -1
    for C in C_dict:
        count = C_dict[C][0]
        print("Number of times C =", C, "chosen out of", trials,
              "times:", count)
        if count > 0:
            C_dict[C][1] /= count
        if count > max_count:
            C_max_count = C
            max_count = count
    
    print("\nValue of C selected most often: C =", C_max_count)
    print("Average cross-validation error of C selected most often",
          "(C =", str(C_max_count) + ") (averaged using",
          str(C_dict[C_max_count][0]) + " trials): E_cv =",
          C_dict[C_max_count][1])
    
    return C_max_count

For Problem 7

In [13]:
C_max_count = SVM_poly_n_vs_m_test2(1, 5,
                                    C_list=[0.0001, 0.001, 0.01,
                                            0.1, 1],
                                    Q=2)

1-vs-5 SVM with polynomial kernel, Q = 2:
C values: [0.0001, 0.001, 0.01, 0.1, 1]

Number of times C = 0.0001 chosen out of 100 times: 0
Number of times C = 0.001 chosen out of 100 times: 30
Number of times C = 0.01 chosen out of 100 times: 34
Number of times C = 0.1 chosen out of 100 times: 17
Number of times C = 1 chosen out of 100 times: 19

Value of C selected most often: C = 0.01
Average cross-validation error of C selected most often (C = 0.01) (averaged using 34 trials): E_cv = 0.004463402215369208


For Problem 8

In [14]:
C_max_count = SVM_poly_n_vs_m_test2(1, 5, C_list=[C_max_count], Q=2)

1-vs-5 SVM with polynomial kernel, Q = 2:
C values: [0.01]

Number of times C = 0.01 chosen out of 100 times: 100

Value of C selected most often: C = 0.01
Average cross-validation error of C selected most often (C = 0.01) (averaged using 100 trials): E_cv = 0.004683447656377596


In [15]:
def SVM_rbf_n_vs_m_test(n, m, C_list=[0.01, 1, 100, 1e4, 1e6]):
    """
    Iterate over the given values of C in C_list and print the
    in-sample and out-of-sample errors using an n-vs-m SVM (Support
    Vector Machine) with an rbf (radial basis function) kernel after
    being trained on the US Postal Service Zip Code data set.
    """
    assert(n >= 0 and n <= 9)
    assert(m >= 0 and m <= 9)
    
    print(str(n) + "-vs-" + str(m) + " SVM with rbf kernel:\n")
    
    # Initialize training data.
    training_points_original, training_values_original = \
        get_data_from_file("features.train")
    training_points, training_values = \
        get_n_vs_m_data(training_points_original,
                        training_values_original, n, m)
    
    # Initialize test data.
    test_points_original, test_values_original = \
        get_data_from_file("features.test")
    test_points, test_values = \
        get_n_vs_m_data(test_points_original,
                        test_values_original, n, m)
    
    for C in C_list:
        # Initialize and fit SVM (Scikit-Learn SVC) to
        # training data.
        print("C =", str(C) + ":")
        svc = SVC(C=C, kernel="rbf", gamma=1.0)
        svc.fit(training_points, training_values) 
        
        # Compute the in-sample classification error of the SVM.
        svc_in_sample_error = 1.0 - \
            svc.score(training_points, training_values)
        
        print("In-sample error (E_in):", svc_in_sample_error)
        
        # Compute the out-of-sample classification error of the SVM.
        svc_out_sample_error = 1.0 - \
            svc.score(test_points, test_values)
        
        print("Out-of-sample error (E_out):",
              str(svc_out_sample_error) + "\n")

For Problems 9 and 10

In [16]:
SVM_rbf_n_vs_m_test(1, 5)

1-vs-5 SVM with rbf kernel:

C = 0.01:
In-sample error (E_in): 0.0038436899423446302
Out-of-sample error (E_out): 0.02358490566037741

C = 1:
In-sample error (E_in): 0.004484304932735439
Out-of-sample error (E_out): 0.021226415094339646

C = 100:
In-sample error (E_in): 0.0032030749519538215
Out-of-sample error (E_out): 0.018867924528301883

C = 10000.0:
In-sample error (E_in): 0.002562459961563124
Out-of-sample error (E_out): 0.02358490566037741

C = 1000000.0:
In-sample error (E_in): 0.0006406149903908087
Out-of-sample error (E_out): 0.02358490566037741

