Question 5 Basic SGD

In [None]:
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
import math

In [None]:
bc_train_data = np.load("breast_cancer_train.npy")
np.random.shuffle(bc_train_data)
bc_train_data_points = np.asarray(bc_train_data[:,1:])
bc_train_labels = bc_train_data[:, 0]

bc_train_L1_norms = np.linalg.norm(bc_train_data_points, ord=1, axis=0)
bc_train_data_points = 100 * bc_train_data_points / bc_train_L1_norms

bc_test_data = np.load("breast_cancer_test.npy")
bc_test_data_points = np.asarray(bc_test_data[:,1:])
bc_test_data_points = 100 * bc_test_data_points / bc_train_L1_norms
bc_test_labels = bc_test_data[:, 0]

In [None]:
def sigmoid_function(X):
    return 1/(1+math.e**(-X))

In [None]:
def error(tpl,tl):
    err_count = 0
    for i in range(len(tpl)):
        if (int(tpl[i]) != int(tl[i])):
            err_count = err_count+1
    err_rate = (err_count/len(tpl))*100
    return err_rate

In [None]:
def plot(data, label, w):
    x_min = min(data[:, 1])
    x_max = max(data[:, 1])
    x = np.linspace(x_min, x_max)
    y = -(w[0]+w[1]*x)/w[2]

    plt.plot(x, y, label='Decision Boundary')
    plt.fill_between(x, y, 4, color='blue', alpha=.5)
    plt.scatter(data[label == 1, 1], data[label == 1, 2],
                color='red', label='Class 1')
    plt.scatter(data[label == 2, 1], data[label == 2, 2],
                color='blue', label='Class 2')
    plt.legend()

In [None]:
def LR(r_data_points, labels, wi, eta=1, max_itr=10000):
    itr = 0
    epoch = 0
    Jw_min = 1000000000
    Jw_ar = []
    w_opt = 0.1*np.ones([3, 1])
    Jw = 0
    m = len(r_data_points)
    while itr < max_itr:
        update_count = 0
        for i in range(len(r_data_points)):
            gradient = 2/m * r_data_points.T.dot(sigmoid_function(r_data_points.dot(wi)) - labels[i])
            wi = wi - eta * gradient
            update_count = update_count+1
            Jw = 0
            pred_labels = sigmoid_function(r_data_points.dot(wi))
            Jw = - np.sum(pred_labels*np.log(pred_labels)+ ((1-pred_labels)*np.log(1-pred_labels)))/(len(pred_labels))
   
            if Jw < Jw_min:
                Jw_min = Jw
                w_opt = wi
            Jw_ar.append(Jw.tolist()[0])

            itr = itr+1
        if update_count == 100:
            print("The data is linearly separable")
            break
        epoch = epoch + 1
    return w_opt, Jw_ar, itr

In [None]:
def test_classification(data, w):
    clas = np.zeros(len(data))
    for i in range(len(data)):
        pred = sigmoid_function(np.dot(w.T, data[i, :])[0])
        if pred < 0.5:
            clas [i] = 2
        else:
            clas [i] = 1
    return clas

In [None]:
def error(tpl,tl):
    err_count = 0
    for i in range(len(tpl)):
        if (int(tpl[i]) != int(tl[i])):
            err_count = err_count+1
    err_rate = (err_count/len(tpl))*100
    return err_rate

In [None]:
bc_train_data_aug = np.vstack([np.ones([1, 480]), bc_train_data_points.T]).T

bc_reflected_data_points = np.copy(bc_train_data_aug)

for i in range(len(bc_train_data_points)):
    if bc_train_data[i, 0] == 2.0:
        bc_reflected_data_points[i, :] = bc_reflected_data_points[i, :] * (-1)

w0 = 0.1 * np.ones([31, 1])

w_o, Jw_ar, itr = LR(bc_reflected_data_points, w0)

print("The Optimal weights are - " + str(w_o.reshape(1,31)))

bc_train_pred_labels = test_classification(bc_train_data_aug, w_o)

print("The Classification Error Rate for Training Data = " + str(error(bc_train_pred_labels,bc_train_labels)) + "%")

itrs = len(Jw_ar)
for jw in range(len(Jw_ar)):
    Jw_ar[jw] = Jw_ar[jw] *(-1)

plt.plot(list(range(itrs)),Jw_ar)
plt.title("Learning Curve")
plt.xlabel("Epochs")
plt.ylabel("J(W)")

In [None]:
bc_test_data_aug = np.vstack([np.ones([1, 89]), bc_test_data_points.T]).T

bc_test_pred_labels = test_classification(bc_test_data_aug, w_o)

print("The Classification Error Rate for Test Data = " + str(error(bc_test_pred_labels,bc_test_labels)) + "%")

In [None]:
y = []
for i in range(len(bc_train_data_points)):
    x = np.dot(w_o.T, bc_train_data_aug[i, :])/np.linalg.norm(w_o)
    y.append([x])

y = np.asarray(y).reshape(480, 1)
c1 = y[bc_train_labels == 1]
c2 = y[bc_train_labels == 2]

plt.hist(y, label='Data', edgecolor='black')
plt.hist(c1, label='Class 1', edgecolor='black')
plt.hist(c2, label='Class 2', edgecolor='black')
plt.xlabel("Distance from the decision boundary")
plt.ylabel("Frequency")
plt.legend()