Question 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import copy
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from plot_multi import plot_multiclass_histograms

In [None]:
df_train = pd.read_csv('Dry_Bean_train.csv') 
df_x_train = df_train.drop("Class", axis=1) 
df_y_train = df_train['Class']
x_train = df_x_train.to_numpy()

df_test = pd.read_csv('Dry_Bean_test.csv') 
df_x_test = df_test.drop("Class", axis=1)
df_y_test = df_test['Class']
x_test = df_x_test.to_numpy()

encoder = preprocessing.LabelEncoder()
y_train_enc = encoder.fit_transform(df_y_train)
y_test_enc = encoder.fit_transform(df_y_test)

In [None]:
w_og = np.ones((max(y_train_enc)+1, x_train.shape[1]+1))

def train(X, y, cl, epochs=100, lr=1):
    d , c = X.shape[1] , max(y_train_enc)+1
    w = np.ones((1, d))
    w_opt = np.ones((1, d))
    num_samples = X.shape[0]
    itr = 0
    J_min = 100000000
    reflected_data = copy.deepcopy(X)

    for j in range(X.shape[0]):
        if y[j] != cl:
            reflected_data[j] *= -1
    
    for epoch in range(epochs):
        temp = np.c_[reflected_data, y]
        np.random.shuffle(temp)
        reflected_data = temp[:, :17]
        y = temp[:, -1]
        for i in range(num_samples):
            z = np.dot(w, reflected_data[i].T)
            if z <= 0:
                w += lr*reflected_data[i]
            if itr > 1224799:
                jw = Jw(reflected_data, w)
                if jw<J_min:
                    J_min = jw
                    w_opt = w
            itr = itr + 1
    return w, J_min, w_opt


def Jw(X, w):
    j = 0
    for i in range(X.shape[0]):
        if np.dot(w, X[i, :].T) <= 0:
            j -= np.dot(w, X[i, :].T)
    return j

def classify_1(X, y, w):
    acc = 0
    err = 0
    unc = 0
    for i in range(X.shape[0]):
        score = np.dot(w, X[i].T)
        for j in range(len(score)):
            if score[j] > 0:
                score[j] = 1
            else:
                score[j] = 0
            
        if np.count_nonzero(score) == 1:
            clas = np.argmax(score)
            if y[i] == clas:
                acc += 1
            else:
                err += 1
        else:
            unc += 1
    return acc/y.shape[0], err/y.shape[0], unc/y.shape[0]

def classify_2(X, y, w):
    acc = 0
    err = 0
    for i in range(X.shape[0]):
        score = np.dot(w, X[i].T)
        clas = np.argmax(score)
        if y[i] == clas:
            acc += 1
        else:
            err += 1
    return acc/y.shape[0], err/y.shape[0]

def classify_3(X, y, w):
    acc = 0
    err = 0
    for i in range(X.shape[0]):
        score = np.dot(w, X[i].T)
        for j in range(len(score)):
            score[j] = score[j]/np.linalg.norm(w[j][1:])
        clas = np.argmax(score)
        if y[i] == clas:
            acc += 1
        else:
            err += 1
    return acc/y.shape[0], err/y.shape[0]

def plot_data(X,y,w,k):
    x_k = X[y == k]
    x_j = X[y != k]

    y_k = np.dot(x_k, w[k])
    y_j = np.dot(x_j, w[k])

    plt.figure(figsize=(10,6))
    plt.hist(y_k, alpha=0.5, label='(Sk)', bins=50)
    plt.hist(y_j, alpha=0.5, label= '(Sj)', bins=50)

    plt.title('plot')
    plt.xlabel('g(x)')
    plt.ylabel('Count')
    plt.legend()
    plt.show()

In [None]:
std = StandardScaler()
std.fit(x_train)

x_train_normalized = std.transform(x_train)
x_train_normalized_aug = np.hstack((np.ones((x_train_normalized.shape[0], 1)), x_train_normalized))

x_test_normalized = std.transform(x_test)
x_test_normalized_aug = np.hstack((np.ones((x_test_normalized.shape[0], 1)), x_test_normalized))

In [None]:
for i in range(max(y_train_enc)+1):
    w, J_min, w_opt = train(x_train_normalized_aug, y_train_enc, i)
    w_og[i] = w_opt
    plot_data(x_train_normalized_aug, y_train_enc, w_opt, i)

In [None]:
train_acc, train_err, train_unc = classify_1(x_train_normalized_aug, y_train_enc, w_og)
print("Train Data Classification Accuracy for Method 1 -", train_acc*100, "%")
print("Train Data Error Rate for Method 1 -", train_err*100, "%")
print("Train Data Unclassified Rate for Method 1 -", train_unc*100, "%")
print()
test_acc, test_err, test_unc = classify_1(x_test_normalized_aug, y_test_enc, w_og)
print("Test Data Classification Accuracy for Method 1 -", test_acc*100, "%")
print("Test Data Error Rate for Method 1 -", test_err*100, "%")
print("Test Data Unclassified Rate for Method 1 -", test_unc*100, "%")

In [None]:
train_acc, train_err = classify_2(x_train_normalized_aug, y_train_enc, w_og)
print("Test Data Classification Accuracy for Method 2 -", train_acc*100, "%")
print("Train Data Error Rate for Method 2 -", train_err*100, "%")
print()
test_acc, test_err= classify_2(x_test_normalized_aug, y_test_enc, w_og)
print("Test Data Classification Accuracy for Method 2 -", test_acc*100, "%")
print("Test Data Error Rate for Method 2 -", test_err*100, "%")

In [None]:
train_acc, train_err = classify_3(x_train_normalized_aug, y_train_enc, w_og)
print("Test Data Classification Accuracy for Method 3 -", train_acc*100, "%")
print("Train Data Error Rate for Method 3 -", train_err*100, "%")
print()
test_acc, test_err= classify_3(x_test_normalized_aug, y_test_enc, w_og)
print("Test Data Classification Accuracy for Method 3 -", test_acc*100, "%")
print("Test Data Error Rate for Method 3 -", test_err*100, "%")

In [None]:
plot_multiclass_histograms(x_test_normalized_aug, w_og.T, y_test_enc, 'xx')