In [98]:
import numpy as np 
import pandas as pd 
from csv import reader
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.utils import shuffle

In [99]:
# Pre Processing

def getdata(filename, q_rows, q_cols):
    dataset = np.zeros((q_rows,q_cols))

    with open(filename, "r") as file:
        csv_reader = reader(file)
        i = 0
        for row in csv_reader:
            if i > 0:
                if not row or i == q_rows+1:
                    continue
                for j in range(4):
                    dataset[i-1, j] = float(row[j])
            i += 1
    return dataset

class NormStandart():
    def __init__(self, dataset, dsformat):
        self.dataset = dataset
        self.format = dsformat

    def normalize(self, dataset):
        max = dataset.max()
        min = dataset.min()
        for i in range(len(dataset.transpose()[0])):
            for j in range(4):
                dataset[i][j] = (dataset[i][j] - min)/(max-min)
        return dataset


    def mean(self, dataset):
        col_quant = len(dataset[0])
        dataset = dataset.transpose()
        means = np.zeros(col_quant)
        for i in range(col_quant):
            means[i] = np.sum(dataset[i])/float(len(dataset[i]))
        return means

    # calculate column standard deviations
    def stdev(self, dataset, means):
        col_quant = len(dataset[0])
        stdevs = np.zeros(col_quant)
        for i in range(col_quant):
            variance = [pow(row[i]-means[i], 2) for row in dataset]
            stdevs[i] = sum(variance)
        stdevs = [sqrt(x/(float(len(dataset)-1))) for x in stdevs]
        return np.array(stdevs)

    def standardize_dataset(self, dataset, means, stdevs):
        for row in dataset:
            for i in range(len(row)):
                row[i] = (row[i] - means[i]) / stdevs[i]

    def normalize_and_standartize(self):

        self.dataset = self.normalize(self.dataset)
        means = self.mean(self.dataset)
        stdevs = self.stdev(self.dataset, means)

        for row in self.dataset:
            for i in range(len(row)):
                row[i] = (row[i] - means[i]) / stdevs[i]
        if self.format == "numpy":
            return self.dataset
        elif self.format == "pandas":
            return pd.DataFrame(data=self.dataset)

ds = getdata("iris.csv", 100, 4)
ns = NormStandart(ds, "numpy")
dataset_nolabels = ns.normalize_and_standartize()
i = 0
dataset = np.zeros((100,5))
for i in range(100):
    if i <= 49:
        dataset[i] = np.append(dataset_nolabels[i], 1, axis=None)
    else:
        dataset[i] = np.append(dataset_nolabels[i], -1, axis=None)

dataset = pd.DataFrame(data=dataset)

Y = dataset.loc[:, 4]
X = dataset.iloc[:, :-1]

# # insert 1 in every row for intercept b
X.insert(loc=len(X.columns), column=4, value=1)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("Train Data")
print(X_train.head(5))
print(y_train.head(5))

print("Test Data")
print(X_test.head(5))
print(y_test.head(5))

Train Data
           0         1         2         3  4
55  0.356865 -0.624558  1.130697  0.909488  1
88  0.201029 -0.206793  0.854749  0.909488  1
26 -0.733990  0.628735 -0.869926 -0.683001  1
42 -1.669009  0.210971 -1.076887 -1.036887  1
69  0.201029 -1.251204  0.716775  0.555602  1
55   -1.0
88   -1.0
26    1.0
42    1.0
69   -1.0
Name: 4, dtype: float64
Test Data
           0         1         2         3  4
83  0.824375 -0.833440  1.544619  1.440318  1
53  0.045193 -1.668968  0.785762  0.909488  1
70  0.668538  0.210971  1.337658  1.794204  1
45 -1.045663 -0.206793 -1.007900 -0.859944  1
44 -0.578153  1.464264 -0.662965 -0.683001  1
83   -1.0
53   -1.0
70   -1.0
45    1.0
44    1.0
Name: 4, dtype: float64


In [100]:
def cost_svm(W, X, Y, regularization_strength=100):
    N = X.shape[0]
    distances = 1 - Y * (np.dot(X, W))
    distances[distances < 0] = 0  # max(0, distance)
    hinge_loss = regularization_strength * (np.sum(distances) / N)
    
    cost = 1 / 2 * np.dot(W, W) + hinge_loss
    return cost

def calculate_cost_gradient(W, X_batch, Y_batch, regularization_strength=100):
    if type(Y_batch) == np.float64:
        Y_batch = np.array([Y_batch])
        X_batch = np.array([X_batch]) 

    distance = 1 - (Y_batch * np.dot(X_batch, W))
    dw = np.zeros(len(W))

    for index, d in enumerate(distance):
        if max(0, d) == 0:
            di = W
        else:
            di = W - (regularization_strength * Y_batch[ind] * X_batch[index])
        dw += di

    dw = dw/len(Y_batch)
    
    return dw
    
def sgd(features, outputs, learning_rate=0.001, epochs=2):
    weights = np.zeros(features.shape[1])
    nth = 0
    prev_cost = float("inf")
    cost_threshold = 0.01
    # stochastic gradient descent
    for epoch in range(1, epochs):
        X, Y = shuffle(features, outputs)
        for index, x in enumerate(X):
            ascent = calculate_cost_gradient(weights, x, Y[index])
            weights = weights - (learning_rate * ascent)        
        # convergence check
        if epoch == 2 ** nth or epoch == epochs:
            cost = cost_svm(weights, features, outputs)
            print("Epoch - {}, Cost - {}".format(epoch, cost))
            if abs(prev_cost - cost) < cost_threshold * prev_cost:
                return weights
            prev_cost = cost
            nth += 1
    return weights



In [101]:
print("training started...")
W = sgd(X_train.to_numpy(), y_train.to_numpy())
print("training finished.")
print("weights: \n {}".format(W))

# testing the model
print("testing the model...")
y_train_predicted = np.array([])
for i in range(X_train.shape[0]):
    yp = np.sign(np.dot(X_train.to_numpy()[i], W))
    y_train_predicted = np.append(y_train_predicted, yp)

y_test_predicted = np.array([])
for i in range(X_test.shape[0]):
    yp = np.sign(np.dot(X_test.to_numpy()[i], W))
    y_test_predicted = np.append(y_test_predicted, yp)

print("accuracy on test dataset: {}".format(accuracy_score(y_test, y_test_predicted)))
print("recall on test dataset: {}".format(recall_score(y_test, y_test_predicted)))
print("precision on test dataset: {}".format(precision_score(y_test, y_test_predicted)))

training started...
Epoch - 1, Cost - 2.291081339537521
training finished.
weights: 
 [-0.44644663  0.2443768  -0.72636983 -0.69808697 -0.08709706]
testing the model...
accuracy on test dataset: 1.0
recall on test dataset: 1.0
precision on test dataset: 1.0
