In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('adult.data')

In [3]:
# Removing all Columns without a continuous Attributes
popColumns = ['workclass', 'education', 'maritalstatus', 'occupation', 'relationship', 'race', 'sex', 'nativecountry']
for i in popColumns:
    df.pop(i)
pass

In [4]:
# Standardizing Salary Column since it is in the format of strings
# If Salary == ' <50k' == -1
# Else Salary == ' >50k' == 1
arrSalary = np.zeros((df.shape[0],1))
for i in range(df.shape[0]):
    arrSalary[i][0] = -1 if df.iloc[i][6] == ' <=50K' else 1

In [5]:
#Normalizing Data in Mean and Unit Variance
def normalization(df):
    arrNorm = np.zeros((df.shape[0],df.columns.size))
    for i in range(df.columns.size-1):
        meanVal = df.iloc[:,i].mean()
        standardDev = df.iloc[:,i].std()
        for j in range(df.shape[0]):
            arrNorm[j][i] = (df.iloc[j][i]-meanVal)/standardDev
    return arrNorm

In [6]:
arrNorm = normalization(df)

In [7]:
dfNorm = pd.DataFrame(arrNorm)
dfNorm.header = False
dfNorm[6] = arrSalary

In [8]:
accuracyList = []

In [9]:
y = dfNorm.pop(6)
X = dfNorm

In [10]:
class SVM:
    def __init__(self):
        global regStrength
        global learningRate

    def computeCost(self, W, X, Y):
        # calculate hinge loss
        M = X.shape[0]
        Y = np.array(Y)
        distances = 1 - Y.reshape(1, -1) * (np.dot(X, W))
        distances[distances < 0] = 0 
        hingeLoss = regStrength * (np.sum(distances) / M)

        cost = 1 / 2 * np.dot(np.transpose(W), W) + hingeLoss
        return cost
    
    def calcCostGradient(self, W, X_batch, Y_batch):

        Y_final= Y_batch.astype(np.float64)
        X_final = X_batch
        W_64 = W.astype(np.float64)
        distance = 1 - (Y_final * np.dot(X_final, W_64))
        dw = np.zeros(len(W_64))
        dw = dw.reshape(-1, 1)
        if all(item < 0 for item in distance):
            di = W_64
        else:
            x_test = W_64 - ((regStrength * Y_final) * X_final.reshape(-1, 1))
            di = x_test
        dw += di
        return dw
    
    def testAccuracy(self, X_test, weights):
        y_test_predicted = np.ndarray(shape=(len(X_test), 1))
        for i in range(X_test.shape[0]):
            yp = np.sign(np.dot(np.transpose(weights), X_test.to_numpy()[i])) #model
            y_test_predicted[i] = yp
        return y_test_predicted

    def validation(self, X_validate,weights, Y_validate):
        finalWeights = list(weights)
        results = self.testAccuracy(X_validate, finalWeights)
        a = accuracy_score(Y_validate, results)
        return (a / len(X_validate)) * 100

    def sgd(self, features, outputs, maxEpochs):
        weights = np.zeros(features.shape[1])
        weights = weights.reshape(-1, 1)
        epochs = 0
        previousCost = float("inf")
        costTreshold = 0.01  
        for epoch in range(1, maxEpochs):
            print(f"{epoch,}", end = " ")
            X, Y = shuffle(features, outputs)
            for i in range(0, len(X), 300):
                if (i % 30 == 0):
                    validate = self.validation(features, weights, outputs)
                    accuracyList.append(validate)
                    try:
                        ascent = self.calcCostGradient(weights, np.array(X.iloc[[i]]), Y[i])
                        weights -= (self.learningRate * ascent)
                    except:
                        continue

            if epoch == maxEpochs-1:
                cost = self.computeCost(weights, features, outputs)
                print(f"Epoch is:{epoch} and Cost is: {cost}")

                if abs(previousCost - cost) < costTreshold * previousCost:
                    return weights
                previousCost = cost
                epochs += 1
        return weights
           
    def test(self, X_test, weights):
        y_test_predicted = np.ndarray(shape=(len(X_test), 1))
        for i in range(X_test.shape[0]):
            yp = np.sign(np.dot(np.transpose(weights), X_test.to_numpy()[i])) #model
            y_test_predicted[i] = yp
        return y_test_predicted

In [11]:
model = SVM()

In [12]:
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.20, random_state=57)
X_test, X_val, y_test, y_val = train_test_split( X_test_val, y_test_val, test_size=0.50, random_state=57)

In [None]:
weights = model.sgd(X_train, y_train, maxEpochs=100)

(1,) (2,) (3,) (4,) (5,) (6,) (7,) (8,) (9,) (10,) (11,) (12,) (13,) (14,) (15,) 

In [49]:
df_a = pd.DataFrame(model.test(X_val, weights), columns=['y_test_predicted'])

In [50]:
type(df_a)

pandas.core.frame.DataFrame

In [58]:
from sklearn.metrics import confusion_matrix as cm
cm(y_val, df_a['y_test_predicted'])
pass

In [134]:
weights

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]])