In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('adult.data')

In [3]:
# Removing all Columns without a continuous Attributes 
df.pop('workclass')
df.pop('education')
df.pop('maritalstatus')
df.pop('occupation')
df.pop('relationship')
df.pop('race')
df.pop('sex')
df.pop('nativecountry')
pass

In [4]:
#Standardizing Salary Column since it is in the format of strings
#If Salary == ' <50k' == -1
#Else Salary == ' >50k' == 1
np_salary = np.zeros((df.shape[0],1))
for i in range(df.shape[0]):
    np_salary[i][0] = -1 if df.iloc[i][6] == ' <=50K' else 1

In [5]:
#Normalizing Data in Mean and Unit Variance
def normalization(df):
    np_norm = np.zeros((df.shape[0],df.columns.size))
    for i in range(df.columns.size-1):
        mean_ = df.iloc[:,i].mean()
        std_ = df.iloc[:,i].std()
        for j in range(df.shape[0]):
            np_norm[j][i] = (df.iloc[j][i]-mean_)/std_
    return np_norm

In [6]:
np_norm = normalization(df)

In [7]:
df_norm = pd.DataFrame(np_norm)
df_norm.header = False
df_norm[6] = np_salary

In [8]:
class SVM:
    def __init__(self, reg_strength=1e-3, learning_rate=0.1):
        self.reg_strength=reg_strength
        self.learning_rate=learning_rate

    def computeCost(self, W, X, Y):
        # calculate hinge loss
        M = X.shape[0]
        Y = np.array(Y)
        distances = 1 - Y.reshape(1, -1) * (np.dot(X, W))
        distances[distances < 0] = 0 
        hinge_loss = self.reg_strength * (np.sum(distances) / M)

        cost = 1 / 2 * np.dot(np.transpose(W), W) + hinge_loss
        return cost
    
    def calcCostGradient(self, W, X_batch, Y_batch):

        Y_final= Y_batch.astype(np.float64)
        X_final = X_batch
        W_64 = W.astype(np.float64)
        distance = 1 - (Y_final * np.dot(X_final, W_64))
        dw = np.zeros(len(W_64))
        dw = dw.reshape(-1, 1)
        if all(item < 0 for item in distance):
            di = W_64
        else:
            x_test = W_64 - ((self.reg_strength * Y_final) * X_final.reshape(-1, 1))
            di = x_test
        dw += di
        return dw

    def sgd(self, features, outputs, max_epochs):
        weights = np.zeros(features.shape[1])
        weights = weights.reshape(-1, 1)
        epochs = 0
        prev_cost = float("inf")
        cost_thresh = 0.01  

        for epoch in range(1, max_epochs):

            X, Y = shuffle(features, outputs)
            for i in range(len(X)):
                try:
                    ascent = self.calcCostGradient(weights, np.array(X.iloc[[i]]), Y[i])
                    weights = weights - (self.learning_rate * ascent)
                except:
                    continue

            if epoch == max_epochs - 1:
                cost = self.computeCost(weights, features, outputs)
                print(f"Epoch is:{epoch} and Cost is: {cost}")

                if abs(prev_cost - cost) < cost_thresh * prev_cost:
                    return weights
                prev_cost = cost
                epochs += 1
        return weights

    # def test(self, features, labels):
    #     y_test_predicted = np.array([])
    #     for i in range(X_test.shape[0]):
    #         yp = np.sign(np.dot(W, X_test.to_numpy()[i])) #model
    #         y_test_predicted = y_test_predicted.append(yp)
    #         return y_test_predicted
        
    def test(self, X_test, weights):
        y_test_predicted = np.ndarray(shape=(len(X_test), 1))
        for i in range(X_test.shape[0]):
            yp = np.sign(np.dot(np.transpose(weights), X_test.to_numpy()[i])) #model
            y_test_predicted = np.append(y_test_predicted, yp[0])
            # y_test_predicted = y_test_predicted.append(int(yp[0]))
        return y_test_predicted

In [9]:
y = df_norm[6]
df_norm.pop(6)

0       -1.0
1       -1.0
2       -1.0
3       -1.0
4       -1.0
        ... 
32556   -1.0
32557    1.0
32558   -1.0
32559   -1.0
32560    1.0
Name: 6, Length: 32561, dtype: float64

In [10]:
X = df_norm

In [11]:
model = SVM()

In [12]:
X_train, X_test_val, y_train, y_test_val = train_test_split( X, y, test_size=0.20, random_state=57)
X_test, X_val, y_test, y_val = train_test_split( X_test_val, y_test_val, test_size=0.50, random_state=57)

In [None]:
weights = model.sgd(X_train,y_train,max_epochs=300)

In [14]:
df_a = pd.DataFrame(model.test(X_val, weights), columns=['y_test_predicted'])

In [15]:
type(df_a)

pandas.core.frame.DataFrame

In [34]:
print(weights)

[[-9.56502258e-05]
 [ 2.55902074e-04]
 [ 1.47573380e-04]
 [ 1.03522149e-04]
 [ 1.26517134e-04]
 [-1.63373356e-04]]
