# Support Vector Machine

In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# Source:- https://www.folkstalk.com/2022/10/python-suppress-warning-with-code
import warnings
warnings.filterwarnings("ignore")

In [2]:
df_data = pd.read_csv('adultData.csv')
df_test = pd.read_csv('adultTest.csv')

In [7]:
df = 0

In [8]:
df = df_data.append(df_test, ignore_index=True)
len(df)

48842

In [5]:
names_to_drop = ["education",
"martial-status",
"occupation",
"race",
"sex",
"native",
"work-class", "relationship"]

In [11]:
df

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,earnings
0,39,77516,13,2174,0,40,<=50K
1,50,83311,13,0,0,13,<=50K
2,38,215646,9,0,0,40,<=50K
3,53,234721,7,0,0,40,<=50K
4,28,338409,13,0,0,40,<=50K
...,...,...,...,...,...,...,...
48837,39,215419,13,0,0,36,<=50K.
48838,64,321403,9,0,0,40,<=50K.
48839,38,374983,13,0,0,50,<=50K.
48840,44,83891,13,5455,0,40,<=50K.


In [12]:
y = df['earnings']
X = df.drop('earnings', axis=1)

In [13]:
np_norm = np.zeros((df.shape[0],df.columns.size))
for i in range(X.columns.size):
    mean_ = X.iloc[:,i].mean()
    std_ = X.iloc[:,i].std()
    def normalize(x):
        return (x - mean_) / (std_)
    X.iloc[:,i] = X.iloc[:,i].apply(normalize)

In [14]:
def normLabels(x):
    if x == " <=50K.":
        return 0
    else:
        return 1
    
y = y.apply(normLabels)

In [15]:
count_0 = 0
count_1 = 0
for i in range(len(y)):
    if y[i] == 0:
        count_0 += 1
    else:
        count_1 += 1
        
print(count_0, count_1)

12435 36407


In [16]:
X_train, X_test_val, y_train, y_test_val = train_test_split( X, y, test_size=0.20, random_state=57)
X_test, X_val, y_test, y_val = train_test_split( X_test_val, y_test_val, test_size=0.50, random_state=57)

In [23]:
def computeCost(W, X, Y):
    # calculate hinge loss
    M = X.shape[0]
    Y = np.array(Y)
    distances = 1 - Y.reshape(1, -1) * (np.dot(X, W))
    distances[distances < 0] = 0 
    hinge_loss = reg_strength * (np.sum(distances) / M)

    cost = 1 / 2 * np.dot(np.transpose(W), W) + hinge_loss
    return cost
    
def calcCostGradient(W, X_batch, Y_batch):
        
    Y_final= Y_batch.astype(np.float64)
    X_final = X_batch
    W_64 = W.astype(np.float64)
    distance = 1 - (Y_final * np.dot(X_final, W_64))
    dw = np.zeros(len(W_64))
    dw = dw.reshape(-1, 1)
    if all(item < 0 for item in distance):
        di = W_64
    else:
        x_test = W_64 - ((reg_strength * Y_final) * X_final.reshape(-1, 1))
        di = x_test
    dw += di
    return dw

def sgd(features, outputs, max_epochs):
    weights = np.zeros(features.shape[1])
    weights = weights.reshape(-1, 1)
    epochs = 0
    prev_cost = float("inf")
    cost_thresh = 0.01  

    for epoch in range(1, max_epochs):

        X, Y = shuffle(features, outputs)
        for i in range(len(X)):
            try:
                ascent = calcCostGradient(weights, np.array(X.iloc[[i]]), Y[i])
                weights = weights - (learning_rate * ascent)
            except:
                continue
            
        if epoch == max_epochs - 1:
            cost = computeCost(weights, features, outputs)
            print(f"Epoch is:{epoch} and Cost is: {cost}")

            if abs(prev_cost - cost) < cost_thresh * prev_cost:
                return weights
            prev_cost = cost
            epochs += 1
    return weights
    
def test(features, labels):
    y_test_predicted = np.array([])
    for i in range(X_test.shape[0]):
        yp = np.sign(np.dot(W, X_test.to_numpy()[i])) #model
        y_test_predicted = y_test_predicted.append(yp)
        return y_test_predicted

In [24]:
reg_strength = 1e-3
learning_rate = 0.1
model = sgd(X_train, y_train, 300)

Epoch is:299 and Cost is: [[39.07298757]]


In [86]:
print(np.shape(X_test))

(4884, 6)


In [105]:
def test(X_test, weights):
    y_test_predicted = np.ndarray(shape=(len(X_test), 1))
    for i in range(X_test.shape[0]):
        yp = np.sign(np.dot(np.transpose(weights), X_test.to_numpy()[i])) #model
        y_test_predicted = np.append(y_test_predicted, yp[0])
        # y_test_predicted = y_test_predicted.append(int(yp[0]))
    return y_test_predicted

In [106]:
a = pd.DataFrame(test(X_val, model), columns=['y_test_predicted'])

In [107]:
print(type(a))

<class 'pandas.core.frame.DataFrame'>


In [108]:
a = a[1::2]
len(a)

4885

In [109]:
def normLabels(x):
    if x > 0.5:
        return 1
    else:
        return 0

a['y_test_predicted'] = a['y_test_predicted'].apply(normLabels)

In [110]:
print(a)

      y_test_predicted
1                    0
3                    0
5                    0
7                    0
9                    0
...                ...
9761                 1
9763                 0
9765                 1
9767                 0
9769                 1

[4885 rows x 1 columns]


In [111]:
from sklearn.metrics import confusion_matrix as cm

cm(y_val, a['y_test_predicted'])


array([[ 910,  307],
       [2731,  937]], dtype=int64)

In [112]:
from sklearn.metrics import classification_report as cr

cr(y_val, a['y_test_predicted'])

'              precision    recall  f1-score   support\n\n           0       0.25      0.75      0.37      1217\n           1       0.75      0.26      0.38      3668\n\n    accuracy                           0.38      4885\n   macro avg       0.50      0.50      0.38      4885\nweighted avg       0.63      0.38      0.38      4885\n'