In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('adult.data')

In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educationnum,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,nativecountry,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# Removing all Columns without a continuous Attributes 
df.pop('workclass')
df.pop('education')
df.pop('maritalstatus')
df.pop('occupation')
df.pop('relationship')
df.pop('race')
df.pop('sex')
df.pop('nativecountry')
pass

In [6]:
df

Unnamed: 0,age,fnlwgt,educationnum,capitalgain,capitalloss,hoursperweek,salary
0,39,77516,13,2174,0,40,<=50K
1,50,83311,13,0,0,13,<=50K
2,38,215646,9,0,0,40,<=50K
3,53,234721,7,0,0,40,<=50K
4,28,338409,13,0,0,40,<=50K
...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,<=50K
32557,40,154374,9,0,0,40,>50K
32558,58,151910,9,0,0,40,<=50K
32559,22,201490,9,0,0,20,<=50K


In [7]:
#Standardizing Salary Column since it is in the format of strings
#If Salary == ' <50k' == 0
#Else Salary == ' >50k' == 1
np_salary = np.zeros((df.shape[0],1))
for i in range(df.shape[0]):
    np_salary[i][0] = 0 if df.iloc[i][6] == ' <=50K' else 1

In [8]:
#Normalizing Data in Mean and Unit Variance
def normalization(df):
    np_norm = np.zeros((df.shape[0],df.columns.size))
    for i in range(df.columns.size-1):
        mean_ = df.iloc[:,i].mean()
        std_ = df.iloc[:,i].std()
        for j in range(df.shape[0]):
            np_norm[j][i] = (df.iloc[j][i]-mean_)/std_
    return np_norm

In [9]:
# for i in range(df.columns.size-1):
#     mean_ = df.iloc[:,i].mean()
#     std_ = df.iloc[:,i].std()
#     def normalize(x):
#         return (x - mean_) / (std_)
#     df.iloc[:,i] = df.iloc[:,i].apply(normalize)
# df

In [10]:
np_norm = normalization(df)

In [11]:
np_norm

array([[ 0.03067009, -1.06359441,  1.13472134, ..., -0.2166562 ,
        -0.0354289 ,  0.        ],
       [ 0.83709613, -1.00869151,  1.13472134, ..., -0.2166562 ,
        -2.222119  ,  0.        ],
       [-0.04264137,  0.24507474, -0.42005317, ..., -0.2166562 ,
        -0.0354289 ,  0.        ],
       ...,
       [ 1.42358779, -0.3587719 , -0.42005317, ..., -0.2166562 ,
        -0.0354289 ,  0.        ],
       [-1.2156247 ,  0.11095818, -0.42005317, ..., -0.2166562 ,
        -1.65519934,  0.        ],
       [ 0.98371904,  0.9298783 , -0.42005317, ..., -0.2166562 ,
        -0.0354289 ,  0.        ]])

In [12]:
df_norm = pd.DataFrame(np_norm)
df_norm.header = False

In [13]:
df_norm[6] = np_salary

In [14]:
df_norm

Unnamed: 0,0,1,2,3,4,5,6
0,0.030670,-1.063594,1.134721,0.148451,-0.216656,-0.035429,0.0
1,0.837096,-1.008692,1.134721,-0.145918,-0.216656,-2.222119,0.0
2,-0.042641,0.245075,-0.420053,-0.145918,-0.216656,-0.035429,0.0
3,1.057031,0.425795,-1.197440,-0.145918,-0.216656,-0.035429,0.0
4,-0.775756,1.408154,1.134721,-0.145918,-0.216656,-0.035429,0.0
...,...,...,...,...,...,...,...
32556,-0.849067,0.639731,0.746028,-0.145918,-0.216656,-0.197406,0.0
32557,0.103982,-0.335428,-0.420053,-0.145918,-0.216656,-0.035429,1.0
32558,1.423588,-0.358772,-0.420053,-0.145918,-0.216656,-0.035429,0.0
32559,-1.215625,0.110958,-0.420053,-0.145918,-0.216656,-1.655199,0.0


In [15]:
#X_df, y_df = train_test_split(df_norm, test_size=0.1)

In [16]:
class SVM:

    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None
        self.b = None


    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        y_ = np.where(y <= 0, -1, 1)
        
        self.w = np.zeros(n_features)
        self.b = 0

        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1
                if condition.any():
                    self.w -= self.lr * (2 * self.lambda_param * self.w)
                else:
                    self.w -= self.lr * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
                    self.b -= self.lr * y_[idx]


    def predict(self, X):
        approx = np.dot(X, self.w) - self.b
        return np.sign(approx)

In [17]:
svm = SVM()

In [18]:
zeros = 0
ones = 0
for i in range(32561):
    if df_norm.iloc[i][6] == 0.0:
        zeros += 1
    else:
        ones += 1
print(f'{zeros}  {ones}')

24720  7841


In [19]:
df_sort = df_norm.sort_values(by=6)

In [20]:
np_zeros = np.zeros((24720,6))
np_zeros = df_sort[:24720]
np_zeros

Unnamed: 0,0,1,2,3,4,5,6
0,0.030670,-1.063594,1.134721,0.148451,-0.216656,-0.035429,0.0
20177,-0.115953,0.483739,-0.420053,-0.145918,-0.216656,-0.845314,0.0
20176,1.790145,0.947064,1.523415,5.447632,-0.216656,0.774456,0.0
20175,-0.849067,0.090342,1.134721,-0.145918,-0.216656,-0.035429,0.0
20174,-1.288936,0.164137,-0.420053,-0.145918,-0.216656,-0.035429,0.0
...,...,...,...,...,...,...,...
9998,-0.115953,-0.597209,-3.140909,-0.145918,-0.216656,1.017422,0.0
9878,-0.775756,-0.039141,-0.031360,-0.145918,-0.216656,0.612479,0.0
9733,2.230014,-0.823348,-0.420053,-0.145918,-0.216656,-1.331245,0.0
13339,-1.142313,-1.268483,1.134721,-0.145918,-0.216656,-0.035429,0.0


In [21]:
np_zeros.loc[6] = -1
np_zeros

Unnamed: 0,0,1,2,3,4,5,6
0,0.030670,-1.063594,1.134721,0.148451,-0.216656,-0.035429,0.0
20177,-0.115953,0.483739,-0.420053,-0.145918,-0.216656,-0.845314,0.0
20176,1.790145,0.947064,1.523415,5.447632,-0.216656,0.774456,0.0
20175,-0.849067,0.090342,1.134721,-0.145918,-0.216656,-0.035429,0.0
20174,-1.288936,0.164137,-0.420053,-0.145918,-0.216656,-0.035429,0.0
...,...,...,...,...,...,...,...
9998,-0.115953,-0.597209,-3.140909,-0.145918,-0.216656,1.017422,0.0
9878,-0.775756,-0.039141,-0.031360,-0.145918,-0.216656,0.612479,0.0
9733,2.230014,-0.823348,-0.420053,-0.145918,-0.216656,-1.331245,0.0
13339,-1.142313,-1.268483,1.134721,-0.145918,-0.216656,-0.035429,0.0


In [22]:
np_ones = np.zeros((7841,6))
np_ones = df_sort[24720:32561]
np_ones

Unnamed: 0,0,1,2,3,4,5,6
28596,-0.555822,3.640234,1.134721,-0.145918,-0.216656,0.288525,1.0
14897,0.177293,0.342147,-0.031360,-0.145918,-0.216656,0.612479,1.0
30596,-0.335887,-0.103623,1.134721,-0.145918,-0.216656,-0.035429,1.0
2740,-0.042641,0.441617,1.134721,0.895070,-0.216656,-0.035429,1.0
16284,0.910408,-0.128265,0.746028,-0.145918,-0.216656,-0.035429,1.0
...,...,...,...,...,...,...,...
25548,0.910408,4.585369,0.357334,-0.145918,-0.216656,0.774456,1.0
5276,0.397227,-0.693002,1.523415,-0.145918,-0.216656,0.774456,1.0
5275,0.470539,0.230124,2.300802,-0.145918,-0.216656,0.774456,1.0
25526,1.276965,-0.108246,-0.031360,-0.145918,-0.216656,-0.035429,1.0


In [23]:
dict_ = {-1: np_zeros, 1:np_ones}

In [24]:
svm.fit(data=dict_)

TypeError: SVM.fit() got an unexpected keyword argument 'data'

In [25]:
svm.visualize()

AttributeError: 'SVM' object has no attribute 'visualize'

In [26]:
df_norm

Unnamed: 0,0,1,2,3,4,5,6
0,0.030670,-1.063594,1.134721,0.148451,-0.216656,-0.035429,0.0
1,0.837096,-1.008692,1.134721,-0.145918,-0.216656,-2.222119,0.0
2,-0.042641,0.245075,-0.420053,-0.145918,-0.216656,-0.035429,0.0
3,1.057031,0.425795,-1.197440,-0.145918,-0.216656,-0.035429,0.0
4,-0.775756,1.408154,1.134721,-0.145918,-0.216656,-0.035429,0.0
...,...,...,...,...,...,...,...
32556,-0.849067,0.639731,0.746028,-0.145918,-0.216656,-0.197406,0.0
32557,0.103982,-0.335428,-0.420053,-0.145918,-0.216656,-0.035429,1.0
32558,1.423588,-0.358772,-0.420053,-0.145918,-0.216656,-0.035429,0.0
32559,-1.215625,0.110958,-0.420053,-0.145918,-0.216656,-1.655199,0.0


In [27]:
df_norm_sal = df_norm.pop(6)

In [28]:
df_norm

Unnamed: 0,0,1,2,3,4,5
0,0.030670,-1.063594,1.134721,0.148451,-0.216656,-0.035429
1,0.837096,-1.008692,1.134721,-0.145918,-0.216656,-2.222119
2,-0.042641,0.245075,-0.420053,-0.145918,-0.216656,-0.035429
3,1.057031,0.425795,-1.197440,-0.145918,-0.216656,-0.035429
4,-0.775756,1.408154,1.134721,-0.145918,-0.216656,-0.035429
...,...,...,...,...,...,...
32556,-0.849067,0.639731,0.746028,-0.145918,-0.216656,-0.197406
32557,0.103982,-0.335428,-0.420053,-0.145918,-0.216656,-0.035429
32558,1.423588,-0.358772,-0.420053,-0.145918,-0.216656,-0.035429
32559,-1.215625,0.110958,-0.420053,-0.145918,-0.216656,-1.655199


In [29]:
df_norm_sal

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
32556    0.0
32557    1.0
32558    0.0
32559    0.0
32560    1.0
Name: 6, Length: 32561, dtype: float64

In [30]:
svm.fit(df_norm, df_norm_sal)

In [31]:
pred = svm.predict(df_norm)
print(pred)
print(df_norm_sal)

[-1. -1. -1. ... -1.  1. -1.]
0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
32556    0.0
32557    1.0
32558    0.0
32559    0.0
32560    1.0
Name: 6, Length: 32561, dtype: float64


In [32]:
for j in range(df_norm_sal.size):
    if df_norm_sal[j] == 0:
        df_norm_sal[j] = -1
df_norm_sal

0       -1.0
1       -1.0
2       -1.0
3       -1.0
4       -1.0
        ... 
32556   -1.0
32557    1.0
32558   -1.0
32559   -1.0
32560    1.0
Name: 6, Length: 32561, dtype: float64

In [33]:
sum = 0
for i in range(df_norm.shape[0]):
    if pred[i] == df_norm_sal[i]:
        sum += 1
print(sum)
print(f'Accuracy: {(sum/32561)*100}')

19447
Accuracy: 59.72482417616166
