# State Vector Machines

## Introduction

## Summary of Results

---

## Setup 

First, we'll need to import the various libraries that we'll need.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import svm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

%matplotlib inline

## Load and Clean Data

Before we can build any models, we need to import the data and clean it by converting types as necessary.

In [2]:
df = pd.read_csv("Adult/adult.data", names=[
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "earning_label"
], skipinitialspace=True)

df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,earning_label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [3]:
# Explore some attributes of the dataset

# dir(df)
print("Features: ", df.columns)
print("Labels: ", pd.Series.unique(df.earning_label))
print("Shape: ", df.shape)

Features:  Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'earning_label'],
      dtype='object')
Labels:  ['<=50K' '>50K']
Shape:  (32561, 15)


In [4]:
# Because the native.country variable has too many categories, and most of the data points are from the US (91%), we combine all the categories except for “United-States”into the “Other” category:
df.loc[df['native_country']!='United-States', 'native_country']='Other'

# Save the output label in binary encoding, 0: <=50k, 1: > 50k
Y=pd.Categorical(df['earning_label']).codes
Y = np.where(Y==0, -1, Y) 
print(Y)

# education is not needed as uducation_num performs its function
# also drop the label as it is not needed for the model
df=df.drop(['education','earning_label'], axis=1)

# determine unique values of each categorical feature:
for feature in ['workclass','marital_status','occupation','relationship','race','sex','native_country']:
    print(feature, pd.Series.unique(df[feature]))

df

[-1 -1 -1 ... -1 -1  1]
workclass ['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked']
marital_status ['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']
occupation ['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' '?'
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv']
relationship ['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 'Other-relative']
race ['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other']
sex ['Male' 'Female']
native_country ['United-States' 'Other']


Unnamed: 0,age,workclass,fnlwgt,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32557,40,Private,154374,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32558,58,Private,151910,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
32559,22,Private,201490,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


In [5]:
# Convert categorical features to one-hot encoding
df=pd.get_dummies(df)
df

# encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)
# encoder.fit(df)
# df = encoder.transform(df)
# df

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,native_country_Other,native_country_United-States
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
4,28,338409,13,0,0,40,0,0,0,0,...,1,0,0,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,0,0,0,0,...,1,0,0,0,0,1,1,0,0,1
32557,40,154374,9,0,0,40,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
32558,58,151910,9,0,0,40,0,0,0,0,...,0,0,0,0,0,1,1,0,0,1
32559,22,201490,9,0,0,20,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1


In [6]:
# Finally, split the data into training and testing sets 80\/20
train_X, test_X, train_Y, test_Y = train_test_split(df.values, Y, test_size = 0.2)

## Create the model

In [35]:
from svm import SVM
svm = SVM(10, .001)
w = svm.fit(train_X, train_Y)
print(w)

[ 8.72731112e+02 -1.38770731e+02  2.50310964e+02  1.53784321e+04
  1.72342829e+04  7.83541923e+02 -6.88728501e+00  3.58952703e+00
  3.09428747e+00 -3.45515971e-02 -1.63851351e+01  8.24631450e+00
  1.83507371e+00  9.52088452e-01 -8.06203931e-02 -1.80973587e+01
  1.03654791e-01  8.14265971e+01 -1.85426904e+00 -5.75476044e+01
 -4.96007371e+00 -4.74124693e+00 -6.92183661e+00 -1.16438882e+01
 -2.30343980e-02 -1.91953317e+00  2.46199324e+01 -4.06941032e+00
 -6.82202088e+00 -5.91216216e+00 -1.78094287e+01 -1.05574324e+00
  2.14104730e+01  1.60089066e+00  2.90233415e+00  1.83507371e+00
 -1.86194717e+00  7.18826781e+01 -3.42559889e+01 -5.59735872e+00
 -3.11617015e+01 -1.66845823e+01  1.01466523e+01 -1.36286855e+00
  6.94871007e-01 -8.91047297e+00 -1.07493857e+00  4.98310811e+00
 -4.02449324e+01  3.45746314e+01 -3.62407862e+00 -2.04622236e+00
 -5.67030098e+00]


In [36]:
predicted_Y = svm.predict(test_X, w)

from sklearn.metrics import accuracy_score

print("Accuracy:")
print ((predicted_Y - test_Y == 0).sum() / test_Y.shape[0])

Accuracy:
0.7670812221710426


In [None]:
from sklearn.svm import SVC
clf = SVC(gamma='auto')
clf.fit(train_X, train_Y)
SVC(gamma='auto')
print(clf.predict(test_X))

print(df.values)

In [None]:
predicted_Y = clf.predict(test_X)
print(predicted_Y)

## Fitting the Model

With the data now processed, it is ready to have SVM applied

## Acknowlegements

https://methods.sagepub.com/dataset/howtoguide/support-vector-machine-in-aci-1996-python