# State Vector Machines

## Introduction

## Summary of Results

---

## Setup 

First, we'll need to import the various libraries that we'll need.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import svm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

%matplotlib inline

## Load and Clean Data

Before we can build any models, we need to import the data and clean it by converting types as necessary.

In [None]:
df = pd.read_csv("Adult/adult.data", names=[
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "earning_label"
], skipinitialspace=True)

df

In [None]:
# Explore some attributes of the dataset

# dir(df)
print("Features: ", df.columns)
print("Labels: ", pd.Series.unique(df.earning_label))
print("Shape: ", df.shape)

In [None]:
# Because the native.country variable has too many categories, and most of the data points are from the US (91%), we combine all the categories except for “United-States”into the “Other” category:
df.loc[df['native_country']!='United-States', 'native_country']='Other'

# Save the output label in binary encoding, 0: <=50k, 1: > 50k
Y=pd.Categorical(df['earning_label']).codes
Y = np.where(Y==0, -1, Y) 
print(Y)

# education is not needed as uducation_num performs its function
# also drop the label as it is not needed for the model
df=df.drop(['education','earning_label'], axis=1)

# determine unique values of each categorical feature:
for feature in ['workclass','marital_status','occupation','relationship','race','sex','native_country']:
    print(feature, pd.Series.unique(df[feature]))

df

In [None]:
# Convert categorical features to one-hot encoding
df=pd.get_dummies(df)
df

# encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)
# encoder.fit(df)
# df = encoder.transform(df)
# df

In [None]:
# Finally, split the data into training and testing sets 80\/20
partialData = df.values[:, [2, 5]]
partialData = df.values
train_X, test_X, train_Y, test_Y = train_test_split(partialData, Y, test_size = 0.2)

In [None]:
greaterThan = np.where(train_Y == 1)
print(greaterThan)
gTCount = greaterThan[0].shape[0]
print(gTCount)

lT = np.where(train_Y == -1)[0]
lT = np.random.permutation(lT)
print(lT)
print(lT.shape)

index = np.hstack((lT[0:gTCount], greaterThan[0]))
print(index.shape)
train_Y = train_Y[index]
print(train_Y)
train_X = train_X[index]
print(train_X)
print(train_Y.shape)

print(np.where(train_Y == -1)[0].shape[0])
print(np.where(train_Y == 1)[0].shape[0])

In [None]:
print(train_X)

In [None]:
X = df.values
for i in range(0, 6):
    for j in range(i + 1, 6):
        print((i, j))
        print((df.columns[i], df.columns[j]))
        plt.scatter(X[:, i], X[:, j], c=Y)
        plt.show()
        

## Create the model

In [None]:
from svm import SVM
svm = SVM(10, .1)
w = svm.fit(train_X, train_Y)
print(w)

In [None]:
predicted_Y = svm.predict(test_X, w)
_X = np.hstack((test_X, np.ones((test_X.shape[0], 1))))
p = np.sum(_X * w, axis=1)
print(p)
predicted_Y = p / abs(p)
print(predicted_Y)

print("Accuracy:")
print ((predicted_Y - test_Y == 0).sum() / test_Y.shape[0])

In [None]:
import matplotlib.pyplot as plt

X = train_X
y = train_Y
print(y)

independant_index = 2
dependant_index = 3

independant = X[:, independant_index]

divBy = -w[dependant_index]
multBy = w[independant_index]
addBy = 0
print (multBy)
print(divBy)

addBy += w[len(w) - 1]

i1 = np.min(independant)
i2 = np.max(independant)

result = np.array([[i1, (i1 * multBy + addBy) / divBy],
          [i2, (i2 * multBy + addBy) / divBy]])
print(result)

plt.plot(result[:, 0], result[:, 1], scaley=True)

plt.scatter(X[:, independant_index], X[:, dependant_index], c=y)
plt.show()

In [None]:
from sklearn.svm import SVC
clf = SVC(gamma='auto')
clf.fit(train_X, train_Y)
SVC(gamma='auto')
#print(clf.predict(test_X))

print(df.values)

In [None]:
predicted_Y = clf.predict(test_X)
print(predicted_Y)

In [None]:
print("Accuracy:")
print ((predicted_Y - test_Y == 0).sum() / test_Y.shape[0])

## Fitting the Model

With the data now processed, it is ready to have SVM applied

## Acknowlegements

https://methods.sagepub.com/dataset/howtoguide/support-vector-machine-in-aci-1996-python