# State Vector Machines

## Introduction

## Summary of Results

---

## Setup 

First, we'll need to import the various libraries that we'll need.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import svm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
# from sklearn_pandas import CategoricalImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder

%matplotlib inline

## Load and Clean Data

Before we can build any models, we need to import the data and clean it by converting types as necessary.

In [None]:
df = pd.read_csv("Adult/adult.data", names=[
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "earning_label"
], skipinitialspace=True)

df

In [None]:
# Explore some attributes of the dataset

# dir(df)
print("Features: ", df.columns)
print("Labels: ", pd.Series.unique(df.earning_label))
print("Shape: ", df.shape)

In [None]:
# target_names = pd.Series.unique(df.earning_label)
# feature_names = df.columns

# ranges = []
# type_count = 0
# for i in range(0, len(target_names) - 1):
#     additional = np.count_nonzero(labels == i)
#     ranges.append((i, type_count, additional + type_count))
#     type_count += additional
    
# for i in range(0, 4)[0:4]:
#     for j in range(i + 1, 4):
#         for target_type, start, end in ranges:
#             plot.scatter(data[start:end, i], data[start:end ,j], label=target_names[target_type])
#         plot.xlabel(dataset.feature_names[i])
#         plot.ylabel(dataset.feature_names[j])
#         plot.legend()
#         plot.show()

In [None]:
# Because the native.country variable has too many categories, and most of the data points are from the US (91%), we combine all the categories except for “United-States”into the “Other” category:
df.loc[df['native_country']!='United-States', 'native_country']='Other'

# Save the output label in binary encoding, 0: <=50k, 1: > 50k
Y=pd.Categorical(df['earning_label']).codes
Y = np.where(Y==0, -1, Y) 
print(Y)

# Education is not needed as uducation_num performs its function
# Also drop the label as it is not needed for the model
df=df.drop(['education','earning_label'], axis=1)

# Scale numerical features
col_names = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss','hours_per_week']
features = df[col_names]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)
df[col_names] = features

df

In [None]:
# determine unique values of each categorical feature:
col_names = ['workclass','marital_status','occupation','relationship','race','sex','native_country']
for feature in col_names:
    print(feature, pd.Series.unique(df[feature]))

# impute missing values
features = df[col_names]
imp = SimpleImputer(strategy='most_frequent').fit(features.values)
features = imp.transform(features.values)
df[col_names] = features

# Convert categorical features to one-hot encoding
df=pd.get_dummies(df)
df

In [None]:
# Finally, split the data into training and testing sets 80\/20
train_X, test_X, train_Y, test_Y = train_test_split(df.values, Y, test_size = 0.2)

## Fitting the Model

With the data now processed, it is ready to have SVM applied

In [None]:
from svm import SVM
svm = SVM(10, .001)
w = svm.fit(train_X, train_Y)
print(w)

In [None]:
predicted_Y = svm.predict(test_X, w)

from sklearn.metrics import accuracy_score

print("Accuracy:")
print ((predicted_Y - test_Y == 0).sum() / test_Y.shape[0])

In [None]:
from sklearn.svm import SVC
clf = SVC(gamma='auto')
clf.fit(train_X, train_Y)
SVC(gamma='auto')
print(clf.predict(test_X))

print(df.values)

In [None]:
predicted_Y = clf.predict(test_X)
print(predicted_Y)

In [None]:
print("Accuracy:")
print ((predicted_Y - test_Y == 0).sum() / test_Y.shape[0])

## Acknowlegements

https://methods.sagepub.com/dataset/howtoguide/support-vector-machine-in-aci-1996-python