# State Vector Machines

## Introduction

## Summary of Results

---

## Setup 

First, we'll need to import the various libraries that we'll need.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import svm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
# from sklearn_pandas import CategoricalImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder

%matplotlib inline

## Load and Clean Data

Before we can build any models, we need to import the data and clean it by converting types as necessary.

In [None]:
df = pd.read_csv("Adult/adult.data", names=[
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "earning_label"
], skipinitialspace=True)

df

In [None]:
# Explore some attributes of the dataset

# dir(df)
print("Features: ", df.columns)
print("Labels: ", pd.Series.unique(df.earning_label))
print("Shape: ", df.shape)

In [None]:
# from itertools import combinations

# labels = combinations(df.columns, 2)
# indices = combinations(range(len(df.columns)), 2)
# for (x_label, y_label), (x1, x2) in zip(labels, indices):
#     for i, target_name in enumerate(pd.Series.unique(df.earning_label)):
#         income = df[ df.earning_label == i ]
#         plt.scatter(income[:,x1], income[:,x2], label=target_name, alpha=0.7)
#     plt.xlabel(x_label)
#     plt.ylabel(y_label)
#     plt.legend(loc='upper left')
#     plt.show()

print(df.describe())

n = 100  # for 2 random indices
index = np.random.choice(df.shape[0], n, replace=False) 
random = df.iloc[index, :]

for i, target_name in enumerate(pd.Series.unique(df.earning_label)):
        income = random[ random.earning_label == i ]
        plt.scatter(random.capital_loss, random.capital_gain, label=target_name, alpha=0.7)
plt.xlabel('fnlwgt')
plt.ylabel("education_num")
plt.legend(loc='upper left')
plt.show()

In [None]:
# Because the native.country variable has too many categories, and most of the data points are from the US (91%), we combine all the categories except for “United-States”into the “Other” category:
df.loc[df['native_country']!='United-States', 'native_country']='Other'

# Save the output label in binary encoding, 0: <=50k, 1: > 50k
Y=pd.Categorical(df['earning_label']).codes
Y = np.where(Y==0, -1, Y) 
print(Y)

# Education is not needed as uducation_num performs its function
# Also drop the label as it is not needed for the model
df=df.drop(['education','earning_label'], axis=1)

# Scale numerical features
col_names = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss','hours_per_week']
features = df[col_names]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)
df[col_names] = features

df

In [None]:
# determine unique values of each categorical feature:
col_names = ['workclass','marital_status','occupation','relationship','race','sex','native_country']
for feature in col_names:
    print(feature, pd.Series.unique(df[feature]))

# impute missing values
features = df[col_names]
imp = SimpleImputer(strategy='most_frequent').fit(features.values)
features = imp.transform(features.values)
df[col_names] = features

# Convert categorical features to one-hot encoding
df=pd.get_dummies(df)
df

In [None]:
# Finally, split the data into training and testing sets 80\/20
#partialData = df.values[:, [0, 2]] #Enable To only fit with two features
partialData = df.values
train_X, test_X, train_Y, test_Y = train_test_split(partialData, Y, test_size = 0.2)

In [None]:
# Reduce the instances of <50k in the training data set to match the number of >50k
greaterThan = np.where(train_Y == 1)
gTCount = greaterThan[0].shape[0]

lT = np.where(train_Y == -1)[0]
lT = np.random.permutation(lT)

index = np.hstack((lT[0:gTCount], greaterThan[0]))
train_Y = train_Y[index]
train_X = train_X[index]


In [None]:
#Plot all of the basic features against each other to look for a good candidate
X = df.values
for i in range(0, 6):
    for j in range(i + 1, 6):
        print((i, j))
        print((df.columns[i], df.columns[j]))
        plt.scatter(X[:, i], X[:, j], c=Y)
        plt.show()
        

## Fitting the Model

With the data now processed, it is ready to have SVM applied

In [None]:
#Fit with linear SVM
from svm import SVM
svm = SVM(50, .001)
w = svm.fit(train_X, train_Y)
print(w)

In [None]:
#Predict from test data and print statistics
from sklearn.metrics import confusion_matrix

predicted_Y = svm.predict(test_X, w)

print("Accuracy:")
count = test_Y.shape[0]
print ((predicted_Y - test_Y == 0).sum() / count)
cm = confusion_matrix(test_Y, predicted_Y)
print("Confusion Matrix")
print(cm)

In [None]:
# Plot the dividing line
import matplotlib.pyplot as plt

X = partialData
y = Y

independant_index = 0
dependant_index = 1

independant = X[:, independant_index]

divBy = -w[dependant_index]
multBy = w[independant_index]
addBy = 0

addBy += w[len(w) - 1]

i1 = np.min(independant) - 5
i2 = np.max(independant) + 5

result = np.array([[i1, (i1 * multBy + addBy) / divBy],
          [i2, (i2 * multBy + addBy) / divBy]])

fig = plt.gcf()
fig.set_size_inches(10, 10)


plt.scatter(X[:, independant_index], X[:, dependant_index], c=y)
plt.plot(result[:, 0], result[:, 1], scaley=False, scalex=False)
plt.xlabel("Age", size=24)
plt.ylabel("Education", size=24)
plt.show()

In [None]:
#Fit with sklearn's SVM implementation
from sklearn.svm import SVC
clf = SVC(kernel='poly', probability=True)
clf.fit(train_X, train_Y)

In [None]:
# Predict the test data
predicted_Y = clf.predict(test_X)
print(predicted_Y)

In [None]:
# Print the statistics
print("Accuracy:")
print ((predicted_Y - test_Y == 0).sum() / test_Y.shape[0])
cm = confusion_matrix(test_Y, predicted_Y)
print("Confusion Matrix")
print(cm)

In [None]:
#Plot the ROC curve
import sklearn.metrics as metrics

probs = clf.predict_proba(test_X)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(test_Y, preds)
roc_auc = metrics.auc(fpr, tpr)

import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
# Plot sklearn's fitted SVM
from mlxtend.plotting import plot_decision_regions

plot_decision_regions(X=partialData, 
                      y=Y,
                      clf=clf, 
                      legend=None)

plt.xlabel("Age", size=14)
plt.ylabel("Education", size=14)

print("Accuracy:")
print ((predicted_Y - test_Y == 0).sum() / test_Y.shape[0])

## Acknowlegements

https://methods.sagepub.com/dataset/howtoguide/support-vector-machine-in-aci-1996-python