In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier
# from sklearn import cross_validation
from sklearn.model_selection import train_test_split, cross_val_score
import warnings
warnings.simplefilter(action='ignore')

In [2]:

# Input file containing data
input_file = '../aiwp-data/income_data.txt'

# Read the data
X = []
y = []
count_class1 = 0
count_class2 = 0
max_datapoints = 25000

with open(input_file, 'r') as f:
    for line in f.readlines():
        if count_class1 >= max_datapoints and count_class2 >= max_datapoints:
            break
        if '?' in line:
            continue
        data = line[:-1].split(', ')
        if data[-1] == '<=50K' and count_class1 < max_datapoints:
            X.append(data)
            count_class1 += 1
        if data[-1] == '>50K' and count_class2 < max_datapoints:
            X.append(data)
            count_class2 += 1
# Convert to numpy array
X = np.array(X)

# Convert string data to numerical data
label_encoder = [] 
X_encoded = np.empty(X.shape)
for i,item in enumerate(X[0]):
    if item.isdigit(): 
        X_encoded[:, i] = X[:, i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        X_encoded[:, i] = label_encoder[-1].fit_transform(X[:, i])

X = X_encoded[:, :-1].astype(int)
y = X_encoded[:, -1].astype(int)

In [3]:
X

array([[    39,      5,  77516, ...,      0,     40,     38],
       [    50,      4,  83311, ...,      0,     13,     38],
       [    38,      2, 215646, ...,      0,     40,     38],
       ...,
       [    58,      2, 151910, ...,      0,     40,     38],
       [    22,      2, 201490, ...,      0,     20,     38],
       [    52,      3, 287927, ...,      0,     40,     38]])

In [4]:
y

array([0, 0, 0, ..., 0, 0, 1])

In [5]:
# X.shape, y.shape

In [6]:
# Create SVM classifier
classifier = OneVsOneClassifier(LinearSVC(random_state=0))

# Train the classifier
classifier.fit(X, y)

# Cross validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
classifier = OneVsOneClassifier(LinearSVC(random_state=0))
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)

In [7]:
# y = y.ravel()
np.shape(y)

(30162,)

In [8]:
# Compute the F1 score of the SVM classifier
f1 = cross_val_score(classifier, X, y, scoring='f1_weighted', cv=3)
print("F1 score: " + str(round(100*f1.mean(), 2)) + "%")

F1 score: 56.15%


In [9]:
# Predict output for a test datapoint
input_data = ['38', 'Private', '215646', 'HS-grad', '9', 'Divorced',
              'Handlers-cleaners', 'Not-in-family', 'White',
              'Male', '0', '0', '40', 'United-States']
# Encode test datapoint
input_data_encoded = [-1] * len(input_data)
input_data_encoded

[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]

In [10]:
count = 0
for i, item in enumerate(input_data):
    if item.isdigit():
        input_data_encoded[i] = int(input_data[i])
    else:
        input_data = np.reshape(input_data, (-1, 1))
        input_data_encoded[i] = int(label_encoder[count].transform(input_data[i]))
        count += 1
input_data_encoded = np.array(input_data_encoded)
input_data_encoded, np.shape(input_data_encoded)

(array([    38,      2, 215646,     11,      9,      0,      5,      1,
             4,      1,      0,      0,     40,     38]),
 (14,))

In [11]:
input_data_encoded = np.reshape(input_data_encoded, (1, -1))
# input_data_encoded

In [12]:
# Run classifier on encoded datapoint and print output
predicted_class = classifier.predict(input_data_encoded)

In [13]:
label_encoder[-1].inverse_transform(predicted_class)[0]

'<=50K'