Salary binary classifier
========================

Based on dataset from [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/). The goal is predict salary level **<50K** or **>50K** using 14 input variables such as _age, education, sex and home country_

This script doesn't use KerasClassifier but manual implementation which simply round predictions to integer 


In [1]:
import numpy as np
import os.path
import urllib, csv
import pandas 
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from sklearn.preprocessing import LabelEncoder
from keras.layers.normalization import BatchNormalization


# Dataset folder located  at thttps://archive.ics.uci.edu/ml/machine-learning-databases/adult/
dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
dataset_filename = "adult.data"

eval_percent = 20

np.random.seed(4)

Using TensorFlow backend.
  if 'order' in inspect.getargspec(np.copy)[0]:


In [2]:
# Loading dataset
if not os.path.isfile(dataset_filename):
    print("Downloading dataset from {}".format(dataset_url))
    urllib.request.urlretrieve(dataset_url, dataset_filename)

headers = ["age", "type_employer", "fnlwgt", "education", 
                "education_num","marital", "occupation", "relationship", "race","sex",
                "capital_gain", "capital_loss", "hr_per_week","country", "income"]
df = pandas.read_csv(dataset_filename, header = 0, skipinitialspace = True, skip_blank_lines=True, names = headers)

# Cleanup data
df = df[df.country.notnull()]
df = df[df.income.notnull()]

# Encode labels to numbers using LabelEncoder
mapping = {}
for col in df.select_dtypes(include=['object']).columns.values.tolist():
    mapping[col] = LabelEncoder()
    mapping[col].fit(df[col])
    df[col] = mapping[col].transform(df[col]).astype(np.int32)

# Shuffle it
df = df.reindex(np.random.permutation(df.index))

# Get eval dataset
edf = df[0:int(len(df)*eval_percent/100.0)] 
[Xeval, Yeval] = [edf[edf.columns.difference(['income'])].values, edf['income'].values]

# Get training dataset
df = df[len(edf):]
[X, Y] = [df[df.columns.difference(['income'])].values , df['income'].values]

print("Got shapes for inputs for training {} and evaluation {}".format(X.shape, Xeval.shape))

Got shapes for inputs for training (14072, 14) and evaluation (3518, 14)


In [3]:
# Build the model
model = Sequential()
model.add(Dense(64, input_dim=14, activation='relu', init='normal'))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='mse', optimizer='adamax', metrics=['accuracy'])

# Train it
model.fit(X, Y, batch_size=64, nb_epoch=5)

# Evaluate trained model on another data
metrics = model.evaluate(Xeval, Yeval)
print("")
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
loss: 0.15777606581035428
acc: 0.7970437747026587


In [4]:
# Take a sample and look on prediction results
test = edf.sample(100)
Xtest = test[test.columns.difference(['income'])].values
test['income_predicted'] = model.predict(Xtest)
r = np.stack([
        test.income_predicted.values, test.income, 
        np.array(test.income_predicted.values.round() == test.income, dtype="bool")]
    , axis=1)


r = pandas.DataFrame({
        'prediction' : pandas.Series(test.income_predicted.values),
        'expected':  pandas.Series(test.income.values),
        'correct':  pandas.Series(np.array(test.income_predicted.values.round() == test.income, dtype="bool")),
    })

print("Prediction accuracy: {}%".format(r.correct[r.correct == True].count() / r.correct.count() * 100))
print("")
print("Sample predictions: ")
print(r.sample(30))


Prediction accuracy: 83.0%

Sample predictions: 
   correct  expected  prediction
0    False         1    0.185096
78    True         0    0.120879
9     True         0    0.215378
53    True         0    0.225522
90   False         1    0.227264
51    True         0    0.100754
40   False         1    0.176723
93    True         0    0.229553
54    True         0    0.217546
19    True         0    0.179391
58    True         0    0.204256
81    True         0    0.148152
87    True         0    0.231750
96   False         1    0.371935
20    True         0    0.195555
57    True         0    0.093977
23    True         0    0.132624
60    True         0    0.196099
66    True         0    0.163362
39    True         0    0.232309
43    True         0    0.218273
95    True         0    0.210139
12    True         0    0.225714
59    True         0    0.173467
97    True         0    0.201519
84    True         0    0.114512
65    True         0    0.227582
94    True         0    0.2