In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from recomm.classifier import ClassifierNN, ClassifierDNN

In [2]:
init_notebook_mode(connected=True)

In [3]:
CSV_COLUMNS = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "gender",
    "capital_gain", "capital_loss", "hours_per_week", "native_country",
    "income_bracket"
]

In [4]:
train_data = pd.read_csv("data/adult_train.csv", 
                         names=CSV_COLUMNS, 
                         skiprows=1)
test_data = pd.read_csv("data/adult_test.csv", 
                        names=CSV_COLUMNS, 
                        skiprows=1)
total_data = train_data.append(test_data)
# to unify the column income_bracket
total_data["income_bracket"] = total_data["income_bracket"].str.replace(".", "")

In [5]:
dummy_feature_data = pd.DataFrame(total_data.copy())
for col in ["workclass", "education", "marital_status", "occupation", "relationship", "race", "gender", "native_country", "income_bracket"]:
    dummy_col = dummy_feature_data[col]
    dummy_feature_data = pd.concat(
        [dummy_feature_data.drop(col, axis=1), pd.get_dummies(dummy_col)],
        axis=1)

In [6]:
dummy_feature_data = dummy_feature_data.apply(lambda x: (x - x.min()) / (x.max() - x.min()))
train_features = dummy_feature_data.iloc[:16281,:-2].values
train_labels = dummy_feature_data.iloc[:16281,-2:].values
test_features = dummy_feature_data.iloc[16281:,:-2].values
test_labels = dummy_feature_data.iloc[16281:,-2:].values

In [59]:
dnn_features_estimator = [
    ClassifierDNN(train_features, train_labels)\
        .build_network(hidden_layers=[100])\
        .set_objective(method="l2_loss")\
        .optimize()\
        .estimate(batch_size=4000, iter_max=200, learning_rate=1e-1)\
        .estimate(batch_size=4000, iter_max=300,learning_rate=1e-4, init=False),
    ClassifierDNN(train_features, train_labels)\
        .build_network(hidden_layers=[1000])\
        .set_objective(method="l2_loss")\
        .optimize()\
        .estimate(batch_size=4000, iter_max=200, learning_rate=1e-1)\
        .estimate(batch_size=4000, iter_max=300,learning_rate=1e-4, init=False),
    ClassifierDNN(train_features, train_labels)\
        .build_network(hidden_layers=[100])\
        .set_objective()\
        .optimize()\
        .estimate(batch_size=1000, iter_max=500, learning_rate=1e-4),
    ClassifierDNN(train_features, train_labels)\
        .build_network(hidden_layers=[1000])\
        .set_objective()\
        .optimize()\
        .estimate(batch_size=1000, iter_max=500, learning_rate=1e-4),
]

In [64]:
data = [go.Scatter(x=np.arange(2000),
                   y=np.array(dnn_features_estimator[0].loss),
                   mode="lines",
                   name="L2 Loss"),
        go.Scatter(x=np.arange(2000),
                   y=np.array(dnn_features_estimator[1].loss),
                   mode="lines",
                   name="L2 Loss"),
        go.Scatter(x=np.arange(2000),
                   y=np.array(dnn_features_estimator[2].loss),
                   mode="lines",
                   name="Cross Entropy"),
        go.Scatter(x=np.arange(2000),
                   y=np.array(dnn_features_estimator[3].loss),
                   mode="lines",
                   name="Cross Entropy"),
       ]
layout = go.Layout(
    title='Batch Size: 1000, Learning Rate: 1e-4',
    xaxis=dict(
        title='iteration steps',
    ),
    yaxis=dict(
        title='negative cross entropy'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [61]:
accuracy = []
for estimator in dnn_features_estimator:
    accuracy.append(estimator\
                    .predict(test_features)\
                    .activate_label()\
                    .get_accuracy(test_labels)\
                    .accuracy)

In [63]:
np.array(accuracy)

array([ 0.83906634,  0.76968673,  0.7277027 ])