In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from recomm.classifier import ClassifierNN, ClassifierDNN

In [2]:
init_notebook_mode(connected=True)

In [3]:
CSV_COLUMNS = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "gender",
    "capital_gain", "capital_loss", "hours_per_week", "native_country",
    "income_bracket"
]

In [4]:
train_data = pd.read_csv("data/adult_train.csv", 
                         names=CSV_COLUMNS, 
                         skiprows=1)
test_data = pd.read_csv("data/adult_test.csv", 
                        names=CSV_COLUMNS, 
                        skiprows=1)
total_data = train_data.append(test_data)
# to unify the column income_bracket
total_data["income_bracket"] = total_data["income_bracket"].str.replace(".", "")

In [5]:
dummy_feature_data = pd.DataFrame(total_data.copy())
for col in ["workclass", "education", "marital_status", "occupation", "relationship", "race", "gender", "native_country", "income_bracket"]:
    dummy_col = dummy_feature_data[col]
    dummy_feature_data = pd.concat(
        [dummy_feature_data.drop(col, axis=1), pd.get_dummies(dummy_col)],
        axis=1)

In [6]:
dummy_feature_data = dummy_feature_data.apply(lambda x: (x - x.min()) / (x.max() - x.min()))
train_features = dummy_feature_data.iloc[:16281,:-2].values
train_labels = dummy_feature_data.iloc[:16281,-2:].values
test_features = dummy_feature_data.iloc[16281:,:-2].values
test_labels = dummy_feature_data.iloc[16281:,-2:].values

When there is no hidden layer, the accuracy obtained is around 0.8396.

In [7]:
accuracy = ClassifierNN(train_features, train_labels)\
            .build_network()\
            .set_objective(method="l2_loss")\
            .optimize()\
            .estimate(batch_size=1000, learning_rate=1e-2)\
            .predict(test_features)\
            .activate_label()\
            .get_accuracy(test_labels)\
            .accuracy

In [8]:
accuracy

0.84014127764127766

In [9]:
dnn_features_estimator = [
    ClassifierDNN(train_features, train_labels)\
        .build_network(hidden_layers=[100])\
        .set_objective(method="l2_loss")\
        .optimize()\
        .estimate(batch_size=4000, iter_max=200, learning_rate=1e-1)\
        .estimate(batch_size=4000, iter_max=300,learning_rate=1e-4, init=False),
    ClassifierDNN(train_features, train_labels)\
        .build_network(hidden_layers=[1000])\
        .set_objective(method="l2_loss")\
        .optimize()\
        .estimate(batch_size=4000, iter_max=200, learning_rate=1e-1)\
        .estimate(batch_size=4000, iter_max=300,learning_rate=1e-4, init=False),
    ClassifierDNN(train_features, train_labels)\
        .build_network(hidden_layers=[100])\
        .set_objective()\
        .optimize()\
        .estimate(batch_size=1000, iter_max=500, learning_rate=1e-4),
    ClassifierDNN(train_features, train_labels)\
        .build_network(hidden_layers=[1000])\
        .set_objective()\
        .optimize()\
        .estimate(batch_size=1000, iter_max=500, learning_rate=1e-4),
]

The figure shown below indicates the cases conditioned on the differents obejctives and the number of neutons of a hidden layer. When the number of neuron is 100, the fidelity is better than the one has 1000 neurons. Furthermore, we have to switch the objective to L2 loss from cross entropy, because the minimum of L2 loss provides us the higher accuracy. Otherwise, the fidelity of red line is better than the green line, when the the negative cross entropy of green line is smaller.

In [10]:
data = [go.Scatter(x=np.arange(2000),
                   y=np.array(dnn_features_estimator[0].loss),
                   mode="lines",
                   name="L2 Loss"),
        go.Scatter(x=np.arange(2000),
                   y=np.array(dnn_features_estimator[1].loss),
                   mode="lines",
                   name="L2 Loss"),
        go.Scatter(x=np.arange(2000),
                   y=np.array(dnn_features_estimator[2].loss),
                   mode="lines",
                   name="Cross Entropy"),
        go.Scatter(x=np.arange(2000),
                   y=np.array(dnn_features_estimator[3].loss),
                   mode="lines",
                   name="Cross Entropy"),
       ]
layout = go.Layout(
    title='Compare Results from Cross Entropy and L2 Loss',
    xaxis=dict(
        title='iteration steps',
    ),
    yaxis=dict(
        title='Loss',
        range=[0, 2000]
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [11]:
accuracy = []
for estimator in dnn_features_estimator:
    accuracy.append(estimator\
                    .predict(test_features)\
                    .activate_label()\
                    .get_accuracy(test_labels)\
                    .accuracy)

In [12]:
np.array(accuracy)

array([ 0.83952703,  0.83470516,  0.49179975,  0.79296683])

It is shown the results are independent of number of neurons under the condition we choose. Probably, we are almost able to say it is not proper to pick less than 12 neurons which is tiny worse than others.

In [13]:
accuracy = []
neuron_num = np.arange(1, 100, 1)
for neurons in neuron_num:
    accuracy.append(
        ClassifierDNN(train_features, train_labels)\
            .build_network(hidden_layers=[neurons])\
            .set_objective(method="l2_loss")\
            .optimize()\
            .estimate(batch_size=4000, iter_max=200, learning_rate=1e-1)\
            .estimate(batch_size=4000, iter_max=300,learning_rate=1e-4, init=False)\
            .predict(test_features)\
            .activate_label()\
            .get_accuracy(test_labels)\
            .accuracy
    )

In [14]:
data = [go.Scatter(x=neuron_num,
                   y=accuracy,
                   mode="lines",
                   name="L2 Loss")
       ]
layout = go.Layout(
    title='Batch Size: 4000, Learning Rate: 1e-1 -> 1e-4',
    xaxis=dict(
        title='Number of Neurons',
    ),
    yaxis=dict(
        title='L2 Loss',
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [15]:
x = []
y = []
accuracy = []
layer_num = np.arange(5, 20, 1)
for neurons_0 in layer_num:
    for neurons_1 in layer_num:
        x.append(neurons_0)
        y.append(neurons_1)
        accuracy.append(
            ClassifierDNN(train_features, train_labels)\
                .build_network(hidden_layers=[neurons_0, neurons_1])\
                .set_objective(method="l2_loss")\
                .optimize()\
                .estimate(batch_size=4000, iter_max=200, learning_rate=1e-1)\
                .estimate(batch_size=4000, iter_max=300,learning_rate=1e-4, init=False)\
                .predict(test_features)\
                .activate_label()\
                .get_accuracy(test_labels)\
                .accuracy
        )

Whem there are double layers, the accracy is slightly worse. Moreover, it collapse once the number of neurons grow up.

In [28]:
data = [
    go.Mesh3d(x=x,
              y=y,
              z=np.array(accuracy),
              colorbar = go.ColorBar(
                    title='z'
                ),
              colorscale = [['0', 'rgb(255, 0, 0)'], ['0.5', 'rgb(0, 255, 0)'], ['1', 'rgb(0, 0, 255)']],
              showscale = True)
]
layout = go.Layout(
                    scene = dict(
                        xaxis = dict(
                            title='1st Layer'),
                        yaxis = dict(
                            title='2nd Layer'),
                        zaxis = dict(
                            title='Fidelity'),),
                  )
fig = go.Figure(data=data, layout=layout)
iplot(fig)