In [13]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from recomm.classifier import ClassifierNN, ClassifierDNN
from sklearn.decomposition import PCA

In [2]:
init_notebook_mode(connected=True)

In [3]:
CSV_COLUMNS = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "gender",
    "capital_gain", "capital_loss", "hours_per_week", "native_country",
    "income_bracket"
]

In [4]:
train_data = pd.read_csv("data/adult_train.csv", 
                         names=CSV_COLUMNS, 
                         skiprows=1)
test_data = pd.read_csv("data/adult_test.csv", 
                        names=CSV_COLUMNS, 
                        skiprows=1)
total_data = train_data.append(test_data)
# to unify the column income_bracket
total_data["income_bracket"] = total_data["income_bracket"].str.replace(".", "")

In [5]:
dummy_feature_data = pd.DataFrame(total_data.copy())
for col in ["workclass", "education", "marital_status", "occupation", "relationship", "race", "gender", "native_country", "income_bracket"]:
    dummy_col = dummy_feature_data[col]
    dummy_feature_data = pd.concat(
        [dummy_feature_data.drop(col, axis=1), pd.get_dummies(dummy_col)],
        axis=1)

In [6]:
dummy_feature_data.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,?,Federal-gov,Local-gov,Never-worked,...,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,<=50K,>50K
0,25,226802,7,0,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,38,89814,9,0,0,50,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,28,336951,12,0,0,40,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1
3,44,160323,10,7688,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,18,103497,10,0,0,30,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0


In [7]:
dummy_feature_data = dummy_feature_data.apply(lambda x: (x - x.min()) / (x.max() - x.min()))
train_features = dummy_feature_data.iloc[:16281,:-2].values
train_labels = dummy_feature_data.iloc[:16281,-2:].values
test_features = dummy_feature_data.iloc[16281:,:-2].values
test_labels = dummy_feature_data.iloc[16281:,-2:].values

In [8]:
train_features.shape

(16281, 108)

In [9]:
classifier = ClassifierNN(train_features, train_labels)\
                .build_network(activate=True)\
                .set_objective(method="l2_loss")\
                .optimize()\
                .estimate(batch_size=1000, learning_rate=1e-2, iter_max=3000)\
                .predict(test_features)\
                .activate_label()\
                .get_accuracy(test_labels)

In [12]:
data = [go.Scatter(x=np.arange(1000),
                   y=np.log(np.array(classifier.loss)),
                   mode="lines",
                   name="layer_number: 10"),]
layout = go.Layout(
    title='Learning Rate: 1e-2',
    xaxis=dict(
        title='iteration steps',
    ),
    yaxis=dict(
        title='L2 Loss (Log Scale)'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [11]:
classifier.accuracy

0.85024570024570034

# Effect of PCA

In [31]:
prepared_data = dummy_feature_data.values
prepared_features = dummy_feature_data.iloc[:,:-2].values
prepared_labels = dummy_feature_data.iloc[:,-2:].values
pca = PCA(n_components=108)
converted_features = pca.fit_transform(prepared_features)

In [32]:
train_features = converted_features[:16281,:]
train_labels = prepared_labels[:16281,:]
test_features = converted_features[16281:,:]
test_labels = prepared_labels[16281:,:]

In [33]:
pd.DataFrame(train_features).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,107
0,0.542693,-1.100359,0.097066,-0.569593,0.433713,-0.365392,1.079754,0.327009,0.048154,-0.07735,...,5.840465e-15,-1.105077e-14,1.007759e-15,2.082435e-15,1.250977e-15,-5.412099e-16,-2.555704e-16,-1.547559e-16,4.078896e-15,-8.860755000000001e-17
1,-1.041227,-0.047328,0.760333,-0.004448,0.031959,-0.083002,-0.218514,-0.11589,0.014926,-0.079085,...,7.850693e-16,-8.010031e-17,-4.411678e-16,-1.753287e-16,2.291727e-16,2.599948e-16,1.069324e-16,-3.781612e-16,3.620075e-17,1.030721e-16
2,-1.072945,0.253407,-0.486813,0.211918,0.44089,-0.006888,-0.067408,0.019378,-0.168191,-0.161883,...,5.099675e-15,1.084159e-14,3.190929e-14,-2.417159e-15,4.12698e-15,-7.026262e-16,-1.321792e-15,7.001092e-16,1.280041e-15,-2.513239e-16
3,-0.857315,-0.013311,-0.033276,-0.79766,-0.046583,0.512451,1.223664,-0.369073,0.108332,-0.247765,...,8.011069e-16,-4.010857e-16,5.150816e-17,-5.816854000000001e-17,5.576851e-16,4.0569380000000006e-17,-1.006974e-16,-1.257743e-16,1.718855e-17,8.157939000000001e-17
4,1.224312,-0.003541,-0.889534,-0.616042,1.273498,0.259553,-0.571024,-0.093356,0.161279,0.400145,...,1.157074e-16,9.61493e-16,2.509392e-16,-2.7065e-16,3.228641e-16,4.691639e-17,-1.763799e-16,4.5694170000000004e-17,-2.230183e-16,-1.54717e-17


In [34]:
classifier = ClassifierNN(train_features, train_labels)\
                .build_network(activate=True)\
                .set_objective(method="l2_loss")\
                .optimize()\
                .estimate(batch_size=1000, learning_rate=1e-2, iter_max=3000)\
                .predict(test_features)\
                .activate_label()\
                .get_accuracy(test_labels)

In [35]:
data = [go.Scatter(x=np.arange(1000),
                   y=np.log(np.array(classifier.loss)),
                   mode="lines",
                   name="layer_number: 10"),]
layout = go.Layout(
    title='Learning Rate: 1e-2',
    xaxis=dict(
        title='iteration steps',
    ),
    yaxis=dict(
        title='L2 Loss (Log Scale)'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

The accuracy is independent of the transformation.

In [36]:
classifier.accuracy

0.84990786240786242

In [None]:
accuracy = []
for deselect in np.arange(1, 108):
    accuracy.append(
        ClassifierNN(train_features[:,:-deselect], train_labels)\
                .build_network(activate=True)\
                .set_objective(method="l2_loss")\
                .optimize()\
                .estimate(batch_size=1000, learning_rate=1e-2, iter_max=3000)\
                .predict(test_features[:,:-deselect])\
                .activate_label()\
                .get_accuracy(test_labels)\
                .accuracy
    )

Finally, the accuracy drops once the number of features selected is small. We are also able to learn that the curve is almost flatten when the number of feasures is larger than 54. In other words, there are almost half of features is not dominant, and the dimension is reduced in this case.

In [49]:
data = [go.Scatter(x=np.arange(107, 0, -1),
                   y=np.array(accuracy),
                   mode="lines",
                   name=""),]
layout = go.Layout(
    xaxis=dict(
        title='Dimension',
    ),
    yaxis=dict(
        title='L2 Accuracy'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)