In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from recomm.features import index_single_feature
from recomm.classifier import ClassifierNN

In [2]:
init_notebook_mode(connected=True)

In [3]:
CSV_COLUMNS = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "gender",
    "capital_gain", "capital_loss", "hours_per_week", "native_country",
    "income_bracket"
]

In [4]:
train_data = pd.read_csv("data/adult_train.csv", 
                         names=CSV_COLUMNS, 
                         skiprows=1)
test_data = pd.read_csv("data/adult_test.csv", 
                        names=CSV_COLUMNS, 
                        skiprows=1)

In [5]:
train_data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


# Single Indices
Some features are labeled in text, and we convert them into integer numbers to distinguish the differences. For example, the education of employee are Assoc-acdm, Some-college, and so on, and they are distincted by 0, 1, 2, ...

In [6]:
indiced_single_feature_data = pd.DataFrame(train_data.copy())
for col in ["workclass", "education", "marital_status", "occupation", "relationship", "race", "gender", "native_country", "income_bracket"]:
    indiced_single_feature_data.loc[:, col] = index_single_feature(indiced_single_feature_data[col].values)[0]
indiced_single_feature_test = pd.DataFrame(test_data.copy())
for col in ["workclass", "education", "marital_status", "occupation", "relationship", "race", "gender", "native_country", "income_bracket"]:
    indiced_single_feature_test.loc[:, col] = index_single_feature(indiced_single_feature_test[col].values)[0]

In [7]:
indiced_single_feature_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,25,0,226802,0,7,0,0,0,0,0,0,0,40,0,0
1,38,0,89814,1,9,1,1,1,1,0,0,0,50,0,0
2,28,1,336951,2,12,1,2,1,1,0,0,0,40,0,1
3,44,0,160323,3,10,1,0,1,0,0,7688,0,40,0,1
4,18,2,103497,3,10,0,3,0,1,1,0,0,30,0,0


Next, we have to rescale each features to prevent the bias. More specifically, mean and variation of fnlwgt are huge, and squeeze other factors. Therefore, all features are rescaled between 0 and 1 to uniform factors, and that means weights of all features are equal.

In [8]:
indiced_single_feature_data = indiced_single_feature_data.apply(lambda x: (x - x.min()) / (x.max() - x.min()))
indiced_single_features = indiced_single_feature_data[indiced_single_feature_data.columns[:-1]].values
indiced_single_labels = indiced_single_feature_data[indiced_single_feature_data.columns[-1]].values
indiced_single_feature_test = indiced_single_feature_test.apply(lambda x: (x - x.min()) / (x.max() - x.min()))
indiced_single_features_test = indiced_single_feature_test[indiced_single_feature_test.columns[:-1]].values
indiced_single_labels_test = indiced_single_feature_test[indiced_single_feature_test.columns[-1]].values

In [9]:
indiced_single_feature_data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,0.109589,0.0,0.14443,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.397959,0.0,0.0
1,0.287671,0.0,0.051677,0.066667,0.533333,0.166667,0.071429,0.2,0.25,0.0,0.0,0.0,0.5,0.0,0.0
2,0.150685,0.125,0.219011,0.133333,0.733333,0.166667,0.142857,0.2,0.25,0.0,0.0,0.0,0.397959,0.0,1.0
3,0.369863,0.0,0.099418,0.2,0.6,0.166667,0.0,0.2,0.0,0.0,0.076881,0.0,0.397959,0.0,1.0
4,0.013699,0.25,0.060942,0.2,0.6,0.0,0.214286,0.0,0.25,1.0,0.0,0.0,0.295918,0.0,0.0


Finally, training data is put into a single neuron with multi-input and single output. It is not sure if a single output is able to classify those with multi-input or not, although it is continous between 0 and 1, and there must be the potential.

In [10]:
indiced_single_features_estimator = [
    ClassifierNN(indiced_single_features, indiced_single_labels)\
        .optimize(learning_rate=1e-3)\
        .estimate(batch_size=50),
    ClassifierNN(indiced_single_features, indiced_single_labels)\
        .optimize(learning_rate=1e-3)\
        .estimate(batch_size=100),
    ClassifierNN(indiced_single_features, indiced_single_labels)\
        .optimize(learning_rate=1e-3)\
        .estimate(batch_size=200)
]

(16281,)
(16281, 1)
(16281, 1)


In [11]:
data = [go.Scatter(x=np.arange(1000),
                   y=np.array(indiced_single_features_estimator[0].loss),
                   mode="lines",
                   name="cross entropy")]
layout = go.Layout(
    title='Learning Rate: 1e-3',
    xaxis=dict(
        title='iteration steps',
    ),
    yaxis=dict(
        title='negative cross entropy'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [12]:
# import tensorflow as tf

In [13]:
# features = tf.placeholder(tf.float32, shape=[None, 14], name="sample_features")
# labels = tf.placeholder(tf.float32, shape=[None, 1], name="smaple_labels")
# w = tf.Variable(tf.random_normal([14, 1]), name="neural_net_weight")
# b = tf.Variable(tf.random_normal([1]), name="neural_net_bias")
# estimated_labels = tf.add(tf.matmul(features, w), b)
# # activation = tf.sigmoid(estimated_labels)
# # logit = tf.log(activation)
# entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=estimated_labels)
# # objective = tf.reduce_mean(entropy)

In [14]:
# with tf.Session() as sess:
#     sess.run(tf.global_variables_initializer())
#     s = sess.run([entropy], feed_dict={features:indiced_single_features, labels:indiced_single_labels})
#     print(s)

In [15]:
len(indiced_single_labels.shape)

2