In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from recomm.classifier import ClassifierNN, ClassifierDNN

In this section, we are going to study the performance of neural network when there are noisy features which are not essential dimensions corresponding to interesting labels. A toy model is constructed and the features simply exist 0 or 1 on the corresponding dimension. Here, labels are set equal to the features and an identity weight with zero mean bias must be obtained from this model.

# Features and Labels

In [11]:
features = np.ceil(5 * np.random.rand(50000))
features = pd.get_dummies(features)
labels = pd.DataFrame(features.copy())

In [12]:
features.head()

Unnamed: 0,1.0,2.0,3.0,4.0,5.0
0,1,0,0,0,0
1,0,0,0,0,1
2,0,0,1,0,0
3,0,0,0,0,1
4,0,0,0,1,0


In [13]:
labels.head()

Unnamed: 0,1.0,2.0,3.0,4.0,5.0
0,1,0,0,0,0
1,0,0,0,0,1
2,0,0,1,0,0
3,0,0,0,0,1
4,0,0,0,1,0


In [14]:
train_features = features.iloc[:40000].values
train_labels = labels.iloc[:40000].values
test_features = features.iloc[40000:].values
test_labels = labels.iloc[40000:].values

In [15]:
accuracy = ClassifierNN(train_features, train_labels)\
            .build_network()\
            .set_objective(method="l2_loss")\
            .optimize()\
            .estimate(batch_size=1000, learning_rate=1e-2)\
            .predict(test_features)\
            .activate_label()\
            .get_accuracy(test_labels)\
            .accuracy

Finally, the perfect result is obtained definitely.

In [16]:
accuracy

1.0

# Add Noisy Features

Next step, additional noisy feature is built with uniform random distribution between 0 and 1.

In [34]:
features_wi_noise = pd.DataFrame(features.copy())
noise_dim = np.arange(10)
accuracy = []
for idx in noise_dim:
    features_wi_noise.loc[:, "noise_{}".format(idx)] = np.random.rand(features_wi_noise.shape[0])
    train_features = features_wi_noise.iloc[:40000].values
    train_labels = labels.iloc[:40000].values
    test_features = features_wi_noise.iloc[40000:].values
    test_labels = labels.iloc[40000:].values
    accuracy.append(ClassifierNN(train_features, train_labels)\
            .build_network()\
            .set_objective(method="l2_loss")\
            .optimize()\
            .estimate(batch_size=1000, learning_rate=1e-2)\
            .predict(test_features)\
            .activate_label()\
            .get_accuracy(test_labels)\
            .accuracy)

In [35]:
features_wi_noise.head()

Unnamed: 0,1.0,2.0,3.0,4.0,5.0,noise_0,noise_1,noise_2,noise_3,noise_4,noise_5,noise_6,noise_7,noise_8,noise_9
0,1,0,0,0,0,0.991015,0.916851,0.302856,0.150913,0.71351,0.191185,0.439378,0.445062,0.810402,0.225242
1,0,0,0,0,1,0.357655,0.244297,0.263289,0.591254,0.565913,0.474834,0.620605,0.937182,0.227848,0.500562
2,0,0,1,0,0,0.727514,0.082685,0.036956,0.596959,0.61409,0.14117,0.949408,0.992566,0.621637,0.75262
3,0,0,0,0,1,0.246399,0.657806,0.306668,0.73,0.975433,0.324179,0.652441,0.628402,0.633587,0.533036
4,0,0,0,1,0,0.700609,0.206877,0.737523,0.977467,0.095448,0.415794,0.423756,0.954515,0.617185,0.625899


Results indicate we are still able to obtain perfect labels, and additional noisy features disturb nothing.

In [36]:
np.array(accuracy).T

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])