In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from recomm.features import index_single_feature
from recomm.classifier import ClassifierNN, ClassifierDNN

In [2]:
init_notebook_mode(connected=True)

# Introduction

It is difficult to label a series of uniform random number by adopting neural network. However, it is interesting if we are able to cluster a serier of continuous variables into group and then to search the correlation between the group and the label. In other words, it is supposed features are grouped together once the result is discriminable with discrete vectors. 

# Uniform Distribution

Before jumping into the topic, we review the work splitting a serious of continous number into 4 levels with DNN.

In [3]:
generator = np.random.rand(50000)
features = pd.DataFrame(generator)
labels = pd.get_dummies(np.ceil(4 * generator))

In [4]:
features.head()

Unnamed: 0,0
0,0.868464
1,0.178726
2,0.39577
3,0.589656
4,0.608748


In [5]:
labels.head()

Unnamed: 0,1.0,2.0,3.0,4.0
0,0,0,0,1
1,1,0,0,0
2,0,1,0,0
3,0,0,1,0
4,0,0,1,0


In [6]:
train_features = features.iloc[:40000].values
train_labels = labels.iloc[:40000].values
test_features = features.iloc[40000:].values
test_labels = labels.iloc[40000:].values
classifier = ClassifierDNN(train_features, train_labels)\
            .build_network(hidden_layers=[2, 4, 8], activate=True)\
            .set_objective(method="l2_loss")\
            .optimize()\
            .estimate(batch_size=2000, learning_rate=1e-1, iter_max=500)\
            .estimate(batch_size=2000, learning_rate=1e-2, iter_max=4500, init=False)\
            .estimate(batch_size=2000, learning_rate=1e-5, iter_max=5000, init=False)\
            .predict(test_features)\
            .activate_label()\
            .get_accuracy(test_labels)

In [7]:
data = [go.Scatter(x=np.arange(len(classifier.loss)),
                   y=np.log(np.array(classifier.loss)),
                   mode="lines",
                   name="layer_number: 10"),]
layout = go.Layout(
    xaxis=dict(
        title='iteration steps',
    ),
    yaxis=dict(
        title='L2 Loss (Log Scale)'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

We now are able to obtain the high accuracy result, although the convergence is complicated. This is probably we did not use the activation function at the end of each layer before, and the weight and the bias divergent after passing through layers. Now, the topic is changed to compare the pros and cons with and without clustering before implementing neural network.

In [8]:
classifier.accuracy

0.99970000000000003

# Split Gaussian Distribution

Now, we consider the situation there are three groups satisfying Gaussian distribution. The limitation of neural network is interesting once the resolution is poor. The results are compared with and without clustering before learning.

In [9]:
distance = 5
discriminator = [-1 ,0, 1]
samples = pd.DataFrame()
for d0 in discriminator:
    for d1 in discriminator:
        cluster = pd.DataFrame({
            "f0": np.random.randn(50000) + distance * d0,
            "f1": np.random.randn(50000) + distance * d1,
            "l0": distance * d0 * np.ones(50000),
            "l1": distance * d1 * np.ones(50000),
        })
        samples = samples.append(cluster)
samples = samples.sample(frac=1)
features = samples[["f0", "f1"]]
labels = pd.get_dummies(samples.l0.astype("str") + ", " + samples.l1.astype("str"))

In [10]:
features.head()

Unnamed: 0,f0,f1
5140,-4.27468,4.603357
27283,3.373483,5.031437
42777,-0.654233,1.189932
45651,-1.346884,5.254765
35547,-7.179302,0.043631


In [11]:
labels.head()

Unnamed: 0,"-5.0, -5.0","-5.0, 0.0","-5.0, 5.0","0.0, -5.0","0.0, 0.0","0.0, 5.0","5.0, -5.0","5.0, 0.0","5.0, 5.0"
5140,0,0,1,0,0,0,0,0,0
27283,0,0,0,0,0,0,0,0,1
42777,0,0,0,0,1,0,0,0,0
45651,0,0,0,0,0,1,0,0,0
35547,0,1,0,0,0,0,0,0,0


In [19]:
train_features = features.iloc[:40000].values
train_labels = labels.iloc[:40000].values
test_features = features.iloc[40000:].values
test_labels = labels.iloc[40000:].values
classifier = ClassifierDNN(train_features, train_labels)\
            .build_network(hidden_layers=[32], activate=True)\
            .set_objective(method="l2_loss")\
            .optimize()\
            .estimate(batch_size=2000, learning_rate=1e-2, iter_max=2000)\
            .predict(test_features)\
            .activate_label()\
            .get_accuracy(test_labels)

In [20]:
data = [go.Scatter(x=np.arange(len(classifier.loss)),
                   y=np.log(np.array(classifier.loss)),
                   mode="lines",
                   name="layer_number: 10"),]
layout = go.Layout(
    xaxis=dict(
        title='iteration steps',
    ),
    yaxis=dict(
        title='L2 Loss (Log Scale)'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [21]:
classifier.accuracy

0.98114390243902438