In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from recomm.classifier import ClassifierNN, ClassifierDNN

In [2]:
init_notebook_mode(connected=True)

In [3]:
tx_data = pd.read_csv("data/tx_data.csv")

In [4]:
# To construct 1 min k-line
tx_data = tx_data.sort_values(by="create_time")
tx_data.loc[:, "trade_time"] = pd.to_datetime(tx_data["create_time"].astype("str").apply(lambda x: x[:-2]), format="%Y%m%d%H%M")
tx_data.loc[:, "high"] = tx_data.price
tx_data.loc[:, "low"] = tx_data.price
tx_1min_kline = tx_data.groupby(["trade_time"]).agg({"high": pd.Series.max, "low": pd.Series.min, "volume": pd.Series.sum})
tx_1min_kline.loc[:, "open"] = tx_data.drop_duplicates(["trade_time"]).price.values
tx_1min_kline.loc[:, "close"] = tx_data.drop_duplicates(["trade_time"], keep="last").price.values

# Data Pre-process
The idea of the model is to pick an interval of the time series with open, high, low, close, and volume as fetures. More detail, we will consider the data from n-th k-line to (n+300)-th k-line as features and the growth rate of the (n+301)-th k-line as labels.

In [5]:
feature_range = 300

It is found the ratio between open and close is symmetry, and it is therefore grouped into 6 levels with boundarues 180e-6, 90e-6, 0, -90e-6, and -180e-6.

In [6]:
iplot([go.Histogram(x=(tx_1min_kline.close - tx_1min_kline.open) / tx_1min_kline.open)])

In [7]:
train_data = []
label_data = []
for idx in range(feature_range, tx_1min_kline.shape[0]-1):
    range_data = tx_1min_kline.iloc[(idx-feature_range):idx]
    label = tx_1min_kline.iloc[idx+1]
    label = (label.close - label.open) / label.open
    train_data.append(range_data.values.reshape(-1))
    if (label > 180e-6):
        label_data.append([1, 0, 0, 0, 0, 0])
    elif(label > 90e-6):
        label_data.append([0, 1, 0, 0, 0, 0])
    elif(label > 0):
        label_data.append([0, 0, 1, 0, 0, 0])
    elif(label > -90e-6):
        label_data.append([0, 0, 0, 1, 0, 0])
    elif(label > -180e-6):
        label_data.append([0, 0, 0, 0, 1, 0])
    else:
        label_data.append([0, 0, 0, 0, 0, 1])
train_features = np.array(train_data)
train_labels = np.array(label_data)

In [8]:
# To rescale features to values between 0 and 1
train_features = np.apply_along_axis(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)), 1, train_features)

In [9]:
test_features = train_features[20000:]
test_labels = train_labels[20000:]
train_features = train_features[:20000]
train_labels = train_labels[:20000]

In [10]:
simple_features_estimator = [
    ClassifierNN(train_features, train_labels)\
        .build_network()\
        .set_objective()\
        .optimize(learning_rate=1e-1)\
        .estimate(batch_size=1000),
    ClassifierNN(train_features, train_labels)\
        .build_network()\
        .set_objective()\
        .optimize(learning_rate=1e-2)\
        .estimate(batch_size=1000),
]

Previously, the batch size help us reduce the fluctuation. However, the part of learning rate is different in this case, and it reduces the slow oscillation when the value is small, although the convergence rate is not as fast as the large one.

In [11]:
data = [go.Scatter(x=np.arange(1000),
                   y=np.array(simple_features_estimator[0].loss),
                   mode="lines",
                   name="learning_rate: 1e-1"),
        go.Scatter(x=np.arange(1000),
                   y=np.array(simple_features_estimator[1].loss),
                   mode="lines",
                   name="learning_rate: 1e-2"),
       ]
layout = go.Layout(
    title='Batch Size: 1000',
    xaxis=dict(
        title='iteration steps',
    ),
    yaxis=dict(
        title='negative cross entropy'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [12]:
accuracy = simple_features_estimator[1]\
            .predict(test_features)\
            .activate_label()\
            .get_accuracy(test_labels)\
            .accuracy

In [13]:
accuracy

0.52672989868506148

# Single Hidden Layer

In [14]:
dnn_features_estimator = [
    ClassifierDNN(train_features, train_labels)\
        .build_network(hidden_layers=[100])\
        .set_objective()\
        .optimize(learning_rate=1e-4)\
        .estimate(batch_size=1000, iter_max=500),
    ClassifierDNN(train_features, train_labels)\
        .build_network(hidden_layers=[500])\
        .set_objective()\
        .optimize(learning_rate=1e-4)\
        .estimate(batch_size=1000, iter_max=500),
    ClassifierDNN(train_features, train_labels)\
        .build_network(hidden_layers=[1000])\
        .set_objective()\
        .optimize(learning_rate=1e-4)\
        .estimate(batch_size=1000, iter_max=500),
    ClassifierDNN(train_features, train_labels)\
        .build_network(hidden_layers=[2000])\
        .set_objective()\
        .optimize(learning_rate=1e-4)\
        .estimate(batch_size=1000, iter_max=500),
]

We add a single hiddeen layer with neurons 100, 500, 1000, and 2000. Minimum of entropy is achieved but it is not able to present the fidelity. It is found the minimum of the negative entropy is obtained when the number of neuron is the smallest. However, the best fidelity is obtained when the number of neuron is 2000 which is the largest one. Furthermore, the worst case is searched when the number of neuron is 1000. It is probably we does not choose the proper objective or processed features can not be reduced to unit labels.

In [15]:
data = [go.Scatter(x=np.arange(2000),
                   y=np.array(dnn_features_estimator[0].loss),
                   mode="lines",
                   name="hidden_layers: 100"),
        go.Scatter(x=np.arange(2000),
                   y=np.array(dnn_features_estimator[1].loss),
                   mode="lines",
                   name="hidden_layers: 500"),
        go.Scatter(x=np.arange(2000),
                   y=np.array(dnn_features_estimator[2].loss),
                   mode="lines",
                   name="hidden_layers: 1000"),
        go.Scatter(x=np.arange(2000),
                   y=np.array(dnn_features_estimator[3].loss),
                   mode="lines",
                   name="hidden_layers: 2000"),
       ]
layout = go.Layout(
    title='Batch Size: 1000, Learning Rate: 1e-4',
    xaxis=dict(
        title='iteration steps',
    ),
    yaxis=dict(
        title='negative cross entropy'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [16]:
accuracy = []
for estimator in dnn_features_estimator:
    accuracy.append(estimator\
                    .predict(test_features)\
                    .activate_label()\
                    .get_accuracy(test_labels)\
                    .accuracy)

In [17]:
np.array(accuracy).T

array([ 0.46647984,  0.38704462,  0.48609614,  0.48663505])

In [18]:
dnn_features_estimator = [
    ClassifierDNN(train_features, train_labels)\
        .build_network(hidden_layers=[100], activate=True)\
        .set_objective()\
        .optimize(learning_rate=1e-4)\
        .estimate(batch_size=1000, iter_max=500),
    ClassifierDNN(train_features, train_labels)\
        .build_network(hidden_layers=[500], activate=True)\
        .set_objective()\
        .optimize(learning_rate=1e-4)\
        .estimate(batch_size=1000, iter_max=500),
    ClassifierDNN(train_features, train_labels)\
        .build_network(hidden_layers=[1000], activate=True)\
        .set_objective()\
        .optimize(learning_rate=1e-4)\
        .estimate(batch_size=1000, iter_max=500),
    ClassifierDNN(train_features, train_labels)\
        .build_network(hidden_layers=[2000], activate=True)\
        .set_objective()\
        .optimize(learning_rate=1e-4)\
        .estimate(batch_size=1000, iter_max=500),
]

In [19]:
data = [go.Scatter(x=np.arange(2000),
                   y=np.array(dnn_features_estimator[0].loss),
                   mode="lines",
                   name="hidden_layers: 100"),
        go.Scatter(x=np.arange(2000),
                   y=np.array(dnn_features_estimator[1].loss),
                   mode="lines",
                   name="hidden_layers: 500"),
        go.Scatter(x=np.arange(2000),
                   y=np.array(dnn_features_estimator[2].loss),
                   mode="lines",
                   name="hidden_layers: 1000"),
        go.Scatter(x=np.arange(2000),
                   y=np.array(dnn_features_estimator[3].loss),
                   mode="lines",
                   name="hidden_layers: 2000"),
       ]
layout = go.Layout(
    title='Batch Size: 1000, Learning Rate: 1e-4',
    xaxis=dict(
        title='iteration steps',
    ),
    yaxis=dict(
        title='negative cross entropy'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [20]:
accuracy = []
for estimator in dnn_features_estimator:
    accuracy.append(estimator\
                    .predict(test_features)\
                    .activate_label()\
                    .get_accuracy(test_labels)\
                    .accuracy)

In [21]:
np.array(accuracy).T

array([  5.21879715e-01,   2.22044605e-16,  -1.00365717e-01,
         8.95687121e-02])