## Load dataset

In [None]:
import numpy as np 
import pandas as pd 
from tensorflow.keras.models import *
from tensorflow.keras.layers import *  
from tensorflow.keras.callbacks import *
import torch 
import torch.nn as nn
import time
import random
import math
from matplotlib import pyplot

torch.manual_seed(0)
np.random.seed(0)


In [2]:
train_x = pd.read_csv('train_x_df.csv') 
train_y = pd.read_csv('train_y_df.csv') 
test_x = pd.read_csv('test_x_df.csv')
submission = pd.read_csv('sample_submission.csv')

In [3]:
train_x.head()

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av
0,0,0,7,1.010004,1.010004,1.009612,1.010004,838287.5,43160.632812,451.157288,732683.4,37725.183594
1,0,1,7,1.009808,1.009808,1.009808,1.009808,162242.0,8352.220703,39.231071,0.0,0.0
2,0,2,7,1.009808,1.0102,1.009808,1.0102,16649.67,857.377808,58.846603,16649.67,857.377808
3,0,3,7,1.0102,1.011181,1.0102,1.011181,2586971.0,133310.34375,431.541779,2189147.0,112811.046875
4,0,4,7,1.010985,1.010985,1.0102,1.0102,1129996.0,58216.867188,176.53981,0.0,0.0


In [4]:
train_y.head()

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av
0,0,0,7,1.000392,1.000588,1.000392,1.000588,830511.9,42356.179688,137.308746,830511.9,42356.179688
1,0,1,7,1.000588,1.001177,1.000392,1.001177,532006.6,27140.638672,294.233032,488273.8,24909.861328
2,0,2,7,1.001177,1.001177,1.001177,1.001177,511377.0,26100.681641,58.846603,511377.0,26100.681641
3,0,3,7,1.001177,1.001373,1.001177,1.001373,1134853.0,57929.410156,137.308746,1095514.0,55921.15625
4,0,4,7,1.000981,1.000981,0.999804,1.000196,5801173.0,295872.34375,666.928162,991123.9,50528.589844


In [5]:
train_x.shape, train_y.shape, test_x.shape, submission.shape 

((10159560, 12), (883440, 12), (730020, 12), (529, 3))

In [6]:
def df2d_to_array3d(df_2d):
    feature_size = df_2d.iloc[:,2:].shape[1]
    time_size = len(df_2d.time.value_counts())
    sample_size = len(df_2d.sample_id.value_counts())
    array_3d = df_2d.iloc[:,2:].values.reshape([sample_size, time_size, feature_size])
    return array_3d


x_train = df2d_to_array3d(train_x) 
y_train = df2d_to_array3d(train_y) 
x_test = df2d_to_array3d(test_x) 

x_train.shape, y_train.shape, x_test.shape

((7362, 1380, 10), (7362, 120, 10), (529, 1380, 10))

In [7]:
y_train_close = y_train[:,:,4]
x_train_close = x_train[:,:,4].reshape((-1,1380)) 
x_test_close = x_test[:,:,4].reshape((-1,1380))

In [8]:
x_train_close.shape, y_train_close.shape, x_test_close.shape

((7362, 1380), (7362, 120), (529, 1380))

In [9]:
buy_quantities = [] # either 0 or 1 
sell_times = [] # when to sell i.e. when the close price reaches the highest point in the next 2 hours.  


for i in range(y_train_close.shape[0]): 
    sell_time = np.argmax(y_train_close[i,:])  
    sell_times.append(sell_time) 
    if y_train_close[i,sell_time] > x_train_close[i,-1]: 
        buy_quantities.append(1.0) 
    else: 
        buy_quantities.append(0.0)
    

buy_quantities = np.asarray(buy_quantities).reshape((-1,1))
sell_times = np.asarray(sell_times).reshape((-1,1))

## Define model

In [24]:
def buy_quantity_LSTM(): 
    inputs = Input((1380,10)) # better to use all 10 features I think
    bn = BatchNormalization()(inputs) 
    lstm = Bidirectional(LSTM(128, return_sequences = True))(bn) 
    lstm = Dropout(0.25)(lstm) 
    lstm = Bidirectional(LSTM(128, return_sequences = False))(lstm) 
    lstm = Dropout(0.25)(lstm) 
    dense = Dense(64, activation = 'relu')(lstm) 
    dense = BatchNormalization()(dense) 
    outputs = Dense(1, activation = 'sigmoid')(dense) 
    model = Model(inputs=inputs,outputs=outputs) 
    model.compile(loss='binary_crossentropy', optimizer = 'adam') 
    return model 


def sell_time_LSTM():
    inputs = Input((1380,10)) 
    bn = BatchNormalization()(inputs)
    lstm = Bidirectional(LSTM(128, return_sequences = True))(bn) 
    lstm = Dropout(0.25)(lstm) 
    lstm = Bidirectional(LSTM(128, return_sequences = False))(lstm) 
    lstm = Dropout(0.25)(lstm) 
    dense = Dense(64, activation = 'relu')(lstm) 
    dense = BatchNormalization()(dense) 
    outputs = Dense(120, activation = 'softmax')(dense) 
    model = Model(inputs=inputs,outputs=outputs) 
    model.compile(loss='sparse_categorical_crossentropy', optimizer = 'adam') 
    return model 

In [28]:
model_bq = buy_quantity_LSTM() 
model_st = sell_time_LSTM() 

## Train buy quantity model 

In [29]:
model_path = 'bq_LSTM_epoch_{epoch:03d}_val_{val_loss:.3f}.h5'
learning_rate_reduction = ReduceLROnPlateau(monitor = 'val_loss', patience = 2, verbose = 1, factor = 0.5)
checkpoint = ModelCheckpoint(filepath = model_path, monitor = 'val_loss', verbose = 1, save_best_only = True)
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 10) 
history = model_bq.fit(x_train,
                       buy_quantities,
                       epochs = 200,
                       batch_size = 32,
                       validation_split = 0.2,
                       callbacks = [learning_rate_reduction, checkpoint, early_stopping]) 

Epoch 1/200

Epoch 00001: val_loss improved from inf to 0.24132, saving model to bq_LSTM_epoch_001_val_0.241.h5
Epoch 2/200

Epoch 00002: val_loss improved from 0.24132 to 0.23739, saving model to bq_LSTM_epoch_002_val_0.237.h5
Epoch 3/200

Epoch 00003: val_loss did not improve from 0.23739
Epoch 4/200

Epoch 00004: val_loss improved from 0.23739 to 0.22096, saving model to bq_LSTM_epoch_004_val_0.221.h5
Epoch 5/200

Epoch 00005: val_loss improved from 0.22096 to 0.21939, saving model to bq_LSTM_epoch_005_val_0.219.h5
Epoch 6/200

Epoch 00006: val_loss improved from 0.21939 to 0.21901, saving model to bq_LSTM_epoch_006_val_0.219.h5
Epoch 7/200

Epoch 00007: val_loss did not improve from 0.21901
Epoch 8/200

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 00008: val_loss did not improve from 0.21901
Epoch 9/200

Epoch 00009: val_loss improved from 0.21901 to 0.21797, saving model to bq_LSTM_epoch_009_val_0.218.h5
Epoch 10/200

Epoch 00010: val_loss

## Train sell time model

In [30]:
model_path = 'st_LSTM_epoch_{epoch:03d}_val_{val_loss:.3f}.h5'
learning_rate_reduction = ReduceLROnPlateau(monitor = 'val_loss', patience = 2, verbose = 1, factor = 0.5)
checkpoint = ModelCheckpoint(filepath = model_path, monitor = 'val_loss', verbose = 1, save_best_only = True)
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 10) 
history = model_st.fit(x_train,
                       sell_times,
                       epochs = 200,
                       batch_size = 32,
                       validation_split = 0.2, 
                       callbacks = [learning_rate_reduction, checkpoint, early_stopping]) 

Epoch 1/200

Epoch 00001: val_loss improved from inf to 4.86305, saving model to st_LSTM_epoch_001_val_4.863.h5
Epoch 2/200

Epoch 00002: val_loss improved from 4.86305 to 4.78159, saving model to st_LSTM_epoch_002_val_4.782.h5
Epoch 3/200

Epoch 00003: val_loss improved from 4.78159 to 4.73250, saving model to st_LSTM_epoch_003_val_4.732.h5
Epoch 4/200

Epoch 00004: val_loss did not improve from 4.73250
Epoch 5/200

Epoch 00005: val_loss improved from 4.73250 to 4.68943, saving model to st_LSTM_epoch_005_val_4.689.h5
Epoch 6/200

Epoch 00006: val_loss improved from 4.68943 to 4.68726, saving model to st_LSTM_epoch_006_val_4.687.h5
Epoch 7/200

Epoch 00009: val_loss did not improve from 4.68519
Epoch 10/200

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 00010: val_loss did not improve from 4.68519
Epoch 11/200

Epoch 00011: val_loss did not improve from 4.68519
Epoch 12/200

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.000250000011

## Make Prediction

In [10]:
best_bq = load_model('bq_LSTM_epoch_012_val_0.217.h5') 
best_st = load_model('st_LSTM_epoch_008_val_4.685.h5')

In [13]:
predicted_bq = best_bq.predict(x_test)

In [14]:
predicted_st = best_st.predict(x_test)

In [19]:
exact_times = []
for sell_time in predicted_st: 
    exact_times.append(np.argmax(sell_time))

In [20]:
exact_times = np.asarray(exact_times)

In [24]:
submission.iloc[:,1] = predicted_bq 
submission.iloc[:,2] = exact_times

In [25]:
submission.head()

Unnamed: 0,sample_id,buy_quantity,sell_time
0,0,0.934824,0
1,1,0.94102,0
2,2,0.937923,0
3,3,0.933961,0
4,4,0.919518,0


In [26]:
submission.to_csv('sample_lstm.csv',index=False)

## Visualize result

In [108]:
def plot_series(x_series, y_series):
    #입력 series와 출력 series를 연속적으로 연결하여 시각적으로 보여주는 코드 입니다.
    plt.plot(x_series, label = 'input_series')
    plt.plot(np.arange(len(x_series), len(x_series)+len(y_series)),
             y_series, label = 'output_series')
    # plt.axhline(1, c = 'red')
    plt.legend()
