## Load dataset

In [15]:
import numpy as np 
import pandas as pd 
from tensorflow.keras.models import *
from tensorflow.keras.layers import *  
from tensorflow.keras.callbacks import *
import torch 
import torch.nn as nn
from tqdm import tqdm
import time
import random
import math
from matplotlib import pyplot
from sklearn.preprocessing import StandardScaler
torch.manual_seed(0)
np.random.seed(0)


In [2]:
train_x = pd.read_csv('train_x_df.csv') 
train_y = pd.read_csv('train_y_df.csv') 
test_x = pd.read_csv('test_x_df.csv')
submission = pd.read_csv('sample_submission.csv')

In [3]:
train_x.head()

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av
0,0,0,7,1.010004,1.010004,1.009612,1.010004,838287.5,43160.632812,451.157288,732683.4,37725.183594
1,0,1,7,1.009808,1.009808,1.009808,1.009808,162242.0,8352.220703,39.231071,0.0,0.0
2,0,2,7,1.009808,1.0102,1.009808,1.0102,16649.67,857.377808,58.846603,16649.67,857.377808
3,0,3,7,1.0102,1.011181,1.0102,1.011181,2586971.0,133310.34375,431.541779,2189147.0,112811.046875
4,0,4,7,1.010985,1.010985,1.0102,1.0102,1129996.0,58216.867188,176.53981,0.0,0.0


In [4]:
train_y.head()

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av
0,0,0,7,1.000392,1.000588,1.000392,1.000588,830511.9,42356.179688,137.308746,830511.9,42356.179688
1,0,1,7,1.000588,1.001177,1.000392,1.001177,532006.6,27140.638672,294.233032,488273.8,24909.861328
2,0,2,7,1.001177,1.001177,1.001177,1.001177,511377.0,26100.681641,58.846603,511377.0,26100.681641
3,0,3,7,1.001177,1.001373,1.001177,1.001373,1134853.0,57929.410156,137.308746,1095514.0,55921.15625
4,0,4,7,1.000981,1.000981,0.999804,1.000196,5801173.0,295872.34375,666.928162,991123.9,50528.589844


In [5]:
train_x.shape, train_y.shape, test_x.shape, submission.shape 

((10159560, 12), (883440, 12), (730020, 12), (529, 3))

In [6]:
def df2d_to_array3d(df_2d):
    feature_size = df_2d.iloc[:,2:].shape[1]
    time_size = len(df_2d.time.value_counts())
    sample_size = len(df_2d.sample_id.value_counts())
    array_3d = df_2d.iloc[:,2:].values.reshape([sample_size, time_size, feature_size])
    return array_3d


x_train = df2d_to_array3d(train_x) 
y_train = df2d_to_array3d(train_y) 
x_test = df2d_to_array3d(test_x) 

x_train.shape, y_train.shape, x_test.shape

((7362, 1380, 10), (7362, 120, 10), (529, 1380, 10))

In [8]:
scaling_values = [] 

## standardize column by column 
for i in tqdm(range(x_train.shape[2]), position = 0, leave = True): 
    full_data = np.concatenate([x_train[:,:,i], y_train[:,:,i]], axis = 1) 
    mu = np.mean(full_data) 
    std = np.std(full_data) 
    x_train[:,:,i] = (x_train[:,:,i] - mu)/std 
    y_train[:,:,i] = (y_train[:,:,i] - mu)/std
    x_test[:,:,i] = (x_test[:,:,i] - mu)/std 
    scaling_values.append((mu, std))      
    


100%|██████████| 10/10 [00:01<00:00,  6.91it/s]


In [9]:
y_train_close = y_train[:,:,4] # close index is the 4th column  
x_train_close = x_train[:,:,4].reshape((-1,1380)) 
x_test_close = x_test[:,:,4].reshape((-1,1380))   

In [10]:
x_train_close.shape, y_train_close.shape, x_test_close.shape 

((7362, 1380), (7362, 120), (529, 1380))

In [11]:
buy_quantities = [] # either 0 or 1 
sell_times = [] # when to sell i.e. when the close price reaches the highest point in the next 2 hours.  

for i in range(y_train_close.shape[0]):
#    sell_time = np.argmax(y_train_close[i,:])
#    sell_times.append(sell_time)
    win_count = 0
    for t in range(0, len(y_train_close[i])):
        if y_train_close[i][t] > x_train_close[i,-1]:
            win_count += 1
    win_count_threshold = int(len(y_train_close[i])*0.5)
    if win_count >= win_count_threshold:
        buy_quantities.append(1.0)
        sell_time = np.argmax(y_train_close[i,:])
        sell_times.append(sell_time)
    else:
        buy_quantities.append(0.0)
        sell_times.append(0)

buy_quantities = np.asarray(buy_quantities).reshape((-1,1))
sell_times = np.asarray(sell_times).reshape((-1,1))

## Define model

In [12]:
def buy_quantity_LSTM(): 
    inputs = Input((1380,10)) # better to use all 10 features I think
    bn = BatchNormalization()(inputs) 
    lstm = Bidirectional(LSTM(128, return_sequences = True))(bn) 
    lstm = Dropout(0.25)(lstm) 
    lstm = Bidirectional(LSTM(128, return_sequences = False))(lstm) 
    lstm = Dropout(0.25)(lstm) 
    dense = Dense(64, activation = 'relu')(lstm) 
    dense = BatchNormalization()(dense) 
    outputs = Dense(1, activation = 'sigmoid')(dense) 
    model = Model(inputs=inputs,outputs=outputs) 
    model.compile(loss='binary_crossentropy', optimizer = 'adam') 
    return model 


def sell_time_LSTM():
    inputs = Input((1380,10)) 
    bn = BatchNormalization()(inputs)
    lstm = Bidirectional(LSTM(128, return_sequences = True))(bn) 
    lstm = Dropout(0.25)(lstm) 
    lstm = Bidirectional(LSTM(128, return_sequences = False))(lstm) 
    lstm = Dropout(0.25)(lstm) 
    dense = Dense(64, activation = 'relu')(lstm) 
    dense = BatchNormalization()(dense) 
    outputs = Dense(120, activation = 'softmax')(dense) 
    model = Model(inputs=inputs,outputs=outputs) 
    model.compile(loss='sparse_categorical_crossentropy', optimizer = 'adam') 
    return model 

In [13]:
model_bq = buy_quantity_LSTM() 
model_st = sell_time_LSTM() 

## Train buy quantity model 

In [None]:
model_path = 'bq_LSTM_epoch_{epoch:03d}_val_{val_loss:.3f}.h5'
learning_rate_reduction = ReduceLROnPlateau(monitor = 'val_loss', patience = 2, verbose = 1, factor = 0.5)
checkpoint = ModelCheckpoint(filepath = model_path, monitor = 'val_loss', verbose = 1, save_best_only = True)
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 10) 
history = model_bq.fit(x_train,
                       buy_quantities,
                       epochs = 200,
                       batch_size = 32,
                       validation_split = 0.2,
                       callbacks = [learning_rate_reduction, checkpoint, early_stopping]) 

Epoch 1/200

Epoch 00001: val_loss improved from inf to 0.18072, saving model to bq_LSTM_epoch_001_val_0.181.h5
Epoch 2/200

Epoch 00002: val_loss improved from 0.18072 to 0.08389, saving model to bq_LSTM_epoch_002_val_0.084.h5
Epoch 3/200

Epoch 00003: val_loss improved from 0.08389 to 0.04666, saving model to bq_LSTM_epoch_003_val_0.047.h5
Epoch 4/200

Epoch 00004: val_loss did not improve from 0.04666
Epoch 5/200

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 00005: val_loss did not improve from 0.04666
Epoch 6/200

Epoch 00006: val_loss improved from 0.04666 to 0.03387, saving model to bq_LSTM_epoch_006_val_0.034.h5
Epoch 7/200

Epoch 00007: val_loss improved from 0.03387 to 0.03031, saving model to bq_LSTM_epoch_007_val_0.030.h5
Epoch 8/200

Epoch 00008: val_loss did not improve from 0.03031
Epoch 9/200

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 00009: val_loss did not improve from 0.03031
Epoch 

## Train sell time model

In [None]:
model_path = 'st_LSTM_epoch_{epoch:03d}_val_{val_loss:.3f}.h5'
learning_rate_reduction = ReduceLROnPlateau(monitor = 'val_loss', patience = 2, verbose = 1, factor = 0.5)
checkpoint = ModelCheckpoint(filepath = model_path, monitor = 'val_loss', verbose = 1, save_best_only = True)
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 10) 
history = model_st.fit(x_train,
                       sell_times,
                       epochs = 200,
                       batch_size = 32,
                       validation_split = 0.2, 
                       callbacks = [learning_rate_reduction, checkpoint, early_stopping]) 

Epoch 1/200

Epoch 00001: val_loss improved from inf to 3.62800, saving model to st_LSTM_epoch_001_val_3.628.h5
Epoch 2/200

Epoch 00002: val_loss improved from 3.62800 to 3.10442, saving model to st_LSTM_epoch_002_val_3.104.h5
Epoch 3/200

Epoch 00003: val_loss did not improve from 3.10442
Epoch 4/200

Epoch 00004: val_loss improved from 3.10442 to 3.08479, saving model to st_LSTM_epoch_004_val_3.085.h5
Epoch 5/200
  4/185 [..............................] - ETA: 43s - loss: 3.2124

## Make Prediction

In [17]:
best_bq = load_model('bq_LSTM_epoch_018_val_0.022.h5') 
best_st = load_model('st_LSTM_epoch_006_val_3.069.h5')

In [18]:
predicted_bq = best_bq.predict(x_test)

In [19]:
predicted_st = best_st.predict(x_test)

In [20]:
exact_times = []
for sell_time in predicted_st: 
    exact_times.append(np.argmax(sell_time))

In [21]:
exact_times = np.asarray(exact_times)

In [22]:
submission.iloc[:,1] = predicted_bq 
submission.iloc[:,2] = exact_times

In [24]:
submission

Unnamed: 0,sample_id,buy_quantity,sell_time
0,0,0.999779,0
1,1,0.999002,0
2,2,0.999143,0
3,3,0.999407,0
4,4,0.999646,0
...,...,...,...
524,524,0.997583,0
525,525,0.999038,0
526,526,0.999740,0
527,527,0.998601,0


In [25]:
submission.to_csv('lstm_threshold.csv',index=False)

## Visualize result

In [108]:
def plot_series(x_series, y_series):
    #입력 series와 출력 series를 연속적으로 연결하여 시각적으로 보여주는 코드 입니다.
    plt.plot(x_series, label = 'input_series')
    plt.plot(np.arange(len(x_series), len(x_series)+len(y_series)),
             y_series, label = 'output_series')
    # plt.axhline(1, c = 'red')
    plt.legend()
