## Load the training set

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

In [None]:
data_folder = "../input/g-research-crypto-forecasting/"

In [None]:
crypto_df = pd.read_csv(data_folder + 'train.csv')
asset_details = pd.read_csv(data_folder + 'asset_details.csv')

In [None]:
crypto_df.head(10)

In [None]:
crypto_assets = []
for idx in range(len(asset_details)):
  crypto_assets.append(crypto_df[crypto_df["Asset_ID"]==idx].set_index("timestamp"))  

# Preprocessing

## Dealing with missing data


In [None]:
for idx in range(len(crypto_assets)):
  print(crypto_assets[idx].isna().sum())

In [None]:
for idx in range(len(crypto_assets)):
  print('Assets', idx ,'data goes from ', crypto_assets[idx].index[0].astype('datetime64[s]'), 'to ', crypto_assets[idx].index[1].astype('datetime64[s]'))

In [None]:
for idx in range(len(crypto_assets)):
  crypto_assets[idx] = crypto_assets[idx].reindex(range(crypto_assets[idx].index[0],crypto_assets[idx].index[-1]+60,60),method='pad')
  crypto_assets[idx] = crypto_assets[idx].fillna(method='ffill')

In [None]:
(crypto_assets[1].index[1:]-crypto_assets[1].index[:-1]).value_counts().head()

In [None]:
# define function to compute log returns
def log_return(series, periods=1):
    return np.log(series).diff(periods=periods)

# Building your prediction model

In [None]:
# Select some input features from the trading data: 
# 5 min log return, abs(5 min log return), upper shadow, and lower shadow.
upper_shadow = lambda asset: asset.High - np.maximum(asset.Close,asset.Open)
lower_shadow = lambda asset: np.minimum(asset.Close,asset.Open)- asset.Low

X_assets = []
y_target = []
for idx in range(len(crypto_assets)):
  X_assets.append(pd.concat([log_return(crypto_assets[idx].VWAP,periods=5), 
                             log_return(crypto_assets[idx].VWAP,periods=1).abs(), 
                             upper_shadow(crypto_assets[idx]), lower_shadow(crypto_assets[idx])], axis=1))
  y_target.append(crypto_assets[idx].Target)

In [None]:
import time

totimestamp = lambda s: np.int32(time.mktime(datetime.strptime(s, "%d/%m/%Y").timetuple()))

train_window = [totimestamp("01/09/2021"), totimestamp("15/09/2021")]
val_window = [totimestamp("16/09/2021"), totimestamp("18/09/2021")]
test_window = [totimestamp("19/09/2021"), totimestamp("21/09/2021")]

X_train_raw = []
y_train_raw = []

X_val_raw = []
y_val_raw = []

X_test_raw = []
y_test_raw = []

for idx in range(len(X_assets)):
  X_train_raw.append(X_assets[idx].loc[train_window[0]:train_window[1]].fillna(0).to_numpy())  # filling NaN's with zeros
  y_train_raw.append(y_target[idx].loc[train_window[0]:train_window[1]].fillna(0).to_numpy())

  X_val_raw.append(X_assets[idx].loc[val_window[0]:val_window[1]].fillna(0).to_numpy())
  y_val_raw.append(y_target[idx].loc[val_window[0]:val_window[1]].fillna(0).to_numpy())
  
  X_test_raw.append(X_assets[idx].loc[test_window[0]:test_window[1]].fillna(0).to_numpy())
  y_test_raw.append(y_target[idx].loc[test_window[0]:test_window[1]].fillna(0).to_numpy())

In [None]:
X_train_raw[1]

We now standardize the input data. Standardization is the process of putting different variables on the same scale. In regression analysis, it is often crucial to standardize your independent variables or you may risk obtaining misleading results.

In [None]:
from sklearn.preprocessing import StandardScaler
# simple preprocessing of the data 
scaler = StandardScaler()

X_train_scaled = []
X_val_scaled = []
X_test_scaled = []
for idx in range(len(X_train_raw)):
  X_train_scaled.append(scaler.fit_transform(X_train_raw[idx]))
  X_val_scaled.append(scaler.fit_transform(X_val_raw[idx]))
  X_test_scaled.append(scaler.fit_transform(X_test_raw[idx]))

In [None]:
def windowingData(X_list, y_list, start, end, window_size, dataset, label):
    data, labels = [], []
    idx = start
    while (idx + window_size) < end:
        # adding sequence of length window_size
        data.append(dataset[idx: idx+window_size])
        # adding the close price of next point
        labels.append(dataset[idx+window_size, 0])
        idx += 1
    X_list.append(np.array(data)), y_list.append(np.array(labels))

X_train_all = []
y_train_all = []

X_val_all = []
y_val_all = []

X_test_all = []
y_test_all = []

for idx in range(len(X_train_scaled)):
  windowingData(
      X_train_all,
      y_train_all,
      start = 0,
      end = len(X_train_scaled[idx]),
      window_size = 14,
      dataset = X_train_scaled[idx],
      label = y_train_raw[idx]
  )


for idx in range(len(X_val_scaled)):
  windowingData(
      X_val_all,
      y_val_all,
      start = 0,
      end = len(X_val_scaled[idx]),
      window_size = 14,
      dataset = X_val_scaled[idx],
      label = y_val_raw[idx]
  )


for idx in range(len(X_test_scaled)):
  windowingData(
      X_test_all,
      y_test_all,
      start = 0,
      end = len(X_test_scaled[idx]),
      window_size = 14,
      dataset = X_test_scaled[idx],
      label = y_test_raw[idx]
  )

In [None]:
print(X_train_all[1].shape)
print(y_train_all[1].shape)
print(X_val_all[1].shape)
print(y_val_all[1].shape)
print(X_test_all[1].shape)
print(y_test_all[1].shape)

In [None]:
len(X_train_all)

GRU

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from tensorflow.keras.utils import to_categorical
# from keras.layers import Conv1D, GRU, LSTM, Dropout, Dense, CuDNNGRU
from keras.layers import Activation, Dense, Dropout, LSTM, BatchNormalization, CuDNNLSTM, CuDNNGRU

from keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import Bidirectional

In [None]:
# # Fitting to the training set
# learning_rate = 0.0001
# earlyStop = keras.callbacks.EarlyStopping(min_delta=0.01, monitor='val_loss', patience=5, restore_best_weights=True)

# regressors = []
# hist = []
# for idx in range(len(X_train_all)):
#     regressors.append(tf.keras.models.Sequential([
#         tf.keras.layers.Input(shape=X_train_all[0].shape[-2:]),
#         tf.compat.v1.keras.layers.CuDNNLSTM(128, return_sequences=True),
#         tf.keras.layers.Dropout(0.1),
#         tf.compat.v1.keras.layers.CuDNNLSTM(60, return_sequences=False),
#         tf.keras.layers.Dropout(0.1),
#         tf.keras.layers.Dense(32, activation='relu'),
#         tf.keras.layers.Dense(1)
#     ]))
#     regressors[idx].summary()
#     # Compiling the RNN
#     regressors[idx].compile(optimizer=tf.optimizers.Adam(learning_rate),loss='mae', metrics=['mse'])
#     xva, yva = X_val_all[idx],y_val_all[idx]
#     history = regressors[idx].fit(
#         X_train_all[idx],
#         y_train_all[idx],
#         epochs=100,
#         callbacks=[earlyStop],
#         batch_size=256,
#         validation_data=(xva,yva)
#     )
#     hist.append(history)

In [None]:
# Fitting to the training set
learning_rate = 0.0001
earlyStop = keras.callbacks.EarlyStopping(min_delta=0.01, monitor='val_loss', patience=5, restore_best_weights=True)

regressors = []
hist = []
for idx in range(len(X_train_all)):
    regressors.append(tf.keras.models.Sequential([
        tf.keras.layers.Input(shape=X_train_all[0].shape[-2:]),
        tf.compat.v1.keras.layers.CuDNNGRU(128, return_sequences=True),
        tf.keras.layers.Dropout(0.1),
        tf.compat.v1.keras.layers.CuDNNGRU(60, return_sequences=False),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1)
    ]))
    regressors[idx].summary()
    # Compiling the RNN
    regressors[idx].compile(optimizer=tf.optimizers.Adam(learning_rate),loss='mae')
    xva, yva = X_val_all[idx],y_val_all[idx]
    history = regressors[idx].fit(
        X_train_all[idx],
        y_train_all[idx],
        epochs=100,
        callbacks=[earlyStop],
        batch_size=256,
        validation_data=(xva,yva)
    )
    hist.append(history)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import math
predict_list = []
mse = 0
mape = 0
for idx in range(len(X_test_all)):
    predict_list.append(regressors[idx].predict(X_test_all[idx]))
    mse += mean_squared_error(predict_list[idx], y_test_all[idx])
    mape+= np.mean(np.abs((y_test_all[idx] - predict_list[idx])/y_test_all[idx]))*100

print("mse: %s" % math.sqrt(mse/len(X_test_all)))
print("mape: %s" % (mape/len(X_test_all)))


In [None]:
import matplotlib.pyplot as plt
for idx_plot in range(len(regressors)):
    loss = hist[idx_plot].history['loss']
    val_loss = hist[idx_plot].history['val_loss']
    epochs_plot = range(1, len(loss) + 1)
    plt.plot(epochs_plot, loss, 'bo', label='Training loss')
    plt.plot(epochs_plot, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [None]:
import matplotlib.pyplot as plt
def line_plot(line1, line2, idx, label1=None, label2=None, title='', lw=2):
    fig, ax = plt.subplots(1, figsize=(13,7))
    ax.plot(line1, label=label1, linewidth=lw)
    ax.plot(line2, label=label2, linewidth=lw)
    ax.set_ylabel('price diff ' + str(idx), fontsize=14)
    ax.set_title(title, fontsize=16)
    ax.legend(loc='best', fontsize=16)

for idx in range(len(predict_list)):
  line_plot(y_test_all[idx], predict_list[idx], idx, 'actual', 'prediction', lw=3)

In [None]:
X_test_all[0][0]

In [None]:
def get_features(row):
  asset_id = row['Asset_ID'].values[0]
  X_train_ref[asset_id] = X_train_ref[asset_id].append(test_row.set_index("timestamp"))
  raw_feature = pd.concat([log_return(X_train_ref[asset_id].VWAP,periods=5),
                           log_return(X_train_ref[asset_id].VWAP,periods=1).abs(),
                           upper_shadow(X_train_ref[asset_id]),
                           lower_shadow(X_train_ref[asset_id])], axis=1).to_numpy()
  # scaled_feature = scaler.fit_transform([raw_feature[-1]])
  reduce_set = np.delete(last_feature_set[asset_id], (0), axis=0)
  last_feature_set[asset_id] = np.append(reduce_set, [raw_feature[-1]], axis=0)
  return last_feature_set[asset_id]

In [None]:
df_test = pd.read_csv(data_folder + 'example_test.csv')
df_test
for idx in range(len(df_test)):
    print(df_test[idx])
#     regressors[df_test[idx].Asset_ID].predict(df_test)

In [None]:
import gresearch_crypto
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    for j , row in df_test.iterrows():
        
        if regressors[row['Asset_ID']] is not None:
            try:
                model = regressors[row['Asset_ID']]
                X_test = get_features(row)
                y_pred = regressors[row['Asset_ID']].predict(X_test)
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
            except:
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
                traceback.print_exc()
        else: 
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
        
    env.predict(df_pred)

In [None]:
# import math
# from sklearn.metrics import mean_squared_error
# # print('Test score for LSTM: BTC', np.corrcoef(predicted_btc, y_test.reshape(len(y_test),1))[0,1])
# rmse = math.sqrt(mean_squared_error(y_test, predicted_btc.flatten()))
# print(rmse)