In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('../input/g-research-crypto-forecasting/train.csv')

In [None]:
#all imports
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential, layers, callbacks
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.callbacks import LearningRateScheduler


For training the models, my initial plan was to run for 10 epochs.However the dataset is huge. Opting for less number of epochs and using learning rate scheduling is a better compromise and most importantly it saves time.
To get to know more about learning rate scheduling refer:
https://machinelearningmastery.com/understand-the-dynamics-of-learning-rate-on-deep-learning-neural-networks/

In [None]:
#lr scheduling 
import math
def step_decay(epoch):
    initial_lrate = 0.1
    epochs_drop = 10.0
    lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
    return lrate

This function takes care of removing outliers using quantile capping.

In [None]:
def quantile_capping(df):
    
    for col in df.columns:
        percentiles = df[col].quantile([0.01, 0.90]).values
        df[col][df[col] <= percentiles[0]] = percentiles[0]
        df[col][df[col] >= percentiles[1]] = percentiles[1]
    return df    

log_return function takes an series i.e one paticular column from our input data, obtains logarithmic values and finds the difference based on periods mentioned. Same implemenation as that shown in tutorial notebook. However I applied np.abs(series) to remove negative log undefined values

In [None]:
#log difference
def log_return(series, periods=1):
    return np.log(np.abs(series)).diff(periods=periods)

create_model_0to9 - Models for asset_id 0 to 9

create_model_10to13 - Models for asset_id 10 to 13

Why different models? The last set of models did not converge. I made some changes like activation function and gradient clipping. But the loss value was NAN. If you can suggest some fixes, please write down in the comment section below.

In [None]:
#lstm model
def create_model_0to9(X):
    model = Sequential()
    # First layer of LSTM
    model.add(LSTM(64, return_sequences = True, 
                 input_shape = [X.shape[1], X.shape[2]]))
    model.add(Dropout(0.2)) 
    # Second layer of LSTM
    model.add(LSTM(64))                 
    model.add(Dropout(0.2))
    model.add(Dense(units = 1)) 
    #Compile model
    model.compile(loss='mse', optimizer='adam')
    return model

In [None]:
def create_model_10to13(X):
    model = Sequential()
    # First layer of LSTM
    model.add(LSTM(64, return_sequences = True, activation=None,recurrent_activation=None,
                 input_shape = [X.shape[1], X.shape[2]]))
    model.add(Dropout(0.2)) 
    # Second layer of LSTM
    model.add(LSTM(64))                 
    model.add(Dropout(0.2))
    model.add(Dense(units = 1 ,activation=None)) 
    opt = tf.keras.optimizers.SGD(lr=0.01, momentum=0.9, clipvalue=5.0)
    #Compile model
    #gradient clipping
    model.compile(loss='mse', optimizer=opt)
    return model

Feature Extraction- same as that implemented in tutorial notebook

In [None]:
upper_shadow = lambda asset: asset.High - np.maximum(asset.Close,asset.Open)
lower_shadow = lambda asset: np.minimum(asset.Close,asset.Open)- asset.Low

Here we go!
The training commences for each and every asset_id
Summary of the process-
1. Train data chosen till timestamp 162354200. The test data example starts from there. Avoiding data leakage.
2. Removing missing values,replacing 0 with 1 to avoid log 0 undefined error
3. Scaler to Normalise input values
4. Reshape- LSTM models take 3d datasets. To know more visit:-https://machinelearningmastery.com/reshape-input-data-long-short-term-memory-networks-keras/
5.Save model for future use. Copy the link from "Copy File Path" in Output Section beside "/kaggle/working"


In [None]:
#build one model for each asset_id

for i in range(0,10):#model assetid 0 to 10
    train_data=df_train[df_train["Asset_ID"]==i]
    train_data=train_data[train_data["timestamp"]<1623542400]
    train_data["Target"].fillna(train_data["Target"].interpolate(),inplace=True)
    X_c=pd.concat([log_return(train_data.VWAP,periods=5), log_return(train_data.VWAP,periods=1).abs(), 
               upper_shadow(train_data), lower_shadow(train_data)], axis=1)
    y_c = train_data.Target
    X_c.fillna(1,inplace=True)
    X_c.replace(0,1)# to avoid log 0 error
    X_c.columns=['VWAP-5','VWAP-1','UpperShadow','LowerShadow']
    X_c.reset_index(drop=True,inplace=True)
    X_c=quantile_capping(X_c)
    scaler = StandardScaler()
    X_c_scaled = scaler.fit_transform(X_c)
    X_c_scaled=X_c_scaled.reshape(X_c_scaled.shape[0],X_c_scaled.shape[1],1)
    model=create_model_0to9(X_c_scaled)
    lrate = LearningRateScheduler(step_decay)
    callbacks_list = [lrate]
    model.fit(X_c_scaled,   y_c, epochs = 5, validation_split = 0.2,
                    batch_size = 32, shuffle = False, callbacks =[callbacks_list])
    model.save('./model_asset_id{}.h5'.format(i))
    
    

In [None]:
for i in range(10,14):#model assetid 10 to 13
    train_data=df_train[df_train["Asset_ID"]==i]
    train_data=train_data[train_data["timestamp"]<1623542400]
    train_data["Target"].fillna(train_data["Target"].interpolate(),inplace=True)
    X_c=pd.concat([log_return(train_data.VWAP,periods=5), log_return(train_data.VWAP,periods=1).abs(), 
               upper_shadow(train_data), lower_shadow(train_data)], axis=1)
    y_c = train_data.Target
    y_c=y_c.interpolate()
    X_c.fillna(1,inplace=True)
    X_c.replace(0,1)# to avoid log 0 error
    X_c.columns=['VWAP-5','VWAP-1','UpperShadow','LowerShadow']
    X_c.reset_index(drop=True,inplace=True)
    X_c=quantile_capping(X_c)
    scaler = StandardScaler()
    X_c_scaled = scaler.fit_transform(X_c)
    X_c_scaled=X_c_scaled.reshape(X_c_scaled.shape[0],X_c_scaled.shape[1],1)
    model=create_model_10to13(X_c_scaled)
    lrate = LearningRateScheduler(step_decay)
    callbacks_list = [lrate]
    model.fit(X_c_scaled,   y_c, epochs = 5, validation_split = 0.2,
                    batch_size = 32, shuffle = False, callbacks =[callbacks_list])
    model.save('./model_asset_id{}.h5'.format(i))
    

In [None]:
#preprocessing of the test dataset to be executed in the same way as example_test_df
df_test=pd.read_csv('../input/g-research-crypto-forecasting/example_test.csv')
#df_test['Target']=0

In [None]:
df_submission=pd.read_csv('../input/g-research-crypto-forecasting/example_sample_submission.csv')

Creating a copy of df_submission because the original file is in read only format. 

In [None]:
submission_df_op = pd.DataFrame(columns=['group_num','row_id','Target'])
submission_df_op['group_num']=df_submission['group_num']
submission_df_op['row_id']=df_submission['row_id']

Testing starts here-

In [None]:
for i in range(0,14):
    test_data=df_test[df_test["Asset_ID"]==i]
    X_c=pd.concat([log_return(test_data.VWAP,periods=5), log_return(test_data.VWAP,periods=1).abs(), 
              upper_shadow(test_data), lower_shadow(test_data)], axis=1)
   
    X_c.fillna(1,inplace=True)
    X_c.replace(0,1)# to avoid log 0 error
    X_c.columns=['VWAP-5','VWAP-1','UpperShadow','LowerShadow']
    X_c.reset_index(drop=True,inplace=True)
    scaler = StandardScaler()
    X_c_scaled = scaler.fit_transform(X_c)
    X_c_scaled=X_c_scaled.reshape(X_c_scaled.shape[0],X_c_scaled.shape[1],1)
    model=keras.models.load_model('./model_asset_id{}.h5'.format(i))
    y=model.predict(X_c_scaled)
    rid=test_data["row_id"].to_list()
    k=0
    index=0
    for j in range(0,submission_writeop.shape[0]):
        try :
            index=rid.index(df_submission['row_id'][j])
            submission_df_op.at[j,'Target']=y[k][0]
            
        except ValueError as e:
            continue
        k=k+1    
    
    
    
    
    
    
    

Merge the dataframes and obtain rmse score

In [None]:
#sample code
#df_predandactual=pd.merge(df_pred_dftest,df_train,on='timestamp',how='inner')
#from sklearn.metrics import mean_squared_error
#mean_squared_error(df_predandactual['Target_x'], df_predandactual['Target_y'])

The comparision was done with models 0 to 9 and 13, the results obtained and the actual values
Received RMSE score-3.4427147667227486e-05

In [None]:

submission_df_op.to_csv('./output.csv',index=False)
