In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Install external pandas_ta package

Originally, this worked and I calculated technical indicators using the pandas_ta package. However, over night it did not work anymore, due to an error I can not follow and seems to be kaggle-related. This is why I manually calculate indicators now.


1. Download tar.gz package file from pypi
2. Replace tar.gz by .xyz to avoid Kaggle is unpacking automatically
3. Upload .xyz to your kaggle datasets and upload it here
4. Run the code below

In [None]:
#! mkdir -p /tmp/pip/cache/
#! cp ../input/pandas-ta/pandas-ta-0.3.14.xyz /tmp/pip/cache/pandas-ta-0.3.14.tar.gz
#! pip install --no-index --find-links /tmp/pip/cache/ pandas-ta

## Import data

In [None]:
import os
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import gc

directory = '/kaggle/input/g-research-crypto-forecasting/'
file_path = os.path.join(directory, 'train.csv')
dtypes = {
    'timestamp': np.int64,
    'Asset_ID': np.int8,
     'Count': np.int32,
     'Open': np.float64,
     'High': np.float64,
     'Low': np.float64,
    'Close': np.float64,
     'Volume': np.float64,
     'VWAP': np.float64,
    'Target': np.float64,
}

data = pd.read_csv(file_path, dtype=dtypes, usecols=list(dtypes.keys()))

file_path = os.path.join(directory, 'asset_details.csv')
details = pd.read_csv(file_path)

data.set_index('timestamp', inplace = True)
data.sort_index(inplace = True)

## Define functions

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime

# a function to generate technical indicators and return featues, targets, and an index
def prepare_features(df_in, length = 15, with_target = True):

    df_tmp = df_in.copy()
    # calculate sma
    df_tmp['sma'] = df_tmp.Close.rolling(window = length).mean()

    # calculate rsi with ewma
    close_delta = df_tmp['Close'].diff()

    # Make two series: one for lower closes and one for higher closes
    up = close_delta.clip(lower=0)
    down = -1 * close_delta.clip(upper=0)

    # Use exponential moving average
    ma_up = up.ewm(com = length - 1, adjust=True, min_periods = length).mean()
    ma_down = down.ewm(com = length - 1, adjust=True, min_periods = length).mean()

    rsi = ma_up / ma_down
    df_tmp['rsi'] = 100 - (100/(1 + rsi))

    # calculate atr
    high_low = df_tmp['High'] - df_tmp['Low']
    high_close = np.abs(df_tmp['High'] - df_tmp['Close'].shift())
    low_close = np.abs(df_tmp['Low'] - df_tmp['Close'].shift())

    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_range = np.max(ranges, axis=1)

    df_tmp['atr'] = true_range.rolling(length).sum() / length
    
    
    df_tmp.dropna(subset = ['sma', 'rsi', 'atr'], inplace = True)
    X = df_tmp[['sma', 'rsi', 'atr']]
    
    if with_target:
        y = df_tmp.Target
    else:
        y = None
        
    return X, y, df_tmp.index

# a function to define the recurrent network
def make_rnn_model(inpt_shape):
    model = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape = inpt_shape),
            tf.keras.layers.GRU(10),
            tf.keras.layers.Dropout(0.25),
            tf.keras.layers.Dense(1)
    ])

    return model

In [None]:
# The models are generated on my local machine using this code

#import os
#import numpy as np
#import pandas as pd
#pd.options.mode.chained_assignment = None
#import gc
#from datetime import datetime
#from sklearn.preprocessing import MinMaxScaler
#import tensorflow as tf
#from pickle import dump

#directory = '/Users/ralfkellner/Library/Mobile Documents/com~apple~CloudDocs/Python/Module/DLTA/datasets/cryptoPrediction/'
#file_path = os.path.join(directory, 'train.csv')
#dtypes = {
#    'timestamp': np.int64,
#    'Asset_ID': np.int8,
#     'Count': np.int32,
#     'Open': np.float64,
#     'High': np.float64,
#     'Low': np.float64,
#    'Close': np.float64,
#     'Volume': np.float64,
#     'VWAP': np.float64,
#    'Target': np.float64,
#}

#data = pd.read_csv(file_path, dtype=dtypes, usecols=list(dtypes.keys()))

#file_path = os.path.join(directory, 'asset_details.csv')
#details = pd.read_csv(file_path)

#data.set_index('timestamp', inplace = True)
#data.sort_index(inplace = True)

#train_start = datetime.timestamp(datetime(2021,1,1))
#train_end = datetime.timestamp(datetime(2021,5,1))
#test_end = datetime.timestamp(datetime(2021,6,1))

#train_data = data.loc[train_start:train_end]
#test_data = data.loc[train_end:test_end]

#def prepare_features(df_in, length = 15, with_target = True):

#    df_tmp = df_in.copy()
    # calculate sma
#    df_tmp['sma'] = df_tmp.Close.rolling(window = length).mean()

    # calculate rsi with ewma
#    close_delta = df_tmp['Close'].diff()

    # Make two series: one for lower closes and one for higher closes
#    up = close_delta.clip(lower=0)
#    down = -1 * close_delta.clip(upper=0)

    # Use exponential moving average
#    ma_up = up.ewm(com = length - 1, adjust=True, min_periods = length).mean()
#    ma_down = down.ewm(com = length - 1, adjust=True, min_periods = length).mean()

#    rsi = ma_up / ma_down
#    df_tmp['rsi'] = 100 - (100/(1 + rsi))

    # calculate atr
#    high_low = df_tmp['High'] - df_tmp['Low']
#    high_close = np.abs(df_tmp['High'] - df_tmp['Close'].shift())
#    low_close = np.abs(df_tmp['Low'] - df_tmp['Close'].shift())

#    ranges = pd.concat([high_low, high_close, low_close], axis=1)
#    true_range = np.max(ranges, axis=1)

#    df_tmp['atr'] = true_range.rolling(length).sum() / length
    
    
#    df_tmp.dropna(subset = ['sma', 'rsi', 'atr'], inplace = True)
#    X = df_tmp[['sma', 'rsi', 'atr']]
    
#    if with_target:
#        y = df_tmp.Target
#    else:
#        y = None
        
#    return X, y, df_tmp.index


#for id_ in train_data.Asset_ID.unique():
#    train_asset = train_data[train_data.Asset_ID == id_]
#    test_asset = test_data[test_data.Asset_ID == id_]

#    X_train, y_train, idx = prepare_features(train_asset)
#    X_test, y_test, idx = prepare_features(test_asset)

#    X_scaler = MinMaxScaler()
#    X_scaler.fit(X_train)

#    dump(X_scaler, open(f'network_scalers/{id_}.pkl', 'wb'))
#    print(f'Scaler for asset {id_} has been exported.')

#    X_train_, X_test_ = X_scaler.transform(X_train), X_scaler.transform(X_test)

    # build a recurrent network
#    lookback = 30

#    X_train_rnn = []
#    y_train_rnn = []

#    for t in range(len(X_train_) - lookback):
#        X_train_rnn.append(X_train_[t:(t + lookback)])
#        y_train_rnn.append(y_train.values[(t + lookback)])

#    X_test_rnn = []
#    y_test_rnn = []

#    for t in range(len(X_test_) - lookback):
#        X_test_rnn.append(X_test_[t:(t + lookback)])
#        y_test_rnn.append(y_test.values[(t + lookback)])

#    X_train_rnn = np.array(X_train_rnn)
#    X_test_rnn = np.array(X_test_rnn)

#    y_train_rnn = np.array(y_train_rnn)
#    y_test_rnn = np.array(y_test_rnn)

#    model = tf.keras.Sequential([
#        tf.keras.layers.InputLayer(input_shape = (X_train_rnn.shape[1], X_train_rnn.shape[2])),
#        tf.keras.layers.GRU(10),
#        tf.keras.layers.Dropout(0.25),
#        tf.keras.layers.Dense(1)
#    ])

#    model.compile(loss = 'mean_absolute_error', optimizer = 'adam')
#    model.fit(X_train_rnn, y_train_rnn, epochs = 1, validation_data = (X_test_rnn, y_test_rnn))

#    print(np.corrcoef(model.predict(X_train_rnn).flatten(), y_train_rnn)[0, 1])
#    print(np.corrcoef(model.predict(X_test_rnn).flatten(), y_test_rnn)[0, 1])

#    model.save_weights(f'network_weights/rnn/{id_}.h5', save_format = 'h5')
#    print(f'Model for asset {id_} has been exported.')

## Load scalers, models and recent data

Scalers and models are build using this notebook: https://www.kaggle.com/rkellner/easy-gru-network-with-technical-indicators

In [None]:
from pickle import load
from datetime import datetime

data_recent = data.drop(['Target'], axis = 1).loc[datetime.timestamp(datetime(2021,6,1)):1623542340]
data_recent.loc[:, 'row_id'] = np.nan

scalers = {}
models = {}
data_memory = {}

length = 15
lookback = 30

for id_ in details.Asset_ID:
    scalers[id_] = load(open(f'/kaggle/input/scalers/{id_}.pkl', 'rb'))
    models[id_] = make_rnn_model((lookback, scalers[id_].n_features_in_))
    models[id_].load_weights(f'/kaggle/input/rnn-weights/{id_}.h5')
    data_memory[id_] = data_recent[data_recent.Asset_ID == id_].iloc[-50:, :]

## Make predictions

In [None]:
import gresearch_crypto
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

start_time = datetime.now()

for (test_df, sample_predictions) in iter_test:
    
    # set the timestamp as index for new data
    test_df.set_index('timestamp', inplace = True)

    # current predictions are generated at every iteration, past current_predictions should be deleted upfront
    try:
        del current_predictions
    except:
        print('No current predictions')

    for id_ in details.Asset_ID:

        # extract data for a specific id
        test_df_tmp = test_df[test_df.Asset_ID == id_]
        # append new data information to the id data memory
        data_memory[id_] = pd.concat([data_memory[id_], test_df_tmp]) 
        # get features for old and new observations
        X_data, holder_, idx = prepare_features(data_memory[id_], with_target = False)
        # scale features
        X_data_ = scalers[id_].transform(X_data)

        # prepare for rnn model
        X_data_rnn = []

        for t in range(len(X_data_) - lookback):
            X_data_rnn.append(X_data_[t:(t + lookback)])

        X_data_rnn = np.array(X_data_rnn)

        # make prediction and add this to the id specific data sheet
        data_memory[id_].loc[:, 'Target_hat'] = np.concatenate([np.array([np.nan] * (length + lookback)), models[id_](X_data_rnn).numpy().flatten()])

        # build a current predictions dataframe for all ids at this iteration
        try:
            current_predictions = pd.concat((current_predictions, data_memory[id_].dropna(subset = ['row_id'])))
        except:
            current_predictions = data_memory[id_].dropna(subset = ['row_id']).copy()

        # delete past observations to avoid the data memory getting to large
        data_memory[id_] = data_memory[id_].iloc[len(test_df_tmp):, :]
        data_memory[id_].loc[: , 'row_id'] = np.nan
        data_memory[id_].loc[: , 'Target_hat'] = np.nan

    # make sample prediction
    sample_predictions = sample_predictions.merge(current_predictions, on = 'row_id', how = 'inner')
    sample_predictions = sample_predictions[['row_id', 'Target_hat']]
    sample_predictions.rename(columns={"row_id": "row_id", "Target_hat": "Target"}, inplace = True)

    env.predict(sample_predictions)
    
time_elapsed = datetime.now() - start_time
print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))

Notebook timeout: Unfortunately, every approach which lasts longer than approximately 0.8 seconds for the test iteration will fail when submitting and resulting in a runtime error. While the code above surely can be optimized in terms of run-time, I find this limitation quite significant. The model above is super easy and far from realistic, still, together with data handling takes too long. Quite frustrating!