# Data import


In [None]:
import os
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import gc

directory = '/kaggle/input/g-research-crypto-forecasting/'
file_path = os.path.join(directory, 'train.csv')
dtypes = {
    'timestamp': np.int64,
    'Asset_ID': np.int8,
     'Count': np.int32,
     'Open': np.float64,
     'High': np.float64,
     'Low': np.float64,
    'Close': np.float64,
     'Volume': np.float64,
     'VWAP': np.float64,
    'Target': np.float64,
}
data = pd.read_csv(file_path, dtype=dtypes, usecols=list(dtypes.keys()))
data['Time'] = pd.to_datetime(data['timestamp'], unit='s')

file_path = os.path.join(directory, 'asset_details.csv')
details = pd.read_csv(file_path)

data = pd.merge(data, 
                details, 
                on ='Asset_ID', 
                how ='left')

print(data.head())
print(details)

# Visualize the data and choose a train-test-split

In [None]:
import matplotlib.pylab as plt


btc = data[data.Asset_ID == 1]
btc.set_index('timestamp', inplace = True)
btc = btc.reindex(range(btc.index[0], btc.index[-1] + 60, 60), method = 'pad')
btc.sort_index(inplace = True)

fig, ax = plt.subplots(figsize = (12, 4))

ax.plot(btc.Time, btc.Close)
plt.show()

The level of Bitcoin drastically changed over time, so lets take a closer look, how its progression looks like for different time periods...

In [None]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

starting_date = btc.Time.iloc[0]
timesplits = [starting_date + i * relativedelta(months = 6) for i in range(8)] + [btc.Time.iloc[-1]]

import matplotlib.pyplot as plt

# Subplots are organized in a Rows x Cols Grid
# Tot and Cols are known
Tot = len(timesplits) - 1
Cols = 2


# Compute Rows required
Rows = Tot // Cols 
Rows += Tot % Cols

# Create a Position index
Position = range(1,Tot + 1)

# Create main figure
fig = plt.figure(1)
fig.set_figheight(30)
fig.set_figwidth(20)

for k in range(Tot):
  # add every single subplot to the figure with a for loop
    btc_tmp = btc.loc[datetime.timestamp(timesplits[k]):datetime.timestamp(timesplits[k+1])]
    ax = fig.add_subplot(Rows,Cols,Position[k])
    ax.plot(btc_tmp.Time, btc_tmp.Close)

plt.show()

As we can see, we have different market conditions in the subsamples, as an example I will use the first half of 2021 as training and the remaining data as testing data. As laid out in a previous notebook, I do not think that the original variables (High, Low, Close, Volume, etc.) will serve well as predictive features, which is why I will give technical indicators a chance. To select the best suited for the job, I am calculating all technical indicators of the ta-package and select the first 20 indicators which exhibit the highest rank-correlation to the target variable.

In [None]:
!pip install ta

In [None]:
import ta

start_train, end_train = datetime.timestamp(timesplits[-3]), datetime.timestamp(timesplits[-2])
start_test, end_test = datetime.timestamp(timesplits[-2]), datetime.timestamp(timesplits[-1])

train_data, test_data = btc.loc[start_train:end_train], btc.loc[start_test:end_test][3600:]

upper_shadow = lambda asset: asset.High - np.maximum(asset.Close,asset.Open)
lower_shadow = lambda asset: np.minimum(asset.Close,asset.Open)- asset.Low

train_data['close_1'] = train_data.Close.diff()
train_data['close_15'] = train_data.Close.diff(15)
train_data['close_60'] = train_data.Close.diff(60)

train_data['count_1'] = train_data.Count.diff()
train_data['count_15'] = train_data.Count.diff(15)
train_data['count_60'] = train_data.Count.diff(60)

train_data['volume_1'] = train_data.Volume.diff()
train_data['volume_15'] = train_data.Volume.diff(15)
train_data['volume_60'] = train_data.Volume.diff(60)

train_data['upper_shadow'] = upper_shadow(train_data)
train_data['lower_shadow'] = lower_shadow(train_data)

train_data = ta.add_all_ta_features(train_data,
                                       open = 'Open',
                                       high = 'High',
                                       low = 'Low',
                                       close = 'Close',
                                       volume = 'Volume',
                                       fillna = False)

test_data['close_1'] = test_data.Close.diff()
test_data['close_15'] = test_data.Close.diff(15)
test_data['close_60'] = test_data.Close.diff(60)

test_data['count_1'] = test_data.Count.diff()
test_data['count_15'] = test_data.Count.diff(15)
test_data['count_60'] = test_data.Count.diff(60)

test_data['volume_1'] = test_data.Volume.diff()
test_data['volume_15'] = test_data.Volume.diff(15)
test_data['volume_60'] = test_data.Volume.diff(60)

test_data['upper_shadow'] = upper_shadow(test_data)
test_data['lower_shadow'] = lower_shadow(test_data)

test_data = ta.add_all_ta_features(test_data,
                                       open = 'Open',
                                       high = 'High',
                                       low = 'Low',
                                       close = 'Close',
                                       volume = 'Volume',
                                       fillna = False)

# delete variables with many missing values
train_data = train_data.drop(train_data.columns[train_data.isnull().sum() > 100], axis = 1)
test_data = test_data.drop(test_data.columns[test_data.isnull().sum() > 100], axis = 1)

find_corr_features = train_data.drop(['Asset_ID', 'Time', 'Weight'], axis = 1).corr(method = 'spearman')['Target'].abs().sort_values(ascending = False)
find_corr_features[1:21]

Let us delete rows with missing values and visualize correlations among feature variables and the target variable.

In [None]:
import seaborn as sns

train_data.dropna(inplace = True)
test_data.dropna(inplace = True)

use_features = list(find_corr_features[:20].index)

fig, axs = plt.subplots(1, 2, figsize = (20, 10))
sns.heatmap(train_data[use_features].corr(method = 'spearman').abs(), ax = axs[0])
sns.heatmap(test_data[use_features].corr(method = 'spearman').abs(), ax = axs[1])
plt.show()

Let us further visualize the behavior of feature variables and the target variable over time as well as the pairwise relationship between feature variables and the target variable.

In [None]:
import matplotlib.pyplot as plt

# Subplots are organized in a Rows x Cols Grid
# Tot and Cols are known

Tot = len(use_features)
Cols = 5


# Compute Rows required
Rows = Tot // Cols 
Rows += Tot % Cols

# Create a Position index
Position = range(1,Tot + 1)

# Create main figure
fig = plt.figure(1)
fig.set_figheight(30)
fig.set_figwidth(20)

for k in range(Tot):
  # add every single subplot to the figure with a for loop

    ax = fig.add_subplot(Rows,Cols,Position[k])
    ax.plot(train_data['Time'], train_data[use_features][use_features[k]].values)
    ax.set_title(use_features[k])

plt.show()

In [None]:
import matplotlib.pyplot as plt

# Subplots are organized in a Rows x Cols Grid
# Tot and Cols are known

Tot = len(use_features)
Cols = 5


# Compute Rows required
Rows = Tot // Cols 
Rows += Tot % Cols

# Create a Position index
Position = range(1,Tot + 1)

# Create main figure
fig = plt.figure(1)
fig.set_figheight(30)
fig.set_figwidth(20)

for k in range(Tot):
  # add every single subplot to the figure with a for loop

    ax = fig.add_subplot(Rows,Cols,Position[k])
    ax.plot(test_data['Time'], test_data[use_features][use_features[k]].values)
    ax.set_title(use_features[k])

plt.show()

In [None]:
import matplotlib.pyplot as plt

# Subplots are organized in a Rows x Cols Grid
# Tot and Cols are known
Tot = len(use_features)
Cols = 5


# Compute Rows required
Rows = Tot // Cols 
Rows += Tot % Cols

# Create a Position index
Position = range(1,Tot + 1)

# Create main figure
fig = plt.figure(1)
fig.set_figheight(30)
fig.set_figwidth(20)

for k in range(Tot):
  # add every single subplot to the figure with a for loop

    ax = fig.add_subplot(Rows,Cols,Position[k])
    ax.scatter(train_data['Target'].values, train_data[use_features[k]].values)
    ax.set_title(use_features[k])

plt.show()

In [None]:
import matplotlib.pyplot as plt

# Subplots are organized in a Rows x Cols Grid
# Tot and Cols are known
Tot = len(use_features)
Cols = 5


# Compute Rows required
Rows = Tot // Cols 
Rows += Tot % Cols

# Create a Position index
Position = range(1,Tot + 1)

# Create main figure
fig = plt.figure(1)
fig.set_figheight(30)
fig.set_figwidth(20)

for k in range(Tot):
  # add every single subplot to the figure with a for loop

    ax = fig.add_subplot(Rows,Cols,Position[k])
    ax.scatter(test_data['Target'].values, test_data[use_features[k]].values)
    ax.set_title(use_features[k])

plt.show()

To be honest, I do not think that this looks promising (at least on a pairwise perspective). Anyway, let us scale the data and estimate a plain forward neural network.

In [None]:
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler(feature_range = (0, 1))
X_train = train_data[use_features].drop(['Target'], axis = 1)
X_test = test_data[use_features].drop(['Target'], axis = 1)

y_train = train_data['Target'].values
y_test = test_data['Target'].values

X_train_ = X_scaler.fit_transform(X_train)
X_test_ = X_scaler.transform(X_test)

In [None]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape = (X_train_.shape[1])),
    tf.keras.layers.Dense(20, activation = 'selu'),
    tf.keras.layers.Dropout(0.50),
    tf.keras.layers.Dense(10, activation = 'selu'),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(1)
])

model.compile(loss = 'mean_absolute_error', optimizer = 'adam')
history = model.fit(X_train_, y_train, epochs = 5, validation_data = (X_test_, y_test))
plt.plot(history.history['loss'], label = 'training')
plt.plot(history.history['val_loss'], label = 'test')
plt.show()

Let us check correlation between predictions and target realizations visually and quantitatively.

In [None]:
fig, axs = plt.subplots(1, 2, figsize = (12,8))

axs[0].scatter(model.predict(X_train_).flatten(), y_train)
axs[1].scatter(model.predict(X_test_).flatten(), y_test)
plt.show()

print(np.corrcoef(model.predict(X_train_).flatten(), y_train)[0, 1])
print(np.corrcoef(model.predict(X_test_).flatten(), y_test)[0, 1])

Not that good at all, but in my experience quite typical for financial data. Furthermore, we can see something which I would consider as critical with respect to the performance metric correlation - it is sensitive towards outliers which makes it very prone to single extreme observations, this will get more clear with the results below for the recurrent network architecture. 

Now, let us find out which of the feature variables are most important for predictions. This is gradient based and by help of another notebook of another user (just search for gradient and feature)

In [None]:
import matplotlib.pyplot as plt

X = tf.Variable(X_train_)
y = tf.Variable(y_train)
with tf.GradientTape() as tape:
    tape.watch(X)
    pred = model(X)
grad = tf.abs(tape.gradient(pred,X))
grad = tf.reduce_mean(grad,axis=0)
feature_importance = grad.numpy() / grad.numpy().sum()


plt.figure(figsize=(10,20))
plt.barh(X_train.columns[np.argsort(feature_importance)], np.sort(feature_importance))
plt.title('Feature importance')
plt.show()

Obviously, recurrent networks may be a better choice for autocorrelated time series data like this. So let us find out...first, we need to reshape the data, such that each sample consists of a series with a certain "lookback" time frame.

In [None]:
lookback = 30

X_train_rnn = []
y_train_rnn = []

for t in range(len(X_train_) - lookback):
    X_train_rnn.append(X_train_[t:(t + lookback)])
    y_train_rnn.append(y_train[(t + lookback)])
    
X_test_rnn = []
y_test_rnn = []

for t in range(len(X_test_) - lookback):
    X_test_rnn.append(X_test_[t:(t + lookback)])
    y_test_rnn.append(y_test[(t + lookback)])
    
X_train_rnn = np.array(X_train_rnn)
X_test_rnn = np.array(X_test_rnn)

y_train_rnn = np.array(y_train_rnn)
y_test_rnn = np.array(y_test_rnn)

In [None]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape = (X_train_rnn.shape[1], X_train_rnn.shape[2])),
    #tf.keras.layers.GRU(10, return_sequences = True),
    #tf.keras.layers.Dropout(0.25),
    tf.keras.layers.GRU(10),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(1)
])

model.compile(loss = 'mean_absolute_error', optimizer = 'adam')
history = model.fit(X_train_rnn, y_train_rnn, epochs = 3, validation_data = (X_test_rnn, y_test_rnn))
plt.plot(history.history['loss'], label = 'training')
plt.plot(history.history['val_loss'], label = 'test')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize = (12,8))

axs[0].scatter(model.predict(X_train_rnn).flatten(), y_train_rnn)
axs[1].scatter(model.predict(X_test_rnn).flatten(), y_test_rnn)
plt.show()

print(np.corrcoef(model.predict(X_train_rnn).flatten(), y_train_rnn)[0, 1])
print(np.corrcoef(model.predict(X_test_rnn).flatten(), y_test_rnn)[0, 1])

The result improved, but this seems to be related to the ability of our model of making a few good predictions of negative price developments. As stated before, it is typical for correlation to be exposed to a few extreme realizations.

Anyway, if you see any mistakes or room for improvement, let me know! Cheers!