In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from tensorflow.keras.models import Sequential
import tensorflow.keras.backend as K
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import TimeseriesGenerator
from gc import collect
import pprint
from colorama import Fore, Style, init;
from warnings import filterwarnings;
filterwarnings('ignore');

In [3]:
# nice helper function from https://www.kaggle.com/code/ravi20076/optiver-baseline-models
def PrintColor(text:str, color = Fore.BLUE, style = Style.BRIGHT):
    "Prints color outputs using colorama using a text F-string";
    print(style + color + text + Style.RESET_ALL);

In [4]:
tf.random.set_seed(42)

In [7]:
# Imports
train = pd.read_csv(r'crypto/train_model/data/bitcoin_2017_to_2023.csv')
train

Unnamed: 0,timestamp,open,high,low,close,volume,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume
0,2023-08-01 13:19:00,28902.48,28902.49,28902.48,28902.49,4.686580,1.354538e+05,258,0.893910,25836.224836
1,2023-08-01 13:18:00,28902.48,28902.49,28902.48,28902.49,4.775890,1.380351e+05,317,2.245460,64899.385195
2,2023-08-01 13:17:00,28908.52,28908.53,28902.48,28902.49,11.522630,3.330532e+05,451,2.708730,78290.170121
3,2023-08-01 13:16:00,28907.41,28912.74,28907.41,28908.53,15.896100,4.595556e+05,483,10.229810,295738.166916
4,2023-08-01 13:15:00,28896.00,28907.42,28893.03,28907.41,37.746570,1.090761e+06,686,16.504520,476955.246611
...,...,...,...,...,...,...,...,...,...,...
3125995,2017-08-17 04:04:00,4261.48,4261.48,4261.48,4261.48,0.140796,5.999993e+02,1,0.140796,599.999338
3125996,2017-08-17 04:03:00,4261.48,4261.48,4261.48,4261.48,0.012008,5.117185e+01,3,0.012008,51.171852
3125997,2017-08-17 04:02:00,4280.56,4280.56,4280.56,4280.56,0.261074,1.117543e+03,2,0.261074,1117.542921
3125998,2017-08-17 04:01:00,4261.48,4261.48,4261.48,4261.48,0.000000,0.000000e+00,0,0.000000,0.000000


In [12]:
# We only need the targets from the training data as we are going to build a time series model for it.
# train.sort_values(by=['stock_id', 'date_id', 'seconds_in_bucket'], inplace=True)
targets_df = pd.DataFrame(train[['timestamp', 'open', 'high', 'low', 'close', 'volume',
       'quote_asset_volume', 'number_of_trades', 'taker_buy_base_asset_volume',
       'taker_buy_quote_asset_volume']])

In [None]:
# We can also make a revealed targets dataframe (similar to in the test case)
# We get all the targets for the previous days, to ensure there is no leakage we enable a date access column so we can't access dates for the same day
# unique_dates = test.date_id.unique()
# revealed_targets_test = pd.DataFrame()
# targets, dates, access_dates, stocks = [], [], [], []

# for date in unique_dates:
#     targets.append(train.loc[train.date_id == date]['target'])
#     dates.append(train.loc[train.date_id == date - 1]['date_id'])
#     access_dates.append(train.loc[train.date_id == date]['date_id'])
#     stocks.append(train.loc[train.date_id == date]['stock_id'])

# # flatten the n x 3 arrays so they can be added to the dataframe
    
# revealed_targets_test['revealed_target'] = np.array(targets).flatten()
# revealed_targets_test['date_id'] = np.array(dates).flatten()
# revealed_targets_test['access_date_id'] = np.array(access_dates).flatten()
# revealed_targets_test['stock_id'] = np.array(stocks).flatten()
# collect();

>**In a previous notebook:** I investigate the stationarity of stock 0s timeseries for the target data. The conclusion from the Dickey-Fuller test is it is stationary and therefore needs no trend or differencing adjustment. [here](https://www.kaggle.com/archiecarpenter/baseline-lstm-not-submission-capable)

## Multivariate Data Preprocessing
##### Previously in the univariate case the lstm was applied to a series *independent of the stock type*. In the this notebook the aim is to build a model that can capture the differences between the series of each stock and predict accordingly. This means the dataframe needs to be adjusted to reflect this, in the next section I will develop a method to split each stock into its own time series.

In [None]:
# changing the date to be multivariate where the features are stocks
# def split_stocks(df, target_col='target'):
#     y = pd.DataFrame()
#     # iterate over each stock in the dataframe and assign them to a new df y
#     for stock in df.stock_id.unique():
#         y[f'stock_{stock}'] = df.loc[df.stock_id == stock][target_col]
#     return y
# collect();

In [None]:
# note we cannot run this as there are missing values for certain stocks
#y = split_stocks(targets_df)

In [13]:
%%time
# second method

# can we create a new dataframe that includes NaN values when a stock doesnt have a value for that date

# this method is quicker but still slow (any suggestions to improve this are welcome)
def rearrange_df(df, target_col='target'):
    # start by getting the unique values of each date, stock and seconds
    unique_dates = df.date_id.unique()
    unique_stocks = df.stock_id.unique()
    unique_seconds = df.seconds_in_bucket.unique()
    
    cols = list(unique_stocks)
    cols.append('date_id')
    cols.append('seconds_in_bucket')
    # init a df that values can be placed in
    d = pd.DataFrame([np.zeros(202)], columns=cols)
    
    for date in unique_dates:
        # get the values for the date_id == date
        date_df = df.loc[df.date_id == date]
        for seconds in unique_seconds:
            # get the values for the seconds_in_bucket == seconds at date_id == date
            seconds_df = date_df[date_df.seconds_in_bucket == seconds]

            # list of stock_ids that have values at this time point, to be used as columns for temp df
            columns = list(seconds_df.stock_id.values)
            columns.append('date_id')
            columns.append('seconds_in_bucket')

            # list of target values, date_id and seconds_in_bucket for the given time point
            values = list(seconds_df[target_col].values)
            values.append(date)
            values.append(seconds)

            # init df with these values and concat with first df
            seconds_df_new = pd.DataFrame([values], columns=columns)
            d = pd.concat([d, seconds_df_new], axis=0, ignore_index=True)

    return d

d = rearrange_df(targets_df)

# drop the row used in the initialisation
# d.drop(0, axis=0, inplace=True)
# print(d.head(3))
# collect();

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/home/dojm0727/venv/lib/python3.8/site-packages/IPython/core/magics/execution.py", line 1325, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 41, in <module>
  File "<timed exec>", line 8, in rearrange_df
  File "/home/dojm0727/venv/lib/python3.8/site-packages/pandas/core/generic.py", line 5989, in __getattr__
    return object.__getattribute__(self, name)
AttributeError: 'DataFrame' object has no attribute 'date_id'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/dojm0727/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2105, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/home/dojm0727/venv/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1396, in structured_traceback
    return FormattedTB.structured_traceback(
  File "/home/dojm0727/venv/lib/python3.8/site-packages/IPython/core/ultratb.py"

### Investigating missing values
##### We already know there are missing values for stocks at specific timepoints, can we see whether they are distributed evenly or they are for one specific stock.

In [None]:
numerical_d = d.drop(['date_id', 'seconds_in_bucket'], axis=1)
PrintColor('---Percentage of targets that are missing for each stock---')
plt.plot(numerical_d.isna().sum() / len(numerical_d))
plt.show()
collect();

In [None]:
# for now just fill the missing values with 0s, in future use more complex ways to impute NaNs
numerical_d.fillna(0, inplace=True)

### Normalisation

In [None]:
scaler = MinMaxScaler(feature_range=(-1, 1))
# Normalise and fit the data (only to be used on the training set)
def normalise_and_fit(df):
    df = df.copy()
    df[df.columns] = scaler.fit_transform(df[df.columns])
    return df

# Just normalise the data, used in test and validation
def normalize_df(df):
    df = df.copy()
    df[df.columns] = scaler.transform(df[df.columns])
    return df

# different helper function for just 1D arrays
def normalize_1D_array(arr, n_samples=1, n_features=200):
    arr = scaler.transform(X.reshape(n_samples, n_features))
    return arr

In [None]:
normalised_numerical_df = normalise_and_fit(numerical_d)

### Training

In [None]:
# gets close to 60% train, 15% validation, 15% test
train, test_df = train_test_split(normalised_numerical_df, train_size = .85, random_state=42, shuffle=False)
train_df, val_df = train_test_split(train, train_size=12/17, random_state=42, shuffle=False)

In [None]:
length = 55
input_series_length = 55
output_series_length = 55
n_features = 200
batch_size = 128
n_samples = len(train) // batch_size

In [None]:
# as seen in https://www.kaggle.com/code/nicapotato/keras-timeseries-multi-step-multi-output/notebook
def multivariate_multioutput_data(dataset, target, start_index, end_index, history_size,
                      target_size, step, single_step=False):
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i-history_size, i, step)
        data.append(dataset[indices])
        
        if single_step:
            labels.append(target[i+target_size])
        else:
            labels.append(target[i:i+target_size])
    
    return np.array(data), np.array(labels)

In [None]:
train_X, train_y = multivariate_multioutput_data(train_df.to_numpy(), train_df.to_numpy(), 0, None, input_series_length, output_series_length, 1)
val_X, val_y = multivariate_multioutput_data(val_df.to_numpy(), val_df.to_numpy(), 0, None, input_series_length, output_series_length, 1)
test_X, test_y = multivariate_multioutput_data(test_df.to_numpy(), test_df.to_numpy(), 0, None, input_series_length, output_series_length, 1)

In [None]:
print (train_X.shape,
       train_y.shape,
       val_X.shape,
       val_y.shape,
       'Single window of past history : {}'.format(train_X[0].shape),
       'Target window to predict : {}'.format(train_y[0].shape),
       sep='\n')

In [None]:
train_data_multi = tf.data.Dataset.from_tensor_slices((train_X, train_y))
train_data_multi = train_data_multi.batch(batch_size).repeat()

test_data_multi = tf.data.Dataset.from_tensor_slices((test_X))
test_data_multi = test_data_multi.batch(batch_size)

val_data_multi = tf.data.Dataset.from_tensor_slices((val_X, val_y))
val_data_multi = val_data_multi.batch(batch_size).repeat()
collect();

In [None]:
class Time2Vec(Layer):
    def __init__(self, output_dim, **kwargs):
        self.output_dim = output_dim
        super(Time2Vec, self).__init__(**kwargs)

    def build(self, input_shape):
        _, self.sequence_length, self.n_features = input_shape

        self.W = self.add_weight(name='W',
                                shape=(self.sequence_length, self.n_features),
                                initializer='uniform',
                                trainable=True)

        self.P = self.add_weight(name='P',
                                shape=(self.sequence_length, self.n_features),
                                initializer='uniform',
                                trainable=True)

        super(Time2Vec, self).build(input_shape)

    def call(self, x):
        
        original =  x * self.W + self.P
        sin_trans = tf.sin(original[:,:-1,:])
        return tf.concat([sin_trans, original[:,-1,:][:,tf.newaxis,:]], axis=1)

In [None]:
# Add the Time2Vec layer (modify 'output_dim' as needed)
model = Sequential()

model.add(Time2Vec(output_dim=200, input_shape=(55, n_features)))

# Add the LSTM layer with return_sequences=True
model.add(LSTM(200, activation='tanh', return_sequences=True))

# Compile the model
model.compile(optimizer='adam', loss='mae', metrics=['mse', 'accuracy'])

# Print model summary
model.summary()

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience = 5, restore_best_weights=True)

In [None]:
history = model.fit(train_data_multi, validation_data=val_data_multi, steps_per_epoch = n_samples, validation_steps=n_samples, epochs=100, verbose=1, callbacks=[early_stopping]).history

In [None]:
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('model train vs validation loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

### Making Predictions

In [None]:
predictions = model.predict(test_X)
scaled_pred = scaler.inverse_transform(predictions.reshape(predictions.shape[0] * predictions.shape[1], predictions.shape[2]))
scaled_true = scaler.inverse_transform(test_y.reshape(predictions.shape[0] * predictions.shape[1], predictions.shape[2]))

In [None]:
PrintColor(f'MAE on normalised test data: {mae(test_y.flatten(), predictions.flatten())}')
PrintColor(f'MAE on test data: {mae(scaled_true, scaled_pred)}')