**G-Research Crypto Forecasting**
* Load Data
* Add Feature Cols
* Scale
* Build & Fit NN
* Evaluate Results
* Submit results via G-Research API

In [None]:
import numpy as np
import pandas as pd
from pandas.tseries.offsets import DateOffset
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import warnings
import datetime as dt
from statsmodels.graphics.tsaplots import plot_pacf
import copy
from IPython.display import Image
warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**Load Data & Explore Structure**
- load training data set
- load supplemental train (this will be replaced with Sept - Dec once the comp starts)
- merge

In [None]:
def loadData(file):
    df = pd.read_csv(file)
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit = 's')
    df.set_index(['timestamp', 'Asset_ID'], inplace = True) # make multi-index
    
    return df

In [None]:
# read data, format, filter time
data = loadData('/kaggle/input/g-research-crypto-forecasting/train.csv')
data = data[data.index.get_level_values('timestamp') > '2020-12-30']
print(data.info(show_counts = True))
data.head()

In [None]:
# get supp train data
suppData = loadData('../input/g-research-crypto-forecasting/supplemental_train.csv')
print(suppData.info(show_counts = True))
suppData.head()

In [None]:
# stack dataframes without overlapping index

overlapDate = suppData.index.get_level_values('timestamp').min() # returns earliest time from suppTrain
data = data[data.index.get_level_values('timestamp') < overlapDate] # filter original DF so there's no overlap

stacked = pd.concat([data, suppData], ignore_index = False, levels = 'timestamp')

douplicateRows = stacked.shape[0] - data.shape[0] - suppData.shape[0]
print(f"There are {douplicateRows} missing rows")

In [None]:
# get the asset details into dictionaries

file = '../input/g-research-crypto-forecasting/asset_details.csv'

assetDetails = (pd.read_csv(file)).sort_values(by = ['Asset_ID']).reset_index(drop = True)

names = {}
weights = {}

for row in assetDetails.index:
    assetID = assetDetails.at[row, 'Asset_ID'] 
    names[assetID] = assetDetails.at[row, 'Asset_Name']
    weights[assetID] = assetDetails.at[row, 'Weight']

print(names)
print(weights)

**Define feature cols to be added**

In [None]:
# create functions to add in feature cols
    
def FeatureCols(df):
    df['hlDiff'] = df['High'] - df['Low'] # high - low to measure volitility
    df['avgSize'] = df['Volume'] // df['Count'] # average size of each trade as int
    
    # shadows
    df['uShadow'] = df['High'] - np.maximum(df['Close'], df['Open'])
    df['bShadow'] = np.minimum(df['Close'], df['Open']) - df['Low']
    
    # encode minute from timestamp
    df['minute'] = df.index.get_level_values('timestamp').minute
    df['minSin'] = np.sin(df.minute*(2.*np.pi/60))
    df['minCos'] = np.cos(df.minute*(2.*np.pi/60))
    
    # encode day of month from timestamp
    df['mDay'] = df.index.get_level_values('timestamp').day
    df['daySin'] = np.sin(df.mDay*(2.*np.pi/31))
    df['dayCos'] = np.cos(df.mDay*(2.*np.pi/31))
    
    df.drop(columns = ['minute', 'mDay'], axis = 1, inplace = True) # clear progress columns
    
    return (df)

In [None]:
final = FeatureCols(stacked) # Apply feature cols to the entire dataset
final = final[ [ col for col in final.columns if col != 'Target' ] + ['Target'] ] # move target to end
final.head(20)

In [None]:
final.info(show_counts = True) # check dataset after feature cols were added

**Build pipeline to scale data**
- Select cols to be used as features
- Save fit model to scale data for use in testing

In [None]:
# get libraries for preprocessing 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, FunctionTransformer

In [None]:
# select cols
noScale_features = ['minSin', 'minCos', 'daySin', 'dayCos']
highVol_features = ['Volume']
cont_features = ['Close', 'avgSize', 'uShadow', 'bShadow']
featureCols = noScale_features + highVol_features + cont_features

# set up pipeline for different data types
def ScaleData(inputDF, noScale_features = noScale_features, highVol_features = highVol_features, cont_features = cont_features):

    noScale_transformer = Pipeline(steps = [
        ('imputer', SimpleImputer(strategy = 'most_frequent'))])

    highVol_transformer = Pipeline(steps = [
        ('imputer', SimpleImputer(strategy = 'constant', fill_value = 1)),
        ('encoder', RobustScaler(quantile_range = (20.0, 80.0)))])

    cont_transformer = Pipeline(steps = [
        ('imputer', SimpleImputer(strategy = 'mean')),
        ('encoder', MinMaxScaler())])

    # process
    preprosessor = ColumnTransformer(transformers = [
        ('noScale', noScale_transformer, noScale_features),
        ('negPos', highVol_transformer, highVol_features),
        ('cont', cont_transformer, cont_features)])
    
    cols = noScale_features + highVol_features + cont_features # get cols we want to transform
    df_to_scale = inputDF[cols] # select these cols from input df
    fitScaler = preprosessor.fit(df_to_scale) # fit scaler
    scaled = fitScaler.transform(df_to_scale) # scale
    
    return [(pd.DataFrame(scaled, columns = cols).set_index(inputDF.index)), fitScaler] # df with scaled data & fit model to be used later

**Split assets into individual tables**
- Missing timestamps are filled using 'pad'
- Data is scaled using the pipeline built above
- Pairplot and Autocorrelations are plotted for the scaled data

In [None]:
# Split each table into a df, fill missing values, create feature cols, scale

assets = []
assetScalers = {}

for asset in names.keys():
    df = final.xs(asset, level = 'Asset_ID')
    
    timeStamps = df.index
    
    # set index so there's no missing times
    minDate = timeStamps.min()
    maxDate = timeStamps.max()
    df = df.reindex(index = list(pd.date_range(minDate, maxDate, freq = 'min')), method = 'pad')

    # scale data
    result = ScaleData(df)
    scaledDF, fitScaler = result[0], result[1] 
    
    # fill na's for target
    scaledDF['Target'] = df['Target'].fillna(0)
    
    assets.append(scaledDF) # save transformed df
    assetScalers[asset] = fitScaler # save scaler 
    
    # visualize data
    print(names[asset])
    
    plt.figure(figsize = (10, 10))
    sns.pairplot(scaledDF.sample(10000, random_state = 10, ignore_index = True))
    plt.show()
    
    plot_pacf(scaledDF['Target'].to_list(), lags = 50)
    plt.show()
    
del data # we no longer need the table. Free up memory.

**Prepare data for input into model**
- Class created to store attributes for each asset
- Create the number of lags to feed into the model
- Train/test split is set
- Check shape of all assets inputs

In [None]:
# create class to store data
class Asset():
    def __init__(self, xTrain, xTest, yTrain, yTest, builtModel = None, fitModel = None):
        self.xTrain = xTrain
        self.xTest = xTest
        self.yTrain = yTrain
        self.yTest = yTest
        self.builtModel = builtModel
        self.fitModel = fitModel

In [None]:
numLags = 15 # set the number of lags we want to feed to each array (i.e. were going to feed in the current minute + 15 previous minutes)
assetNames = names.keys()
trainPct = 0.8

preppedData = [] # store a instance for each asset
assetShapes = [] # store the shape of each assets df

# go through each asset and save details
for i, asset in enumerate(assets):
    y = np.array(asset['Target'].values.tolist())[numLags:]
    asset.drop('Target', axis = 1, inplace = True)
    
    x = [] #store the data with lags
    
    for time in asset.index[numLags:]:
        refTime = time - dt.timedelta(minutes = numLags) # go back number of lags required
        refRows = asset.loc[refTime : time] # get df of rows
        x.append(refRows.to_numpy())
    x = np.array(x) 
    
    # append a class instance with the training and testing data
    trainIndex = int(len(x) * trainPct)
    preppedData.append(Asset(x[:trainIndex], x[trainIndex:], y[:trainIndex], y[trainIndex:]))
    
    # append shapes
    shapes = {}
    shapes['xTrain_shape'] = np.shape(preppedData[i].xTrain)
    shapes['xTest_shape'] = np.shape(preppedData[i].xTest)
    shapes['yTrain_shape'] = np.shape(preppedData[i].yTrain)
    shapes['yTest_shape'] = np.shape(preppedData[i].yTest)
    assetShapes.append(shapes)
    
shapes = pd.DataFrame(assetShapes, index = assetNames)
shapes # i.e. asset 0 has a xTrain shape of 305268 timestamps, each with 16 times, each with 9 columns

**Build Model and Fit**
- Use Keras functional API (add layers, set activation functions, early stopping, etc..)
- Save the model and fit model for each asset to the class instance


In [None]:
# get libraries for the model
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
K.clear_session() 

In [None]:
# timeSteps, features = shapes.at[0, 'xTrain_shape'][1], shapes.at[0, 'xTrain_shape'][2] # set shape

# # variables to adjust
# act = 'tanh'
# init = tf.keras.initializers.he_uniform()

# # define layers
# X = keras.Input(shape = (timeSteps, features), name = 'X_Inputs')
# L1 = keras.layers.LSTM(64, activation = act, kernel_initializer = init, return_sequences = True, 
#                        name = 'Layer_1')(inputs = X)
# L2 = keras.layers.BatchNormalization(name = 'Layer_2')(inputs = L1)
# L3 = keras.layers.LSTM(64, activation = act, kernel_initializer = init, return_sequences = True, 
#                        name = 'Layer_3')(inputs = L2)
# L4 = keras.layers.BatchNormalization(name = 'Layer_4')(inputs = L3)
# L5 = keras.layers.LSTM(32, activation = act, kernel_initializer = init, return_sequences = False, 
#                        name = 'Layer_5')(inputs = L4)
# L6 = keras.layers.BatchNormalization(name = 'Layer_6')(inputs = L5)
# y_proba = keras.layers.Dense(1, name = 'Y_Outputs')(inputs = L6) 

In [None]:
# speed testing
timeSteps, features = shapes.at[0, 'xTrain_shape'][1], shapes.at[0, 'xTrain_shape'][2] # set shape

# variables to adjust
act = 'tanh'
init = tf.keras.initializers.he_uniform()

# define layers
X = keras.Input(shape = (timeSteps, features), name = 'X_Inputs')
L1 = keras.layers.LSTM(64, activation = act, kernel_initializer = init, return_sequences = False, 
                       name = 'Layer_1')(inputs = X)
y_proba = keras.layers.Dense(1, name = 'Y_Outputs')(inputs = L1) 

In [None]:
# variables to adjust
esPatience = 4
optLr = 0.02
numEpochs = 1
batchSize = 1000

# fit models
es = EarlyStopping(monitor = 'loss', mode = 'min', patience = esPatience, restore_best_weights = True, verbose = 1)

for i, asset in enumerate(preppedData):
    # assemble
    model = tf.keras.Model(inputs = [X], outputs = [y_proba], name = 'Individual_Asset_Model')
    
    # define optimizer and compile
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = optLr), 
                  loss = 'mean_squared_error',metrics = ['accuracy'])
    if i == 0:
        print(model.summary())
        
    print("Fitting to", names[i])
    asset.builtModel = model # save
    asset.fitModel = asset.builtModel.fit(asset.xTrain, asset.yTrain, validation_split = 0.1, epochs = numEpochs, 
                                           batch_size = batchSize, verbose = 1, callbacks = [es]) # fit

**Explore Results**
- Plot Model Loss Values
- Create y_pred for each asset and evaluate

In [None]:
# plot loss values

#list of all plot locations
rows, cols = 3, 5
plotList = [] 
for row in list(range(rows)):
    for col in list(range(cols)):
        plotList.append([row, col])

# loss values by epoch
fig, axis = plt.subplots(rows, cols, figsize = (10, 10))

for i, asset in enumerate(preppedData):
    modelHist = asset.fitModel
    pltRow, pltCol = plotList[i][0], plotList[i][1]
    axis[pltRow, pltCol].plot(modelHist.history['loss'], label = 'tL')
    axis[pltRow, pltCol].plot(modelHist.history['val_loss'], label = 'vL')
    axis[pltRow, pltCol].legend(loc = 1) 
    axis[pltRow, pltCol].set_title(names[i], y = 1.05)
    axis[pltRow, pltCol].ticklabel_format(style = 'sci')

fig.suptitle('Loss Values by Epoch')
fig.tight_layout() 

In [None]:
# plot metrics for each models performance

#list of all plot locations
rows, cols = len(names), 3
plotList = [] 
for row in list(range(rows)):
    for col in list(range(cols)):
        plotList.append([row, col])
        
fig, axis = plt.subplots(rows, cols, figsize = (10, 40))

correlations = {}

for i, asset in enumerate(preppedData):
    assetName = names[i]
    ypred = asset.builtModel.predict(asset.xTest).flatten()
    inputs = asset.yTest
    
    result = pd.DataFrame(list(zip(ypred, inputs)), columns = ['ypred', 'inputs']) # df of results
    result['diff'] = result['ypred'] - result['inputs']
    
    correlations[assetName] = result['ypred'].corr(result['inputs']) # add correlations 
    
    axis[i, 0].scatter(result.ypred, result.inputs)
    axis[i, 0].set_title("yPred vs yTest: " + assetName)
    axis[i, 1].plot(result['diff'])
    axis[i, 1].set_title("yPred - yTest: " + assetName)
    sample = result.sample(100, random_state = 99).sort_index() # just plotting a portion of the dataset
    axis[i, 2].plot(sample['ypred'], alpha = 0.5, label = 'yP') # yP = y_pred
    axis[i, 2].plot(sample['inputs'], alpha = 0.5, label = 'yT') # yT = y_test
    axis[i, 2].set_title("yPred vs yTest: " + assetName)
    axis[i, 2].legend(loc = 1)

fig.suptitle('Metrics by Asset')
fig.tight_layout() 

pd.DataFrame.from_dict(correlations, orient = 'index', columns = ['Correlation'])

**Submit Predictions**
- Uses G-Research's API developed for the competition

In [None]:
lagData = {}

for i, asset in enumerate(assets): # get historical data from last dataset
    lagData[i] = asset.iloc[-numLags:]

In [None]:
import gresearch_crypto

In [None]:
env = gresearch_crypto.make_env()

In [None]:
iter_test = env.iter_test()

In [None]:
# create function to predict results for each row
def Predict(x):
    
    time = x.name[1] # current time
    refTime = time - dt.timedelta(minutes = numLags) # get the timestamp of the earliest lag
    
    asset = x.name[0] # get asset name
    
    df = pd.DataFrame(x[featureCols].values, index = featureCols).T # select these cols from input df
    df = assetScalers[asset].transform(df) # scale with model used for the individual asset
    preppedRow = pd.DataFrame(df, columns = featureCols) # make DF with scaled data, set columns
    preppedRow['timestamp'] = time
    preppedRow.set_index('timestamp', drop = True, inplace = True)
    
    workingDF = lagData[asset] 
        
    # add row to lagData
    workingDF = pd.concat([workingDF, preppedRow]).sort_index() # add new row
        
    # prep data for predictions & predict
    lagData[asset] = workingDF.reindex(index = list(pd.date_range(refTime, time, freq = 'min')), method = 'nearest') # reindex for input
        
    x = lagData[asset].to_numpy()
        
    if np.count_nonzero(x==0) > (0.25*(len(x.flatten()))): # i.e. if more than half of the elemets in the prediction are 0
        y_pred = 0
        
    else:
        x = np.expand_dims(x, axis = 0)
        y_pred = preppedData[asset].builtModel.predict(x)[0][0]

    return np.float16(y_pred)

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    
    # clean input df, set index
    test_df['timestamp'] = pd.to_datetime(test_df['timestamp'], unit = 's')
    test_df.set_index(['Asset_ID', 'timestamp'], inplace = True)

    indexValues = test_df.index.get_level_values('timestamp')
    
    test_df.fillna(0) # fill na's
    test_df = FeatureCols(test_df) # add in feature cols
    
    test_df['Target'] = test_df.apply(lambda x: Predict(x), axis = 1) # predict results
    
    prediction_df = test_df[['row_id', 'Target']].reset_index(drop = True) # get row and target
    
    prediction_df['Target'].clip(-0.4, 0.4, inplace = True) # remove outliers

    env.predict(prediction_df) # submit