In [1]:
import numpy as np
import pandas as pd

import os
import time
from datetime import datetime
from dateutil.relativedelta import relativedelta
from collections import OrderedDict
import random as rnd
rnd.seed(100)

import matplotlib.pyplot as plt
import seaborn as sns

import gresearch_crypto

import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
tf.random.set_seed(200)
print('tf version:', tf.__version__)

# Check GPU Availability in Tensorflow
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    print("Name:", gpu.name, "  Type:", gpu.device_type)

# List Devices including GPU's with Tensorflow
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

# Check GPU in Tensorflow
tf.test.is_gpu_available()
    

FOLDER = os.path.join(os.getcwd(), 'dev')
if not os.path.isdir(FOLDER):
    os.mkdir(FOLDER)
    print('created', FOLDER)

In [2]:
def weighted_correlation(a, b, weights):
    
    w = np.ravel(weights)
    a = np.ravel(a)
    b = np.ravel(b)
    
    sum_w = np.sum(w)
    mean_a = np.sum(a * w) / sum_w
    mean_b = np.sum(b * w) / sum_w
    var_a = np.sum(w * np.square(a - mean_a)) / sum_w
    var_b = np.sum(w * np.square(b - mean_b)) / sum_w
    
    cov = np.sum((a * b * w)) / np.sum(w) - mean_a * mean_b
    corr = cov / np.sqrt(var_a * var_b)
    
    return corr

In [3]:
dtype = {'Asset_ID': 'int8', 'Weight': float, 'Asset_Name': str}
asset_details = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv', dtype=dtype)
asset_details = asset_details.sort_values(by=['Asset_ID']).reset_index(drop=True)
weights = asset_details['Weight'].values.astype('float32')
asset_details

In [4]:
dtype = {'timestamp': 'int64', 'Asset_ID': 'int8', 'Count': 'int32', 
         'Open': 'float64', 'High': 'float64', 'Low': 'float64', 'Close': 'float64',
         'Volume': 'float64', 'VWAP': 'float64', 'Target': 'float64'}

df = pd.DataFrame()
for fname in ['train.csv', 'supplemental_train.csv']:
    df = df.append(pd.read_csv(f'../input/g-research-crypto-forecasting/{fname}', low_memory=False, dtype=dtype))

# [2018-01-01, 2021-09-21], [2021-09-21, 2022-01-10]
dt = pd.to_datetime(df['timestamp'], unit='s')
print([dt.min(), dt.max()])

print(df.shape)
df.head()

In [5]:
def time_bounds(dfs, left_shift=0, right_shift=0):
    start = max([df.index.min() for df in dfs.values()]) + relativedelta(months=left_shift)
    end = min([df.index.max() for df in dfs.values()]) - relativedelta(months=right_shift)
    return start, end

def prep_asset(df_):
    df = df_.drop('Asset_ID', axis=1).copy()
    df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
    df = df.set_index('datetime')    
    df = df.sort_index()
    df = df.loc[~df.index.duplicated()]
    df[df.isin([np.inf, -np.inf])] = np.nan
    df = df.reindex( pd.date_range(start=df.index.min(), end=df.index.max(), freq='min') )
    #df['interpolated'] = df['timestamp'].isnull().astype(float)
    df = df.interpolate(method='linear', limit_direction='both', axis=0)
    return df

dfs = OrderedDict([(i, prep_asset(df[df['Asset_ID']==i])) for i in range(14)])
del df

start, end = time_bounds(dfs)
print(start, end)

dfs[0].head()

In [6]:
#print(dfs[0].shape)
#dfs[0].describe().transpose()

In [7]:
def find_outliers(data): 
    # calculate interquartile range
    q25 = np.percentile(data, 25)
    q75 = np.percentile(data, 75) 
    iqr = q75 - q25
    # calculate the outlier cutoff
    cut_off = iqr * 1.5
    lb = q25 - cut_off
    ub = q75 + cut_off    
    # identify outliers
    mask = np.logical_or(data < lb, data > ub)
    return mask, lb, ub

"""
# Process Target
from sklearn.preprocessing import MinMaxScaler

tscalers = {i: MinMaxScaler(feature_range=(-1, 1)) for i in range(14)}

for i in range(14):    
    target_smoothed = dfs[i]['Target'].ewm(span=5).mean()
    dfs[i]['Target'] = tscalers[i].fit_transform(target_smoothed.values.reshape(-1,1))"""

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

tscalers = {i: MinMaxScaler(feature_range=(-1, 1)) for i in range(14)}

for i in range(14):  
    data = dfs[i]['Target']
    mask, lb, ub = find_outliers(data)
    data = np.maximum(lb, np.minimum(data, ub))
    smooth = data.ewm(span=5).mean()
    dfs[i]['Target'] = tscalers[i].fit_transform(smooth.values.reshape(-1,1))

In [8]:
#dfs[1][:1000].Target.plot()
#dfs[1][:1000].Target.ewm(span=5).mean().plot()
#plt.legend(['original', 'smoothing'], loc='upper right')
#plt.ylabel('value')
#plt.xlabel('data')

"""
from statsmodels.graphics.tsaplots import plot_acf
fig = plot_acf(dfs[1][:1000].Target, lags=10)
plt.title("Autocorrelation")
plt.show()"""
#dfs[1][:1000].Target.diff().plot()

In [9]:
def trim_dfs(dfs):
    start, end = time_bounds(dfs)
    for i, df in dfs.items():
        dfs[i] = df[start:]

trim_dfs(dfs)

In [10]:
#d = dfs[1][:50000]
#idx = d.index.hour==22 #d.index.weekday==5
#d[idx].Target.plot()

for i, df in dfs.items():
    idx = df.index.minute==30
    dfs[i] = df[idx]

In [11]:
def add_lagged(dfs):
    for i, df in dfs.items():
        df_lag = df.drop(['timestamp', 'Target'], axis=1).shift(periods=1).add_suffix('_lag1')
        dfs[i] = pd.concat([dfs[i], df_lag], axis=1)[1:]
    return dfs

#dfs = add_lagged(dfs)
#dfs[0].head(3)

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.preprocessing import FunctionTransformer

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer


def timestamp_func(df):
    
    ts = df['timestamp'].values
    
    day = 24*60*60
    year = (365.2425)*day
    
    arr = np.zeros((ts.size, 4), dtype='float32')
    arr[:,0] = np.sin(ts * (2 * np.pi / day)) # day_sin
    arr[:,1] = np.cos(ts * (2 * np.pi / day)) # day_cos
    arr[:,2] = np.sin(ts * (2 * np.pi / year)) # year_sin
    arr[:,3] = np.cos(ts * (2 * np.pi / year)) # year_cos    
    return arr


def extractor_func(df):   
    eps = 1e-5
    arr = np.zeros((len(df), 7), dtype='float32')
    for i, col in enumerate(df.columns):
        arr[:,i] = np.log(np.maximum(eps, df[col].values))        
    return arr


def enricher_func(df):    
    arr = np.zeros((len(df), 5), dtype='float32')    
    arr[:,0] = df['High'].values - np.maximum(df['Close'].values, df['Open'].values) # Upper_Shadow
    arr[:,1] = np.minimum(df['Close'].values, df['Open'].values) - df['Low'].values # Lower_Shadow
    arr[:,2] = df['High'].values - df['Low'].values # spread
    arr[:,3] = df['Volume'].values/df['Count'].values # mean_trade
    arr[:,4] = np.log(df['Close'].values/df['Open'].values) # log_price_change
    return arr
    

def build_pipeline():
    
    # time features
    timestamp_trans = FunctionTransformer(func=lambda x: timestamp_func(x))
    
    # numerical features
    extractor_trans = FunctionTransformer(func=lambda x: extractor_func(x))    

    # derived features
    enricher_trans = FunctionTransformer(func=lambda x: enricher_func(x))
    
    # standard scaler
    #scaler = StandardScaler()
    
    numerical_cols = ['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']
    lagged_cols = ['Count_lag1', 'Open_lag1', 'High_lag1', 'Low_lag1', 'Close_lag1', 'Volume_lag1', 'VWAP_lag1'] 
    
    preprocessor = ColumnTransformer([('timestamp', timestamp_trans, ['timestamp']),
                                      ('numerical', make_pipeline(extractor_trans, StandardScaler()), numerical_cols),
                                      ('enricher', make_pipeline(enricher_trans, StandardScaler()), numerical_cols),])
    #                                  ('lagged', make_pipeline(extractor_trans, StandardScaler()), lagged_cols)])    
    return preprocessor

pipelines = {i: build_pipeline().fit(_df.drop('Target', axis=1)) for i, _df in dfs.items()}
n_features = pipelines[0].transform(dfs[0]).shape[1] - 4

# testing
#arr = build_pipeline().fit_transform(dfs[1])
#print(arr.shape)

In [13]:
def data_generator(dfs, start, end, pipelines, batch_size=256, shuffle=True, epochs=1):
    """A data generator function"""    
    
    # create an array with the indexes that can be shuffled
    indexes = dfs[0][start:end].index.tolist()
    length = len(indexes)
    
    # shuffle the indexes
    if shuffle:
        rnd.shuffle(indexes)
    
    # init 
    idx = 0 # current location
    batch_indexes = [0] * batch_size    
    epoch = 0
    flag = False
    
    targets = np.zeros((batch_size,14), dtype='float32')
    features = np.zeros((batch_size,14,n_features), dtype='float32')
    time_encoding = np.zeros((batch_size,14,4), dtype='float32') 
    asset_encoding = np.tile(np.expand_dims(np.eye(14, dtype='float32'), axis=0), (batch_size,1,1))
    encodings = np.concatenate([time_encoding, asset_encoding], axis=-1)
    
    while True:
        
        if flag:
            break
        
        for i in range(batch_size):
            if idx >= length:
                epoch += 1
                flag = epoch>=epochs # determine if continue after pass through data
                idx = 0
                if shuffle:
                    rnd.shuffle(indexes)                    
            batch_indexes[i] = indexes[idx]            
            idx += 1        
        
        for i, df in dfs.items():
            values = pipelines[i].transform(df.loc[batch_indexes, :])
            
            features[:,i,:] = values[:,4:]
            encodings[:,i,:4] = values[:,:4]
            
            targets[:,i] = df.loc[batch_indexes,'Target']
        
        inputs = [features, encodings]
                
        yield inputs, targets

# testing
start, end = time_bounds(dfs)
generator = data_generator(dfs, start, end, pipelines)
inputs, targets = next(generator)

In [30]:
class TransformerBlock(tf.keras.layers.Layer):
    """
    https://keras.io/examples/nlp/text_classification_with_transformer
    """
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


def custom_loss(weights):
    w = tf.constant(weights.reshape(1,-1)/ weights.sum())
    def wrapper(y_true, y_pred):
        errors = tf.square(y_true - y_pred) * w
        #se = tf.reduce_sum(errors, axis=-1)
        #loss = tf.reduce_mean(se)  
        loss = tf.reduce_sum(tf.math.reduce_max(errors, axis=0))
        return loss
    return wrapper


def build_model(nx, weights):

    features = tf.keras.layers.Input(shape=(14, nx))
    encodings = tf.keras.layers.Input(shape=(14, 4+14))
    
    #embed = tf.keras.layers.Dense(32, activation="relu")(features)
    #embed = tf.keras.layers.Dropout(0.1)(embed)
    #embed = tf.keras.layers.Dense(16)(embed)

    #out_attention = TransformerBlock(embed_dim=nx, num_heads=4, ff_dim=64)(features)
    #out_attention = TransformerBlock(embed_dim=nx, num_heads=4, ff_dim=64)(out_attention)
    
    query = tf.keras.layers.Dense(16)(features)
    value =  tf.keras.layers.Dense(32)(features)
    key   =  tf.keras.layers.Dense(16)(features)
    attention = tf.keras.layers.Attention()([query, value, key])

    out_attention = tf.keras.layers.Dropout(0.1)(attention)
    out_attention = tf.keras.layers.LayerNormalization(epsilon=1e-6)(out_attention)

    hidden = tf.concat([out_attention, encodings], axis=-1)

    dense = tf.keras.layers.Dense(128, activation="relu")(hidden)
    dense = tf.keras.layers.Dropout(0.1)(dense)
    outputs = tf.keras.layers.Dense(1)(dense)  
    outputs = tf.keras.layers.Flatten()(outputs)   

    model = tf.keras.models.Model(inputs=[features, encodings], outputs=outputs)
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), loss=custom_loss(weights))

    return model

# build
model = build_model(n_features, weights)

model.summary()

In [40]:
#model.predict([np.random.randn(1, 14, 12), np.random.randn(1, 14, 18)]).ravel()

In [41]:
def train_model(model):
    
    train_start, train_end = time_bounds(dfs, right_shift=3)
    _, val_end = time_bounds(dfs)

    #train_generator = data_generator(dfs, train_start, train_end, pipelines, epochs=10)
    #val_generator = data_generator(dfs, train_end, val_end, pipelines)    
    
    train_n = train_loss = 0
    val_n = val_loss = 0
    tic = time.time()    
    
    history = {'train_loss': [], 'val_loss': []}
    epochs = 10
    step = 0
        
    for epoch in range(1,epochs+1):     
        
        train_generator = data_generator(dfs, train_start, train_end, pipelines, batch_size=256, shuffle=True)
        
        while True:
            
            step += 1
            
            try:
                batch_inputs, batch_targets = next(train_generator)
            except:
                break

            train_loss += model.train_on_batch(batch_inputs, batch_targets)
            train_n += 1
            #print(step, train_loss)
            if False and step%5000==0:
                lr = model.optimizer.learning_rate.numpy()*0.75
                model.optimizer.learning_rate.assign(lr)            

            if step%1000 == 0:      
                train_loss = np.sqrt(train_loss / train_n)
                print(f'step: {step} ------------------------')
                print('train_loss: {:.4f} train_time: {}'.format(train_loss, round((time.time()-tic)/60))) 
                history['train_loss'].append(train_loss)
                train_n = train_loss = 0
                tic = time.time()

            if False and step%1000 == 0:            
                val_generator = data_generator(dfs, train_end, val_end, pipelines, shuffle=False)
                while True:
                    try:
                        batch_inputs, batch_targets = next(val_generator)
                        batch_predictions = model(batch_inputs)
                        loss = custom_loss(batch_targets, batch_predictions, w)
                        val_n += 1
                        val_loss += loss.numpy()
                    except:
                        break
                val_loss = np.sqrt(val_loss / val_n)
                print('val_loss: {:.4f} val_time: {}'.format(val_loss, round((time.time()-tic)/60)))           
                history['val_loss'].append(val_loss)
                val_n = val_loss = 0
                tic = time.time()
            
    return history

history = train_model(model)

In [42]:
#model.save(f'{FOLDER}/model')
#model = tf.keras.models.load_model(f'{FOLDER}/model')

In [43]:
def show_performance():
    train_start, train_end = time_bounds(dfs, right_shift=3)
    _, val_end = time_bounds(dfs)

    generator = data_generator(dfs, train_start, train_end, pipelines, shuffle=False) # train data
    #generator = data_generator(dfs, train_end, val_end, pipelines, shuffle=False) # val

    generator = data_generator(dfs, train_end, val_end, pipelines)

    batch_inputs, batch_targets = next(generator)
    batch_predictions = model.predict(batch_inputs)

    i = 1
    y_pred = batch_predictions[:,i] #tscalers[i].inverse_transform(.reshape(-1,1))
    y_true = batch_targets[:,i]

    plt.plot(y_pred)
    plt.plot(y_true)
    plt.title('model performance')
    plt.ylabel('value')
    plt.xlabel('data')
    plt.legend(['y_pred', 'y_true'], loc='upper right')
    plt.show()

show_performance()

In [44]:
"""
plt.plot(history['train_loss'])
plt.plot(history['val_loss'])
plt.title('evolution of loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train_loss', 'val_loss'], loc='upper right')
plt.show()"""

In [45]:
def make_input(pipelines, df_test, dfs_asset, features, encodings):
    for _, row in df_test.iterrows():
        i = int(row['Asset_ID'])  
        df_asset = dfs_asset[i]
        for col in df_asset.columns:
            if row[col]==row[col]:
                df_asset.loc[0,col] = row[col]
        # feature transformation
        values = pipelines[i].transform(df_asset)
        features[0,i,:] = values[0,4:]
    encodings[0,:,:4] = np.tile(values[0,:4].reshape(1,4), (14, 1))
    

def make_predictions(model, pipelines, dfs):    
    
    features = np.zeros((1, 14, n_features), dtype='float32')    
    time_encoding = np.zeros((1, 14, 4), dtype='float32')
    asset_encoding = np.expand_dims(np.eye(14, dtype='float32'), axis=0)
    encodings = np.concatenate([time_encoding, asset_encoding], axis=-1)
    
    dfs_asset = {i: df.loc[[df.index[-1]],:].drop('Target', axis=1).copy() for i, df in dfs.items()}
    values = np.concatenate([pipelines[i].transform(dfs_asset[i]) for i in range(14)], axis=0)
    
    features[0,:,:] = values[:,4:]

    env = gresearch_crypto.make_env()
    iter_test = env.iter_test()
    
    for df_test, df_pred in iter_test:  
        
        #if datetime.now() < datetime(2022, 2, 3): #(datetime.now()-datetime.fromtimestamp(1642022128)).seconds
        #    df_pred['Target'] = 0
        #    env.predict(df_pred)
        #    continue
        
        try:                
            make_input(pipelines, df_test, dfs_asset, features, encodings)

            predictions = model.predict([features, encodings]).ravel()
            print('success', predictions) # TODO: tscalers.inverse_transform()

            df_test['Target'] = predictions[df_test['Asset_ID']]            
            df_pred = df_pred.drop('Target', axis=1).merge(df_test[['row_id', 'Target']], on='row_id', how='left')
        except:            
            df_pred['Target'] = 0
            print('failure')
        finally:
            env.predict(df_pred)

make_predictions(model, pipelines, dfs)