In [None]:
import numpy as np
import pandas as pd

import os
import time
from datetime import datetime
from dateutil.relativedelta import relativedelta
from collections import OrderedDict

import matplotlib.pyplot as plt
import seaborn as sns

import gresearch_crypto

import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
tf.random.set_seed(200)

DEVICE = 'CPU' # CPU GPU TPU

if DEVICE == "TPU":
    # detect and init the TPU
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()

    # instantiate a distribution strategy
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
    
    print('Running on TPU ', tpu.master())

# Check GPU Availability in Tensorflow
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    print("Name:", gpu.name, "  Type:", gpu.device_type)

# List Devices including GPU's with Tensorflow
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

# Check GPU in Tensorflow
tf.test.is_gpu_available()
    

FOLDER = os.path.join(os.getcwd(), 'dev')
if not os.path.isdir(FOLDER):
    os.mkdir(FOLDER)
    print('created', FOLDER)

In [None]:
dtype = {'timestamp': np.dtype('int64'), 'Asset_ID': int, 'Count': int, 
         'Open': float, 'High': float, 'Low': float, 'Close': float,
         'Volume': float, 'VWAP': float, 'Target': float}

df = pd.read_csv('../input/g-research-crypto-forecasting/train.csv', dtype=dtype)

#df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
#df = df.set_index('datetime')#.drop('timestamp', axis=1)

print(df.shape)
df.head()

In [None]:
def prep_asset_func(df):
    df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
    df = df.set_index('datetime')
    df = df.sort_index()
    df = df.loc[~df.index.duplicated()]
    df[df.isin([np.inf, -np.inf])] = np.nan
    df = df.reindex( pd.date_range(start=df.index.min(), end=df.index.max(), freq='min') )
    df['interpolated'] = df['timestamp'].isnull().astype(float)
    df = df.interpolate(method='linear', limit_direction='both', axis=0)
    return df

dfs = OrderedDict()
for asset_id in sorted(df['Asset_ID'].unique()):    
    df_asset = df[df['Asset_ID'] == asset_id].copy()   
    dfs[asset_id] = prep_asset_func(df_asset)

del df

In [None]:
dfs[1].describe().transpose()

In [None]:
#for df_ in dfs.values():
    #print(sum(df_['timestamp'].diff() != 60))
    #print(df_['timestamp'].diff().max() / 60)
    #print(df_.interpolated.sum())

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.preprocessing import FunctionTransformer

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer


def timestamp_func(df):
    
    day = 24*60*60
    year = (365.2425)*day
    
    timestamp_s = df['timestamp'].values

    day_sin = np.sin(timestamp_s * (2 * np.pi / day)).reshape(-1,1)
    day_cos = np.cos(timestamp_s * (2 * np.pi / day)).reshape(-1,1)
    year_sin = np.sin(timestamp_s * (2 * np.pi / year)).reshape(-1,1)
    year_cos = np.cos(timestamp_s * (2 * np.pi / year)).reshape(-1,1)
    
    return np.concatenate([day_sin, day_cos, year_sin, year_cos], axis=1)


def add_features_func(df):
    """A utility function to build features from the original df"""
    
    df['Upper_Shadow'] = df['High'] - np.maximum(df['Close'], df['Open'])
    df['Lower_Shadow'] = np.minimum(df['Close'], df['Open']) - df['Low']
    df['spread'] = df['High'] - df['Low']
    df['mean_trade'] = df['Volume']/df['Count']
    df['log_price_change'] = np.log(df['Close']/df['Open'])
    
    return df

def values_extractor_func(df):
    arr = df.values
    return arr.reshape(len(df), -1)
    

def build_pipeline():
    
    prep_asset = FunctionTransformer(func=lambda x: prep_asset_func(x))
    
    values_extractor = FunctionTransformer(func=lambda x: values_extractor_func(x))
    
    # derive new features
    add_features = FunctionTransformer(func=lambda x: add_features_func(x))
    
    # standard scaler
    scaler = StandardScaler()
    
    # discretizer
    #discretizer = KBinsDiscretizer(n_bins=8, encode='onehot-dense', strategy='kmeans')
    
    # time features
    timestamp_trans = FunctionTransformer(func=lambda x: timestamp_func(x))
    
    numlst = ['Open', 'High', 'Low', 'Close', 'Volume', 'VWAP',
              'Upper_Shadow', 'Lower_Shadow', 'spread', 'mean_trade', 'log_price_change']
    
    preprocessor = ColumnTransformer([('timestamp', timestamp_trans, ['timestamp']),                                      
                                      #('count', discretizer, ['Count']),
                                      ('numerical', scaler, ['Count']+numlst),
                                      ('values', values_extractor, ['interpolated'])])  
    
    return make_pipeline(prep_asset, add_features, preprocessor)

#preprocessor = build_pipeline()

In [None]:
#arr = preprocessor.fit_transform(dfs[1])
#plt.plot(arr[:,2])
#plt.plot(arr[:,3])
#plt.xlabel('Time [h]')
#plt.title('Time of day signal')
#print(arr.shape)
#arr[:1]

In [None]:
def build_model(Tx, nx):   
    
    def wrapper(Tx, nx):

        # inputs
        x = tf.keras.layers.Input(shape=(Tx, nx))

        # layers
        o1 = tf.keras.layers.GRU(32, return_sequences=True, dropout=0.2)(x)
        o = tf.keras.layers.GRU(8, return_sequences=True, dropout=0.2)(o1)
        y = tf.keras.layers.Dense(1, activation='linear')(o)

        # optimizer
        optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
        
        model = tf.keras.models.Model(inputs=x, outputs=y)
        model.compile(loss='mse', optimizer=optimizer) 
        
        return model, x, o

    # model
    if DEVICE == "TPU":
        with tpu_strategy.scope():
            model, x, o = wrapper(Tx, nx)
    else:
        model, x, o = wrapper(Tx, nx)   

    return model, x, o

    
def create_models(dfs):    
    models = {}
    for asset_id, df in dfs.items():
        model, inp, out = build_model(Tx=20, nx=17)
        models[asset_id] = {'model': model, 'inp': inp, 'out': out}
    return models


def create_pipelines(dfs):    
    pipelines = {}
    for asset_id, df in dfs.items():
        pipeline = build_pipeline()        
        pipelines[asset_id] = pipeline.fit(df)
    return pipelines    
    

def train_model(train_X, train_Y, model, epochs):
    
    for epoch in range(epochs):
        
        tic = time.time()
        train_loss = 0
        
        # Iterate over the batches of the dataset.
        for batch_x, batch_y in zip(train_X, train_Y):
            loss = model.train_on_batch(batch_x, batch_y)
            train_loss += loss
            break

        print("epoch: {} loss: {:.4f} time: {:.2f}".format(epoch+1, train_loss, time.time()-tic))


def pretrain(dfs, models, pipelines, end):
    
    for asset_id, df in dfs.items():
        
        pipeline = pipelines[asset_id]
        model = models[asset_id]['model']
        
        train_df = df[:end]        
        train_x = pipeline.transform(train_df)
        train_y = train_df['Target'].values
        
        params = {'sequence_length': 20, 'sequence_stride': 1, 'sampling_rate': 1, 'batch_size': 1024, 'shuffle': False}
        
        train_X = tf.keras.preprocessing.timeseries_dataset_from_array(data=train_x, targets=None, **params)        
        train_Y = tf.keras.preprocessing.timeseries_dataset_from_array(data=train_y, targets=None, **params)
        
        print('Asset_ID:', asset_id)
        
        train_model(train_X, train_Y, model, epochs=1)


start = max([df_.index.min() for df_ in dfs.values()])
end = min([df_.index.max() for df_ in dfs.values()]) - relativedelta(months=3)
print(start, end) 

#m, _, _ = build_model(Tx=20, nx=17); m.summary()

models = create_models(dfs)
pipelines = create_pipelines(dfs)

pretrain(dfs, models, pipelines, end)

In [None]:
"""
X = np.arange(100)
#Y = X*2
Y = np.concatenate([(X*2).reshape(-1,1),(X*2).reshape(-1,1)], axis=1)

input_dataset = tf.keras.preprocessing.timeseries_dataset_from_array(
  X, None, batch_size=4, sequence_length=10)

target_dataset = tf.keras.preprocessing.timeseries_dataset_from_array(
  Y, None, batch_size=4, sequence_length=1, start_index=9)

for inputs, targets in zip(input_dataset, target_dataset):
    print(inputs)
    print(targets)
    break
""" 
    
""" 
input_dataset = tf.keras.preprocessing.timeseries_dataset_from_array(
  X, None, batch_size=4, sequence_length=10)

target_dataset = tf.keras.preprocessing.timeseries_dataset_from_array(
  Y, None, batch_size=4, sequence_length=10)

for inputs, targets in zip(input_dataset, target_dataset):
    print(inputs)
    print(targets)
    break"""   
#print(i)

In [None]:

def make_train_outputs(dfs, start, end, SEQ_LENGTH=20):
    
    lst = [df[start:end]['Target'].values.reshape(-1, 1).astype(np.float32) for df in dfs.values()]
    
    data = np.concatenate(lst, axis=1)
    
    params = {'sequence_length': 1, 'batch_size': 1024, 'start_index': SEQ_LENGTH-1}        
    outputs = tf.keras.preprocessing.timeseries_dataset_from_array(data=data, targets=None, **params)
    
    return outputs


def make_train_inputs(dfs, pipelines, start, end, SEQ_LENGTH=20):
    
    lst = []
    for asset_id, df in dfs.items():
        
        data = pipelines[asset_id].transform(df[start:end])
        
        params = {'sequence_length': SEQ_LENGTH, 'batch_size': 1024}        
        inp = tf.keras.preprocessing.timeseries_dataset_from_array(data=data, targets=None, **params)
        lst.append(inp)
    
    #inputs = zip(*lst)
    
    return lst


start = max([df_.index.min() for df_ in dfs.values()])
end = min([df_.index.max() for df_ in dfs.values()]) - relativedelta(months=3)
print('train window:', start, end)  

outputs = make_train_outputs(dfs, start, end)
#for o in outputs:
#    print(tf.squeeze(o))
#    break
    
inputs = make_train_inputs(dfs, pipelines, start, end)
#for batch in inputs:
#    lst = list(batch)
#    print(len(lst))
#    print(lst[0])
#    break

In [None]:
def build_global_model(models):    
    
    def wrapper(models):
    
        inps = [model['inp'] for model in models.values()]
        outs = [model['out'] for model in models.values()]
        x = tf.concat(outs, -1)

        #query = tf.keras.layers.Dense(256)(x)
        #value =  tf.keras.layers.Dense(256)(x)
        #key   =  tf.keras.layers.Dense(256)(x)
        #attention = tf.keras.layers.Attention()([query, value, key])
        #dense = tf.keras.layers.Flatten()(attention)

        _, dense = tf.keras.layers.GRU(128, return_sequences=False, return_state=True, dropout=0.2)(x)

        y = tf.keras.layers.Dense(14, activation='linear')(dense)
        
        model = tf.keras.models.Model(inputs=inps, outputs=y)
        
        return model
    
    if DEVICE == "TPU":
        with tpu_strategy.scope():
            model = wrapper(models)
    else:
        model = wrapper(models)
    
    return model


def loss_fn(y_true, y_pred):
    squared_difference = tf.square(y_true - y_pred) #* (1 - y_true[1,:])
    return tf.reduce_mean(squared_difference, axis=-1)


def train_global_model(model, inputs, outputs):
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    
    dataset = inputs + [outputs]

    for epoch in range(4):

        tic = time.time()

        train_loss = 0
        
        step = 0       

        # iterate over the batches
        for batch in zip(*dataset):
            
            batch_x = list(batch[:-1])
            batch_y = batch[-1]

            with tf.GradientTape() as tape:
                # compute predictions
                batch_pred = model(batch_x, training=True)
                
                # compute loss
                errors = tf.square(batch_y - batch_pred)
                se = tf.reduce_sum(errors, axis=-1)
                loss = tf.reduce_mean(se)

            grads = tape.gradient(loss, model.trainable_weights)
            optimizer.apply_gradients(zip(grads, model.trainable_weights))
            
            step += 1
            if step % 10 == 0:
                print(step, loss.numpy())

            train_loss += loss.numpy()
            
            break
        break

        print("epoch: {}, loss: {}, time: {}".format(epoch, train_loss, time.time()-tic))

model = build_global_model(models); model.summary()

train_global_model(model, inputs, outputs)

In [None]:
#filename = './dev/model_v0'
# Save the weights
#model.save_weights(filename)

# Create a new model instance
#model = build_global_model(models)

# Restore the weights
#model.load_weights(filename)

In [None]:
def weighted_correlation(a, b, weights):
    
    w = np.ravel(weights)
    a = np.ravel(a)
    b = np.ravel(b)
    
    sum_w = np.sum(w)
    mean_a = np.sum(a * w) / sum_w
    mean_b = np.sum(b * w) / sum_w
    var_a = np.sum(w * np.square(a - mean_a)) / sum_w
    var_b = np.sum(w * np.square(b - mean_b)) / sum_w
    
    cov = np.sum((a * b * w)) / np.sum(w) - mean_a * mean_b
    corr = cov / np.sqrt(var_a * var_b)
    
    return corr

In [None]:
"""
history = models[0]['hist']

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('evolution of loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper right')
plt.show()"""

"""
for asset_id, model in models.items():
    plt.plot(model['hist'].history['loss'])
    plt.title('Asset_ID: ' + str(asset_id))
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train'], loc='upper right')
    plt.show()"""

In [None]:
def make_predictions(model, pipelines, dfs, SEQ_LENGTH=20):
    
    env = gresearch_crypto.make_env()
    iter_test = env.iter_test()
    
    for df_test, df_pred in iter_test:    

        df_test['datetime'] = pd.to_datetime(df_test['timestamp'], unit='s')
        df_test = df_test.set_index('datetime').drop('timestamp', axis=1) # shape = (14, 9) 

        for i, row in df_test.iterrows():            
            asset_id = row['Asset_ID']
            dfs[asset_id].append(df_test.loc[df_test['Asset_ID']==asset_id, :])
        
        inputs = []
        for asset_id, df in dfs.items():
            
            tmp_df = df[df.index[-1]-relativedelta(minutes=SEQ_LENGTH-1):]
                       
            values = pipelines[asset_id].transform(tmp_df)[-SEQ_LENGTH:]
            data = np.array([values.tolist()]) # shape = (1, 20, 24)          
            
            inputs.append(data)
        
        predictions = model.predict(inputs).ravel() # shape = (14, 9)
        
        tmp_df = pd.DataFrame({'Asset_ID': range(len(predictions)), 'pred': predictions})
        df_test = df_test.merge(tmp_df, on=['Asset_ID'], how='left')        
        df_pred['Target'] = df_pred.merge(df_test[['row_id', 'pred']], on=['row_id'], how='left')['pred']
        
        '''
        for i, row in df_test.iterrows():
            asset_id = int(row['Asset_ID'])
            y_pred = predictions[asset_id]
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred'''

        env.predict(df_pred)

dfs = {asset_id: df[df.index[-1]-relativedelta(months=1):] for asset_id, df in dfs.items()}
make_predictions(model, pipelines, dfs)