# Crypto Forecasting - Basic NN + Feature importance

A basic NN exemple, optimising the competition target, calibrated on folds from on-line Feature engineering (see: https://www.kaggle.com/lucasmorin/on-line-feature-engineering). The online-Feature engineering allows for submission with features. For the corresponding lgbm baseline see: https://www.kaggle.com/lucasmorin/online-fe-lgbm-feval-importances (no custom loss - not nubmitting). 

The notebook also includes:
- some changes to standard architecture to use the custom weighted loss in the model.
- the architecture of the model is rather simple and regularized thanks to ton of noise. 
- for the moment the convergence is rather poor: +/-1% inside a fold. Very different results across folds. ensembling seems to improve the overall LB result a bit. 
- a feature importance solution (from @cdeotte work in the Google Ventilator Pressure Prediction Challenge) based on shuffling features. The solution can be rather slow so it is possible to deactivate it easily. a similar feature importance trough SHAP approximation of Shapley values. A feature importance plot for a nice visualisation of importance across folds (from @nyanp currently winning solution from Optiver volatility forecasting competition).
- some practical tests: normalisation of targets, post processing (removing the market average), removing the worst models from fold 4. Not really working.  

# Import and options

In [None]:
import gresearch_crypto

env = gresearch_crypto.make_env()


import pandas as pd
import numpy as np
import os
import gc
import pickle

import time
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

import tensorflow as tf
from tensorflow import keras
import numpy as np
from tensorflow.keras import backend as K

seed = 2021

DEBUG = False
TARGET_NORM = False
SHAP = True
EXPLAIN = False
LAST_FOLD = False
Market_Correction = False
Filter_Bad_Models = False

n_fold = 5

# Some definitions

In [None]:
fold = 4
path = '../input/on-line-feature-engineering/'
train = pd.read_parquet(path+'train_fold_'+str(fold)+'.parquet')

mean = pd.read_parquet(path+'mean_fold_'+str(fold)+'.parquet')
std = pd.read_parquet(path+'std_fold_'+str(fold)+'.parquet')

numerical_columns = [col for col in train.columns if col not in {'timestamp', 'Target', 'Target_M','weights','Asset_ID'}]
categorical_columns = ['Asset_ID']
target_columns = ['Target']
cols = numerical_columns + categorical_columns

asset_nunique = train['Asset_ID'].nunique()
print('asset_nunique:',asset_nunique)

asset_details = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')

#create dictionnary of weights
dict_weights = {}
for i in range(asset_details.shape[0]):
    dict_weights[asset_details.iloc[i,0]] = asset_details.iloc[i,1]

# model definition

In [None]:
hidden_units = (256,64,16,4)
hidden_noise = 0.1*np.array((5,2,1,0.5,0.25))

from functools import partial

def corr_loss(y_true, y_pred):
    x = tf.cast(y_true, tf.float32)
    y = tf.cast(y_pred, tf.float32)
    mx = K.mean(x)
    my = K.mean(y)
    xm, ym = x-mx, y-my
    r_num = K.sum(tf.multiply(xm,ym))
    r_den = K.sqrt(tf.multiply(K.sum(K.square(xm)), K.sum(K.square(ym))))
    r = r_num / r_den
    r = K.maximum(K.minimum(r, 1.0), -1.0)
    return - r

def wcorr_loss(y_true, y_pred, w):
    x = tf.cast(y_true, tf.float32)
    y = tf.cast(y_pred, tf.float32)
    w = tf.cast(w, tf.float32)
    wmx = K.sum(tf.multiply(x, w)) / K.sum(w)
    wmy = K.sum(tf.multiply(y, w)) / K.sum(w)
    xm, ym = x-wmx, y-wmy
    tfwcovxy = K.sum( tf.multiply(tf.multiply(xm, w),  ym)) / K.sum(w)
    tfwcovxx = K.sum( tf.multiply(tf.multiply(xm, w),  xm)) / K.sum(w)
    tfwcovyy = K.sum( tf.multiply(tf.multiply(ym, w),  ym)) / K.sum(w)
    r = tfwcovxy / K.sqrt(tf.multiply(tfwcovxx , tfwcovyy))
    r = K.maximum(K.minimum(r, 1.0), -1.0)
    return - r

def wcorr_fn(w):
    def wcorr_eval(y_true, y_pred):
        return wcorr_loss(y_true, y_pred, w)

def base_model():
    
    # Each instance will consist of two inputs: a single user id, and a single movie id
    #stock_id_input = keras.Input(shape=(1,), name='stock_id')
    weights_input = keras.Input(shape=(1,), name='weigths')
    y_true = keras.Input(shape=(1,), name='true')
    num_input = keras.Input(shape=(len(numerical_columns),), name='num_data')
    
    out = keras.layers.BatchNormalization()(num_input)
    out = keras.layers.GaussianNoise(stddev=hidden_noise[0])(out)

    # Add one or more hidden layers
    for n in range(len(hidden_units)):
        out = keras.layers.Dense(hidden_units[n], activation='swish')(out) 
        #out = keras.layers.Dropout(0.1)(out)
        out = keras.layers.BatchNormalization()(out)
        out = keras.layers.GaussianNoise(stddev=hidden_noise[n])(out)
      
    # A single output: our predicted rating
    out = keras.layers.Dense(1, activation='linear', name='prediction')(out)
    
    model = keras.Model(
    inputs = [weights_input, y_true, num_input], #[stock_id_input, weights_input, y_true, num_input],
    outputs = out,
    )
    
    model.add_loss(wcorr_loss(y_true, out, weights_input))
    
    model.compile(
        keras.optimizers.Adam(learning_rate=0.00005),
        loss = None,#'MSE',
        metrics=[wcorr_fn(weights_input)],
    )
    
    return model

# Model calibration

In [None]:
models = {}
importances = []
df_scores = []

n_seed = 2 if DEBUG else 5
n_epochs = 10 if DEBUG else 300
folds = [4] if LAST_FOLD else range(n_fold-1, -1, -1)

for fold in folds:
    print('Fold: '+str(fold))

    train = pd.read_parquet(path+'train_fold_'+str(fold)+'.parquet')
    test = pd.read_parquet(path+'test_fold_'+str(fold)+'.parquet')
    
    mean = pd.read_parquet(path+'mean_fold_'+str(fold)+'.parquet')
    std = pd.read_parquet(path+'std_fold_'+str(fold)+'.parquet')
    
    train[numerical_columns] = (train[numerical_columns]-mean.transpose()[numerical_columns].iloc[0].values.squeeze()).div(std.transpose()[numerical_columns].iloc[0]).fillna(0)
    test[numerical_columns] = (test[numerical_columns]-mean.transpose()[numerical_columns].iloc[0].values.squeeze()).div(std.transpose()[numerical_columns].iloc[0]).fillna(0)
    
    if DEBUG:
        timestamp_sample_train = train.timestamp.unique()[:np.int(len(train.timestamp.unique())*0.05)]
        timestamp_sample_test = test.timestamp.unique()[:np.int(len(test.timestamp.unique())*0.05)]
        train = train[train.timestamp.isin(timestamp_sample_train)]
        test = test[test.timestamp.isin(timestamp_sample_test)]
         
    train = train[~train.Target.isna()]
    test = test[~test.Target.isna()]

    train['weights'] = train.Asset_ID.map(dict_weights).astype('float32')
    test['weights'] = test.Asset_ID.map(dict_weights).astype('float32')

    if TARGET_NORM:
        target_mean = np.mean(train['Target'])
        target_std = np.std(train['Target'])
        train['Target'] = (train['Target']-target_mean)/target_std
        test['Target'] = (test['Target']-target_mean)/target_std
        
    weights_train = train[['weights']]
    weights_test = test[['weights']]
    
    for seed in range(n_seed):
        
        tf.random.set_seed(seed)
        
        print('Fold: '+str(fold)+ ' - seed: '+str(seed))
    
        model = base_model()

        es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-05, patience=25, verbose=1,mode='min',restore_best_weights=True)

        plateau = tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss', factor=0.5, patience=10, verbose=1,
            mode='min')

        hist = model.fit([weights_train, train['Target'],train[numerical_columns]], 
                  train['Target'],
                  batch_size=1024,
                  epochs=n_epochs,
                  validation_data=([weights_test, test['Target'], test[numerical_columns]], test['Target']),
                  callbacks=[plateau,es],
                  shuffle=True,verbose = 0)
        
        plt.plot(hist.history['val_loss'], label= 'fold '+str(fold)+' seed '+str(seed))
        df_scores.append((fold, seed, -min(hist.history['val_loss'])))
        
        final_model = keras.Model(model.input[2],model.output)
        
        final_model.save('model_'+str(fold)+ '_seed_'+str(seed))
        
        models[(seed,fold)] = final_model
    
    plt.legend(loc="upper left", bbox_to_anchor=(1, 0.5))
    plt.show()

    #only explain one seed per fold...
    if EXPLAIN:
        #print(' Computing NN feature importance...')
        results = []
        # COMPUTE BASELINE (NO SHUFFLE)
        oof_preds = model.predict([weights_test, test['Target'],test[numerical_columns]], verbose=0).squeeze() 
        baseline_loss = wcorr_loss(test['Target'], oof_preds, test['weights'])        

        for k in cols:
            # print(k)
            # SHUFFLE FEATURE K
            save_col = test[k].copy()
            np.random.shuffle(test[k].values)

            # COMPUTE OOF MAE WITH FEATURE K SHUFFLED
            oof_preds = model.predict([weights_test, test['Target'],test[numerical_columns]], verbose=0).squeeze() 
            loss  = wcorr_loss(test['Target'], oof_preds, test['weights'])  
            results.append(loss - baseline_loss )
            test[k] = save_col

            del save_col, oof_preds
            gc.collect()

        importances.append(results)

# Results

In [None]:
df_results = pd.DataFrame(df_scores,columns=['fold','seed','score']).pivot(index='fold',columns='seed',values='score')

df_results.loc['seed_mean']= df_results.mean(numeric_only=True, axis=0)
df_results.loc[:,'fold_mean'] = df_results.mean(numeric_only=True, axis=1)
df_results

# SHAP - explainability 2

In [None]:
import shap

if SHAP:
    SHAP_values = []
    for j in folds:
        for i in range(n_seed):
            explainer = shap.GradientExplainer(models[(i,j)], train[numerical_columns].iloc[:1000].values)
            shap_values = explainer.shap_values(test[numerical_columns].iloc[:1000].values)
            SHAP_values.append(np.mean(np.abs(shap_values[0]),axis=0))

from nyanp's Optiver solution.

In [None]:
def plot_importance(importances, features_names = cols, PLOT_TOP_N = 20, figsize=(10, 10)):
    importance_df = pd.DataFrame(data=importances, columns=features_names)
    sorted_indices = importance_df.median(axis=0).sort_values(ascending=False).index
    sorted_importance_df = importance_df.loc[:, sorted_indices]
    plot_cols = sorted_importance_df.columns[:PLOT_TOP_N]
    _, ax = plt.subplots(figsize=figsize)
    ax.grid()
    #ax.set_xscale('log')
    ax.set_ylabel('Feature')
    ax.set_xlabel('Importance')
    sns.boxplot(data=sorted_importance_df[plot_cols],
                orient='h',
                ax=ax)
    plt.show()

if EXPLAIN:
    plot_importance(np.array(importances),cols, PLOT_TOP_N = 20, figsize=(10, 20))
    
if SHAP:
    plot_importance(np.array(SHAP_values),numerical_columns, PLOT_TOP_N = 20, figsize=(10, 20))

In [None]:
#pickle.dump(models, open('NN_models.pkl', 'wb'))
pickle.dump(importances, open('importances.pkl', 'wb'))
pickle.dump(cols, open('featrures.pkl', 'wb'))

# Submission

In [None]:
dict_score = {}

for k in df_scores:
    dict_score[(k[1],k[0])]=k[2]

In [None]:
class RunningMean:
    def __init__(self, WIN_SIZE=20, n_size = 1):
        self.n = 0
        self.mean = np.zeros(n_size)
        self.cum_sum = 0
        self.past_value = 0
        self.WIN_SIZE = WIN_SIZE
        self.windows = collections.deque(maxlen=WIN_SIZE+1)
        
    def clear(self):
        self.n = 0
        self.windows.clear()

    def push(self, x):
        #currently fillna with past value, might want to change that
        x = fillna_npwhere(x, self.past_value)
        self.past_value = x
        
        self.windows.append(x)
        self.cum_sum += x
        
        if self.n < self.WIN_SIZE:
            self.n += 1
            self.mean = self.cum_sum / float(self.n)
            
        else:
            self.cum_sum -= self.windows.popleft()
            self.mean = self.cum_sum / float(self.WIN_SIZE)

    def get_mean(self):
        return self.mean if self.n else np.zeros(n_size)

    def __str__(self):
        return "Current window values: {}".format(list(self.windows))

# Temporary removing njit as it cause many bugs down the line
# Problems mainly due to data types, I have to find where I need to constraint types so as not to make njit angry
#@njit
def fillna_npwhere(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [None]:
Market_Correction = False
Filter_Bad_Models = True
level = 0.045

iter_test = env.iter_test()

dict_RM = pickle.load(open('../input/on-line-feature-engineering/dict_RM_4.pkl', 'rb'))
dict_RM_M = pickle.load(open('../input/on-line-feature-engineering/dict_RM_M_4.pkl', 'rb'))
dict_MM = pickle.load(open('../input/on-line-feature-engineering/dict_MM_4.pkl', 'rb'))
dict_Mr = pickle.load(open('../input/on-line-feature-engineering/dict_MR_4.pkl', 'rb'))


import os
from random import random

sampling = 0.05

MA_lags = [2,5,15,30,60,120,300,1800,3750,2*3750,7*24*60]
beta_lags = [15,30,60,120,300,600,1800,3750,2*3750,7*24*60]

Features_names = ['log_ret','log_ret_H','log_ret_L','log_ret_VWAP','GK_vol','RS_vol','log_Count','log_Volume','log_Dollars','log_Volume_per_trade','log_Dollars_per_trade']
Market_Features_names = [s+'_M' for s in Features_names]
Time_Features_names = ['sin_month','cos_month','sin_day','cos_day','sin_hour','cos_hour','sin_minute','cos_minute']
MA_Features_names = [s+'_'+str(lag) for lag in MA_lags for s in Features_names ]
MA_Features_M_names = [s+'_'+str(lag) for lag in MA_lags for s in Market_Features_names]
betas_names = ['betas_'+str(lag) for lag in beta_lags]

All_names = Features_names + Market_Features_names + Time_Features_names + MA_Features_names + MA_Features_M_names + betas_names
#df_values = pd.DataFrame(values, columns = All_names)

#not building the weights each loops
asset_details = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')
dict_weights = {}
for i in range(asset_details.shape[0]):
    dict_weights[asset_details.iloc[i,0]] = asset_details.iloc[i,1]
weigths = np.array([dict_weights[i] for i in range(14)])

# only needed when saving ?
dtype={'Asset_ID': 'int8', 'Count': 'int32', 'row_id': 'int32', 'Count': 'int32',
       'Open': 'float32', 'High': 'float32', 'Low': 'float32', 'Close': 'float32',
       'Volume': 'float32', 'VWAP': 'float32'}
#test_df = test_df.astype(dtype)

#refactoring functions:

def timestamp_to_date(timestamp):
    return(datetime.fromtimestamp(timestamp))

def Clean_df(x):
    Asset_ID = x[:,1]
    timestamp = x[0,0]
    if len(Asset_ID)<14:
        missing_ID = [i for i in range(14) if i not in Asset_ID]
        for i in missing_ID:
            row = np.array((timestamp,i,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan))
            x = np.concatenate((x,np.expand_dims(row,axis=0)))
    x = x[np.argsort(x[:,1])]
    return (x[:,i] for i in range(x.shape[1]))

def Base_Feature_fn(timestamp,Asset_ID,Count,O,H,L,C,Volume,VWAP):
    VWAP = np.where(np.isinf(VWAP),(C+O)/2,VWAP)
    base = C
    O = O/base
    H = H/base
    L = L/base
    C = C/base
    VWAP = VWAP/base
    Price = base

    Dollars = Volume * Price
    Volume_per_trade = Volume/Count
    Dollars_per_trade = Dollars/Count

    log_ret = np.log(C/O)
    log_ret_H = np.log(H/C)
    log_ret_L = np.log(C/L)
    log_ret_VWAP = np.log(C/VWAP)
    
    GK_vol = (1 / 2 * np.log(H/L) ** 2 - (2 * np.log(2) - 1) * np.log(C/O) ** 2)
    RS_vol = np.log(H/C)*np.log(H/O) + np.log(L/C)*np.log(L/O)

    #return(np.transpose(np.array([Count,O,H,L,C,Price,Volume,VWAP,Dollars,Volume_per_trade,Dollars_per_trade,log_ret,GK_vol,RS_vol])))
    
    log_Count,log_Volume,log_Dollars,log_Volume_per_trade,log_Dollars_per_trade = np.log([Count,Volume,Dollars,Volume_per_trade,Dollars_per_trade])

    return(np.transpose(np.array([log_ret,log_ret_H,log_ret_L,log_ret_VWAP,GK_vol,RS_vol,log_Count,log_Volume,log_Dollars,log_Volume_per_trade,log_Dollars_per_trade])))

def Time_Feature_fn(timestamp):
    
    sin_month = (np.sin(2 * np.pi * timestamp.month/12))
    cos_month = (np.cos(2 * np.pi * timestamp.month/12))
    sin_day = (np.sin(2 * np.pi * timestamp.day/31))
    cos_day = (np.cos(2 * np.pi * timestamp.day/31))
    sin_hour = (np.sin(2 * np.pi * timestamp.hour/24))
    cos_hour = (np.cos(2 * np.pi * timestamp.hour/24))
    sin_minute = (np.sin(2 * np.pi * timestamp.minute/60))
    cos_minute = (np.cos(2 * np.pi * timestamp.minute/60))

    return(np.array((sin_month,cos_month,sin_day,cos_day,sin_hour,cos_hour,sin_minute,cos_minute)))

# to do : preprocessing per fold

mean = pd.read_parquet(path+'mean_fold_'+str(fold)+'.parquet')
std = pd.read_parquet(path+'std_fold_'+str(fold)+'.parquet')

mean = mean.transpose()[numerical_columns].values
std = std.transpose()[numerical_columns].values

for (test_df, sample_prediction_df) in iter_test:
    
    #df = Clean_df(pd.DataFrame(x,columns=f))
    #timestamp,Asset_ID,Count,O,H,L,C,Volume,VWAP,row_id = (test_df[col].values for col in ['timestamp','Asset_ID','Count','Open','High','Low','Close','Volume','VWAP','row_id'])
    timestamp,Asset_ID,Count,O,H,L,C,Volume,VWAP,row_id = Clean_df(test_df.values)
    
    # np.array([Count,O,H,L,C,Price,Volume,VWAP,Dollars,Volume_per_trade,Dollars_per_trade,log_ret,GK_vol,RS_vol])
    Features = Base_Feature_fn(timestamp,Asset_ID,Count,O,H,L,C,Volume,VWAP)

    #removing wieghts when data is missing so that they don't appears in market
    weigths_curr = np.where(np.isnan(O),O,weigths)
    Market_Features = np.nansum(Features*np.expand_dims(weigths_curr,axis=1)/np.nansum(weigths_curr),axis=0)
    #Market_Features = np.tile(Market_Features,(14,1))

    #np.array((sin_month,cos_month,sin_day,cos_day,sin_hour,cos_hour,sin_minute,cos_minute))
    time = timestamp_to_date(timestamp[0])
    Time_Features = Time_Feature_fn(time)
    #Time_Features = np.tile(Time_Features,(14,1))

    MA_Features = []
    MA_Features_M  = [] 

    for lag in MA_lags:
        dict_RM[lag].push(Features.copy())
        dict_RM_M[lag].push(Market_Features.copy())

        MA_Features.append(dict_RM[lag].get_mean())
        MA_Features_M.append(dict_RM_M[lag].get_mean())

    #standardise w/ 3750 lag
    ref = 3750

    for i in range(len(MA_lags)):
        if MA_lags[i] == ref:
            MA_ref = dict_RM[MA_lags[i]].get_mean().copy()
            MA_M_ref = dict_RM_M[MA_lags[i]].get_mean().copy()

    Features[:,-6:] = (Features[:,-6:] - MA_ref[:,-6:]).copy()
    Market_Features[-6:] = (Market_Features[-6:] - MA_M_ref[-6:]).copy()

    for i in range(len(MA_lags)):
        MA_Features[i][:,-6:] = (MA_Features[i][:,-6:] - MA_ref[:,-6:]).copy()
        MA_Features_M[i][-6:] = (MA_Features_M[i][-6:] - MA_M_ref[-6:]).copy()

    MA_Features_agg = np.concatenate(MA_Features,axis=1)
    MA_Features_M_agg = np.concatenate(MA_Features_M)

    betas = []

    for lag in beta_lags:
        dict_MM[lag].push(Market_Features[0]**2)
        dict_Mr[lag].push(Market_Features[0]*Features[:,0])
        betas.append(np.expand_dims(dict_Mr[lag].get_mean()/dict_MM[lag].get_mean(),axis=1))

    betas = np.concatenate(betas,axis=1)
    betas = np.nan_to_num(betas, nan=0., posinf=0., neginf=0.) 

    values = np.concatenate((Features,np.tile(Market_Features,(14,1)),np.tile(Time_Features,(14,1)),MA_Features_agg,np.tile(MA_Features_M_agg,(14,1)),betas),axis=1)
    values = np.nan_to_num((values - mean)/std)
    
    if Filter_Bad_Models:
        preds = np.median(np.array([models[(i,j)].predict(values) for i in range(n_seed) for j in folds if dict_score[(i,j)]>level]),axis=0).flatten()
    else:
        preds = np.median(np.array([models[(i,j)].predict(values) for i in range(n_seed) for j in folds]),axis=0).flatten()
    
    if Market_Correction:
        market_pred = np.sum(preds * weigths_curr)/np.sum(weigths_curr).copy()
        preds -= market_pred
    
    sample_prediction_df['Target'] = [preds[(row_id == rid)][0] for rid in sample_prediction_df.row_id.values]
    env.predict(sample_prediction_df)