# Setup

In [1]:
# Setup
import os
import time
import gc
import tqdm
import copy
import json
import pickle

import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow_addons as tfa
from keras import backend as K

from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from itertools import combinations
from collections import deque

import warnings
warnings.filterwarnings("ignore")
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# To train a model to predict target
training = True
prepare_dataset = True

# To save models and dataset
save = False

# Submission
submission = True

# To load submission models or used trained ones
load_submission_model = False

# Seed
seed = 42

# Verbosity
verbose = 1


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



# General Functions

In [2]:
def load_data_from_file(path='/kaggle/input/optiver-trading-at-the-close/train.csv'):
    return pd.read_csv(path)


def load_model(model_path, compile=False):
    return tf.keras.models.load_model(model_path, compile=compile)


def load_input_scaler(input_scaler_path='/kaggle/working/input_scaler.pkl'):
    with open(input_scaler_path, "rb") as f:
        scaler = pickle.load(f)
    return scaler


def fill_sequence(sequence, step, n_features):
    # A function to fill sequence with the mask value  
    new_sequence = copy.deepcopy(sequence)        
    for i in range(step - len(sequence)):
        new_sequence.append(np.array([mask_value]*(n_features)).reshape(1, -1))
    return np.array(new_sequence)


def fill_targets(targets, step, n_features):
    # A function to fill sequence with the mask value  
    new_targets = copy.deepcopy(targets)        
    for i in range(step - len(targets)):
        new_targets.append(0.0)
    return np.array(new_targets)


def features_engineering(df, runtime=False):   
    if not runtime:
        # Drop NaN values
        df = df.dropna()
        
        # Drop unused columns
        df = df.drop(columns=['far_price', 'near_price', 'row_id', 'time_id'])
    else:
        # Drop unused columns
        df = df.drop(columns=['far_price', 'near_price'])
                
    df["bid_ask_imbalance"] = df.eval("(bid_size - ask_size) / (bid_size + ask_size)")    
    df["ask_size_x_price"] = df.eval("ask_size * ask_price")
    df["bid_size_x_price"] = df.eval("bid_size * bid_price")
    df["complex_imbalance"] = df.eval("(bid_size_x_price - ask_size_x_price) / (bid_size_x_price + ask_size_x_price)")
    df["prices_average"] = df.eval('(reference_price + ask_price + bid_price + wap)/4')
    
    prices = ['reference_price', 'ask_price', 'bid_price', 'wap']
    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_spread"] = df.eval(f"{c[0]}-{c[1]}")
        df[f"{c[0]}_{c[1]}_urgency"] = df[f"{c[0]}_{c[1]}_spread"] * df["bid_ask_imbalance"]
        df[f'{c[0]}_{c[1]}_imbalance'] = df.eval(f'({c[0]}-{c[1]})/({c[0]}+{c[1]})')
        df[f'{c[0]}_x_{c[1]}'] = df.eval(f'{c[0]} * {c[1]}')
    
    # Drop spred columns
    columns_to_keep = [col for col in df.columns if 'spread' not in col]
    df = df[columns_to_keep]
    
    df = df.drop(columns=['ask_size_x_price', 'bid_size_x_price'])
    
     # Drop NaN values
    df = df.dropna()
    return df


def prepare_data(df,
                 mask_value=np.float32(999999999999.0),
                 step=4,
                 save=True,
                 sequences_file_name='/kaggle/working/sequences.npy',
                 targets_file_name='/kaggle/working/targets.npy',
                 input_scaler_file_name='/kaggle/working/input_scaler.pkl'):
     
    # Features Engineering
    df = features_engineering(df, runtime=False)

    # Data standardization
    features_to_remove_for_scaler = ['stock_id', 'date_id', 'target']
    input_scaler = StandardScaler()
    input_scaler.fit(df.drop(features_to_remove_for_scaler, axis=1, inplace=False))
    
    # Number of features in the final dataset
    n_features = len(df.columns) - len(features_to_remove_for_scaler)
    
    # Group the data by stock_id and date_id
    grouped = df.groupby(['stock_id', 'date_id'])
    
    # Lists to fill
    data_sequences = []
    targets_sequences = []
    
    # Loop over data
    for group_name, group_data in tqdm.tqdm(grouped):
        
        # Get names
        stock_id, date_id = group_name
        
        # Order data by 'seconds_in_bucket'
        group_data = group_data.sort_values(by='seconds_in_bucket')
        
        # Deque to fill
        sequence = deque(maxlen=step)
        targets = deque(maxlen=step)
                
        # Iterate over a grup of data
        for i in range(len(group_data)):
            
            # Get the row
            row = group_data.iloc[i]
               
            # Get the target
            target = row['target']
            
            # Standardize the data
            data = row.drop(features_to_remove_for_scaler).to_numpy().reshape(1, -1)
            standardized_data = input_scaler.transform(data)
            
            # Fill deque
            sequence.append(standardized_data)
            targets.append(target)
            
            # Fill up to steps
            new_sequence = fill_sequence(sequence, step, n_features)
            new_targets = fill_targets(targets, step, n_features)
              
            # Fill data sequence list
            data_sequences.append(np.squeeze(new_sequence, axis=1))
            
            # Fill targets list
            targets_sequences.append(new_targets)
    
    # Convert to numpy array
    data_sequences = np.array(data_sequences, dtype='float32')
    
    # Convert to numpy array
    targets_sequences = np.array(targets_sequences, dtype='float32')
    
    if save:
        print(f"Saving sequences to {sequences_file_name}")
        np.save(sequences_file_name, data_sequences, allow_pickle=True)
    
        print(f"Saving targets to {targets_file_name}")
        np.save(targets_file_name, targets_sequences, allow_pickle=True)
        
        print(f"Saving the input scaler {input_scaler_file_name}")
        pickle.dump(input_scaler, open(input_scaler_file_name, 'wb'))
    
    # Run the garbage collector
    gc.collect()
    
    return data_sequences, targets_sequences, input_scaler


def load_dataset_and_scaler(sequences_file_name='/kaggle/working/sequences.npy',
                            targets_file_name='/kaggle/working/targets.npy',
                            input_scaler_file_name='/kaggle/working/input_scaler.pkl'):
    # Sequences
    with open(sequences_file_name, "rb") as f:
        sequences = np.load(f, allow_pickle=True)
         
    # Targets
    with open(targets_file_name, "rb") as f:
        targets = np.load(f, allow_pickle=True)
        
    # Input scaler
    with open(input_scaler_file_name, "rb") as f:
        input_scaler = pickle.load(f)
            
    return sequences, targets, input_scaler



class CleanCallback(tf.keras.callbacks.Callback):
        def on_epoch_end(self, epoch, logs=None):
            K.clear_session()
            gc.collect()


def Ranger(sync_period=6,
           slow_step_size=0.5,
           learning_rate=0.001,
           beta_1=0.9,
           beta_2=0.999,
           epsilon=1e-7,
           weight_decay=0.,
           amsgrad=False,
           sma_threshold=5.0,
           total_steps=0,
           warmup_proportion=0.1,
           min_lr=0.,
           name="Ranger"):
    """
        function returning a tf.keras.optimizers.Optimizer object
        returned optimizer is a Ranger optimizer
        Ranger is an optimizer combining RAdam (https://arxiv.org/abs/1908.03265) and Lookahead (https://arxiv.org/abs/1907.0861)
        returned optimizer can be fed into the model.compile method of a tf.keras model as an optimizer
        ...
        Attributes
        ----------
        learning_rate : float
            step size to take for RAdam optimizer (depending on gradient)
        beta_1 : float
            parameter that specifies the exponentially moving average length for momentum (0<=beta_1<=1)
        beta_2 : float
            parameter that specifies the exponentially moving average length for variance (0<=beta_2<=1)
        epsilon : float
            small number to cause stability for variance division
        weight_decay : float
            number with which the weights of the model are multiplied each iteration (0<=weight_decay<=1)
        amsgrad : bool
            parameter that specifies whether to use amsgrad version of Adam (https://arxiv.org/abs/1904.03590)
        total_steps : int
            total number of training steps
        warmup_proportion : float
            the proportion of updated over which the learning rate is increased from min learning rate to learning rate (0<=warmup_proportion<=1)
        min_lr : float
            learning rate at which the optimizer starts
        k : int
            parameter that specifies after how many steps the lookahead step backwards should be applied
        alpha : float
            parameter that specifies how much in the direction of the fast weights should be moved (0<=alpha<=1)
    """
    # create RAdam optimizer
    inner = tfa.optimizers.RectifiedAdam(learning_rate, beta_1, beta_2, epsilon, weight_decay, amsgrad, sma_threshold, total_steps, warmup_proportion, min_lr, name)
    # feed RAdam optimizer into lookahead operation
    optim = tfa.optimizers.Lookahead(inner, sync_period, slow_step_size, name)
    return optim

# Training

### Constants

In [3]:
# Mask value
mask_value = np.float32(999999999999.0)

# Constants for the dataset
step = 4
shuffle_factor = 1 # 1
take_factor = 1 # 1 

# Constants for the model
units_type = "gru"
epochs = 24 # 256
batch_size = 512
n_units = 12
optimizer = "adamw"
weight_decay = 0.025  # 0.015 
loss = "mae"
learning_rate = 0.001 # 0.005 
dropout = 0.25

# Callbacks to use
use_early_stopping = False # True
use_lr_reduce = False # True
min_delta = 0.0001
early_stopping_patience = 6
reduce_lr_patience = 4

use_lr_schedule = not use_lr_reduce and True # False
lr_schedule_step = 6


if save:
    # Experiments number
    experiment_number = 16
    
    # Path to save models
    save_model_path = f'/kaggle/working/Model_{experiment_number}'

### Dataset

In [4]:
if training:
    if prepare_dataset:
        # Loat data from file
        df = load_data_from_file()

        # Prepare the dataset
        sequences, targets, input_scaler = prepare_data(df, mask_value=mask_value, step=step, save=save)
    else:
        # If the datase is already available, load it
        sequences, targets, input_scaler = load_dataset_and_scaler()
    
    # Add a dimension
    targets = targets[:, :, None]
    
    # Split the dataset
    X_train, X_temp, y_train, y_temp = train_test_split(sequences, targets, train_size=0.7, random_state=seed)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=seed)
    
    # Dataset Pipeline
    training_set = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    training_set = training_set.shuffle(int(X_train.shape[0] * shuffle_factor))
    training_set = training_set.take(int(X_train.shape[0] * take_factor))
    training_set = training_set.batch(batch_size, drop_remainder=True)
    training_set = training_set.prefetch(tf.data.AUTOTUNE)

    validation_set = tf.data.Dataset.from_tensor_slices((X_val, y_val))
    validation_set = validation_set.batch(batch_size, drop_remainder=True)
    validation_set = validation_set.prefetch(tf.data.AUTOTUNE)

100%|██████████| 95212/95212 [33:46<00:00, 46.99it/s]


### Model Fit

In [5]:
if training:
    # Optimizer
    if optimizer == "sgd":
        optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)
    elif optimizer == "rmsprop":
        optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer == "adam":
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer == "adamw":
        optimizer = tf.keras.optimizers.AdamW(learning_rate=learning_rate, weight_decay=weight_decay)
    elif optimizer == "ranger":
        optimizer = Ranger(learning_rate=learning_rate, weight_decay=weight_decay)

    # Callbacks
    callbacks = [] # To fill
    
    clean = CleanCallback()
    callbacks.append(clean)
    
    if use_early_stopping:
        early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            min_delta=min_delta,
            patience=early_stopping_patience,
            verbose=verbose,
            restore_best_weights=True)
        callbacks.append(early_stopping)

    if use_lr_reduce:
        reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',                                
            min_delta=min_delta,                            
            factor=0.5,
            patience=reduce_lr_patience,
            verbose=verbose)
        callbacks.append(reduce_lr)
        
    if use_lr_schedule:
        def scheduler(epoch, lr):
            if epoch == 0:
                return lr

            if epoch % lr_schedule_step == 0:
                return lr / 2
            else:
                return lr
            
        lr_schedule = tf.keras.callbacks.LearningRateScheduler(scheduler)
        callbacks.append(lr_schedule)
    
    
    # Model definition
    model_input = tf.keras.Input(shape=(X_train.shape[1], X_train.shape[2]))
    mask = tf.keras.layers.Masking(mask_value=mask_value)(model_input)
    
    if units_type == "gru":
        hidden = tf.keras.layers.GRU(n_units, return_sequences=True, dropout=dropout, recurrent_dropout=dropout)(mask)
    else:
        hidden = tf.keras.layers.LSTM(n_units, return_sequences=True, dropout=dropout, recurrent_dropout=dropout)(mask)
    
    attention = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(units=n_units, activation='softmax'))(hidden)
    hidden = tf.keras.layers.Multiply()([hidden, attention])
    
    model_output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(units=1, activation='linear'))(hidden)
    model = tf.keras.Model(inputs=model_input, outputs=model_output)

    # Model summary
    print("Model Summary")
    model.summary()
    
    # Model compile
    model.compile(optimizer=optimizer, loss=loss, metrics=None)
    
    # Model Fit
    print("\nModel Fit")
    model.fit(
            training_set,
            epochs=epochs,
            verbose=verbose,
            callbacks=callbacks,
            validation_data=validation_set)
    
    if save:
        # Save the model to the path
        print(f"\nModel Saving at: {save_model_path}")
        model.save(save_model_path, save_format='tf')

        # Evaluate the model 
        test_set = tf.data.Dataset.from_tensor_slices((X_test, y_test))
        test_set = test_set.batch(batch_size, drop_remainder=True)
        test_set = test_set.prefetch(tf.data.AUTOTUNE)

        print("\nModel Evaluation")
        model.evaluate(test_set, verbose=verbose)

        print("\nModel Prediction")
        results = model.predict(test_set, verbose=verbose)
        
# Clear
K.clear_session()
gc.collect()

Model Summary
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 4, 31)]      0           []                               
                                                                                                  
 masking (Masking)              (None, 4, 31)        0           ['input_1[0][0]']                
                                                                                                  
 gru (GRU)                      (None, 4, 12)        1620        ['masking[0][0]']                
                                                                                                  
 time_distributed (TimeDistribu  (None, 4, 12)       156         ['gru[0][0]']                    
 ted)                                                                           

111

# Submission

### Functions

In [6]:
def zero_sum(prices, volumes):  
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices-std_error*step
    return out


def prepare_submission_data(df,
                            data_history,
                            input_scaler,
                            step=4):
    
    # Features Engineering
    df = features_engineering(df, runtime=True)
                    
    # Constants
    features_to_remove_for_scaler = ['stock_id', 'date_id', 'row_id']
    
    # Number of features in the final dataset
    n_features = len(df.columns) - len(features_to_remove_for_scaler)
    
    # Group the data by stock_id and date_id
    grouped = df.groupby(['stock_id', 'date_id'])
    
    # Loop over data
    for group_name, group_data in grouped:
        
        # Get names
        stock_id, date_id = group_name

        # Access the data
        row = group_data.iloc[0]

        if stock_id not in data_history:            
            # Init the deque
            data_deque = deque(maxlen=step)
            
            # Get row_id
            row_id = row['row_id']
                        
            # Standardize the data
            data = row.drop(features_to_remove_for_scaler).to_numpy().reshape(1, -1)
            standardized_data = input_scaler.transform(data)
            
            # Fill the deque
            data_deque.append(standardized_data)
            
            # Fill the history
            d = {'date_id': date_id, 'data_deque': data_deque, 'row_id': row_id}
            data_history.update({stock_id:d})
        else:
            # Read the dict from the history to update
            d = data_history[stock_id]
            
            if data_history[stock_id]['date_id'] != date_id:                
                # Init the deque
                data_deque = deque(maxlen=step)
            else:                                
                # Get th data deque
                data_deque = d['data_deque']

            # Get row_id
            row_id = row['row_id']

            # Standardize the data
            data = row.drop(features_to_remove_for_scaler).to_numpy().reshape(1, -1)
            standardized_data = input_scaler.transform(data)

            # Fill the deque
            data_deque.append(standardized_data)

            # Update the history               
            d.update({'date_id': date_id})
            d.update({'data_deque': data_deque})
            d.update({'row_id': row_id})
            data_history.update({stock_id:d})
            
    return data_history, n_features


def search_dict_in_history(data_history, row_id):
    for key, value in data_history.items():
        if value['row_id'] == row_id:
            return key, value
    return None, None

### Constants

In [7]:
if submission and load_submission_model: 
    # Mask value
    mask_value = np.float32(999999999999.0)

    # Step value
    step = 4
    
    # Experiments number
    experiment_number = 1
    
    # Path
    save_model_path = f'/kaggle/working/Model_{experiment_number}'
    
    # Load the model
    model = load_model(save_model_path)

    # Load the scaler for standardization
    input_scaler = load_input_scaler()

### Predict

In [8]:
if submission:
    # Init
    import optiver2023
    env = optiver2023.make_env()
    iter_test = env.iter_test()
    
    # Init an empty history
    data_history = {}

    # Loop over data
    for (test, revealed_targets, sample_prediction) in iter_test:    

        # Update the history
        data_history, n_features = prepare_submission_data(test, data_history, input_scaler, step)  

        # Get stock ids to consider for prediction
        row_ids_to_consider = sample_prediction['row_id']

        # List to fill
        input_data = []
        output_index_to_select = []

        # Loop over prediction data
        for row_id in row_ids_to_consider:               
            # Get the dict on the basi of row_id
            stock_id, d = search_dict_in_history(data_history, row_id)

            # Get the data deque from the dict
            data_deque = d['data_deque']
            
            # Add the output index to select
            output_index_to_select.append(len(data_deque) - 1)

            # Fill the data_deque to step size
            data_deque = fill_sequence(data_deque, step, n_features)        

            # Remove the empty dimension
            data_deque = np.squeeze(data_deque, axis=1)
            
            # Fill the list
            input_data.append(data_deque)
        
        # Data pipeline
        input_data = tf.data.Dataset.from_tensor_slices(input_data)
        input_data = input_data.batch(256, drop_remainder=False)
        
        # Run the model inference
        results = model.predict(input_data, verbose=0)
                
        selected_results = []
        for res, i in zip(results, output_index_to_select):
            selected_results.append(res[i])
        
        # Concert to numpy array
        results = np.array(selected_results)
        results = np.squeeze(results, axis=-1)
        
        # Assign the predictions (.copy)
        sample_prediction['target'] = zero_sum(results, test['bid_size'] + test['ask_size'])
                    
        # Predict
        env.predict(sample_prediction)
        
        # Clear
        K.clear_session()
        gc.collect()

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


### Check

In [9]:
if submission and load_submission_model:
    file_path = '/kaggle/working/submission.csv'

    if os.path.exists(file_path):
        timestamp = os.path.getctime(file_path)
        formatted_time = time.ctime(timestamp)

    current_timestamp = time.time()
    formatted_current_time = time.ctime(current_timestamp)

    print("Check submission")
    print(formatted_time)
    print(formatted_current_time)