Credit: this kernel provides some experiments starting from a great notebook https://www.kaggle.com/lonnieqin/ubiquant-market-prediction-with-dnn

In [None]:
import os, gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr as p

import warnings
warnings.simplefilter('ignore')

os.environ['TF_CPP_MIN_LOG_LEVEL']='3'
tf.random.set_seed(2022)

## Parameters

In [None]:
nsamples = 2500000
nfeatures = 300
nfolds = 5
lr = 0.001
epochs = 20
batch_size = 1024

features = [f'f_{i}' for i in range(nfeatures)]

## Data

In [None]:
train = pd.read_pickle('../input/ump195gb/train.pkl')[-nsamples:]
train.head(3)

## Model

In [None]:
max_tokens = len(train.investment_id.unique()) + 1
investment_id_lookup_layer = layers.IntegerLookup(max_tokens=max_tokens)
investment_id_lookup_layer.adapt(train.investment_id)

In [None]:
max_tokens2 = len(train.time_id.unique()) + 1
time_id_lookup_layer = layers.IntegerLookup(max_tokens=max_tokens2)
time_id_lookup_layer.adapt(train.time_id)

In [None]:
def get_model():
    investment_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    time_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    features_inputs = tf.keras.Input((300, ), dtype=tf.float16)
    
    investment_id_x = investment_id_lookup_layer(investment_id_inputs)
    investment_id_x = layers.Embedding(max_tokens, 32, input_length=1)(investment_id_x)
    investment_id_x = layers.Reshape((-1, ))(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    
    time_id_x = time_id_lookup_layer(time_id_inputs)
    time_id_x = layers.Embedding(max_tokens2, 32, input_length=1)(time_id_x)
    time_id_x = layers.Reshape((-1, ))(time_id_x)
    time_id_x = layers.Dense(64, activation='swish')(time_id_x)
    time_id_x = layers.Dense(64, activation='swish')(time_id_x)
    time_id_x = layers.Dense(64, activation='swish')(time_id_x)
    
    feature_x = layers.Dense(256, activation='swish')(features_inputs)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    
    x = layers.Concatenate(axis=1)([investment_id_x, time_id_x, feature_x])
    x = layers.Dense(512, activation='swish', kernel_regularizer='l2')(x)
    x = layers.Dense(128, activation='swish', kernel_regularizer='l2')(x)
    x = layers.Dense(32, activation='swish', kernel_regularizer='l2')(x)
    output = layers.Dense(1)(x)
    rmse = keras.metrics.RootMeanSquaredError(name='rmse')
    model = tf.keras.Model(inputs=[investment_id_inputs, time_id_inputs, features_inputs], outputs=[output])
    model.compile(optimizer=tf.optimizers.Adam(learning_rate=lr), loss='mse', metrics=[rmse])
    
    gc.collect()
    return model

#keras.utils.plot_model(get_model(), show_shapes=True)

## Training

In [None]:
def run(train):
    kfold = StratifiedKFold(nfolds, shuffle=True, random_state=42)
    for index, (train_indices, valid_indices) in enumerate(kfold.split(train, train.investment_id)):
        print(f'Fold {index}')
        print('_'*50)
        X_train, X_val = train[features].iloc[train_indices].astype('float16'), train[features].iloc[valid_indices].astype('float16')
        y_train, y_val = train.target.iloc[train_indices], train.target.iloc[valid_indices]
        investment_id_train, investment_id_val = train.investment_id.iloc[train_indices], train.investment_id.iloc[valid_indices]
        time_id_train, time_id_val = train.time_id.iloc[train_indices], train.time_id.iloc[valid_indices]

        model = get_model()
        checkpoint = keras.callbacks.ModelCheckpoint(f"model_{index}.tf", save_best_only=True, save_weights_only=True)
        early_stop = keras.callbacks.EarlyStopping(patience=5)
        history = model.fit([investment_id_train, time_id_train, X_train], y_train, 
                            epochs=epochs,
                            batch_size=batch_size,
                            validation_data=([investment_id_val, time_id_val, X_val], y_val), 
                            callbacks=[checkpoint, early_stop], verbose=1)
        models.append(model)
        
        # Evaluation
        y_pred = model.predict([investment_id_val, time_id_val, X_val])
        print(f'Pearson correlation: {p(y_pred.reshape(-1), y_val)[0]}')
        print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_pred.reshape(-1), y_val))}')
        
        del investment_id_train, investment_id_val, X_train, X_val, y_train, y_val
        gc.collect()

In [None]:
models = []
run(train)

del train
gc.collect()

## Inference/submission

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test() 
for (test_df, sample_prediction_df) in iter_test:
    preds=[]
    for model in models:
        test_df['time_id'] = test_df['row_id'].apply(lambda x: int(x.split('_')[0]) )
        preds.append(model.predict([test_df.investment_id, test_df.time_id, test_df[features]]))
    
    sample_prediction_df['target'] = np.mean(preds, axis=0)
    env.predict(sample_prediction_df) 
    display(sample_prediction_df)