#  Ubiquant Market Prediction with DNN and Keras Tuner

Based on this great notebook [Ubiquant Market Prediction with DNN](https://www.kaggle.com/lonnieqin/ubiquant-market-prediction-with-dnn/notebook), so please upvote it. 

[Keras Tuner](https://www.tensorflow.org/tutorials/keras/keras_tuner) is added to find best hyperparameters of the DNN model.

In [None]:
DEBUG = False

In [None]:
import warnings
if not DEBUG:
    warnings.filterwarnings('ignore')
import os
import gc
import json
import time
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from scipy import stats
import kerastuner as kt
from tqdm.auto import tqdm
print('tensorflow version:', tf.__version__)
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if gpu_devices:
    for gpu_device in gpu_devices:
        print('device available:', gpu_device)
pd.set_option('display.max_columns', None)

## Config

In [None]:
VER = 'v1'
CONFIG = {
    'version': VER,
    'folds': 5,
    'epochs': 4 if DEBUG else 20,
    'patience': 2 if DEBUG else 4,
    'decay': False,
    'batch_size': 1024,
    'seed': 2021,
    'lr': .001,
    'max_trials': 3 if DEBUG else 8,
    'skf': True,
    'comments': ''
}
DATA_PATH = '../input/ubiquant-market-prediction-half-precision-pickle'
MDLS_PATH = f'./models_{VER}'
if not os.path.exists(MDLS_PATH):
    os.mkdir(MDLS_PATH)
with open(f'{MDLS_PATH}/config.json', 'w') as file:
    json.dump(CONFIG, file)
    
def seed_all(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_all(CONFIG['seed'])
start_time = time.time()

## Load data

In [None]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
train = pd.read_pickle(f'{DATA_PATH}/train.pkl')
if DEBUG:
    train = train.loc[train.investment_id >= 3000, :]
    train.reset_index(inplace=True)
    del train['index']
else:
    # limit the train dataset due to Kaggle memory issue
    train = train.loc[train.investment_id >= 2000, :]
    train.reset_index(inplace=True)
    del train['index']
print(train.shape)
display(train.head())

investment_id = train.pop('investment_id')
investment_ids = list(investment_id.unique())
_ = train.pop('time_id')
y = train.pop('target')

## Utils

In [None]:
def preprocess(X, y):
    return X, y

def make_dataset(feature, investment_id, y,
                 batch_size, mode='train'):
    dataset = tf.data.Dataset.from_tensor_slices(
        (
            (
                investment_id,
                feature,
            ), 
            y
        )
    )
    dataset = dataset.map(preprocess)
    if mode == 'train':
        dataset = dataset.shuffle(4096)
    dataset = dataset.batch(batch_size).cache().prefetch(
        tf.data.experimental.AUTOTUNE
    )
    return dataset

def metric_plot(history, cols, title):
    plt.plot(pd.DataFrame(history.history).loc[:, cols])
    plt.title(title)

def datasets(trn_ind_folds, val_ind_folds, fold, train, investment_id):
    trn_ind = trn_ind_folds[fold]
    val_ind = val_ind_folds[fold]
    train_ds = make_dataset(
        train.iloc[trn_ind], 
        investment_id[trn_ind], 
        y.iloc[trn_ind],
        batch_size=CONFIG['batch_size'],
        mode='train'
    )
    val_ds = make_dataset(
        train.iloc[val_ind], 
        investment_id[val_ind], 
        y.iloc[val_ind], 
        batch_size=CONFIG['batch_size'],
        mode='val'
    )
    return train_ds, val_ds, y.iloc[val_ind]    
    
def show_results(model, history, val_ds, y_val):
    pearson_score = stats.pearsonr(model.predict(val_ds).ravel(), y_val.values)[0]
    print('pearson:', pearson_score)
    plt.figure(figsize=(16, 3))
    plt.subplot(1, 3, 1)
    metric_plot(history, ['mse', 'val_mse'], 'MSE')
    plt.subplot(1, 3, 2)
    metric_plot(history, ['mae', 'val_mae'], 'MAE')
    plt.subplot(1, 3, 3)
    metric_plot(history, ['rmse', 'val_rmse'], 'RMSE')
    plt.show()

## Train with Keras Tuner

In [None]:
investment_id_size = len(investment_ids) + 1
investment_id_lookup_layer = layers.IntegerLookup(
    max_tokens=investment_id_size
)
investment_id_lookup_layer.adapt(
    pd.DataFrame(
        {'investment_ids': investment_ids}
    )
)

def tune_model(hp, features_num=300, lr=.001):
    investment_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    features_inputs = tf.keras.Input((features_num, ), dtype=tf.float16)
    
    investment_id_x = investment_id_lookup_layer(investment_id_inputs)
    investment_id_x = layers.Embedding(
        investment_id_size, 
        hp.Int('investment_embedding_size', 16, 64),
        input_length=1)(investment_id_x)
    investment_id_x = layers.Reshape((-1, ))(investment_id_x)
    for n_emb_hidden in range(hp.Int('emb_num_layers', 1, 4)):
        investment_id_x = layers.Dense(
            hp.Int(f'emb_num_units_{n_emb_hidden}', 16, 512),
            activation='swish')(investment_id_x)
        investment_id_x = layers.BatchNormalization()(investment_id_x)
        investment_id_x = layers.Dropout(
            hp.Float(f'emb_dropout_{n_emb_hidden}', .0, .5))(investment_id_x)
    
    features_x = layers.Dense(
        hp.Int('features_enter_size', 16, 512), 
        activation='swish')(features_inputs)
    for n_feats_hidden in range(hp.Int('feats_num_layers', 1, 4)):
        features_x = layers.Dense(
            hp.Int(f'feats_num_units_{n_feats_hidden}', 16, 512),
            activation='swish')(features_x)
        features_x = layers.BatchNormalization()(features_x)
        features_x = layers.Dropout(
            hp.Float(f'feats_dropout_{n_feats_hidden}', .0, .5))(features_x)
    
    x = layers.Concatenate(axis=1)([investment_id_x, features_x])
    for n_hidden in range(hp.Int('num_layers', 1, 4)):
        x = layers.Dense(
            hp.Int(f'num_units_{n_hidden}', 16, 512),  
            activation='swish', 
            kernel_regularizer='l2')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(hp.Float(f'dropout_{n_hidden}', .0, .5))(x)
    output = layers.Dense(1)(x)
    
    rmse = tf.keras.metrics.RootMeanSquaredError(name='rmse')
    model = tf.keras.Model(
        inputs=[investment_id_inputs, features_inputs], 
        outputs=[output]
    )
    model.compile(
        optimizer=tf.optimizers.Adam(
            hp.Float('lr', .00001, .05, default=lr)
        ), 
        loss='mse', 
        metrics=['mse', 'mae', 'mape', rmse]
    )
    return model

In [None]:
early_stopper = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    patience=CONFIG['patience'], 
    verbose=1,
    mode='min',
    restore_best_weights=True
)
plateau_reducer = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', 
    factor=.1, 
    patience=CONFIG['patience'] / 2, 
    verbose=1,
    mode='min'
)

In [None]:
class CVTuner(kt.engine.tuner.Tuner):
    
    def run_trial(self, trial, train, n_folds, batch_size, 
                  epochs, obj, mode, callbacks):
        trn_ind_folds, val_ind_folds = [], []
        if CONFIG['skf']:
            print('stratified kfold split')
            kfold = StratifiedKFold(
                n_splits=n_folds, 
                shuffle=True, 
                random_state=CONFIG['seed']
            ).split(train, investment_id)
            for fold, (trn_ind, val_ind) in enumerate(kfold):
                trn_ind_folds.append(trn_ind)
                val_ind_folds.append(val_ind)
        val_losses = []
        for counter, fold in enumerate(range(n_folds)):
            print('CV {}/{}'.format(counter + 1, n_folds))
            if CONFIG['skf']:
                train_ds, val_ds, y_val = datasets(
                    trn_ind_folds,
                    val_ind_folds, 
                    fold, train, 
                    investment_id
                )
            model = self.hypermodel.build(trial.hyperparameters)
            history = model.fit(
                train_ds, 
                epochs=epochs, 
                validation_data=val_ds, 
                callbacks=callbacks,
                verbose=1
            )
            val_losses.append(
                min(history.history[obj]) 
                if mode == 'min' else max(history.history[obj])
            )
            show_results(model, history, val_ds, y_val)
            del train_ds, val_ds, y_val; gc.collect()
        self.oracle.update_trial(
            trial.trial_id, 
            {obj: np.mean(val_losses, axis=0)}
        )

model_fn = lambda hp: tune_model(
    hp, 
    features_num=300, 
    lr=CONFIG['lr']
)
if CONFIG['max_trials']:       
    tuner = CVTuner(
        hypermodel=model_fn,
        oracle=kt.oracles.BayesianOptimization(
            objective= kt.Objective('val_loss', direction='min'),
            num_initial_points=1,
            max_trials=CONFIG['max_trials']
        ),
        project_name=f'tuner_{VER}'
    )
    print('=' * 10, f'TUNER max trials={CONFIG["max_trials"]}', '=' * 10)
    tuner.search(
        train, 
        n_folds=CONFIG['folds'], 
        batch_size=CONFIG['batch_size'], 
        epochs=CONFIG['epochs'], 
        obj='val_loss', 
        mode='min',
        callbacks=[early_stopper, plateau_reducer]
    )
    hp = tuner.get_best_hyperparameters(1)[0]
    pd.to_pickle(hp, f'{MDLS_PATH}/best_hp.pkl', protocol=4)
    del tuner; gc.collect()

In [None]:
hp = pd.read_pickle(f'{MDLS_PATH}/best_hp.pkl')
print('hp params loaded:', hp.values)

In [None]:
trn_ind_folds, val_ind_folds = [], []
if CONFIG['skf']:
    print('stratified kfold split')
    kfold = StratifiedKFold(
        n_splits=CONFIG['folds'], 
        shuffle=True, 
        random_state=CONFIG['seed']
    ).split(train, investment_id)
    for fold, (trn_ind, val_ind) in enumerate(kfold):
        trn_ind_folds.append(trn_ind)
        val_ind_folds.append(val_ind)
                
for counter, fold in enumerate(range(CONFIG['folds'])):
    print('========== CV {}/{} =========='.format(
        counter + 1, 
        CONFIG['folds']
    ))
    ch_path = f'{MDLS_PATH}/model_f{fold}'
    if CONFIG['skf']:
        train_ds, val_ds, y_val = datasets(
            trn_ind_folds,
            val_ind_folds, 
            fold, train, 
            investment_id
        )
    model = model_fn(hp)
    history = model.fit(
        train_ds, 
        epochs=CONFIG['epochs'], 
        validation_data=val_ds, 
        callbacks=[
            early_stopper, 
            plateau_reducer, 
            tf.keras.callbacks.ModelCheckpoint(
                ch_path,
                monitor='val_loss',
                verbose=1, 
                save_best_only=True,
                mode='min'
            )
        ],
        verbose=1
    )
    show_results(model, history, val_ds, y_val)
    del model, train_ds, val_ds, y_val; gc.collect()

## Submit to competition

In [None]:
%%time
models = []
for fold in range(CONFIG['folds']):
    ch_path = f'{MDLS_PATH}/model_f{fold}'
    models.append(tf.keras.models.load_model(ch_path))
    print('model loaded:', ch_path)

In [None]:
def preprocess_test(investment_id, feature):
    return (investment_id, feature), 0

def make_test_dataset(feature, investment_id, batch_size=1024):
    dataset = tf.data.Dataset.from_tensor_slices(((investment_id, feature)))
    dataset = dataset.map(preprocess_test)
    dataset = dataset.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

def inference(models, dataset):
    y_preds = []
    for model in models:
        y_pred = model.predict(dataset)
        y_preds.append(y_pred)
    return np.mean(y_preds, axis=0)

In [None]:
import ubiquant

env = ubiquant.make_env()
iter_test = env.iter_test() 

n_features = 300
features = [f'f_{i}' for i in range(n_features)]
for (test_df, sample_pred_df) in iter_test:
    dataset = make_test_dataset(test_df[features], test_df["investment_id"])
    sample_pred_df['target'] = inference(models, dataset)
    env.predict(sample_pred_df)

In [None]:
sample_pred_df.head()