#  Ubiquant Market Prediction with DNN

### References:

[Ubiquant Market Prediction with DNN](https://www.kaggle.com/lonnieqin/ubiquant-market-prediction-with-dnn)

## Import Packages

In [None]:
import os
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from sklearn.linear_model import SGDClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from scipy import stats
from pathlib import Path
import ubiquant
import pickle
import math
import time
import umap
import random
from collections import Counter, defaultdict
from tqdm import tqdm
# !pip install "../input/faisscpu/faiss_cpu-1.7.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl"
# import faiss

## Import dataset

In [None]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
train.info()
train.head()

## Create Integer Lookup for Investment IDs

#### Note that since we are likely to see new investment ids in future, it might be better to group similar investment ids as opposed to using them directly in the model. I attempted this but thus far it did not produce good results...

In [None]:
# since we will likely see new investment ids, its best to add OOV indices
investment_ids = list(train.investment_id.unique())
investment_id_size = len(investment_ids) + 150
investment_id_lookup_layer = layers.IntegerLookup(max_tokens=investment_id_size, num_oov_indices=150)
investment_id_lookup_layer.adapt(pd.DataFrame({"investment_ids":investment_ids}))

## Make Tensorflow dataset

In [None]:
def preprocess(X, y):
    return X, y
def make_dataset(feature, investment_id, y, batch_size=1024, mode="train"):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature), y))
    ds = ds.map(preprocess)
    if mode == "train":
        ds = ds.shuffle(4096)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

## Modeling

#### We implement 2 models for 2 types of cross validation. (1) The first is Stratified Group KFold. Here the objective is for the model to be trained on all investment IDs, in order to minimize the number of unknown investment IDs in the test set. On the other hand, we want to maximize the number of unkown time id's in the test set, since we know that time ids will be new in the test set. (2) The 2nd is EmbargoCV, adapted from the numerai tournament, which is a form of time series cross validation where we exclude the initial time ids from the validation folds to minimize time leakage... we combine the models at the end by averaging...

In [None]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    """ https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation """
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in tqdm(sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])), total=len(groups_and_y_counts)):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices
        
# Embargo CV adapted from Numerai tournament...
def get_time_series_cross_val_splits(data, cv = 5, embargo = 50):
    all_train_time_ids = data.time_id.unique()
    len_split = len(all_train_time_ids) // cv
    test_splits = [all_train_time_ids[i * len_split:(i + 1) * len_split] for i in range(cv)]
    # fix the last test split to have all the last time_ids, in case the number of time_ids wasn't divisible by cv
    rem = len(all_train_time_ids) - len_split*cv
    if rem>0:
        test_splits[-1] = np.append(test_splits[-1], all_train_time_ids[-rem:])

    train_splits = []
    for test_split in test_splits:
        test_split_max = int(np.max(test_split))
        test_split_min = int(np.min(test_split))
        # get all of the time_ids that aren't in the test split
        train_split_not_embargoed = [e for e in all_train_time_ids if not (test_split_min <= int(e) <= test_split_max)]
        train_split = [e for e in train_split_not_embargoed if
                       abs(int(e) - test_split_max) > embargo and abs(int(e) - test_split_min) > embargo]
        train_splits.append(train_split)

    # convenient way to iterate over train and test splits
    train_test_zip = zip(train_splits, test_splits)
    return train_test_zip

def load_model(name):
    path = Path(f"{name}/saved_model.pb")
    if path.is_file():
        model = keras.models.load_model(name, custom_objects={'corr_eval': corr_eval})
    else:
        model = False
    return model

# early stopping and learning rate reduction on plateau
early_stop = keras.callbacks.EarlyStopping(monitor='val_corr_eval', patience=7, verbose=1,
                                              mode='min', restore_best_weights=True)
plateau = keras.callbacks.ReduceLROnPlateau(monitor='val_corr_eval', factor=0.1, patience=3, 
                                           verbose=1, mode='min')

## Learning Rate schedular obtained from book: Hands On Scikit-Learn, Tensorflow by A Geron
class OneCycleScheduler(keras.callbacks.Callback):
    def __init__(self, iterations, max_rate, start_rate=None,
                 last_iterations=None, last_rate=None):
        self.iterations = iterations
        self.max_rate = max_rate
        self.start_rate = start_rate or max_rate / 10
        self.last_iterations = last_iterations or iterations // 10 + 1
        self.half_iteration = (iterations - self.last_iterations) // 2
        self.last_rate = last_rate or self.start_rate / 1000
        self.iteration = 0
    def _interpolate(self, iter1, iter2, rate1, rate2):
        return ((rate2 - rate1) * (self.iteration - iter1)
                / (iter2 - iter1) + rate1)
    def on_batch_begin(self, batch, logs):
        if self.iteration < self.half_iteration:
            rate = self._interpolate(0, self.half_iteration, self.start_rate, self.max_rate)
        elif self.iteration < 2 * self.half_iteration:
            rate = self._interpolate(self.half_iteration, 2 * self.half_iteration,
                                     self.max_rate, self.start_rate)
        else:
            rate = self._interpolate(2 * self.half_iteration, self.iterations,
                                     self.start_rate, self.last_rate)
        self.iteration += 1
        K.set_value(self.model.optimizer.learning_rate, rate)
        

# using correlation as metric
def corr_eval(y_true, y_pred):
    x = tf.cast(y_true, tf.float32)
    y = tf.cast(y_pred, tf.float32)
    mx = K.mean(x)
    my = K.mean(y)
    xm, ym = x-mx, y-my
    r_num = K.sum(tf.multiply(xm,ym))
    r_den = K.sqrt(tf.multiply(K.sum(K.square(xm)), K.sum(K.square(ym))))
    r = r_num / r_den
    r = K.maximum(K.minimum(r, 1.0), -1.0)
    return - r

#### The below model is Deep and Wide Neural Network. We split the lower layers into two, the deep model which will learn high dimensional features and the wide model to learn lower dimensional features. Why do we do this? Ubiquant have already given us engineered features which may already have some predictability to the target. Therefore, we must include this in the wide model...

In [None]:
def get_model(num_inputs):
    investment_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    features_inputs = tf.keras.Input((num_inputs, ), dtype=tf.float16)
    
    investment_id_x = investment_id_lookup_layer(investment_id_inputs)
    investment_id_x = layers.Embedding(investment_id_size, 32, input_length=1)(investment_id_x)
    investment_id_x = layers.Reshape((-1, ))(investment_id_x)
    investment_id_x2 = layers.Dense(32, activation='relu', kernel_regularizer="l2")(investment_id_x)
    
    feature_x = layers.Dense(64, activation='relu', kernel_regularizer="l2")(features_inputs)
    feature_wide = layers.Dense(128, activation='relu', kernel_regularizer="l2")(features_inputs)
    
    x = layers.Concatenate(axis=1)([investment_id_x2, feature_x])
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(64, activation='relu', kernel_regularizer="l2")(x)
    x = layers.BatchNormalization()(x)
#     x = layers.GaussianNoise(0.1)(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(64, activation='relu', kernel_regularizer="l2")(x)
    x = layers.BatchNormalization()(x)
#     x = layers.GaussianNoise(0.1)(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(32, activation='relu', kernel_regularizer="l2")(x)
    x = layers.Concatenate(axis=1)([investment_id_x, feature_wide, x])
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(128, activation='relu', kernel_regularizer="l2")(x)
    x = layers.BatchNormalization()(x)
    x = layers.GaussianNoise(0.1)(x)
    x = layers.Dropout(0.4)(x)
    x = layers.Dense(128, activation='relu', kernel_regularizer="l2")(x)
    x = layers.BatchNormalization()(x)
    x = layers.GaussianNoise(0.1)(x)
    x = layers.Dropout(0.4)(x)
    x = layers.Dense(32, activation='relu', kernel_regularizer="l2")(x)
    x = layers.BatchNormalization()(x)
    x = layers.GaussianNoise(0.1)(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(16, activation='relu', kernel_regularizer="l2")(x)
    x = layers.BatchNormalization()(x)
    x = layers.GaussianNoise(0.1)(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(8, activation='relu', kernel_regularizer="l2")(x)
    x = layers.Dense(1, activation='linear')(x)
    model = keras.Model(inputs=[investment_id_inputs, features_inputs], outputs=[x])
    model.compile(optimizer=tf.optimizers.Nadam(0.0001), loss='mse', metrics=[corr_eval])
    return model

In [None]:
model = get_model(300)
model.summary()
keras.utils.plot_model(model, show_shapes=True)

In [None]:
del model
K.clear_session()
tf.compat.v1.reset_default_graph()
gc.collect()

In [None]:
%%time

tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

epoch_stop = 0
n_epochs = 40
cv = 5
if not load_model('model_4_skf'):
    skf = stratified_group_k_fold(X=train, y=train['investment_id'].astype('category').cat.codes.values, 
                                  groups=np.array(train['time_id'].astype('category').cat.codes.values), k=cv, seed=42)
    for split, (train_split_index, test_split_index) in enumerate(skf):
           
        X_train = train.loc[train_split_index].copy()
        X_val = train.loc[test_split_index].copy()
        
        train_ds = make_dataset(X_train[features], train.loc[train_split_index, 'investment_id'], train.loc[train_split_index, 'target'])
        valid_ds = make_dataset(X_val[features], train.loc[test_split_index, 'investment_id'], train.loc[test_split_index, 'target'], mode="valid")        
        del X_train
        del X_val
        
        gc.collect()
        
        model = get_model(len(features))
#         onecycle = OneCycleScheduler(math.ceil(len(X_train) / batch_size) * n_epochs, max_rate=0.0025, start_rate=0.00025, last_rate=0.00001)
#         history = model.fit(train_ds, epochs=n_epochs, validation_data=valid_ds, callbacks=[onecycle])
        history = model.fit(train_ds, epochs=n_epochs, validation_data=valid_ds, callbacks=[early_stop, plateau])
        epoch_stop += (np.argmin(history.history['val_corr_eval']) + 1) / cv
        model.save(f"model_{split}_skf")
        pearson_score = stats.pearsonr(model.predict(valid_ds).ravel(), train.loc[test_split_index, 'target'].values)[0]
        print('Pearson:', pearson_score)
        pd.DataFrame(history.history, columns=["loss", "val_loss"]).plot()
        plt.title("MSE")
        plt.show()
        pd.DataFrame(history.history, columns=["corr_eval", "val_corr_eval"]).plot()
        plt.title("CORR")
        plt.show()
        

        del model
        del history
        del train_ds
        del valid_ds
        K.clear_session()
        tf.compat.v1.reset_default_graph()
        plt.close()
        gc.collect()
        # 0.143 0.8347

In [None]:
# %%time

# tf.random.set_seed(42)
# np.random.seed(42)
# random.seed(42)

# epoch_stop = 0
# n_epochs = 40
# cv = 5
# if not load_model('model_4_ts'):
#     train_test_zip = get_time_series_cross_val_splits(train, cv = cv)
#     for split, (train_split, test_split) in enumerate(train_test_zip):
#         train_split_index = train.time_id.isin(train_split)
#         test_split_index = train.time_id.isin(test_split)
        
#         X_train = train.loc[train_split_index].copy()
#         X_val = train.loc[test_split_index].copy()
        
#         train_ds = make_dataset(X_train[features], train.loc[train_split_index, 'investment_id'], train.loc[train_split_index, 'target'])
#         valid_ds = make_dataset(X_val[features], train.loc[test_split_index, 'investment_id'], train.loc[test_split_index, 'target'], mode="valid")
        
#         del X_train
#         del X_val
#         gc.collect()
        
#         model = get_model(len(features))
# #         onecycle = OneCycleScheduler(math.ceil(len(X_train) / batch_size) * n_epochs, max_rate=0.0025, start_rate=0.00025, last_rate=0.00001)
# #         history = model.fit(train_ds, epochs=n_epochs, validation_data=valid_ds, callbacks=[onecycle])
#         history = model.fit(train_ds, epochs=n_epochs, validation_data=valid_ds, callbacks=[early_stop, plateau])
#         epoch_stop += (np.argmin(history.history['val_corr_eval']) + 1) / cv
#         model.save(f"model_{split}_ts")
#         pearson_score = stats.pearsonr(model.predict(valid_ds).ravel(), train.loc[test_split_index, 'target'].values)[0]
#         print('Pearson:', pearson_score)
#         pd.DataFrame(history.history, columns=["loss", "val_loss"]).plot()
#         plt.title("MSE")
#         plt.show()
#         pd.DataFrame(history.history, columns=["corr_eval", "val_corr_eval"]).plot()
#         plt.title("CORR")
#         plt.show()
        
#         K.clear_session()
#         tf.compat.v1.reset_default_graph()
#         del model
#         del history
#         del train_ds
#         del valid_ds
#         plt.close()
#         gc.collect()

## Submission

In [None]:
models = {}
splits = 5
for split in range(splits):
    models[f"model_{split}_skf"] = load_model(f"model_{split}_skf")
#     models[f"model_{split}_ts"] = load_model(f"model_{split}_ts")

def preprocess_test(investment_id, feature):
    return (investment_id, feature), 0
def make_test_dataset(feature, investment_id, batch_size=1024):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature)))
    ds = ds.map(preprocess_test)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)
    return ds

In [None]:
env = ubiquant.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df['target'] = 0
    for split in range(splits):
        ds = make_test_dataset(test_df[features], test_df['investment_id'])
#         sample_prediction_df['target'] += (0.8*models[f"model_{split}_skf"].predict(ds).ravel()+0.2*models[f"model_{split}_ts"].predict(ds).ravel()) / splits
        sample_prediction_df['target'] += models[f"model_{split}_skf"].predict(ds).ravel() / splits

    env.predict(sample_prediction_df) 