### As the [Competition Q&A](https://www.kaggle.com/c/ubiquant-market-prediction/discussion/301693) said:<br>

```The mapping relationship between investment_id and a certain investment is fixed, but the investment_ids that appear in the train data, the public leaderboard, and the private leaderboard are not the same, some only appear in the train data, some only in public leaderboard and some only in the private leaderboard.```

### the method to deal with investment_ids that only appear in test data should be considered.


## Import Packages

In [None]:
import os
import gc

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.model_selection import StratifiedKFold

## Get investment_id data and train-test indexs generated by StratifiedKFold (for saving memory)

In [None]:
def gen_ids_and_skf_idxs():
    train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
    investment_id = train[["investment_id"]].astype('int64')
    train.pop("investment_id")
    train.pop("time_id")
    train.pop("target")
    skf = StratifiedKFold(5, shuffle=True, random_state=42)
    idxs = list(enumerate(skf.split(train, investment_id)))
    del train
    gc.collect()
    return investment_id, idxs

In [None]:
investment_id, idxs = gen_ids_and_skf_idxs()

## Tensorflow model modified from model3 of https://www.kaggle.com/librauee/infer-dnn-model-ensemble/

In [None]:
class MyModel(keras.Model):
    
    def __init__(self, investment_id, device='gpu'):
        super().__init__()
    
        investment_ids = list(np.unique(investment_id.values))
        investment_id_size = len(investment_ids) + 1
        
        with tf.device(device):
            self.id_lookup_layer = layers.IntegerLookup(max_tokens=investment_id_size)

            self.id_lookup_layer.adapt(investment_id)

            self.inv_embedding = layers.Embedding(investment_id_size, 32)
            self.inv_fc = keras.Sequential([
                layers.Dense(64, activation='swish', kernel_initializer='he_normal', bias_initializer='zeros'),
                layers.Dropout(0.5),
                layers.Dense(32, activation='swish', kernel_initializer='he_normal', bias_initializer='zeros'),
                layers.Dropout(0.5),
            ])

            self.fea_fc = keras.Sequential([
                layers.Dense(256, activation='swish', kernel_initializer='he_normal', bias_initializer='zeros'),
                keras.layers.BatchNormalization(axis=1),
                layers.Dropout(0.5),
                layers.Dense(128, activation='swish', kernel_initializer='he_normal', bias_initializer='zeros'),
                keras.layers.BatchNormalization(axis=1),
                layers.Dropout(0.5),
                layers.Dense(64, activation='swish', kernel_initializer='he_normal', bias_initializer='zeros')
            ])
            
            self.fc = keras.Sequential([
                layers.Dropout(0.5),
                layers.Dense(128, activation='swish', kernel_initializer='he_normal', bias_initializer='zeros', kernel_regularizer="l2"),
                layers.Dropout(0.5),
                layers.Dense(32, activation='swish', kernel_initializer='he_normal', bias_initializer='zeros',  kernel_regularizer="l2"),
                layers.Dropout(0.5),
                layers.Dense(16, activation='swish', kernel_initializer='he_normal', bias_initializer='zeros', kernel_regularizer="l2"),
                layers.Dense(1)
            ])
    
    def call(self, inputs):
        inv_id, fea = inputs
        
        inv = self.id_lookup_layer(inv_id)
        inv = self.inv_embedding(inv)
        inv = self.inv_fc(inv)
        inv = tf.squeeze(inv, axis=1)
        
        fea = self.fea_fc(fea)
        
        concat = tf.concat([inv, fea], axis=1)
        output = self.fc(concat)
        
        return output

In [None]:
def correlation(x, y, axis=-2):
    xmean = tf.reduce_mean(x, axis=axis)
    ymean = tf.reduce_mean(y, axis=axis)
    cossim = keras.losses.cosine_similarity(x - xmean, y - ymean, axis=axis)
    return 1 + cossim

## Make custom tf_dataset

In [None]:
# Let us see how layers.IntegerLookup works.
investment_ids = list(np.unique(investment_id.values))
investment_id_size = len(investment_ids) + 1
id_lookup_layer = layers.IntegerLookup(max_tokens=investment_id_size)
id_lookup_layer.adapt(investment_id)

In [None]:
# The OOV investment_ids will be mapped to a constant value 0:
for i in range(0, 6):
    print(f"id {i} is in investment_id:", i in investment_ids)
print(id_lookup_layer([0, 1, 2, 3, 4, 5, -1]))

# id 0 is in investment_id: True
# id 1 is in investment_id: True
# id 2 is in investment_id: True
# id 3 is in investment_id: True
# id 4 is in investment_id: True
# id 5 is in investment_id: False
# tf.Tensor([2994 1090 1823 1344 3292    0    0], shape=(7,), dtype=int64)

In [None]:
# Random mask:
# To force the inv_embedding layer to see the OOV id (0 in this case) in train stage, which may help model to learn some "common knowledge" of the investment_id.
# And then when the model faces OOV investment_ids, the embedding of those ids would not be too random.

def random_mask(feas, target, ratio=0.1):
    inv_id, fea = feas
    mask = tf.random.uniform(tf.shape(inv_id)) < ratio
    inv_id = tf.where(mask, tf.constant(-1, dtype=tf.int64), inv_id)
    return (inv_id, fea), target

In [None]:
def make_dataset(feature, investment_id, y, batch_size=512, mode="train"):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature), y))
    if mode == "train":
        ds = ds.map(random_mask).shuffle(batch_size * 4)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

In [None]:
def get_tf_dataset(train_idx, test_idx):
    n_features = 300
    features = [f'f_{i}' for i in range(n_features)]
    df = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
    con_feas = df[features]
    y = df['target']
    train_dataset = make_dataset(con_feas.iloc[train_idx, :], investment_id.iloc[train_idx], y.iloc[train_idx])
    val_dataset = make_dataset(con_feas.iloc[test_idx, :], investment_id.iloc[test_idx], y.iloc[test_idx], mode="valid")
    
    del df, con_feas, y
    gc.collect()
    
    return train_dataset, val_dataset

## A scheduler which can be used for lr decay, weight decay, temperature decay, etc..

In [None]:
def scheduler(epoch, para, bound=20):
    if epoch < bound:
        return para
    else:
        return para / tf.math.exp(0.02)

## Training

In [None]:
# %%time
# for idx, (train_idx, test_idx) in idxs:
#     print(f"the {idx}th fold:")
    
#     print("train_idx and test_idx:")
#     print(train_idx, test_idx)
    
#     print("get tf_dataset...")
#     train_dataset, val_dataset = get_tf_dataset(train_idx, test_idx)

#     print("get model...")
#     model = MyModel(investment_id=investment_id)
#     rmse = keras.metrics.RootMeanSquaredError(name="rmse")
#     optimizer = optimizer=tf.optimizers.Adam(0.001)
#     model.compile(
#         optimizer=optimizer,
#         loss='mse',
#         metrics=[rmse, correlation]
#     )
#     model.build([(None, 1), (None, 300)])
    
#     lr_scheduler = keras.callbacks.LearningRateScheduler(scheduler)
#     checkpoint = keras.callbacks.ModelCheckpoint(f'model_{idx}/model', monitor="val_correlation", save_best_only=True, save_weights_only=True)
#     early_stop = keras.callbacks.EarlyStopping(monitor="val_correlation", patience=10, mode='min')
    
#     print("start training...")
#     history = model.fit(train_dataset, epochs=50, validation_data=val_dataset, callbacks=[lr_scheduler, checkpoint, early_stop])
    
#     model.load_weights(f"model_{idx}/model")
#     for metric in ["rmse", "correlation"]:
#         pd.DataFrame(history.history, columns=[metric, f"val_{metric}"]).plot()
#         plt.title(metric.upper())
#         plt.show()
    
#     del train_dataset, val_dataset, model, rmse, optimizer, checkpoint, early_stop, history
#     gc.collect()

## Submission

In [None]:
model = MyModel(investment_id=investment_id, device='cpu')

In [None]:
def preprocess_test(investment_id, feature):
    return (investment_id, feature), 0

def make_test_dataset(feature, investment_id, batch_size=1024):
    ds = tf.data.Dataset.from_tensor_slices((investment_id, feature)).map(preprocess_test).batch(batch_size)
    return ds

In [None]:
def inference(model, ds):
    y_preds = []
    for i in range(5):
        # 
        model.load_weights(f"../input/ubi-dnn-test1/model_{i}/model")  # private models' weights saved in training cell
        y_pred = model.predict(ds)
        y_preds.append(y_pred)
    return np.mean(y_preds, axis=0)

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test()

In [None]:
n_features = 300
features = [f'f_{i}' for i in range(n_features)]

for (test_df, sample_prediction_df) in iter_test:
    ds = make_test_dataset(test_df[features].astype('float16'), test_df[["investment_id"]].astype('int64'))
    sample_prediction_df['target'] = inference(model, ds)
    env.predict(sample_prediction_df) 
    display(sample_prediction_df)

### Modified from
https://www.kaggle.com/librauee/infer-dnn-model-ensemble <br>
https://www.kaggle.com/lonnieqin/ubiquant-market-prediction-with-dnn