#  Ubiquant Market Prediction with DNN

Please upvote the original kernel from where I modified this one:              
https://www.kaggle.com/lonnieqin/ubiquant-market-prediction-with-dnn

I just added some minor changes:

- PCA (80 components)   
- Lot of Dropout layers with 0.4 dropout ratio   

I think that's it.   

In [None]:
import os
import gc
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from scipy import stats

## Import dataset

In [None]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
feature_columns = ['investment_id', 'time_id'] + features
train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
train.head()

## Adding some interactions

In [None]:
# from itertools import combinations

# cols_interaction = sorted(
#     ['f_271', 'f_298', 'f_184', 'f_253', 'f_250', 'f_231', 'f_269', 'f_78', 'f_209', 'f_19', 'f_241', 'f_43', 'f_182'],
#     key=lambda x: int(x[2:])
# )
# for x1, x2 in list(combinations(cols_interaction, r=2)):
#     train[f'{x1}_{x2}'] = train[x1] * train[x2]
    
# train

In [None]:
train.info()

In [None]:
investment_id = train.pop("investment_id")
investment_id.head()

In [None]:
_ = train.pop("time_id")

In [None]:
y = train.pop("target")
y.head()

## Create a IntegerLookup layer for investment_id input

In [None]:
%%time
investment_ids = list(investment_id.unique())
investment_id_size = len(investment_ids) + 1
investment_id_lookup_layer = layers.IntegerLookup(max_tokens=investment_id_size)
with tf.device("cpu"):
    investment_id_lookup_layer.adapt(investment_id)

## Make Tensorflow dataset

In [None]:
def preprocess(X, y):
    print(X)
    print(y)
    return X, y

def make_dataset(feature, investment_id, y, batch_size=1024, mode="train"):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature), y))
    ds = ds.map(preprocess)
    if mode == "train":
        ds = ds.shuffle(256)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

## Modeling

In [None]:
USE_PCA = True

if USE_PCA:
    n_components = 80
else:
    n_components = train.shape[1]

    
def get_model():
    investment_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    features_inputs = tf.keras.Input((n_components, ), dtype=tf.float16)
    
    investment_id_x = investment_id_lookup_layer(investment_id_inputs)
    investment_id_x = layers.Embedding(investment_id_size, 32, input_length=1)(investment_id_x)
    investment_id_x = layers.Reshape((-1, ))(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    investment_id_x = layers.Dropout(0.4)(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    investment_id_x = layers.Dropout(0.4)(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    investment_id_x = layers.Dropout(0.4)(investment_id_x)
    
    feature_x = layers.Dense(256, activation='swish')(features_inputs)
    feature_x = layers.Dropout(0.4)(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dropout(0.4)(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dropout(0.4)(feature_x)
    
    x = layers.Concatenate(axis=1)([investment_id_x, feature_x])
    x = layers.Dense(512, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(128, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(32, activation='swish', kernel_regularizer="l2")(x)
    
    output = layers.Dense(1)(x)
    
    rmse = keras.metrics.RootMeanSquaredError(name="rmse")
    
    model = tf.keras.Model(inputs=[investment_id_inputs, features_inputs], outputs=[output])
    model.compile(optimizer=tf.optimizers.Adam(0.001), loss='mse', metrics=[rmse])
    return model

In [None]:
model = get_model()
model.summary()

In [None]:
keras.utils.plot_model(model, show_shapes=True)

In [None]:
del model
gc.collect()

In [None]:
%%time


random.seed(10)
np.random.seed(20)
tf.random.set_seed(30)

SAMPLE_FRAC = 0.5

kfold = StratifiedKFold(5, shuffle=True, random_state=42)
models = []
for index, (train_indices, valid_indices) in enumerate(kfold.split(train, investment_id)):
    X_train = train.iloc[train_indices].sample(frac=SAMPLE_FRAC, random_state=2)
    investment_id_train = investment_id[train_indices].sample(frac=SAMPLE_FRAC, random_state=2)
    y_train = y.iloc[train_indices].sample(frac=SAMPLE_FRAC, random_state=2)
    
    X_val = train.iloc[valid_indices]
    investment_id_val = investment_id[valid_indices]
    y_val = y.iloc[valid_indices]
    
    # PCA
    components = [f'x_{i}' for i in range(n_components)]
    pca = PCA(n_components=n_components, random_state=2)
    X_train = pca.fit_transform(X_train)
    X_val = pca.transform(X_val)
    print('Explained variance ratio:', np.cumsum(pca.explained_variance_ratio_))
    
    train_ds = make_dataset(X_train, investment_id_train, y_train)
    valid_ds = make_dataset(X_val, investment_id_val, y_val, mode="valid")
    
    model = get_model()
    checkpoint = keras.callbacks.ModelCheckpoint(f"model_{index}.tf", save_best_only=True, save_weights_only=True)
    early_stop = keras.callbacks.EarlyStopping(patience=5)
    history = model.fit(train_ds, epochs=20, validation_data=valid_ds, callbacks=[checkpoint, early_stop])
    models.append(model)
    
    # predict
    score = stats.pearsonr(y_val.values, model.predict(valid_ds).ravel())[0]
    print('Pearson:', score)
    print()
    
    pd.DataFrame(history.history, columns=["rmse", "val_rmse"]).plot()
    plt.title("RMSE")
    plt.show()
    
    del investment_id_train
    del investment_id_val
    del X_train
    del X_val
    del y_train
    del y_val
    del train_ds
    del valid_ds
    
    gc.collect()
    
    break

In [None]:
del train
del investment_id
del y
gc.collect()

## Submission

In [None]:
def preprocess_test(investment_id, feature):
    return (investment_id, feature), 0

def make_test_dataset(feature, investment_id, batch_size=1024):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature)))
    ds = ds.map(preprocess_test)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

def inference(models, ds):
    y_preds = []
    for model in models:
        y_pred = model.predict(ds)
        y_preds.append(y_pred)
    return np.mean(y_preds, axis=0)

In [None]:
import ubiquant

env = ubiquant.make_env()
iter_test = env.iter_test() 
for (test_df, sample_prediction_df) in iter_test:
    ds = make_test_dataset(pca.transform(test_df[features]), test_df['investment_id'])
    sample_prediction_df['target'] = inference(models, ds)
    env.predict(sample_prediction_df) 