# About the data from the competition

    In this competition, we are given a large number of cross-sectional alphas and a training label which represents a transformed forward return for a given investment_id. Cross-sectional means that for a given time_id, an alpha is meant to be able to sort effectively the universe of invesments for that time_id. "Sort effectively" means to create a vector which is, as best as possible, directly proportionate the the target. There is quite a bit on information in the public domain which describes this style of investing in detail. The paper 101 Formulaic Alphas describes in detail how, generally, the features f_0 to f_299 are generated. The book Finding Alphas is also specificially about the style of investing in this competition. That paper and book will give you a very good sense of how the features were designed. It is our task to combine the alphas into a master alpha...also known as designing a meta model.

#### train.csv

1. row_id - A unique identifier for the row.
2. time_id - The ID code for the time the data was gathered. The time IDs are in order, but the real time between the time IDs is not constant and will likely be shorter for the final private test set than in the training set.

3. investment_id - The ID code for an investment. Not all investment have data in all time IDs.
    target - The target.
4. [f_0:f_299] - Anonymized features generated from market data.
   
#### example_test.csv - Random data provided to demonstrate what shape and format of data the API will deliver to your notebook when you submit.

#### example_sample_submission.csv - An example submission file provided so the publicly accessible copy of the API provides the correct data shape and format.

5. ubiquant/ - The image delivery API that will serve the test set. You may need Python 3.7 and a Linux environment to run the example test set through the API offline without errors.

   

# Importing Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
from scipy import stats
import random
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import *
import warnings

import pickle
import lightgbm as lgb
from sklearn.model_selection import train_test_split



In [None]:
# #Importing pickle files
# with open('/kaggle/input/ubiquanttrainpicklefile/train.pickle', 'rb') as f:
#     train = pickle.load(f)



In [None]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
train.head()

In [None]:
train.shape

In [None]:
train.head()

In [None]:
# sns.histplot(train.target, kde=True)
# plt.title("Target Distribution")
# plt.show()

In [None]:
# warnings.simplefilter("ignore")
# sns.boxplot(train.target)
# plt.title("Target Box Plot")
# plt.show()

In [None]:
# np.random.seed(0)

# for train_investment_id in np.random.choice(train.investment_id.unique(), 9):
#     train[train.investment_id == train_investment_id].target.hist(bins=90, alpha=0.5)

# plt.title("Target Distributions of Sample Investments")
# plt.show()

In [None]:
# sns.histplot(train.groupby("investment_id").target.count(), kde=True)
# plt.title("Distribution of Targets by Investment Identifier")
# plt.show()

# Model DNN

In [None]:
investment_id = train.pop("investment_id")
investment_id.head()

In [None]:
_ = train.pop("time_id")

In [None]:
y = train.pop("target")
y.head()

IntegerLookup layer for investment_id input



In [None]:
%%time
investment_ids = list(investment_id.unique())
investment_id_size = len(investment_ids) + 1
investment_id_lookup_layer = layers.IntegerLookup(max_tokens=investment_id_size)
investment_id_lookup_layer.adapt(pd.DataFrame({"investment_ids":investment_ids}))

In [None]:
# Making Tesorflow dataset
import tensorflow as tf
def preprocess(X, y):
    return X, y
def make_dataset(feature, investment_id, y, batch_size=1024, mode="train"):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature), y))
    ds = ds.map(preprocess)
    if mode == "train":
        ds = ds.shuffle(4096)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

In [None]:
def get_model():
    investment_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    features_inputs = tf.keras.Input((300, ), dtype=tf.float16)
    
    investment_id_x = investment_id_lookup_layer(investment_id_inputs)
    investment_id_x = layers.Embedding(investment_id_size, 32, input_length=1)(investment_id_x)
    investment_id_x = layers.Reshape((-1, ))(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    
    feature_x = layers.Dense(256, activation='swish')(features_inputs)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    
    x = layers.Concatenate(axis=1)([investment_id_x, feature_x])
    x = layers.Dense(512, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(128, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(32, activation='swish', kernel_regularizer="l2")(x)
    output = layers.Dense(1)(x)
    rmse = keras.metrics.RootMeanSquaredError(name="rmse")
    model = tf.keras.Model(inputs=[investment_id_inputs, features_inputs], outputs=[output])
    model.compile(optimizer=tf.optimizers.Adam(0.001), loss='mse', metrics=['mse', "mae", "mape", rmse])
    return model

In [None]:
model = get_model()
model.summary()
keras.utils.plot_model(model, show_shapes=True)

In [None]:
%%time
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(5, shuffle=True, random_state=42)
models = []
for index, (train_indices, valid_indices) in enumerate(kfold.split(train, investment_id)):
    X_train, X_val = train.iloc[train_indices], train.iloc[valid_indices]
    investment_id_train = investment_id[train_indices]
    y_train, y_val = y.iloc[train_indices], y.iloc[valid_indices]
    investment_id_val = investment_id[valid_indices]
    train_ds = make_dataset(X_train, investment_id_train, y_train)
    valid_ds = make_dataset(X_val, investment_id_val, y_val, mode="valid")
    model = get_model()
    checkpoint = keras.callbacks.ModelCheckpoint(f"model_{index}", save_best_only=True)
    early_stop = keras.callbacks.EarlyStopping(patience=10)
    history = model.fit(train_ds, epochs=40, validation_data=valid_ds, callbacks=[checkpoint, early_stop])
    model = keras.models.load_model(f"model_{index}")
    models.append(model)
    
    pearson_score = stats.pearsonr(model.predict(valid_ds).ravel(), y_val.values)[0]
    print('Pearson:', pearson_score)
    pd.DataFrame(history.history, columns=["mse", "val_mse"]).plot()
    plt.title("MSE")
    plt.show()
    pd.DataFrame(history.history, columns=["mae", "val_mae"]).plot()
    plt.title("MAE")
    plt.show()
    pd.DataFrame(history.history, columns=["rmse", "val_rmse"]).plot()
    plt.title("RMSE")
    plt.show()
    del investment_id_train
    del investment_id_val
    del X_train
    del X_val
    del y_train
    del y_val
    del train_ds
    del valid_ds
    gc.collect()
    

In [None]:
def preprocess_test(investment_id, feature):
    return (investment_id, feature), 0
def make_test_dataset(feature, investment_id, batch_size=1024):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature)))
    ds = ds.map(preprocess_test)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds
def inference(models, ds):
    y_preds = []
    for model in models:
        y_pred = model.predict(ds)
        y_preds.append(y_pred)
    return np.mean(y_preds, axis=0)

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test() 
for (test_df, sample_prediction_df) in iter_test:
    ds = make_test_dataset(test_df[features], test_df["investment_id"])
    sample_prediction_df['target'] = inference(models, ds)
    env.predict(sample_prediction_df) 