This notebook tries to check if makes sense to use investment_id in our models for a prediction that is very far away in time.

The method is:
    
1.  Train two models with the first part of the dataset. One takes use of all the features including investment_id and the other doesn't use investment_id.

2.  Validate those models on several datasets from the future.

3.  See if the difference in the score, what is no more that the advantage of using investment_id, decreases over time.

I have a lot of respect for coding and of course I don't want to miss any opportunity to learn so if you see something that doesn't seem right or something I can improve on, please don't hesitate to comment on it.

In [None]:
# Import Libraries
import numpy as np
import pandas as pd
import gc

from scipy.stats import pearsonr

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
### CONSTANTS

EARLY_STOPPING = 5

MAX_EPOCHS = 50

LEARNING_RATE= 10**-3

FOLD = 5

In [None]:
# Adapted from:
#     https://www.kaggle.com/code/lonnieqin/ubiquant-market-prediction-with-dnn

def get_model_investment_id():
    
    investment_id_size = len(np.unique(INVESTMENT_IDS)) + 1
    investment_id_lookup_layer = layers.IntegerLookup(max_tokens = investment_id_size)
    investment_id_lookup_layer.adapt(INVESTMENT_IDS)
    
    investment_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    features_inputs = tf.keras.Input((300, ))
    
    investment_id_x = investment_id_lookup_layer(investment_id_inputs)
    investment_id_x = layers.Embedding(investment_id_size,
                                       32,
                                       input_length=1)(investment_id_x)
    
    investment_id_x = layers.Reshape((-1, ))(investment_id_x)
    

    investment_id_x = layers.Dense(64, activation='relu')(investment_id_x)
    
    feature_x = layers.Dense(256, activation='relu')(features_inputs)
    
    x = layers.Concatenate(axis=1)([investment_id_x, feature_x])
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dense(32, activation='relu')(x)
    output = layers.Dense(1)(x)
    
    model = tf.keras.Model(inputs=[features_inputs, investment_id_inputs], outputs=[output])
    
    model.compile(optimizer = tf.optimizers.Adam(LEARNING_RATE),
                  loss='mse')
    
    return model

In [None]:
def get_model_without_investment_id():
    
    features_inputs = tf.keras.Input((300, ))
    
    feature_x = layers.Dense(256, activation='relu')(features_inputs)
    
    x = layers.Dense(128, activation='relu')(feature_x)
    x = layers.Dense(32, activation='relu')(x)
    output = layers.Dense(1)(x)
    
    model = tf.keras.Model(inputs=[features_inputs], outputs=[output])
    
    model.compile(optimizer = tf.optimizers.Adam(LEARNING_RATE),
                  loss='mse')
    
    return model

In [None]:
# Data from:
#     https://www.kaggle.com/code/columbia2131/speed-up-reading-csv-to-pickle/notebook
#
# Dataset:
#     https://www.kaggle.com/datasets/columbia2131/ump-train-picklefile
def prepare_data():
    df = pd.read_pickle("../input/ump-train-picklefile/train.pkl")
    
    x = df.drop(["row_id","target", "time_id","investment_id"], axis = 1).values
    y = df["target"].values
    time_ids = df["time_id"].values
    investment_ids = df["investment_id"].values
    
    return x , y , time_ids, investment_ids

In [None]:
def calculate_score(actual,
                    predicted,
                    time_ids):
    
    if len(np.unique(actual))==1 or len(np.unique(predicted))==1:
        return -1
    
    unique_time_ids = np.unique(time_ids)
    
    correlations = np.zeros((len(unique_time_ids),))
    
    for index, time_id in enumerate(unique_time_ids):
        
        mask = time_ids==time_id
        
        correlations[index] = pearsonr(actual[mask],
                                       predicted[mask])[0]
        
    
        
    return np.mean(correlations)

In [None]:
def calculate_scores(model):
    
    end = False
    epoch = 0
    
    train_index = [np.logical_and((np.arange(X.shape[0]))>((X.shape[0]//FOLD)*fold),
                                  (np.arange(X.shape[0]))<((X.shape[0]//FOLD)*(fold+1)))
                                        for fold in range(FOLD)]
    
    best_score = np.Inf
    
    x_train = X[train_index[0]] if len(model._input_layers)==1 else [X[train_index[0]], INVESTMENT_IDS[train_index[0]]]
    x_val = X[train_index[1]] if len(model._input_layers)==1 else [X[train_index[1]], INVESTMENT_IDS[train_index[1]]]
    
    while not end:
        
        
        
        model.fit(x_train,
                  Y[train_index[0]],
                  epochs = 1,
                  verbose = 0)
        
        
        epoch+=1
        
        
        
        current_score = calculate_score(actual = Y[train_index[1]],
                                        predicted = model.predict(x_val).ravel(),
                                        time_ids = TIME_IDS[train_index[1]])
        
        if current_score < best_score:
            
            best_score = current_score
            best_epoch = epoch

            model.save('best_model')
            
        end = (epoch == MAX_EPOCHS) or (epoch-best_epoch) >= EARLY_STOPPING
        
        _ = gc.collect()
        
    model = tf.keras.models.load_model('best_model',
                                       compile=False)
    
    output = np.zeros((FOLD,))
    
    for i in range(FOLD):
        
        current_x = X[train_index[i]] if len(model._input_layers)==1 else [X[train_index[i]], INVESTMENT_IDS[train_index[i]]]
        
                
        output[i] = calculate_score(actual = Y[train_index[i]],
                                    predicted = model.predict(current_x).ravel(),
                                    time_ids = TIME_IDS[train_index[i]])
        
        _ = gc.collect()
        
    return output

In [None]:
X , Y , TIME_IDS, INVESTMENT_IDS = prepare_data()

In [None]:
_ = gc.collect()

In [None]:
scores_model_investment_id = calculate_scores(get_model_investment_id())

In [None]:
scores_model_investment_id

In [None]:
_ = gc.collect()

In [None]:
scores_no_investment_id = calculate_scores(get_model_without_investment_id())

In [None]:
scores_no_investment_id

In [None]:
scores_model_investment_id-scores_no_investment_id

In [None]:
sns.lmplot(x ='fold',
           y ='diff_score',
           data = pd.DataFrame.from_dict({'fold':np.arange(FOLD),
                                          'diff_score': scores_model_investment_id-scores_no_investment_id}))
plt.show()

As we can see the difference in scores decreases over time so at least with this test, there is no point in adding this feature to our models cause the private leaderboard is going to be very far away in the future.

In [None]:
sns.lmplot(x ='fold',
           y ='diff_score',
           data = pd.DataFrame.from_dict({'fold':np.arange(2,FOLD),
                                          'diff_score': scores_model_investment_id[2:]-scores_no_investment_id[2:]}))
plt.show()

We also see the same pattern if we just plot the scores of the folds that doesn't take part of the train.