# A straightforward DNN implemented with PyTorch Lightning on GPU. Current features include: early stopping based on the mean daily correlation of the output with target (the competition metric), 10x cross validation

In [None]:
import pandas as pd
import numpy as np
import gc

import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
from pytorch_lightning.callbacks import Callback
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, TensorDataset, Subset
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint

In [None]:
# Load the data, using a low memory parquet file
train = pd.read_parquet("../input/train-small/train_small.parquet")
# Creation of th parquet file follows https://www.kaggle.com/robikscube/fast-data-loading-and-low-mem-with-parquet-files
# it can be found here https://www.kaggle.com/leonweninger/train-small

# Treat the features differently from target, time_id, investment_id
float_feature_names = train.drop(['target', 'row_id', 'time_id', 'investment_id'], axis=1).columns
float_input = train[float_feature_names].values
investment_id = train[['investment_id']].values.astype(int)
time_id = train[['time_id']].values.astype(int)
targets = train[['target']].values

del train

# everything as torch tensors
float_input = torch.FloatTensor(float_input)
investment_id = torch.LongTensor(investment_id)
time_id = torch.LongTensor(time_id)
target = torch.FloatTensor(targets)
dataset = TensorDataset(time_id, investment_id, float_input, target)

gc.collect()

In [None]:
class UbiquantRegressor(pl.LightningModule):
    def __init__(self):
        super(UbiquantRegressor, self).__init__()

        # Embedding of investment_id to 11 float features.
        # As the number of unseen investment_ids is unknown, a large margin is selected (10000). 
        self.id_embedding = nn.Embedding(10000,11)

        # credits to sahil112: https://www.kaggle.com/sahil112/whyonlykeras-easy-pytorch-competitive-dnn for this architecture
        self.layers = nn.Sequential(nn.Linear(311, 64),
                                    nn.BatchNorm1d(64),
                                    nn.SiLU(),
                                    nn.Dropout(0.4),

                                    nn.Linear(64, 128),
                                    nn.BatchNorm1d(128),
                                    nn.SiLU(),
                                    nn.Dropout(0.4),

                                    nn.Linear(128, 256),
                                    nn.BatchNorm1d(256),
                                    nn.SiLU(),
                                    nn.Dropout(0.4),

                                    nn.Linear(256, 512),
                                    nn.BatchNorm1d(512),
                                    nn.SiLU(0.1),
                                    nn.Dropout(0.4),

                                    nn.Linear(512, 256),
                                    nn.BatchNorm1d(256),
                                    nn.SiLU(),
                                    nn.Dropout(0.4),

                                    nn.Linear(256, 128),
                                    nn.BatchNorm1d(128),
                                    nn.SiLU(0.1),
                                    nn.Dropout(0.4),

                                    nn.Linear(128, 8),
                                    nn.BatchNorm1d(8),
                                    nn.SiLU(),
                                    nn.Dropout(0.4),

                                    nn.Linear(8, 1))

    def forward(self, time_id, investment_id, f_features):
        # Embedding of the investment_id
        invest_embedding = self.id_embedding(investment_id).squeeze(dim=1)
        # Concat embedding and features.
        # Open question: should the network have access to the time_id?
        # The final test set will consist of time_id never seen in the train set 
        # Nevertheless, it can be easily added here...
        #dnn_input = torch.cat((invest_embedding, time_id, f_features), axis=-1)
        dnn_input = torch.cat((invest_embedding, f_features), axis=-1)
        return self.layers(dnn_input)

    def training_step(self, batch, batch_nb):
        time_id, investment_id, float_input, target = batch

        out = self(time_id, investment_id, float_input)
        loss = F.mse_loss(out, target)

        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_nb):
        time_id, investment_id, float_input, target = batch

        result = self(time_id, investment_id, float_input)
        loss = F.mse_loss(result, target)

        dict = {'val_loss': loss,
                'result': result,
                'target': target,
                'time_id': time_id,
                'investment_id': investment_id,
                }
        return dict

    def validation_epoch_end(self, outputs):
        val_losses = [x['val_loss'] for x in outputs]
        result = torch.cat([x['result'] for x in outputs])
        target = torch.cat([x['target'] for x in outputs])
        time_ids = torch.cat([x['time_id'] for x in outputs])
        investment_ids = torch.cat([x['investment_id'] for x in outputs])

        corrs = []
        for t in torch.unique(time_ids):
            t_results = result[time_ids == t]
            t_target = target[time_ids == t]
            # corr = torch.corrcoef(torch.stack((t_results, t_target)))[0,1] # use this when pytorch>=1.10
            corr = np.corrcoef(torch.stack((t_results, t_target)).cpu().numpy())[0, 1]
            corrs.append(corr)

        # mean_corr = torch.mean(torch.stack(corrs)) # use this when pytorch>=1.10
        mean_corr = np.nanmean(corrs)
        epoch_loss = torch.stack(val_losses).mean()  # Combine losses

        self.log('val_loss', epoch_loss, prog_bar=True)
        self.log('mean_corr', mean_corr, prog_bar=True)

        dict = {'val_loss': epoch_loss,
                'corrs': mean_corr}
        return dict

    def epoch_end(self, epoch, result):
        pass

    def test_step(self, batch, batch_nb):
        pass

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

class MetricTracker(Callback):
    def __init__(self):
        self.val_losses = []
        self.corrs = []

    def on_validation_epoch_end(self, trainer, module):
        self.val_losses.append(trainer._results['validation_epoch_end.val_loss'].value.cpu().numpy()) # track them
        self.corrs.append(trainer._results['validation_epoch_end.mean_corr'].value.cpu().numpy()) # track them
        if 0: #index==1:
            # live plotting of results during training, switched off
            ax.plot(self.val_losses, color="orange")
            ax.set_ylabel("Val loss", color="orange", fontsize=14)
            ax2 = ax.twinx()
            ax2.plot(self.corrs, color="blue")
            ax2.set_ylabel("Mean daily corr 2 target", color="blue", fontsize=14)
            plt.show()

In [None]:
n_splits=10
kf = KFold(n_splits=n_splits, shuffle=True)
val_losses = []
mean_corrs = []
models = [] # A list of all final models
index=0
for train_index, test_index in kf.split(dataset):
    index+=1
    print("CV run {}...".format(index))
    train_ds, val_ds = Subset(dataset, train_index), Subset(dataset, test_index)

    train_loader = DataLoader(train_ds, 32768)
    val_loader = DataLoader(val_ds, 32768)

    uq_regressor = UbiquantRegressor()

    metricTracker = MetricTracker()
    trainer = pl.Trainer(gpus=1,
                         callbacks=[metricTracker,
                                    EarlyStopping(monitor="mean_corr", mode="max", patience=3),
                                    ModelCheckpoint(save_top_k=1, monitor="mean_corr", mode="max", save_on_train_epoch_end=False)],
                         max_epochs=21,
                         num_sanity_val_steps=0,)

    trainer.fit(uq_regressor, train_loader, val_loader)

    # Load best model based on mean daily correlation with target
    uq_regressor = UbiquantRegressor().load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
    models.append(uq_regressor)
    
    # Show val results
    val_result = trainer.validate(model=uq_regressor, dataloaders=val_loader)
    val_losses.append(val_result[0]['val_loss'])
    mean_corrs.append(val_result[0]['mean_corr'])
    fig, ax = plt.subplots()
    ax.plot(metricTracker.val_losses, color="orange")
    ax.set_ylabel("Val loss", color="orange", fontsize=14)
    ax2 = ax.twinx()
    ax2.plot(metricTracker.corrs, color="blue")
    ax2.set_ylabel("Mean daily corr 2 target", color="blue", fontsize=14)
    plt.show()

In [None]:
# The submission part
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test() 

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    time_id = test_df.row_id.str.split("_", expand=True)[0].values.astype(int) 
    investment_id = test_df[['investment_id']].values.astype(int)
    float_input = test_df[float_feature_names].values
    
    float_input = torch.FloatTensor(float_input)
    investment_id = torch.LongTensor(investment_id)
    time_id = torch.LongTensor(time_id).unsqueeze(-1)
    
    sample_prediction_df['target'] = 0
    for uq_regressor in models:
        uq_regressor.eval()
        with torch.no_grad():
            predictions = uq_regressor(time_id, investment_id, float_input).squeeze()
        sample_prediction_df['target'] += predictions.detach().cpu().numpy()/n_splits
    env.predict(sample_prediction_df) 
    display(sample_prediction_df)