In [None]:
import warnings
warnings.simplefilter('ignore')

import gc

import numpy as np
import pandas as pd

pd.set_option('max_columns', None)
from tqdm.notebook import tqdm

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold

import joblib

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [None]:
%%time

train = pd.read_csv('../input/ubiquant-market-prediction/train.csv')
print(train.shape)
train.head()

In [None]:
%%time

scaler = MinMaxScaler(feature_range=(-1, 1))
train[[f'f_{i}' for i in range(300)]] = scaler.fit_transform(train[[f'f_{i}' for i in range(300)]])

train.head()

In [None]:
class UBIQUANT_DATASET(Dataset):
    def __init__(self, df_data, mode='train'):
        self.mode = mode
        self.ids = np.array(df_data['investment_id'].values.tolist(), dtype=np.int64)
        self.vals = np.array(df_data.iloc[:, 4:].values.tolist(), dtype=np.float64)
        if self.mode != 'test':
            self.targets = np.array(df_data['target'].values, dtype=np.float64)
        self.len = df_data.shape[0]
        
    def __len__(self):
        return self.len
    
    def __getitem__(self, index):
        ids_out = self.ids[index]
        vals_out = self.vals[index]
        if self.mode != 'test':
            targets_out = self.targets[index]
            return ids_out, vals_out, targets_out
        else:
            return ids_out, vals_out

In [None]:
# copy from: https://www.kaggle.com/elcaiseri/pytorch-optiver-realized-volatility-baseline

def swish(x):
    return x * torch.sigmoid(x)


class SimpleMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb = nn.Embedding(3774, 64)
        self.emb_drop = nn.Dropout(0.1)
        
        self.bn1 = nn.BatchNorm1d(300)
        self.lin1 = nn.Linear(64+300, 32)
        self.lin2 = nn.Linear(32, 128)
        self.lin3 = nn.Linear(128, 64)
        self.lin4 = nn.Linear(64, 32)
        self.lin_drop = nn.Dropout(0.25)
        self.lin5 = nn.Linear(32, 1)    

    def forward(self, x_cat, x_cont):
        x1 = self.emb(x_cat)
        x1 = self.emb_drop(x1)
        
        x2 = self.bn1(x_cont)

        x = torch.cat([x1, x2], 1)
        x = swish(self.lin1(x))
        x = swish(self.lin2(x))
        x = swish(self.lin3(x))
        x = swish(self.lin4(x))
        x = self.lin5(x)
        
        return x
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
def train_fn(dataloaders, fold_id):
    
    model = SimpleMLP().to(device)
    loss_fn = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), 
                           lr=1e-3)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
                                                     factor=0.1, 
                                                     patience=1, 
                                                     mode='min')
    
    epochs = 8
    num_train_examples = len(dataloaders['train'])
    num_valid_examples = len(dataloaders['valid'])

    losses = []
    best_loss = np.inf

    for e in range(epochs):
        # train
        model.train()
        train_loss = 0
        for i, (ids, vals, targets) in enumerate(dataloaders['train']):
            ids = ids.to(device)
            vals = vals.to(device=device, dtype=torch.float)
            targets = targets.unsqueeze(1).to(device, dtype=torch.float)

            yhat = model(ids, vals)
            loss = loss_fn(yhat, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_epoch_loss = train_loss / num_train_examples

        # valid
        model.eval()
        valid_preds = list()
        valid_loss = 0
        with torch.no_grad():
            for i, (ids, vals, targets) in enumerate(dataloaders['valid']):
                ids = ids.to(device)
                vals = vals.to(device=device, dtype=torch.float)
                targets = targets.unsqueeze(1).to(device, dtype=torch.float)

                yhat = model(ids, vals)
                val_loss = loss_fn(yhat, targets)
                valid_loss += val_loss.item()
                valid_preds.extend(yhat.detach().cpu().numpy().flatten())
        valid_epoch_loss = valid_loss / num_valid_examples

        # change lr
        scheduler.step(valid_epoch_loss)

        # oof
        oof = df_valid[['target']].copy()
        oof['pred'] = valid_preds
        score = oof['pred'].corr(oof['target'])

        # print score
        print(f"Epoch {e}, LR: {optimizer.param_groups[0]['lr']}")
        print(f"train loss: {train_epoch_loss:.8f}, valid loss {valid_epoch_loss:.8f}, pearson score: {score:.6f}")
        losses.append((train_epoch_loss, valid_epoch_loss))

        # save model
        if best_loss > valid_epoch_loss:
            torch.save(model.state_dict(), f'simple_mlp_model_{fold_id}.pth')
            print(f'-- loss from {best_loss:.8f} to {valid_epoch_loss:.8f}, model saved')
            best_loss = valid_epoch_loss
        print()
        
    return losses, oof

In [None]:
oof_list = list()

kfold = GroupKFold(n_splits=5)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train, train['target'], train['time_id'])):
    
    print(f'Training Fold: {fold_id}\n')
    
    df_train = train.iloc[trn_idx]
    df_valid = train.iloc[val_idx]
    
    train_set = UBIQUANT_DATASET(df_train, mode='train')
    valid_set = UBIQUANT_DATASET(df_valid, mode='valid')
    dataloaders = {
        'train': DataLoader(train_set, batch_size=1024, num_workers=4, pin_memory=True, shuffle=True),
        'valid': DataLoader(valid_set, batch_size=1024, num_workers=4, pin_memory=True, shuffle=False)
    }
    
    _, oof = train_fn(dataloaders, fold_id)
    oof_list.append(oof)

In [None]:
oof = pd.concat(oof_list)
print('oof pearson score:', oof['pred'].corr(oof['target']))

In [None]:
joblib.dump(scaler, 'minmaxscaler.pkl')