<center><h3 style='color:red'>Optiver Realized Volatility PyTorch Baseline</h3><br>KASSEM@ELCAISERI<HR></center>

In this notebook:
* A simple PyTorch NN starter using stock Embedding.


Credits to:
* https://www.kaggle.com/jiashenliu/introduction-to-financial-concepts-and-data
* https://www.kaggle.com/c/optiver-realized-volatility-prediction/discussion/250324
* https://www.kaggle.com/lucasmorin/tf-keras-nn-with-stock-embedding

**I hope it will be useful for beginners. By creating new variables you can easily improve this model.**


## updates:
**V4**
* use RMSELoss as loss fuction
* add scheduler
* test the results.

**V6**
* change model hyperparmeters {emb_size=29, emb_drop_out=0.25}
* Train for 5 folds && Test and submit for the 5 folds

**V8**
* Implement of 'SWISH' : a self-gated activation function
* join main DF with Data Normalize

In [None]:
from IPython.core.display import display, HTML

import pandas as pd
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import gc

from joblib import Parallel, delayed

from sklearn import preprocessing, model_selection
from tqdm import tqdm
import random

from sklearn.metrics import r2_score

import matplotlib.pyplot as plt 
import seaborn as sns

path_root = '../input/optiver-realized-volatility-prediction'
path_data = '../input/optiver-realized-volatility-prediction'
path_submissions = '/'

target_name = 'target'
scores_folds = {}

DEBUG = False

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))


def get_stock_stat(stock_id : int, dataType = 'train'):
    key = ['stock_id', 'time_id', 'seconds_in_bucket']
    
    #Book features
    df_book = pd.read_parquet(os.path.join(path_data, 'book_{}.parquet/stock_id={}/'.format(dataType, stock_id)))
    df_book['stock_id'] = stock_id
    cols = key + [col for col in df_book.columns if col not in key]
    df_book = df_book[cols]
    
    df_book['wap1'] = (df_book['bid_price1'] * df_book['ask_size1'] +
                                    df_book['ask_price1'] * df_book['bid_size1']) / (df_book['bid_size1'] + df_book['ask_size1'])
    df_book['wap2'] = (df_book['bid_price2'] * df_book['ask_size2'] +
                                    df_book['ask_price2'] * df_book['bid_size2']) / (df_book['bid_size2'] + df_book['ask_size2'])
    df_book['log_return1'] = df_book.groupby(by = ['time_id'])['wap1'].apply(log_return).fillna(0)
    df_book['log_return2'] = df_book.groupby(by = ['time_id'])['wap2'].apply(log_return).fillna(0)
    
    features_to_apply_realized_volatility = ['log_return'+str(i+1) for i in range(2)]
    stock_stat = df_book.groupby(by = ['stock_id', 'time_id'])[features_to_apply_realized_volatility]\
                        .agg(realized_volatility).reset_index()

    #Trade features
    trade_stat =  pd.read_parquet(os.path.join(path_data,'trade_{}.parquet/stock_id={}'.format(dataType, stock_id)))
    trade_stat = trade_stat.sort_values(by=['time_id', 'seconds_in_bucket']).reset_index(drop=True)
    trade_stat['stock_id'] = stock_id
    cols = key + [col for col in trade_stat.columns if col not in key]
    trade_stat = trade_stat[cols]
    trade_stat['trade_log_return1'] = trade_stat.groupby(by = ['time_id'])['price'].apply(log_return).fillna(0)
    trade_stat = trade_stat.groupby(by = ['stock_id', 'time_id'])[['trade_log_return1']]\
                           .agg(realized_volatility).reset_index()
    #Joining book and trade features
    stock_stat = stock_stat.merge(trade_stat, on=['stock_id', 'time_id'], how='left').fillna(-999)
    
    return stock_stat

def get_dataSet(stock_ids : list, dataType = 'train'):
    
    if DEBUG and dataType == 'train':
        stock_ids = stock_ids[:10]

    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_stock_stat)(stock_id, dataType) 
        for stock_id in stock_ids
    )
    
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)

    return stock_stat_df

def feval_RMSPE(preds, train_data):
    labels = train_data.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False


## Train and test datasets

In [None]:
train = pd.read_csv(os.path.join(path_data, 'train.csv'))
%time train_stock_stat_df = get_dataSet(stock_ids = train['stock_id'].unique(), dataType = 'train')
train = pd.merge(train, train_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left').fillna(0)
print('Train shape: {}'.format(train.shape))
display(train.head(2))

test = pd.read_csv(os.path.join(path_data, 'test.csv'))
test_stock_stat_df = get_dataSet(stock_ids = test['stock_id'].unique(), dataType = 'test')
test = pd.merge(test, test_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left').fillna(0)
print('Test shape: {}'.format(test.shape))
display(test.head())

## Data Normailze

In [None]:
scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))

train_scaled = scaler.fit_transform(train.iloc[:, 3:])
train_ = train.join(pd.DataFrame(train_scaled), how='left')

test_scaled = scaler.transform(test.iloc[:, 3:])
test_ = test.join(pd.DataFrame(test_scaled), how='left')


train, test = train_, test_
train.shape, test.shape

## Apply Torch

In [None]:
## import libraries

#PyTorch 

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torch.utils import data

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

## Torch DATASET

In [None]:
class OptiveDataset(Dataset):
    def __init__(self, X, Y, emb_cols=['stock_id', 'time_id']):
        X = X.copy()
        self.X1 = X.loc[:,emb_cols].copy().values.astype(np.int64) #categorical columns
        self.X2 = X.drop(columns=emb_cols).copy().values.astype(np.float32) #numerical columns
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return (self.X1[idx], self.X2[idx]), self.y[idx]
    
class OptiveDatasetTest(Dataset):
    def __init__(self, X, emb_cols=['stock_id', 'time_id']):
        X = X.copy()
        self.X1 = X.loc[:,emb_cols].copy().values.astype(np.int64) #categorical columns
        self.X2 = X.drop(columns=emb_cols).copy().values.astype(np.float32) #numerical columns
        
    def __len__(self):
        return len(self.X1)
    
    def __getitem__(self, idx):
        return (self.X1[idx], self.X2[idx])

In [None]:
train_dataset = OptiveDataset(train.drop(['target', 'time_id'], axis=1), train['target'], emb_cols=['stock_id'])
train_dl = DataLoader(train_dataset, batch_size=4, shuffle=True)

#test the dataset class
for (emb, count), target in train_dl:
    print((emb.shape, count.shape), target.shape)
    break;

## Model

In [None]:
# from https://discuss.pytorch.org/t/implementation-of-swish-a-self-gated-activation-function/8813/2
def swish(x):
    return x * torch.sigmoid(x)

In [None]:
class OptiverModel(nn.Module):
    def __init__(self, embedding_sizes=16, num_embeddings=max(train['stock_id'])+1):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_sizes)
        self.emb_drop = nn.Dropout(0.25)
        
        self.bn1 = nn.BatchNorm1d(6)
        self.lin1 = nn.Linear(embedding_sizes+6, 32)
        self.lin2 = nn.Linear(32, 128)
        self.lin3 = nn.Linear(128, 64)
        self.lin4 = nn.Linear(64, 32)
        self.lin5 = nn.Linear(32, 1)    

    def forward(self, x_cat, x_cont):
        x1 = self.emb(x_cat)
        x1 = torch.flatten(x1, end_dim=1)
        #x1 = self.emb_drop(x1)
        
        x2 = self.bn1(x_cont)

        x = torch.cat([x1, x2], 1)
        x = swish(self.lin1(x))
        x = swish(self.lin2(x))
        x = swish(self.lin3(x))
        x = swish(self.lin4(x))
        x = self.lin5(x)
        #x = torch.sigmoid(x)
        
        
        return x

In [None]:
model = OptiverModel(embedding_sizes=24,)
#emb.shape, count.shape
out = model(emb, count)

print(out, target)
#model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def RMSELoss(yhat,y):
    return torch.sqrt(torch.mean((yhat-y)**2))

def RMSPELoss(y_pred, y_true):
    return torch.sqrt(torch.mean( ((y_true - y_pred) / y_true) ** 2 ))

def train_epoch(train_dl, valid_dl, model, loss_fn, opt, sch, epoch, fold, device=device):
    # taining loop
    model.train()
    running_loss_ = 0
    
    pbar = tqdm(enumerate(train_dl), total=len(train_dl))
    for i, ((cats, counts), targets) in pbar:
        cats, counts, targets = cats.to(device), counts.to(device), targets.unsqueeze(1).to(device)
        
        opt.zero_grad()
        y_pred = model(cats, counts)
        loss = loss_fn(y_pred.float(), targets.float())
        
        loss.backward()
        opt.step()
        
        running_loss_ += loss.item()
        if (i+1) % 100 == 0:
            pbar.set_description(f"running loss:{running_loss_ / (i+1): 0.6f}")
    
    sch.step(loss)

    epoch_loss = running_loss_ / len(train_dl)
    #print(f'==> Epoch {epoch} TRAIN loss: {epoch_loss:.6f}')
    
    # Validation loop
    model.eval()
    valid_loss = 0
    best_loss = np.inf
    
    for i, ((cats, counts), targets) in enumerate(valid_dl):
        cats, counts, targets = cats.to(device), counts.to(device), targets.unsqueeze(1).to(device)
        
        with torch.no_grad():
            y_pred = model(cats, counts)
            val_loss = loss_fn(y_pred.float(), targets.float())
            
        valid_loss += val_loss.item() * targets.shape[0]
    sch.step(valid_loss)
    
    valid_epoch_loss = valid_loss / len(valid_dl)
    print(f'==>FOLD:{fold}, Epoch {epoch} VALID loss: {valid_epoch_loss:.8f}')
    
    #if valid_epoch_loss < best_loss:
    #    best_loss = valid_epoch_loss
    #    torch.save(model.state_dict(), f'FOLD{fold}_optive_model.pth')
    
    model.train()
    return model, epoch_loss, valid_epoch_loss

In [None]:
def perpare_dataset(train, valid, test=None, batch_size=128, drop_cols=['target', 'time_id'], emb_cols=['stock_id']):
    train_dataset = OptiveDataset(train.drop(drop_cols, axis=1), train['target'], emb_cols=emb_cols)
    valid_dataset = OptiveDataset(valid.drop(drop_cols, axis=1), valid['target'], emb_cols=emb_cols)    
    
    train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)
    
    return train_dl, valid_dl

In [None]:
n_folds = 5
epochs = 25

kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=2021)
seed_everything(42)

for fold_idx, (dev_index, val_index) in enumerate(kf.split(range(len(train)))):
        
    train_ = train.loc[dev_index,].reset_index(drop=True)
    valid_ = train.loc[val_index, ].reset_index(drop=True)
    
    train_dl, valid_dl = perpare_dataset(train_, valid_)
    
    model = OptiverModel(embedding_sizes=24,).to(device)
    #loss_fn = nn.MSELoss().to(device)
    loss_fn = RMSELoss
    
    opt = optim.Adam(model.parameters(), lr=0.001)
    sch = optim.lr_scheduler.ReduceLROnPlateau(opt, factor=0.2, patience=4)
    
    bst_loss = np.inf
    counter = 0
    for epoch in range(epochs):
        model, epoch_loss, valid_epoch_loss = train_epoch(train_dl, valid_dl, 
                                                                   model, loss_fn, opt, 
                                                                   sch, epoch, fold_idx, device=device)
        
        # simple early stop
        if bst_loss < valid_epoch_loss:
            counter += 1
        else:
            bst_loss = valid_epoch_loss
            bst_epoch = epoch
            counter = 0
            torch.save(model.state_dict(), f'FOLD{fold_idx}_optive_model.pth')

        
        # break after 5 epochs
        if counter > 7:
            break
        
    print(f'FOLD: {fold_idx}, BEST EPOCH: {bst_epoch}, BEST LOSS: {bst_loss}')

# Test and prediction

In [None]:
test_dataset = OptiveDatasetTest(test.drop(['row_id', 'time_id'], axis=1), emb_cols=['stock_id'])
test_dl = DataLoader(test_dataset, batch_size=1, shuffle=False)

test_preds = []
model_paths = glob.glob('./*.pth')

for model_path in model_paths:
    #model_path = './FOLD0_optive_model.pth'
    model.load_state_dict(torch.load(model_path))
    model.to(torch.device('cpu'))
    model.eval()

    y_preds = []
    with torch.no_grad():
        for x_cat, x_cont in test_dl:
            y_preds += [model(x_cat, x_cont).detach().cpu().numpy()[0][0]]
    test_preds.append(y_preds)
    
y_preds = np.mean(test_preds, axis=0)

In [None]:
y_preds

In [None]:
test__ = test.copy()
test__['target'] = y_preds
test__[['row_id', 'target']].to_csv('submission.csv',index = False)

In [None]:
test__[['row_id', 'target']]

<center><h3><span style='color:red'>UPVOTE</span> if you find it interesting</h3><hr>
Notebook still under modification </center>