## Skeleton for training and inference with pytorch-lightning

This notebook shows how pytorch-lightning can be used to realize the whole processing pipeline from training to inference and submission in some few lines of code.

Various code snippets are taken from [https://www.kaggle.com/jiashenliu/introduction-to-financial-concepts-and-data] and [https://www.kaggle.com/gunesevitan/optiver-realized-volatility-prediction-1d-cnn]. Many thanks!


In [None]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import LearningRateMonitor

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.nn import functional as F
from pathlib import Path
import pyarrow.parquet as pq
from sklearn.metrics import r2_score

data_path=Path('../input/optiver-realized-volatility-prediction')

In [None]:
def ffill(data_df): 
    data_df = data_df.set_index(['seconds_in_bucket'])
    data_df = data_df.reindex(np.arange(0,600), method='ffill')
    return data_df.reset_index()

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))


In [None]:
from torch.utils.data import Dataset
import pyarrow.parquet as pq
 
class OptiverDataset(Dataset):
    
    def __init__(self, data_path, mode = "train", transform = None, ffill = False): 
        """ mode must be train or test """
        super().__init__()
        self.mode = mode
        self.transform = transform
        self.ffill = ffill
        train_df = pd.read_csv(data_path/f"{mode}.csv")
        self.train_grouped = train_df.groupby(['stock_id','time_id'])        
        book_df = pq.read_table(data_path / f"book_{mode}.parquet").to_pandas()         
        self.book_grouped = book_df.groupby(['stock_id','time_id'])        
        self.indices = list(self.book_grouped.indices.keys())
    
    def __getitem__(self, idx):   
        grp_name = self.indices[idx]
        df = self.book_grouped.get_group(grp_name)
        if self.ffill:
            df = ffill(df)        
        if self.transform:
            x = self.transform(df)
        else:
            x = df[['bid_price1', 'ask_price1', 'bid_price2', 'ask_price2']].to_numpy(np.float32)
            
        if self.mode == "test":
            row_id = self.train_grouped.get_group(grp_name)['row_id'].values[0]
            return x, row_id
        else:
            y = self.train_grouped.get_group(grp_name)['target'].to_numpy(np.float32)
            return x, y
    
    def __len__(self):
        return len(self.indices)
    

In [None]:
from torch.utils.data import DataLoader, random_split

class OptiverDataModule(pl.LightningDataModule):
    def __init__(self, data_path, train_batch_size = 32, val_batch_size = 32, transform=None):
        super().__init__()
        self.data_path = data_path
        self.train_batch_size = train_batch_size
        self.val_batch_size = val_batch_size
        self.transform = transform
        self.train_dataset = None
        self.val_dataset = None
     
    def create_datasets(self):
        dataset = OptiverDataset(self.data_path, mode= "train", transform = self.transform)
        dataset_len = len(dataset)
        train_dataset_len = int(dataset_len*0.8)
        val_dataset_len = dataset_len - train_dataset_len
        self.train_dataset, self.val_dataset = random_split(
            dataset, [train_dataset_len, val_dataset_len], generator=torch.Generator().manual_seed(42))
        
    def train_dataloader(self):
        if not self.train_dataset:
            self.create_datasets()
        dataloader = DataLoader(
            self.train_dataset, 
            batch_size=self.train_batch_size, 
            shuffle=True,
            num_workers = 4)
        return dataloader
    
    def val_dataloader(self):
        if not self.val_dataset:
            self.create_datasets()
        dataloader = DataLoader(
            self.val_dataset, 
            batch_size=self.val_batch_size, 
            shuffle=False,
            num_workers = 4)
        return dataloader
    
    def test_dataloader(self):
        dataloader = DataLoader(
            OptiverDataset(self.data_path, mode= "test", transform = self.transform),
            batch_size=1,
            num_workers = 4)
        return dataloader

In [None]:
import torch.optim
import torch.nn
import torch.nn.functional as F

class SimplestLinearModule(pl.LightningModule):
    def __init__(self, learning_rate = 0.01):
        super().__init__()    
        self.learning_rate = learning_rate
        self.linear = torch.nn.Linear(1, 1)  # One in and one out
        
    def forward(self, input):
        x = self.linear(input)
        return F.leaky_relu(x)
    
    def training_step(self, batch, batch_idx):
        x, target = batch
        prediction = self.forward(x)
        loss = torch.sqrt( F.mse_loss(prediction, target) + 1e-24)
        self.log("loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)    
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, target = batch
        prediction = self.forward(x)
        loss = torch.sqrt( F.mse_loss(prediction, target) + 1e-24)
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)   
        return prediction, target
    
    def validation_epoch_end(self, validation_step_outputs):
        y_pred = [p for p,t in validation_step_outputs]
        y_true = [t for p,t in validation_step_outputs]                
        y_pred = torch.cat(y_pred, dim=0).view(-1).cpu().numpy()
        y_true = torch.cat(y_true, dim=0).view(-1).cpu().numpy()
        R2 = round(r2_score(y_true, y_pred),3)
        RMSPE = round(rmspe(y_true, y_pred),3)
        self.log("R2", R2, on_step=False, on_epoch=True, prog_bar=True, logger=True) 
        self.log("RMSPE", RMSPE, on_step=False, on_epoch=True, prog_bar=True, logger=True)   

    def test_step(self, batch, batch_idx):  
        x, row_id = batch
        prediction = self.forward(x)
        return row_id, prediction.cpu().numpy()
    
    def test_epoch_end(self, test_step_outputs):
        ## write submission file:
        ## row_id target
        submission = pd.DataFrame({
            'row_id' : [row_id[0] for row_id, target in test_step_outputs], # row_id is a tuple
            'target' : [target[0,0] for row_id, target in test_step_outputs]}) # assumes test batch size is 1
        submission.to_csv('submission.csv', index=None)
        submission

    def configure_optimizers(self):
        #optimizer = torch.optim.SGD(self.linear.parameters(), lr = self.learning_rate )
        #optimizer =torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        optimizer = torch.optim.RMSprop(self.linear.parameters(), lr = self.learning_rate )
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', patience=5)
        return {
            "optimizer" : optimizer, 
            "lr_scheduler" : {
                "scheduler": scheduler,
                "monitor": "val_loss",
            }}

def realized_volatility_feature(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) /\
                  (df['bid_size1'] + df['ask_size1'])
    r = log_return(wap).to_numpy(dtype=np.float32)
    r = r[~np.isnan(r)]
    return np.array([realized_volatility(r)], dtype=np.float32)
    
datamodule = OptiverDataModule(
    data_path=Path('../input/optiver-realized-volatility-prediction'),
    train_batch_size = 256, val_batch_size = 256,
    transform = realized_volatility_feature)

module = SimplestLinearModule(learning_rate = 1e-4)

checkpoint_callback = ModelCheckpoint(monitor='val_loss')
lr_monitor = LearningRateMonitor(logging_interval='step')

trainer = pl.Trainer(
    gpus=0,
    callbacks=[checkpoint_callback, lr_monitor],
    #limit_train_batches=0.25,
    #limit_val_batches=0.25,
    max_epochs = 20
)



In [None]:
trainer.fit(module, datamodule)

## Performance of the Naive Prediction
According to [https://www.kaggle.com/jiashenliu/introduction-to-financial-concepts-and-data], 
performance of the naive prediction is:

R2 score: 0.628 

RMSPE: 0.341

In [None]:
trainer.test(ckpt_path="best")

!cat submission.csv