In [None]:
import os
import random
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, SequentialSampler, RandomSampler
import torch.optim as optim
from fastai.layers import SigmoidRange

In [None]:
class PreprocessingPipeline:
    
    def __init__(self, df_train, df_test, n_splits, shuffle, random_state):
        
        self.df_train = df_train.copy(deep=True)
        self.df_test = df_test.copy(deep=True)
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state
        
    def _label_encode(self):

        # Encoding stock_id for embeddings
        le = LabelEncoder()
        self.df_train['stock_id_encoded'] = le.fit_transform(self.df_train['stock_id'].values)
        self.df_test['stock_id_encoded'] = le.transform(self.df_test['stock_id'].values)
    
    def _get_folds(self):
        
        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
        for fold, (_, val_idx) in enumerate(skf.split(X=self.df_train, y=self.df_train['stock_id']), 1):
            df_train.loc[val_idx, 'fold'] = fold
        self.df_train['fold'] = df_train['fold'].astype(np.uint8)
            
    def transform(self):
        
        self._label_encode()
        self._get_folds()
        
        return self.df_train, self.df_test


In [None]:
train_test_dtypes = {
    'stock_id': np.uint8,
    'time_id': np.uint16,
    'target': np.float64
}

df_train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv', dtype=train_test_dtypes)
df_test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv', usecols=['stock_id', 'time_id'], dtype=train_test_dtypes)

preprocessing_parameters = {
    'df_train': df_train,
    'df_test': df_test,
    'n_splits': 5,
    'shuffle': True,
    'random_state': 42
}

preprocessing_pipeline = PreprocessingPipeline(**preprocessing_parameters)
df_train, df_test = preprocessing_pipeline.transform()

print(f'Training Set Shape: {df_train.shape} - Memory Usage: {df_train.memory_usage().sum() / 1024 ** 2:.2f} MB')
print(f'Test Set Shape: {df_test.shape} - Memory Usage: {df_test.memory_usage().sum() / 1024 ** 2:.2f} MB')

In [None]:
def visualize_learning_curve(training_losses, validation_losses, title, path=None):
    
    """
    Visualize learning curves of the models

    Parameters
    ----------
    training_losses [array-like of shape (n_epochs)]: Array of training losses computed after every epoch
    validation_losses [array-like of shape (n_epochs)]: Array of validation losses computed after every epoch
    title (str): Title of the plot
    path (str or None): Path of the output file (if path is None, plot is displayed with selected backend)
    """

    fig, ax = plt.subplots(figsize=(32, 8), dpi=100)

    sns.lineplot(
        x=np.arange(1, len(training_losses) + 1),
        y=training_losses,
        ax=ax,
        label='train_loss'
    )
    sns.lineplot(
        x=np.arange(1, len(validation_losses) + 1),
        y=validation_losses,
        ax=ax,
        label='val_loss'
    )

    ax.set_xlabel('Epochs/Steps', size=15, labelpad=12.5)
    ax.set_ylabel('Loss', size=15, labelpad=12.5)
    ax.tick_params(axis='x', labelsize=12.5, pad=10)
    ax.tick_params(axis='y', labelsize=12.5, pad=10)
    ax.legend(prop={'size': 18})
    ax.set_title(title, size=20, pad=15)

    if path is None:
        plt.show()
    else:
        plt.savefig(path)


In [None]:
def set_seed(seed, deterministic_cudnn=False):

    """
    Set random seed for reproducible results
    
    Parameters
    ----------
    seed (int): Random seed
    deterministic_cudnn (bool): Whether to set deterministic cuDNN or not
    """

    if deterministic_cudnn:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


def rmspe_metric(y_true, y_pred):

    """
    Calculate root mean squared percentage error between ground-truth and predictions
    
    Parameters
    ----------
    y_true [array-like of shape (n_samples)]: Ground-truth
    y_pred [array-like of shape (n_samples)]: Predictions
    
    Returns
    -------
    rmspe (float): Root mean squared percentage error
    """

    rmspe = np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))
    return rmspe


def rmspe_loss(y_true, y_pred):

    """
    Calculate root mean squared percentage error between ground-truth and predictions
    
    Parameters
    ----------
    y_true [torch.tensor of shape (n_samples)]: Ground-truth
    y_pred [torch.tensor of shape (n_samples)]: Predictions
    
    Returns
    -------
    rmspe (torch.FloatTensor): Root mean squared percentage error
    """

    rmspe = torch.sqrt(torch.mean(torch.square((y_true - y_pred) / y_true)))
    return rmspe


In [None]:
class Optiver2DDataset(Dataset):

    def __init__(self, df, flip_probability=0.):

        self.df = df
        # Normalizing sequences with global means and stds across stocks
        book_means = np.array([
            0.99969482421875, 1.000321388244629, 0.9995064735412598, 1.0005191564559937,
            769.990177708821, 766.7345672818379, 959.3416027831918, 928.2202512713748,
            1.0000068043192514, 1.0000055320253616, 5.129816581143487e-08, 9.831598141593519e-08
        ])
        book_stds = np.array([
            0.0036880988627672195, 0.003687119111418724, 0.0037009266670793295, 0.0036990800872445107,
            5354.051690318169, 4954.947103063445, 6683.816183660414, 5735.299917793827,
            0.003689893218043926, 0.00370745215558702, 6.618708642293018e-07, 1.2508970015188411e-06
        ])
        # Not normalizing trade price and trade price log returns because of the sparsity
        trade_means = np.array([0, 352.9736760331942, 4.1732040971227145, 0])
        trade_stds = np.array([1, 1041.9441951057488, 7.79955795393431, 1])

        self.transforms = {
            'flip': flip_probability,
            'normalize': {
                'book_means': book_means,
                'book_stds': book_stds,
                'trade_means': trade_means,
                'trade_stds': trade_stds
            }
        }

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        """
        Get the idxth element in the dataset

        Parameters
        ----------
        idx (int): Index of the sample (0 <= idx < len(self.df))

        Returns
        -------
        stock_id_encoded [torch.LongTensor of shape (1)]: Encoded stock_id for stock embeddings
        sequences [torch.FloatTensor of shape (600, 16)]: Concatenated sequences from book and trade data
        target [torch.Tensor of shape (1)]: Target
        """

        sample = self.df.iloc[idx]
        stock_id = int(sample['stock_id'])
        time_id = int(sample['time_id'])

        # Sequences from book data
        book_sequences = np.load(f'../input/optiver-realized-volatility-npy-files/book_train/book_train/stock_{stock_id}/time_{time_id}.npy')
        book_wap1 = (book_sequences[:, 0] * book_sequences[:, 5] + book_sequences[:, 1] * book_sequences[:, 4]) /\
                    (book_sequences[:, 4] + book_sequences[:, 5])
        book_wap2 = (book_sequences[:, 2] * book_sequences[:, 7] + book_sequences[:, 3] * book_sequences[:, 6]) /\
                    (book_sequences[:, 6] + book_sequences[:, 7])
        book_wap1_log = np.log(book_wap1)
        book_wap1_log_returns = np.diff(book_wap1_log, prepend=[book_wap1_log[0]])
        book_wap2_log = np.log(book_wap2)
        book_wap2_log_returns = np.diff(book_wap2_log, prepend=[book_wap2_log[0]])
        book_sequences = np.hstack([
            book_sequences,
            book_wap1.reshape(-1, 1),
            book_wap2.reshape(-1, 1),
            book_wap1_log_returns.reshape(-1, 1),
            book_wap2_log_returns.reshape(-1, 1),
        ])
        book_sequences = (book_sequences - self.transforms['normalize']['book_means']) / self.transforms['normalize']['book_stds']

        # Sequences from trade data
        trade_sequences = np.load(f'../input/optiver-realized-volatility-npy-files/trade_train/trade_train/stock_{stock_id}/time_{time_id}.npy')
        trade_price_log1p = np.log1p(trade_sequences[:, 0])
        trade_price_log_returns = np.diff(trade_price_log1p, prepend=trade_price_log1p[0])
        trade_sequences = np.hstack([trade_sequences, trade_price_log_returns.reshape(-1, 1)])

        # Concatenate book and trade sequences
        sequences = np.hstack([book_sequences, trade_sequences])
        sequences = torch.as_tensor(sequences, dtype=torch.float)

        # Flip sequences on zeroth dimension
        if np.random.rand() < self.transforms['flip']:
            sequences = torch.flip(sequences, dims=[0])

        stock_id_encoded = torch.as_tensor(sample['stock_id_encoded'], dtype=torch.long)
        target = sample['target']
        target = torch.as_tensor(target, dtype=torch.float)
        return stock_id_encoded, sequences, target


In [None]:
class Conv1dBlock(nn.Module):

    def __init__(self, in_channels, out_channels, kernel_size=(5,), stride=(1,), padding=(2,), skip_connection=False):

        super(Conv1dBlock, self).__init__()

        self.skip_connection = skip_connection
        self.conv_block = nn.Sequential(
            nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, padding_mode='replicate', bias=True),
            nn.BatchNorm1d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv1d(out_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, padding_mode='replicate', bias=True),
            nn.BatchNorm1d(out_channels),
        )
        self.downsample = nn.Sequential(
            nn.Conv1d(in_channels, out_channels, kernel_size=(1,), stride=(1,), bias=False),
            nn.BatchNorm1d(out_channels)
        )
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):

        output = self.conv_block(x)
        if self.skip_connection:
            x = self.downsample(x)
            output += x
        output = self.relu(output)

        return output


class CNN1DModel(nn.Module):

    def __init__(self, in_channels):

        super(CNN1DModel, self).__init__()

        self.stock_embeddings = nn.Embedding(num_embeddings=113, embedding_dim=16)
        self.conv_block1 = Conv1dBlock(in_channels=in_channels, out_channels=32, skip_connection=True)
        self.conv_block2 = Conv1dBlock(in_channels=32, out_channels=64, skip_connection=True)
        self.conv_block3 = Conv1dBlock(in_channels=64, out_channels=128, skip_connection=True)
        self.conv_block4 = Conv1dBlock(in_channels=128, out_channels=64, skip_connection=True)
        self.conv_block5 = Conv1dBlock(in_channels=64, out_channels=32, skip_connection=True)
        self.conv_block6 = Conv1dBlock(in_channels=32, out_channels=16, skip_connection=True)
        self.conv_block7 = Conv1dBlock(in_channels=16, out_channels=8, skip_connection=True)
        self.conv_block8 = Conv1dBlock(in_channels=8, out_channels=1, skip_connection=True)
        self.pooling = nn.AvgPool1d(kernel_size=(3,), stride=(1,), padding=(1,))
        self.linear = nn.Linear(616, 256, bias=True)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.25)
        self.head = nn.Sequential(
            nn.Linear(256, 1, bias=True),
            SigmoidRange(0, 0.1)
        )

    def forward(self, stock_ids, sequences):

        x = torch.transpose(sequences, 1, 2)
        x = self.conv_block1(x)
        x = self.pooling(x)
        x = self.conv_block2(x)
        x = self.pooling(x)
        x = self.conv_block3(x)
        x = self.pooling(x)
        x = self.conv_block4(x)
        x = self.pooling(x)
        x = self.conv_block5(x)
        x = self.pooling(x)
        x = self.conv_block6(x)
        x = self.pooling(x)
        x = self.conv_block7(x)
        x = self.pooling(x)
        x = self.conv_block8(x)
        x = self.pooling(x)
        x = x.view(x.size(0), -1)
        embedded_stock_ids = self.stock_embeddings(stock_ids)
        x = torch.cat([x, self.dropout(embedded_stock_ids)], dim=1)
        x = self.relu(self.linear(x))
        output = self.head(x)
        
        return output.view(-1)


In [None]:
class Trainer:

    def __init__(self, model_name, model_path, model_parameters, training_parameters):

        self.model_name = model_name
        self.model_path = model_path
        self.model_parameters = model_parameters
        self.training_parameters = training_parameters

    def get_model(self):

        model = None

        if self.model_name == 'cnn1d':
            model = CNN1DModel(**self.model_parameters)

        return model

    def train_fn(self, train_loader, model, criterion, optimizer, device):

        print('\n')
        model.train()
        progress_bar = tqdm(train_loader)
        losses = []

        if self.training_parameters['amp']:
            scaler = torch.cuda.amp.GradScaler()
        else:
            scaler = None

        for stock_id_encoded, sequences, target in progress_bar:
            
            stock_id_encoded, sequences, target = stock_id_encoded.to(device), sequences.to(device), target.to(device)

            if scaler is not None:
                with torch.cuda.amp.autocast():
                    optimizer.zero_grad()
                    output = model(stock_id_encoded, sequences)
                    loss = criterion(target, output)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                optimizer.zero_grad()
                output = model(stock_id_encoded, sequences)
                loss = criterion(target, output)
                loss.backward()
                optimizer.step()

            losses.append(loss.item())
            average_loss = np.mean(losses)
            progress_bar.set_description(f'train_rmspe: {average_loss:.6f}')

        train_loss = np.mean(losses)
        return train_loss

    def val_fn(self, val_loader, model, criterion, device):

        model.eval()
        progress_bar = tqdm(val_loader)
        losses = []

        with torch.no_grad():
            
            for stock_id_encoded, sequences, target in progress_bar:
                
                stock_id_encoded, sequences, target = stock_id_encoded.to(device), sequences.to(device), target.to(device)
                output = model(stock_id_encoded, sequences)
                loss = criterion(target, output)
                losses.append(loss.item())
                average_loss = np.mean(losses)
                progress_bar.set_description(f'val_rmspe: {average_loss:.6f}')

        val_loss = np.mean(losses)
        return val_loss

    def train_and_validate(self, df_train):

        print(f'\n{"-" * 26}\nRunning Model for Training\n{"-" * 26}\n')

        for fold in sorted(df_train['fold'].unique()):

            print(f'\nFold {fold}\n{"-" * 6}')

            trn_idx, val_idx = df_train.loc[df_train['fold'] != fold].index, df_train.loc[df_train['fold'] == fold].index
            train_dataset = Optiver2DDataset(df=df_train.loc[trn_idx, :], flip_probability=0.)
            train_loader = DataLoader(
                train_dataset,
                batch_size=self.training_parameters['batch_size'],
                sampler=RandomSampler(train_dataset),
                pin_memory=True,
                drop_last=False,
                num_workers=self.training_parameters['num_workers'],
            )
            val_dataset = Optiver2DDataset(df=df_train.loc[val_idx, :], flip_probability=0.)
            val_loader = DataLoader(
                val_dataset,
                batch_size=self.training_parameters['batch_size'],
                sampler=SequentialSampler(val_dataset),
                pin_memory=True,
                drop_last=False,
                num_workers=self.training_parameters['num_workers'],
            )

            set_seed(self.training_parameters['random_state'], deterministic_cudnn=self.training_parameters['deterministic_cudnn'])
            device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
            model = self.get_model()
            model = model.to(device)

            criterion = rmspe_loss
            optimizer = optim.Adam(
                model.parameters(),
                lr=self.training_parameters['learning_rate'],
                betas=self.training_parameters['betas'],
                weight_decay=self.training_parameters['weight_decay']
            )
            scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,
                mode='min',
                patience=self.training_parameters['reduce_lr_patience'],
                factor=self.training_parameters['reduce_lr_factor'],
                min_lr=self.training_parameters['reduce_lr_min'],
                verbose=True
            )

            early_stopping = False
            summary = {
                'train_loss': [],
                'val_loss': []
            }

            for epoch in range(1, self.training_parameters['epochs'] + 1):

                if early_stopping:
                    break

                train_loss = self.train_fn(train_loader, model, criterion, optimizer, device)
                val_loss = self.val_fn(val_loader, model, criterion, device)
                print(f'Epoch {epoch} - Training Loss: {train_loss:.6f} - Validation Loss: {val_loss:.6f}')
                scheduler.step(val_loss)

                best_val_loss = np.min(summary['val_loss']) if len(summary['val_loss']) > 0 else np.inf
                if val_loss < best_val_loss:
                    model_path = f'{self.model_path}/{self.model_name}_fold{fold}.pt'
                    torch.save(model.state_dict(), model_path)
                    print(f'Saving model to {model_path} (validation loss decreased from {best_val_loss:.6f} to {val_loss:.6f})')

                summary['train_loss'].append(train_loss)
                summary['val_loss'].append(val_loss)

                best_iteration = np.argmin(summary['val_loss']) + 1
                if len(summary['val_loss']) - best_iteration >= self.training_parameters['early_stopping_patience']:
                    print(f'Early stopping (validation loss didn\'t increase for {self.training_parameters["early_stopping_patience"]} epochs/steps)')
                    print(f'Best validation loss is {np.min(summary["val_loss"]):.6f}')
                    draw_learning_curve(
                        training_losses=summary['train_loss'],
                        validation_losses=summary['val_loss'],
                        title=f'{self.model_name} - Fold {fold} Learning Curve',
                        path=f'{self.model_path}/{self.model_name}_fold{fold}_learning_curve.png'
                    )
                    early_stopping = True

    def inference(self, df_train):

        print(f'\n{"-" * 27}\nRunning Model for Inference\n{"-" * 27}')
        df_train[f'{self.model_name}_predictions'] = 0

        for fold in sorted(df_train['fold'].unique()):

            _, val_idx = df_train.loc[df_train['fold'] != fold].index, df_train.loc[df_train['fold'] == fold].index
            val_dataset = Optiver2DDataset(df=df_train.loc[val_idx, :], flip_probability=0.)
            val_loader = DataLoader(
                val_dataset,
                batch_size=self.training_parameters['batch_size'],
                sampler=SequentialSampler(val_dataset),
                pin_memory=True,
                drop_last=False,
                num_workers=self.training_parameters['num_workers'],
            )

            set_seed(self.training_parameters['random_state'], deterministic_cudnn=self.training_parameters['deterministic_cudnn'])
            device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
            model = self.get_model()
            model.load_state_dict(torch.load(f'{self.model_path}/{self.model_name}_fold{fold}.pt'))
            model.to(device)
            model.eval()

            val_predictions = []
            with torch.no_grad():
                for stock_id, sequences, target in val_loader:
                    stock_id, sequences, target = stock_id.to(device), sequences.to(device), target.to(device)
                    output = model(stock_id, sequences)
                    output = output.detach().cpu().numpy().squeeze().tolist()
                    val_predictions += output

            df_train.loc[val_idx, f'{self.model_name}_predictions'] = val_predictions
            fold_score = rmspe_metric(df_train.loc[val_idx, 'target'], val_predictions)
            print(f'Fold {fold} - RMSPE: {fold_score:.6}')

            del _, val_idx, val_dataset, val_loader, val_predictions, model

        print(f'{"-" * 30}')
        for stock_id in df_train['stock_id'].unique():
            df_stock = df_train.loc[df_train['stock_id'] == stock_id, :]
            stock_oof_score = rmspe_metric(df_stock['target'], df_stock[f'{self.model_name}_predictions'])
            print(f'Stock {stock_id} - RMSPE: {stock_oof_score:.6}')

        oof_score = rmspe_metric(df_train['target'], df_train[f'{self.model_name}_predictions'])
        print(f'{"-" * 30}\nOOF RMSPE: {oof_score:.6}\n{"-" * 30}')


In [None]:
cnn1d_parameters = {
    'model_name': 'cnn1d',
    'model_path': '.',
    'model_parameters': {
        'in_channels': 16,
    },
    'training_parameters': {
        'amp': False,
        'learning_rate': 0.0005,
        'betas': (0.9, 0.999),
        'weight_decay': 0,
        'epochs': 3,
        'batch_size': 256,
        'reduce_lr_patience': 5,
        'reduce_lr_factor': 0.25,
        'reduce_lr_min': 0.000001,
        'early_stopping_patience': 20,
        'num_workers': 8,
        'random_state': 42,
        'deterministic_cudnn': False,
        'random_state': 42
    }
}

trainer = Trainer(**cnn1d_parameters)
trainer.train_and_validate(df_train)

In [None]:
trainer.inference(df_train)