In [None]:
import numpy as np
import pandas as pd
from glob import glob
from numba import njit
from multiprocessing import Pool
from sklearn.metrics import r2_score

import torch
import wandb
import torch.nn as nn
import torch.optim as optim
# from torchsummary import summary
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer

In [None]:
train_targets = pd.read_csv("../input/optiver-realized-volatility-prediction/train.csv")
train_targets['row_id'] = train_targets['stock_id'].astype(str) + '-' + train_targets['time_id'].astype(str)
train_targets = train_targets[['row_id','target']].set_index("row_id")
train_files = glob("../input/optiver-realized-volatility-prediction/book_train.parquet/*")

In [None]:
column_names = [
    "time_id",           # 0
    "seconds_in_bucket", # 1
    "bid_price1",        # 2
    "ask_price1",        # 3
    "bid_price2",        # 4
    "ask_price2",        # 5
    "bid_size1",         # 6
    "ask_size1",         # 7
    "bid_size2",         # 8
    "ask_size2"          # 9
]

In [None]:
@njit
def fill_array(book_data, filled_data):
    filled_data[0] = book_data[0]
    last_read_idx = 0
    for row_idx in range(1, 600):
        # print(row_idx, last_read_idx, int(book_data[last_read_idx + 1][1]), int(book_data[last_read_idx + 1][1]) == row_idx)
        if int(book_data[last_read_idx + 1][1]) == row_idx:
            last_read_idx += 1
        filled_data[row_idx] = book_data[last_read_idx]
        filled_data[row_idx][1] = row_idx

In [None]:
@njit
def calculate_features(filled_data):
    filled_data = filled_data.transpose()
    
    trade_vols1 = (filled_data[6] + filled_data[7])[1:]
    trade_vols2 = (filled_data[8] + filled_data[9])[1:]
    trade_diffs1 = (filled_data[7] - filled_data[6])[1:]
    trade_diffs2 = (filled_data[9] - filled_data[8])[1:]
    
    spreads1 = ((filled_data[2] / filled_data[3]) - 1)[1:]
    spreads2 = ((filled_data[4] / filled_data[5]) - 1)[1:]
    
    waps1 = (filled_data[2] * filled_data[7] + filled_data[3] * filled_data[6]) / (filled_data[6] + filled_data[7])
    waps2 = (filled_data[4] * filled_data[9] + filled_data[5] * filled_data[8]) / (filled_data[8] + filled_data[9])
    
    logs1 = np.diff(np.log(waps1))[1:]
    logs2 = np.diff(np.log(waps2))[1:]
    
    waps1 = waps1[1:]
    waps2 = waps2[1:]
    
    return [
        waps1.mean(), 
        waps2.mean(),
        waps1[450:500].mean(),
        waps1[500:550].mean(),
        waps1[550:].mean(),
        waps2[450:500].mean(),
        waps2[500:550].mean(),
        waps2[550:].mean(),
        waps1.std(),
        waps2.std(),
        waps1[550:].std(),
        waps2[550:].std(),
        logs1.mean(),
        logs2.mean(),
        logs1[450:500].mean(),
        logs1[500:550].mean(),
        logs1[550:].mean(),
        logs2[450:500].mean(),
        logs2[500:550].mean(),
        logs2[550:].mean(),
        trade_vols1.mean(),
        trade_vols2.mean(),
        trade_vols1[550:].mean(),
        trade_vols2[550:].mean(),
        trade_diffs1.mean(),
        trade_diffs2.mean(),
        trade_diffs1[550:].mean(),
        trade_diffs2[550:].mean(),
        np.sqrt(np.sum(logs1 ** 2)), # Essentially volatility1
        np.sqrt(np.sum(logs2 ** 2)), # Essentially volatility2
        int(filled_data[0][0])
    ]

In [None]:
@njit
def process_groups(dataset, stock_id):
    ret_lis = []
    last_split_pos = 0
    filled_data = np.zeros((600, 10), dtype=np.float32)
    for split_pos in np.nonzero(np.diff(dataset[:,0]))[0]:
        data_split = dataset[last_split_pos:split_pos]
        fill_array(data_split, filled_data)
        features = calculate_features(filled_data)
        ret_lis.append(features + [stock_id])
        last_split_pos = split_pos
    data_split = dataset[last_split_pos:]
    fill_array(data_split, filled_data)
    features = calculate_features(filled_data)
    ret_lis.append(features + [stock_id])
    return ret_lis

In [None]:
feature_columns = [
    "wap1", "wap2", "wap1_1", "wap1_2", "wap1_3", "wap2_1", "wap2_2", "wap2_3", "wap1_std", "wap2_std", "wap1l_std", "wap2l_std", "log1", "log2", "log1_1", "log1_2", "log1_3", "log2_1", "log2_2", "log2_3", "volume1", "volume2", "volume1l", "volume2l", "diff1", "diff2", "diff1l", "diff2l", "vol1", "vol2", "time_id", "stock_id"
]

In [None]:
def process_single_stock(file_path):
    book = pd.read_parquet(file_path, engine="pyarrow").sort_values(["time_id", "seconds_in_bucket"]).to_numpy(dtype=np.float32)
    group_features = process_groups(book, int(file_path.split('=')[1]))
    return group_features

In [None]:
def preprocess_data(file_list):
    worker_pool = Pool(processes=None)
    full_feature_list_matrix = worker_pool.map(process_single_stock, file_list)
    worker_pool.close()
    worker_pool.join()
    return_feature_list = []
    for feature_list in full_feature_list_matrix:
        return_feature_list += feature_list
    return pd.DataFrame(return_feature_list, columns=feature_columns)

In [None]:
dataset = preprocess_data(train_files)
dataset["row_id"] = dataset.apply(lambda x: f"{int(x['stock_id'])}-{int(x['time_id'])}", axis=1)
dataset_cleaned = dataset.drop(columns=["time_id"])
dataset_merged = dataset_cleaned.merge(train_targets, "left", "row_id")
dataset_merge_cleaned = dataset_merged.drop(columns=["row_id"])

In [None]:
def rmspe(y_true, y_pred):
    return torch.sqrt(torch.mean(((y_true - y_pred) / y_true) ** 2))

In [None]:
epochs = 20
batch_size = 1024
device = "cuda" if torch.cuda.is_available() else "cpu"
split_size = 0.15
feature_size = len(feature_columns) - 1
lr = 3e-3
lr_gamma = 0.9
betas = (0.5, 0.5)

In [None]:
class FeatureDataset(Dataset):
    def __init__(self, in_dataframe):
        self.feature_standardizer = StandardScaler()
        self.target_boxcox = PowerTransformer(method='box-cox', standardize=False)
        self.target_scaler = MinMaxScaler()
        
        dataframe = in_dataframe.copy()
        self.stocks_target_mean_val = dataframe.groupby("stock_id")["target"].mean()
        dataframe.insert(loc=dataframe.shape[1] - 3, column="stock_target_mean", value=dataframe['stock_id'].map(self.stocks_target_mean_val).values)
        X = self.feature_standardizer.fit_transform(dataframe.iloc[:, :-3].to_numpy())
        y = self.target_scaler.fit_transform(self.target_boxcox.fit_transform(dataframe.iloc[:, -1].to_numpy().reshape(-1, 1)))
        # y = dataframe.iloc[:, -1].to_numpy()
        
        self.X_train = torch.tensor(X, dtype=torch.float32)
        self.y_train = torch.tensor(y, dtype=torch.float32)
        
    def inverse_scale_transform(self, x):
        if not torch.is_tensor(x):
            x = torch.tensor(x, dtype=torch.float32)
        
        # Invert the 0-1 Scaler
        x_mult = x * torch.tensor(self.target_scaler.data_max_ - self.target_scaler.data_min_, dtype=torch.float32, requires_grad=False)
        x_scaled = x_mult + torch.tensor(self.target_scaler.data_min_, dtype=torch.float32, requires_grad=False)
        
        # Invert the Box-Cox Scaler
        lda = torch.tensor(self.target_boxcox.lambdas_[0], dtype=torch.float32, requires_grad=False)
        x_bcox_inv = torch.exp(torch.log(1 + lda * x_scaled) / lda)
        return x_bcox_inv
    
    def __len__(self):
        return len(self.y_train)
    
    def __getitem__(self, idx):
        return self.X_train[idx], self.y_train[idx]

feature_set = FeatureDataset(dataset_merged)
val_size = int(split_size * len(feature_set))
train_set, val_set = torch.utils.data.random_split(feature_set, [len(feature_set) - val_size, val_size])

print(f"{len(train_set)} training samples")
print(f"{len(val_set)} validation samples")

train_loader = DataLoader(train_set, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_set, shuffle=True, batch_size=batch_size)

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(feature_size, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.model(x)

In [None]:
model = Model().to(device)
# summary(model, (batch_size, feature_size))

criterion = nn.MSELoss()
criterion = rmspe
# optimizer = optim.Adam(model.parameters(), lr=lr, betas=betas)
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, lr_gamma)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, min_lr=3e-6, factor=0.1)

In [None]:
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    train_r2 = 0.0
    for features, values in train_loader:
        features = features.to(device)
        values = values.to(device)
        output = model(features)
        
        # Invert the scaling of the outputs
        values_scaled = feature_set.inverse_scale_transform(values.cpu())
        output_scaled = feature_set.inverse_scale_transform(output.cpu())
        
        loss = criterion(values_scaled, output_scaled)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # train_loss += rmspe(values, output).item()
        train_loss += loss.item()
        train_r2 += r2_score(values_scaled.cpu(), output_scaled.detach().cpu())
    
    model.eval()
    val_loss = 0.0
    val_r2 = 0.0
    for features, values in val_loader:
        features = features.to(device)
        values = values.to(device)
        output = model(features)
        
        # Invert the scaling of the outputs
        values_scaled = feature_set.inverse_scale_transform(values.cpu())
        output_scaled = feature_set.inverse_scale_transform(output.cpu())
        
        loss = criterion(values_scaled, output_scaled)
        val_loss += loss.item()
        val_r2 += r2_score(values_scaled.cpu(), output_scaled.cpu().detach())
    scheduler.step()
    
    print(f"Iteration {epoch}, Train RMSPE: {train_loss / len(train_loader)}, Val RMSPE: {val_loss / len(val_loader)}, Train R2: {train_r2 / len(train_loader)}, Val R2: {val_r2 / len(val_loader)}")

## Submission

In [None]:
def process_stock(test_queries):
    features_set = process_single_stock("/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/stock_id=" + str(test_queries["stock_id"][0]))
    features_dset = pd.DataFrame(features_set, columns=feature_columns)
    features_dset.insert(loc=features_dset.shape[1] - 3, column="stock_target_mean", value=features_dset['stock_id'].map(feature_set.stocks_target_mean_val).values)
    testing_data = test_queries.merge(features_dset, how="left", on=["time_id", "stock_id"]).fillna(method="ffill")
    testing_data_cleaned = testing_data.drop(columns=["time_id", "stock_id"])
    
    with torch.no_grad():
        X = torch.tensor(testing_data_cleaned.iloc[:, 1:].to_numpy(dtype=np.float32), dtype=torch.float32)
        model_out = model(X)
        model_out_scaled = feature_set.inverse_scale_transform(model_out).cpu().numpy().reshape(-1, )
    test_queries["target"] = model_out_scaled
    test_queries_cleaned = test_queries.drop(columns=["time_id", "stock_id"])
    return test_queries_cleaned

In [None]:
testing_file = pd.read_csv("../input/optiver-realized-volatility-prediction/test.csv")
testing_file = testing_file.groupby("stock_id").apply(process_stock)
testing_file = testing_file.fillna(0.001)
testing_file.to_csv("submission.csv", index=False)