In [None]:
!pip install torchsummary

In [None]:
import numpy as np
import pandas as pd
from glob import glob
from numba import njit
from multiprocessing import Pool
from sklearn.metrics import r2_score

import torch
import wandb
import torch.nn as nn
import torch.optim as optim
from torchsummary import summary
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
import matplotlib.pyplot as plt
import random

In [None]:
train_targets = pd.read_csv("../input/optiver-realized-volatility-prediction/train.csv")
train_targets['row_id'] = train_targets['stock_id'].astype(str) + '-' + train_targets['time_id'].astype(str)
train_targets = train_targets[['row_id','target']].set_index("row_id")
train_files = glob("../input/optiver-realized-volatility-prediction/book_train.parquet/*")

In [None]:
@njit
def fill_array(book_data, filled_data):
    filled_data[0] = book_data[0]
    last_read_idx = 0
    for row_idx in range(1, 600):
        if int(book_data[last_read_idx + 1][1]) == row_idx:
            last_read_idx += 1
        filled_data[row_idx] = book_data[last_read_idx]
        filled_data[row_idx][1] = row_idx

In [None]:
@njit
def process_groups(dataset, stock_id):
    ret_lis = []
    last_split_pos = 0
    filled_data = np.zeros((600, dataset.shape[1]), dtype=np.float32)
    for split_pos in np.nonzero(np.diff(dataset[:,0]))[0]:
        data_split = dataset[last_split_pos:split_pos]
        fill_array(data_split, filled_data)
        ret_lis.append((dataset[last_split_pos][0], filled_data[:, 2:].copy()))
        last_split_pos = split_pos
    data_split = dataset[last_split_pos:]
    fill_array(data_split, filled_data)
    ret_lis.append((dataset[last_split_pos][0], filled_data[:, 2:].copy()))
    return ret_lis

In [None]:
def process_single_stock(file_path):
    book = pd.read_parquet(file_path, engine="pyarrow").sort_values(["time_id", "seconds_in_bucket"]).to_numpy(dtype=np.float32)
    grouped_data_list = [(data, train_targets.loc[f"{file_path.split('=')[1]}-{int(time_id)}", "target"]) for time_id, data in process_groups(book, int(file_path.split('=')[1]))]
    return grouped_data_list

In [None]:
epochs = 40
device = "cuda" if torch.cuda.is_available() else "cpu"
split_size = 0.1
lr = 3e-3
lr_gamma = 0.9
betas = (0.5, 0.5)

In [None]:
train_files_partitioned = []
train_files_partitioned.append(train_files[:len(train_files)//4])
train_files_partitioned.append(train_files[len(train_files)//4:len(train_files)//2])
train_files_partitioned.append(train_files[len(train_files)//2:3 * len(train_files)//4])
train_files_partitioned.append(train_files[3 * len(train_files)//4:])
val_size = int(split_size * len(train_files_reduced))
train_set = train_files_reduced[val_size:]
valid_set = train_files_reduced[:val_size]
print("Number of stocks used for training: ", len(train_set))
print("Number of stocks used for validation: ", len(valid_set))

In [None]:
stored_list = []
for stock_file in train_set:
    stored_list += process_single_stock(stock_file)

In [None]:
random.shuffle(stored_list)
trainX, trainy = zip(* stored_list)
trainy = np.array(trainy, dtype=np.float32).reshape(-1, 1)
trainX = np.array(trainX, dtype=np.float32)

In [None]:
def rmspe(y_true, y_pred):
    return torch.sqrt(100 * torch.mean(((y_true - y_pred) / y_true) ** 2))

In [None]:
class VolatilityCNN(nn.Module):
    def __init__(self):
        super(VolatilityCNN, self).__init__()
        self.block1 = nn.Sequential(
            nn.Conv1d(8, 11, 3, stride=5),
            nn.LeakyReLU(negative_slope=0.1),
            nn.Conv1d(11, 13, 3, stride=5),
            nn.LeakyReLU(negative_slope=0.1)
        )
        self.pooling = nn.MaxPool1d(5)
        self.ff = nn.Sequential(
            nn.Flatten(),
#             nn.Linear(52, 16),
#             nn.LeakyReLU(negative_slope=0.1),
            nn.Linear(52, 1)
        )
    
    def forward(self, x):
        return self.ff(self.pooling(self.block1(x)))

In [None]:
model = VolatilityCNN().to(device)
summary(model, (8, 600))

# criterion = nn.MSELoss()
criterion = rmspe
optimizer = optim.Adam(model.parameters(), lr=lr, betas=betas)
# optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, lr_gamma)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, min_lr=3e-6, factor=0.1)

In [None]:
input_normalizer = StandardScaler()
output_normalizer = StandardScaler()

In [None]:
# input_normalizer.fit(trainX.reshape(-1, 8))
trainX = input_normalizer.fit_transform((trainX.reshape(-1, 8))).reshape(*trainX.shape)
trainy = output_normalizer.fit_transform(trainy)
trainX = torch.tensor(trainX).permute(0, 2, 1).to(device)
trainy = torch.tensor(trainy).to(device)

In [None]:
for epoch in range(epochs):
    model.train()
    train_running_loss = 0
    train_r2 = 0
#     for train_stock_file in stored_list:
    optimizer.zero_grad()
#         trainX, trainy = zip(*[(input_normalizer.fit_transform(X), y) for X, y in process_single_stock(train_stock_file)])
#     trainX , trainy = train_stock_file[0], train_stock_file[1]
#         print(trainy)
#     trainX = torch.tensor(trainX).unsqueeze(0).permute(0, 2, 1).to(device)
#     trainy = torch.tensor(trainy).to(device).type(torch.float32)
    output = model(trainX)

    scaler_c1 = torch.tensor(output_normalizer.scale_).to(device)
    scaler_c2 = torch.tensor(output_normalizer.mean_).to(device)
    train_loss = criterion(trainy, output)
    train_loss.backward()
    optimizer.step()
    print(train_loss.item())

#     train_running_loss += loss.item()
#     train_r2 += r2_score(trainy.cpu(), output.detach().cpu())
#     random.shuffle(train_set)
    
#     model.eval()
#     val_running_loss = 0
#     val_r2 = 0
#     for val_stock_file in valid_set:
#         valX, valy = zip(*process_single_stock(val_stock_file))
#         valX = torch.tensor(np.array(valX)).permute(0, 2, 1).to(device)
#         valy = torch.tensor(valy).reshape(-1, 1).to(device)
#         output = model(valX)
        
#         scaler_c1 = torch.tensor(output_normalizer.scale_).to(device)
#         scaler_c2 = torch.tensor(output_normalizer.mean_).to(device)
#         #  * scaler_c1 + scaler_c2
#         val_loss = criterion(valy, output)
#         val_running_loss += val_loss.item()
#         val_r2 += r2_score(valy.cpu(), output.detach().cpu())
    scheduler.step()
    
#     print(f"Iteration {epoch}, Train RMSPE: {train_loss.item()}, Val RMSPE: {val_running_loss / len(valid_set)}, Train R2: {r2_score(trainy.cpu(), output.detach().cpu())}, Val R2: {val_r2 / len(valid_set)}")

## Submission

In [None]:
# def process_stock(test_queries):
#     features_set = process_single_stock("/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/stock_id=" + str(test_queries["stock_id"][0]))
#     features_dset = pd.DataFrame(features_set, columns=feature_columns)
#     features_dset.insert(loc=features_dset.shape[1] - 3, column="stock_target_mean", value=features_dset['stock_id'].map(feature_set.stocks_target_mean_val).values)
#     testing_data = test_queries.merge(features_dset, how="left", on=["time_id", "stock_id"]).fillna(method="ffill")
#     testing_data_cleaned = testing_data.drop(columns=["time_id", "stock_id"])
    
#     with torch.no_grad():
#         X = torch.tensor(testing_data_cleaned.iloc[:, 1:].to_numpy(dtype=np.float32), dtype=torch.float32)
#         model_out = model(X)
#         model_out_scaled = feature_set.inverse_scale_transform(model_out).cpu().numpy().reshape(-1, )
#     test_queries["target"] = model_out_scaled
#     test_queries_cleaned = test_queries.drop(columns=["time_id", "stock_id"])
#     return test_queries_cleaned

In [None]:
testing_file = pd.read_csv("../input/optiver-realized-volatility-prediction/test.csv")
testing_file = testing_file.groupby("stock_id").apply(process_stock)
testing_file = testing_file.fillna(0.001)
testing_file.to_csv("submission.csv", index=False)