In [None]:
# !pip install torchsummary

In [None]:
import pandas as pd
import numpy as np
import random
import plotly.express as px
from multiprocessing import Pool
import matplotlib.pyplot as plt
import os
from sklearn.metrics import r2_score
import glob
from sklearn.metrics import r2_score

import torch
import wandb
import torch.nn as nn
import torch.optim as optim
# from torchsummary import summary
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer

In [None]:
def log_return(list_stock_prices):
    logs = np.log(list_stock_prices)
    return np.diff(logs, prepend=logs[0])

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

In [None]:
def get_params(book_example):
    buckets_index = pd.Index(np.arange(600))
    book_example = book_example.set_index('seconds_in_bucket').reindex(buckets_index)
    book_example = book_example.fillna(method='ffill')
    
    bid_prices1 = book_example['bid_price1'].to_numpy()
    bid_prices2 = book_example['bid_price2'].to_numpy()
    bid_sizes1 = book_example['bid_size1'].to_numpy()
    bid_sizes2 = book_example['bid_size2'].to_numpy()
    ask_prices1 = book_example['ask_price1'].to_numpy()
    ask_prices2 = book_example['ask_price2'].to_numpy()
    ask_sizes1 = book_example['ask_size1'].to_numpy()
    ask_sizes2 = book_example['ask_size2'].to_numpy()
    
    trade_vols1 = (ask_sizes1 + bid_sizes1)[1:]
    trade_vols2 = (ask_sizes2 + bid_sizes2)[1:]
    trade_diffs1 = (ask_sizes1 - bid_sizes1)[1:]
    trade_diffs2 = (ask_sizes2 - bid_sizes2)[1:]
    
    spreads1 = ((ask_prices1 / bid_prices1) - 1)[1:]
    spreads2 = ((ask_prices2 / bid_prices2) - 1)[1:]
    
    waps1 = (bid_prices1 * ask_sizes1 + ask_prices1 * bid_sizes1) / (bid_sizes1 + ask_sizes1)
    waps2 = (bid_prices2 * ask_sizes2 + ask_prices2 * bid_sizes2) / (bid_sizes2 + ask_sizes2)
    
    logs1 = log_return(waps1)[1:]
    logs2 = log_return(waps2)[1:]
    
    waps1 = waps1[1:]
    waps2 = waps2[1:]
    
    return [
        waps1.mean(),
        waps2.mean(),
        waps1[450:500].mean(),
        waps1[500:550].mean(),
        waps1[550:600].mean(),
        waps2[450:500].mean(),
        waps2[500:550].mean(),
        waps2[550:600].mean(),
        waps1.std(),
        waps2.std(),
        waps1[550:600].std(),
        waps2[550:600].std(),
        logs1.mean(),
        logs2.mean(),
        logs1[450:500].mean(),
        logs1[500:550].mean(),
        logs1[550:600].mean(),
        logs2[450:500].mean(),
        logs2[500:550].mean(),
        logs2[550:600].mean(),
        trade_vols1.mean(),
        trade_vols2.mean(),
        trade_vols1[550:600].mean(),
        trade_vols2[550:600].mean(),
        trade_diffs1.mean(),
        trade_diffs2.mean(),
        trade_diffs1[550:600].mean(),
        trade_diffs2[550:600].mean(),
        realized_volatility(logs1),
        realized_volatility(logs2),
        int(book_example["time_id"].mean())
    ]

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']].set_index("row_id")
list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
columns = [
    "wap1", "wap2", "wap1_1", "wap1_2", "wap1_3",
    "wap2_1", "wap2_2", "wap2_3", "wap1_std",
    "wap2_std", "wap1l_std", "wap2l_std", "log1",
    "log2", "log1_1", "log1_2", "log1_3", "log2_1",
    "log2_2", "log2_3", "volume1", "volume2", "volume1l",
    "volume2l", "diff1", "diff2", "diff1l", "diff2l",
    "vol1", "vol2", "stock_id", "target"
]
def generate_feature_set(list_file):
    data_list = []
    for file in list_file:
        book_example = pd.read_parquet(file)
        groups = book_example.groupby(['time_id'])
        pool = Pool(processes=None)
        data_list += [
            entry[:-1] + [int(file.split('=')[1]), train.loc[f"{file.split('=')[1]}-{entry[-1]}", "target"]]
            for entry in pool.map(get_params, [groups.get_group(x) for x in list(groups.groups)])
        ]
        pool.close()
        pool.join()
    return pd.DataFrame(data_list, columns=columns)
generate_feature_set(list_order_book_file_train).to_csv("feature_set.csv", encoding='utf-8', index=False)

In [None]:
def rmspe(y_true, y_pred):
    return torch.sqrt(torch.mean(((y_true - y_pred) / y_true) ** 2))

In [None]:
epochs = 100
batch_size = 512
device = "cuda" if torch.cuda.is_available() else "cpu"
split_size = 0.15
feature_size = 31
lr = 2e-3
lr_gamma = 0.9
betas = (0.5, 0.5)

In [None]:
class FeatureDataset(Dataset):
    def __init__(self, file_path):
        self.feature_standardizer = StandardScaler()
        self.target_boxcox = PowerTransformer(method='box-cox', standardize=False)
        self.target_scaler = MinMaxScaler()
        
        dataframe = pd.read_csv(file_path)
        self.stocks_target_mean_val = dataframe.groupby("stock_id")["target"].mean()
        dataframe.insert(loc=dataframe.shape[1] - 2, column="stock_target_mean", value=dataframe['stock_id'].map(self.stocks_target_mean_val).values)
        X = self.feature_standardizer.fit_transform(dataframe.iloc[:, :-2].to_numpy())
        y = self.target_scaler.fit_transform(self.target_boxcox.fit_transform(dataframe.iloc[:, -1].to_numpy().reshape(-1, 1)))
        # y = dataframe.iloc[:, -1].to_numpy()
        
        self.X_train = torch.tensor(X, dtype=torch.float32)
        self.y_train = torch.tensor(y, dtype=torch.float32)
        
    def inverse_scale_transform(self, x):
        if not torch.is_tensor(x):
            x = torch.tensor(x, dtype=torch.float32)
        
        # Invert the 0-1 Scaler
        x_mult = x * torch.tensor(self.target_scaler.data_max_ - self.target_scaler.data_min_, dtype=torch.float32, requires_grad=False)
        x_scaled = x_mult + torch.tensor(self.target_scaler.data_min_, dtype=torch.float32, requires_grad=False)
        
        # Invert the Box-Cox Scaler
        lda = torch.tensor(self.target_boxcox.lambdas_[0], dtype=torch.float32, requires_grad=False)
        x_bcox_inv = torch.exp(torch.log(1 + lda * x_scaled) / lda)
        return x_bcox_inv
    
    def __len__(self):
        return len(self.y_train)
    
    def __getitem__(self, idx):
        return self.X_train[idx], self.y_train[idx]

# pre_path = "../input/feature-set/"
pre_path = ""
feature_set = FeatureDataset(pre_path + "feature_set.csv")
val_size = int(split_size * len(feature_set))
train_set, val_set = torch.utils.data.random_split(feature_set, [len(feature_set) - val_size, val_size])

print(f"{len(train_set)} training samples")
print(f"{len(val_set)} validation samples")

train_loader = DataLoader(train_set, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_set, shuffle=True, batch_size=batch_size)

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(feature_size, 42),
            nn.ReLU(),
            nn.Linear(42, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.model(x)

In [None]:
model = Model().to(device)
# summary(model, (batch_size, feature_size))

criterion = nn.MSELoss()
criterion = rmspe
# optimizer = optim.Adam(model.parameters(), lr=lr, betas=betas)
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, lr_gamma)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, min_lr=3e-6, factor=0.1)

In [None]:
# wandb.init(project="Optiver-Kaggle-Contest", entity="ag8011")

for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    train_r2 = 0.0
    for features, values in train_loader:
        features = features.to(device)
        values = values.to(device)
        output = model(features)
        
        # Invert the scaling of the outputs
        values_scaled = feature_set.inverse_scale_transform(values.cpu())
        output_scaled = feature_set.inverse_scale_transform(output.cpu())
        
#         print(np.stack((values_scaled.cpu(), output_scaled.detach().cpu()), axis=-1))
        
        loss = criterion(values_scaled, output_scaled)
        # print(np.stack((output.detach().cpu().reshape(-1), values.cpu()), axis=-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # train_loss += rmspe(values, output).item()
        train_loss += loss.item()
        train_r2 += r2_score(values_scaled.cpu(), output_scaled.detach().cpu())
    
    model.eval()
    val_loss = 0.0
    val_r2 = 0.0
    for features, values in val_loader:
        features = features.to(device)
        values = values.to(device)
        output = model(features)
        
        # Invert the scaling of the outputs
        values_scaled = feature_set.inverse_scale_transform(values.cpu())
        output_scaled = feature_set.inverse_scale_transform(output.cpu())
        
        loss = criterion(values_scaled, output_scaled)
        # print(np.stack((values.cpu(), output.detach().cpu()), axis=-1))
        val_loss += loss.item()
        # val_loss += rmspe(values, output).item()
        val_r2 += r2_score(values_scaled.cpu(), output_scaled.cpu().detach())
    
    # scheduler.step(train_loss / len(train_loader))
    scheduler.step()
#     wandb.log({"Learning Rate": optimizer.param_groups[0]['lr']})
    
    print(f"Iteration {epoch}, Train RMSPE: {train_loss / len(train_loader)}, Val RMSPE: {val_loss / len(val_loader)}, Train R2: {train_r2 / len(train_loader)}, Val R2: {val_r2 / len(val_loader)}")
#     wandb.log({
#         "Train RMSPE": train_loss / len(train_loader),
#         "Val RMSPE": val_loss / len(val_loader),
#         "Train R2": train_r2 / len(train_loader),
#         "Val R2": val_r2 / len(val_loader)
#     })

In [None]:
list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
outputs = []
with torch.no_grad():
    for file in list_order_book_file_test:
        book_example = pd.read_parquet(file)
        groups = book_example.groupby(['time_id'])
        for time_id in list(groups.groups):
            test_features = get_params(groups.get_group(time_id))
            test_features.insert(-1, feature_set.stocks_target_mean_val[test_features[-1]])
            X = np.array(test_features[:-1], dtype=np.float32).reshape(1, -1)
            test_features_scaled = feature_set.feature_standardizer.transform(X)
            model_out = model(torch.tensor(test_features[:-1], dtype=torch.float32))
            model_scaled = feature_set.inverse_scale_transform(model_out).cpu().item()
            outputs.append([f"{int(file.split('=')[1])}-{int(time_id)}", model_scaled])
pd.DataFrame(outputs, columns=["row_id", "target"]).to_csv('submission.csv',index = False)