In [None]:
import numpy as np 
import pandas as pd 
import plotly.express as px
import glob

train = pd.read_csv('/kaggle/input/optiver-realized-volatility-prediction/train.csv')
sub = pd.read_csv('/kaggle/input/optiver-realized-volatility-prediction/sample_submission.csv')

# Some Constants
TRAIN_BOOK_PATHS = glob.glob("/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*")
TEST_BOOK_PATHS  = glob.glob("/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*")
TRAIN_TRADE_PATHS = glob.glob("/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/*")
TEST_TRADE_PATHS  = glob.glob("/kaggle/input/optiver-realized-volatility-prediction/trade_test.parquet/*")

def submit(prediction):
    sub.drop(sub.index, inplace=True)
    sub['row_id'] = test_data['row_id']
    sub['target'] = prediction
    sub.to_csv('/kaggle/working/submission.csv', index=False)

- The data file for each sotck_id will be imported from book_train folder, the ids go from 0 to 126.

## Preprocessing

Data will be categorized by the time-id, meaning the modeling will happen for each time-id individually. Hence, statistical measures for a given time-id should be extracted and fed into a model.

- Get the wap for each row
- Drp seconds_in_bucket

In [None]:
class DataManager:
    def __init__(self, train=True):
        self._train = train
        self._book_file_list = TRAIN_BOOK_PATHS if train else TEST_BOOK_PATHS
        self._trade_file_list = TRAIN_TRADE_PATHS if train else TEST_TRADE_PATHS
        self.measures_list = []
        
    def _log_return(self, stock_prices):
        return np.log(stock_prices).diff()
    
    def _traverse_book(self):
        """ Goes through each of the training files. """
        for book_file_path, trade_file_path in zip(self._book_file_list, self._trade_file_list):
            stock_id = book_file_path.split("=")[1]
            
            book = pd.read_parquet(book_file_path)
            book['wap'] = (book['bid_price1'] * book['ask_size1'] + book['ask_price1'] * book['bid_size1']) / (book['bid_size1']+ book['ask_size1'])
            book['log_return'] = book.groupby(['time_id'])['wap'].apply(self._log_return)
            book = book[~book['log_return'].isnull()]
            
            trade = pd.read_parquet(trade_file_path)
            
            book_stock_slice = train[train['stock_id'] == int(stock_id)]
            
            for time_id in book['time_id'].unique():
                book_slice = book[book['time_id'] == time_id]
                
                dic = {
                    'row_id': f"{stock_id}-{time_id}", # Fixing row-id from here
                    'wap_mean': book_slice['wap'].mean(),
                    'wap_std':book_slice['wap'].std(),
                    'log_return_mean': book_slice['log_return'].mean(),
                    'log_return_std':book_slice['log_return'].std(),
                    'ask_size_mean': book_slice['ask_size1'].mean(),
                    'ask_size_std': book_slice['ask_size1'].std(),
                    'ask_price_mean': book_slice['ask_price1'].mean(),
                    'ask_price_std': book_slice['ask_price1'].std(),
                    'bid_size_mean': book_slice['bid_size1'].mean(),
                    'bid_size_std': book_slice['bid_size1'].std(),
                    'bid_price_mean': book_slice['bid_price1'].mean(),
                    'bid_price_std': book_slice['bid_price1'].std(),
                    'actual_price_mean': trade['price'].mean(),
                    'actual_price_std': trade['price'].std(),
                    'size_mean': trade['size'].mean(),
                    'size_std': trade['size'].std(),
                    'order_count_mean': trade['order_count'].mean(),
                    'order_count_std': trade['order_count'].std(),
                }
                
                if self._train: dic['target'] = book_stock_slice[book_stock_slice['time_id'] == time_id]['target'].values[0]
                
                self.measures_list.append(dic)
    
    def get_processed(self):
        self._traverse_book()
        
        return pd.DataFrame(self.measures_list)

### Testing different pre-processing techniques

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error as mae

data = pd.read_csv('/kaggle/input/processedbooktrade/train_v1.csv')

y = data['target']
X = data.iloc[:,2:-1]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

test_data = DataManager(train=False).get_processed()
X_test = test_data.iloc[:,1:]

In [None]:
print(type(X_train)) # 6128row 18 columns 
print(type(y_train))# 6128
#回归任务

### Modeling

In [None]:
import torch
import torch.nn as nn

#data转tensor
def pandas_to_tensor(x):
    x=np.array(x)
    x=torch.tensor(x)
    return x

X_train=pandas_to_tensor(X_train)
y_train=pandas_to_tensor(y_train)

In [None]:
class Volatility_prediction(nn.Module):
    def __init__(self):
        super(Volatility_prediction,self).__init__()
        self.predict = nn.Sequential(
            nn.Linear(18, 200),
            nn.ReLU(),
            nn.Linear(200, 50),
            nn.ReLU(),
            nn.Linear(50,1),
            nn.Softmax(dim=0)
        )

    def forward(self, x):
        prediction = self.predict(x)
        return prediction


net =Volatility_prediction()
optimizer = torch.optim.SGD(net.parameters(), lr=0.05)
loss_func = nn.MSELoss()

for epoch in range(50000):
    out = net(X_train.to(torch.float32))
    loss = loss_func(out.squeeze(1),y_train.to(torch.float32))
#     print(out,y_train.to(torch.float32))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
torch.save(net, '\model.pkl')


In [None]:
def model_test(X):
    model = torch.load('\model.pkl')
    model.eval()

    output=0
    times=0
    for data in pandas_to_tensor(X):
        output=model(data.to(torch.float32))
        if times==0:
            times+=1
            outputs=output
        else:
            outputs=torch.cat((outputs,output),0)
        
    outputs=pd.Series(outputs.to(torch.float64).detach().numpy())
    return outputs

outputs=model_test(X_val)

In [None]:
mae(outputs, y_val)

# from sklearn import tree
# import matplotlib.pyplot as plt

# fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=800)
# tree.plot_tree(rfg.estimators_[10],
#                feature_names = X.columns, 
#                filled = True);

# fig.savefig('rf_individualtree.png')

### Submissiong process

In [None]:
submit(model_test(X_test))

In [None]:
from joblib import dump, load
dump(net, 'rfg_1000_10_train_v1.joblib') 
# clf = load('rfg_1000_10.joblib') 

In [None]:
# pd.read_csv('/kaggle/working/submission.csv')

In [None]:
# clf.get_params()