In [1]:
import glob 

BASE_DIR = '/kaggle/input/optiver-realized-volatility-prediction/'

# Paths to book and trade data
TRAIN_BOOK_PATHS  = glob.glob(f'{BASE_DIR}book_train.parquet/*')
TEST_BOOK_PATHS   = glob.glob(f'{BASE_DIR}book_test.parquet/*')
TRAIN_TRADE_PATHS = glob.glob(f'{BASE_DIR}trade_train.parquet/*')
TEST_TRADE_PATHS  = glob.glob(f'{BASE_DIR}trade_test.parquet/*')

# Plotting
import matplotlib.pyplot as plt 

# Working with dataframes and sequences
import numpy as np
import pandas as pd 
import xgboost as xgb
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_squared_error,make_scorer


In [2]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
   


In [3]:
class DataManager:
    """ Used for processing the input data so the model can be fitted on it. """
    def __init__(self, train=True):
        self._train = train
        self._book_file_list = TRAIN_BOOK_PATHS if train else TEST_BOOK_PATHS
        self._trade_file_list = TRAIN_TRADE_PATHS if train else TEST_TRADE_PATHS
        self.measures_list = []
    
    def _traverse_book(self):
        """ Goes through each of the training files. """
        for book_file_path, trade_file_path in zip(self._book_file_list, self._trade_file_list):
            stock_id = book_file_path.split("=")[1] # Getting the stock_id
            print(stock_id)
            # Reading the book info and preparing it for aggregation
            book = pd.read_parquet(book_file_path)
            
            book.sort_values(by=['time_id', 'seconds_in_bucket'])
            
            book['wap1'] = (book['bid_price1'] * book['ask_size1'] + book['ask_price1'] * book['bid_size1']) / (book['bid_size1']+ book['ask_size1'])
            book['log_return1'] = book.groupby(['time_id'])['wap1'].apply(log_return) 
            book = book[~book['log_return1'].isnull()]
            
            book['wap2'] = (book['bid_price2'] * book['ask_size2'] + book['ask_price2'] * book['bid_size2']) / (book['bid_size2']+ book['ask_size2'])
            book['log_return2'] = book.groupby(['time_id'])['wap2'].apply(log_return)
            book = book[~book['log_return2'].isnull()]
            
            # Different spreads: Get the max of these for each time_id
            book['h_spread_l1'] = book['ask_price1'] - book['bid_price1']
            book['h_spread_l2'] = book['ask_price2'] - book['bid_price2']
            book['v_spread_b']  = book['bid_price1'] - book['bid_price2']
            book['v_spread_a']  = book['ask_price1'] - book['bid_price2']
            
            book.loc[:, 'bas'] = (book.loc[:, ('ask_price1', 'ask_price2')].min(axis = 1) / book.loc[:, ('bid_price1', 'bid_price2')].max(axis = 1) - 1) 
            
            trade = pd.read_parquet(trade_file_path)
            
            # Slicing the train data based on stock_id
            book_stock_slice = train[train['stock_id'] == int(stock_id)]
            
            for time_id in book['time_id'].unique():
                book_slice = book[book['time_id'] == time_id] # Slicing based on time_id
                trade_slice = trade[trade['time_id'] == time_id]
                p3 = p4 = k10 = k5 = k = q5 = q10 = 1
                if  not trade_slice.empty:
                    trade_slice_1st = trade_slice['seconds_in_bucket'].iloc[0]
                    trade_slice_last = trade_slice['seconds_in_bucket'].iloc[-1]
            
                    # Feature constrution
                    mid_index = int(len(trade_slice['seconds_in_bucket'])/2)
                    for i in book_slice['seconds_in_bucket']:
                        if i <=trade_slice_1st:
                            k = k+1
                        if i <= 300:
                            k5 = k5+1
                        if i <= trade_slice_last:
                            k10 = k10+1
                    q10 = np.sum(np.multiply(trade_slice['order_count'],trade_slice['size']))
                    for j in range(trade_slice.shape[0]):
                        if trade_slice['seconds_in_bucket'].iloc[j] <= 300:
                            q5 += trade_slice['order_count'].iloc[j]*trade_slice['size'].iloc[j]
                    no_of_orders_in_book1 = k
                    no_of_orders_in_book5 = k5
                    no_of_orders_in_book10 = k10
                    os_just_be_5 = q5
                    os_10 = q10
                    p1 = k5/k
                    p2 = k10/k
                    p3 = np.sum(book_slice['bid_size1'])/np.sum(trade_slice['order_count'])
                    p4 = np.sum(book_slice['ask_size1'])/np.sum(trade_slice['order_count'])
                    p5 = q10/q5
                
                # features
                dic = {
                    'row_id': f"{stock_id}-{time_id}", # Fixing row-id from here
                    
                    'wap1_mean': book_slice['wap1'].mean(),
                    'wap1_std':book_slice['wap1'].std(),
                    'wap1_max':book_slice['wap1'].max(),
                    
                    'wap2_mean': book_slice['wap2'].mean(),
                    'wap2_std':book_slice['wap2'].std(),
                    'wap2_max':book_slice['wap2'].max(),

                    'h_spread_l1_mean': book_slice['h_spread_l1'].mean(),
                    'h_spread_l1_std': book_slice['h_spread_l1'].std(),
                    'h_spread_l1_std': book_slice['h_spread_l1'].max(),
                    
                    'h_spread_l2_mean': book_slice['h_spread_l2'].mean(),
                    'h_spread_l2_std': book_slice['h_spread_l2'].std(),
                    'h_spread_l2_max': book_slice['h_spread_l2'].max(),
                    
                    'v_spread_b_mean': book_slice['v_spread_b'].mean(),
                    'v_spread_b_std': book_slice['v_spread_b'].std(),
                    'v_spread_b_max': book_slice['v_spread_b'].max(),
                    
                    'v_spread_a_mean': book_slice['v_spread_a'].mean(),
                    'v_spread_a_std': book_slice['v_spread_a'].std(),
                    'v_spread_a_max': book_slice['v_spread_a'].max(),
                    
                    'log_return1_mean': book_slice['log_return1'].mean(),
                    'log_return1_std':book_slice['log_return1'].std(),
                    'log_return1_max':book_slice['log_return1'].max(),
                    
                    'log_return2_mean': book_slice['log_return2'].mean(),
                    'log_return2_std':book_slice['log_return2'].std(),
                    'log_return2_max':book_slice['log_return2'].max(),
                    
                    'bas_mean': book_slice['bas'].mean(),
                    'bas_std': book_slice['bas'].std(),
                    'bas_max': book_slice['bas'].max(),
                    
                    'ask_size_mean': book_slice['ask_size1'].mean(),
                    'ask_size_std': book_slice['ask_size1'].std(),
                    
                    'ask_price_mean': book_slice['ask_price1'].mean(),
                    'ask_price_std': book_slice['ask_price1'].std(),
                    
                    'bid_size_mean': book_slice['bid_size1'].mean(),
                    'bid_size_std': book_slice['bid_size1'].std(),
                    
                    'bid_price_mean': book_slice['bid_price1'].mean(),
                    'bid_price_std': book_slice['bid_price1'].std(),
                    
                    'order_count_mean': trade_slice['order_count'].mean(),
                    'order_count_std': trade_slice['order_count'].std(),
                    
                    'orders_time_book5' : p1,
                    'orders_time_book10' : p2,
                    
                    'ratio_bid_count' :p3,
                    'ratio_ask_count' :p4,
                    
                    'order_size_time_trade' :p5
                }
                
                # Note: When getting the test_data ready, there is no target column.
                if self._train: dic['target'] = book_stock_slice[book_stock_slice['time_id'] == time_id]['target'].values[0]
                
                self.measures_list.append(dic)
            
            
    def get_processed(self):
        """ Returns the processed the data. """
        self._traverse_book() 
        
        return pd.DataFrame(self.measures_list)

In [4]:
# feature preposs
#bookma = DataManager().get_processed()
#bookma.to_csv('/kaggle/working/train_sp.csv', index=False)

In [5]:
data = pd.read_csv('../input/features-preposs-data/train_sp.csv')
test_data = DataManager(train=False).get_processed()

# Min-Max Scaling the data for better models
for col_name in data.columns[1:-1]:
    test_data[col_name] = (test_data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())


0


In [6]:
len(data.columns)

43

In [7]:

# Training Data
X, y =  data.iloc[:,1:-1], data['target']

# Test Data
X_test = test_data.iloc[:,1:]

# Getting training and validations plits to check for overfitting
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [8]:
len(X_train.columns)

41

In [9]:
len(X_test.columns)

41

<a href="./train_sp.csv"> Download File </a>

In [10]:
reg_xgb = xgb.XGBRegressor(objective='reg:squarederror')
reg_xgb.fit(X_train,
           y_train,
           verbose=True,
           early_stopping_rounds = 100,
           eval_metric ='rmse',
           eval_set = [(X_val,y_val)])

[0]	validation_0-rmse:0.34729
[1]	validation_0-rmse:0.24311
[2]	validation_0-rmse:0.17018
[3]	validation_0-rmse:0.11914
[4]	validation_0-rmse:0.08340
[5]	validation_0-rmse:0.05839
[6]	validation_0-rmse:0.04089
[7]	validation_0-rmse:0.02865
[8]	validation_0-rmse:0.02008
[9]	validation_0-rmse:0.01410
[10]	validation_0-rmse:0.00992
[11]	validation_0-rmse:0.00703
[12]	validation_0-rmse:0.00503
[13]	validation_0-rmse:0.00367
[14]	validation_0-rmse:0.00277
[15]	validation_0-rmse:0.00219
[16]	validation_0-rmse:0.00185
[17]	validation_0-rmse:0.00165
[18]	validation_0-rmse:0.00154
[19]	validation_0-rmse:0.00149
[20]	validation_0-rmse:0.00146
[21]	validation_0-rmse:0.00144
[22]	validation_0-rmse:0.00144
[23]	validation_0-rmse:0.00144
[24]	validation_0-rmse:0.00143
[25]	validation_0-rmse:0.00143
[26]	validation_0-rmse:0.00143
[27]	validation_0-rmse:0.00143
[28]	validation_0-rmse:0.00142
[29]	validation_0-rmse:0.00142
[30]	validation_0-rmse:0.00142
[31]	validation_0-rmse:0.00142
[32]	validation_0-

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [11]:
validate(reg_xgb)

 RMSPE: 0.3118883972940022


In [12]:
submit(reg_xgb.predict(X_test))


In [13]:

#pd.read_csv('/kaggle/working/submission.csv')

**------------------------------------------------------------------------------------------------------------------------------------------------------------------**
#Optimizing dont care

In [14]:
#reg_xgb = xgb.XGBRegressor(objective='reg:squarederror',
                           #max_depth = 8
                          #)

In [15]:
#reg_xgb.fit(X_train,
          # y_train,
          # verbose=True,
          # early_stopping_rounds = 100,
          # eval_metric ='rmse',
          # eval_set = [(X_val,y_val)])

In [16]:
#param_grid = {
   # 'max_depth' :[3,4,5],
    #'learning_rate': [0.1,0.01,0.05],
   # 'gamma':[0,0.25,1.0],
    #'reg_lambda':[0,1.0,10.0],
    #'scale_pos_weight': [1,3,5]
#}

In [17]:
#optimal_params = GridSearchCV(estimator = xgb.XGBRegressor(objective='reg:squarederror',
                                                           #subsample = 0.9,
                                                          # colsample_bytree = 0.5),
                              #param_grid = param_grid,
                             # scoring = 'neg_root_mean_squared_error',
                             # verbose = 2,
                             # n_jobs = -1,
                             # cv = 2
                            # )

In [18]:
#optimal_params.fit(X_train,
                  # y_train,
                  # verbose=False,
                   #early_stopping_rounds = 10,
                  # eval_metric ='rmse',
                 #  eval_set = [(X_val,y_val)]
                 # )
