In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### About this Notebook

This is a starter preprocessing notebook to create stats features to build the model. However if you run into resource exhausted error when preprocessing as I ran into, since there are few million records to preprocess, you can check this [dataset](https://www.kaggle.com/c/optiver-realized-volatility-prediction/discussion/249647#1369316) where I have linked the preprocessed train pickle and csv format created using the code below.

Note: I have not run and saved it for the obvious reason that I might run into memory issue again, so just sharing the code used. 

In [None]:
import pandas as pd
import numpy as np
import glob
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold
from tqdm import tqdm

import gc

In [None]:
platform = 'Kaggle'

if platform == 'Kaggle':
    config = {'input_trade_path': "../input/optiver-realized-volatility-prediction/trade_",
              'input_book_path': "../input/optiver-realized-volatility-prediction/book_",
              'train_path': '../input/optiver-realized-volatility-prediction/train.csv',
              'test_path' : '../input/optiver-realized-volatility-prediction/test.csv'}
    
else:
    config = {'input_trade_path': "../trade_",
              'input_book_path': "../book_",
              'train_path': '../train.csv',
              'test_path' : '../test.csv'}

In [None]:
train_df = pd.read_csv(config['train_path'])
test_df = pd.read_csv(config['test_path'])

In [None]:
def read_trade_and_book_data(stock_id, inp_type, data_type):
    
    trade_file = glob.glob(config[inp_type]+f'{data_type}.parquet/stock_id={stock_id}/*')[0]
    trade = pd.read_parquet(trade_file)
    return trade

In [None]:
def get_consolidated_final_trade_book_df(df, data_type):
    unique_id = df['stock_id'].unique().tolist()
    
    trade_final_df = pd.DataFrame()
    book_final_df = pd.DataFrame()
    for stock_id in tqdm(unique_id):
        # Get book data
        temp_book_stock_df = read_trade_and_book_data(stock_id=stock_id, 
                                                  inp_type='input_book_path', 
                                                  data_type=data_type)
        temp_book_stock_df['stock_id'] = stock_id
        book_final_df = pd.concat([book_final_df, temp_book_stock_df])
        
        # Get trade data
        temp_trade_stock_df = read_trade_and_book_data(stock_id=stock_id, 
                                                   inp_type='input_trade_path', 
                                                   data_type=data_type)
        temp_trade_stock_df['stock_id'] = stock_id
        trade_final_df = pd.concat([trade_final_df, temp_trade_stock_df])
        
        gc.collect()
        
    book_final_df = book_final_df.reset_index(drop=True)
    trade_final_df = trade_final_df.reset_index(drop=True)

    return book_final_df, trade_final_df

In [None]:
gc.collect()
train_book_final_df, train_trade_final_df = get_consolidated_final_trade_book_df(df=train_df, data_type='train')
test_book_final_df, test_trade_final_df = get_consolidated_final_trade_book_df(df=test_df, data_type='test')

train_book_final_df.shape, train_trade_final_df.shape, test_book_final_df.shape, test_trade_final_df.shape

In [None]:
def get_trade_agg_info(df):
    agg_df = df.groupby(['stock_id', 'time_id']).agg(mean_sec_in_bucket_trade = ('seconds_in_bucket', 'mean'), 
                                                     mean_price = ('price', 'mean'),
                                                     mean_size = ('size', 'mean'),
                                                     mean_order = ('order_count', 'mean'),
                                                     max_sec_in_bucket_trade = ('seconds_in_bucket', 'max'), 
                                                     max_price = ('price', 'max'),
                                                     max_size = ('size', 'max'),
                                                     max_order = ('order_count', 'max'),
                                                     min_sec_in_bucket_trade = ('seconds_in_bucket', 'min'), 
                                                     min_price = ('price', 'min'),
                                                     min_size = ('size', 'min'),
                                                     min_order = ('order_count', 'min'),
                                                     median_sec_in_bucket_trade = ('seconds_in_bucket', 'median'), 
                                                     median_price = ('price', 'median'),
                                                     median_size = ('size', 'median'),
                                                     median_order = ('order_count', 'median')
                                                    ).reset_index()
    
    return agg_df

def get_book_agg_info(df):
    agg_df = df.groupby(['stock_id', 'time_id']).agg(mean_sec_in_bucket_book = ('seconds_in_bucket', 'mean'),
                                                     mean_bid_price1 = ('bid_price1', 'mean'),
                                                     mean_ask_price1 = ('ask_price1', 'mean'),
                                                     mean_bid_price2 = ('bid_price2',  'mean'),
                                                     mean_ask_price2 = ('ask_price2',  'mean'),
                                                     mean_bid_size1 = ('bid_size1',  'mean'),
                                                     mean_ask_size1 = ('ask_size1',  'mean'),
                                                     mean_bid_size2 = ('bid_size2', 'mean'),
                                                     mean_ask_size2 = ('ask_size2', 'mean'),
                                                     max_sec_in_bucket_book = ('seconds_in_bucket', 'max'),
                                                     max_bid_price1 = ('bid_price1', 'max'),
                                                     max_ask_price1 = ('ask_price1', 'max'),
                                                     max_bid_price2 = ('bid_price2',  'max'),
                                                     max_ask_price2 = ('ask_price2',  'max'),
                                                     max_bid_size1 = ('bid_size1',  'max'),
                                                     max_ask_size1 = ('ask_size1',  'max'),
                                                     max_bid_size2 = ('bid_size2', 'max'),
                                                     max_ask_size2 = ('ask_size2', 'max'),
                                                     min_sec_in_bucket_book = ('seconds_in_bucket', 'min'),
                                                     min_bid_price1 = ('bid_price1', 'min'),
                                                     min_ask_price1 = ('ask_price1', 'min'),
                                                     min_bid_price2 = ('bid_price2',  'min'),
                                                     min_ask_price2 = ('ask_price2',  'min'),
                                                     min_bid_size1 = ('bid_size1',  'min'),
                                                     min_ask_size1 = ('ask_size1',  'min'),
                                                     min_bid_size2 = ('bid_size2', 'min'),
                                                     min_ask_size2 = ('ask_size2', 'min'),
                                                     median_sec_in_bucket_book = ('seconds_in_bucket', 'median'),
                                                     median_bid_price1 = ('bid_price1', 'median'),
                                                     median_ask_price1 = ('ask_price1', 'median'),
                                                     median_bid_price2 = ('bid_price2',  'median'),
                                                     median_ask_price2 = ('ask_price2',  'median'),
                                                     median_bid_size1 = ('bid_size1',  'median'),
                                                     median_ask_size1 = ('ask_size1',  'median'),
                                                     median_bid_size2 = ('bid_size2', 'median'),
                                                     median_ask_size2 = ('ask_size2', 'median')
                                                    ).reset_index()
    
    return agg_df

In [None]:
train_trade_agg = get_trade_agg_info(df=train_trade_final_df)
test_trade_agg = get_trade_agg_info(df=test_trade_final_df)

train_trade_agg.shape, test_trade_agg.shape

In [None]:
train_book_agg = get_book_agg_info(df=train_book_final_df)
test_book_agg = get_book_agg_info(df=test_book_final_df)

train_book_agg.shape, test_book_agg.shape

In [None]:
train_agg = pd.merge(train_book_agg, train_trade_agg, 
                     on=['stock_id', 'time_id'], 
                     how='left')

test_agg = pd.merge(test_book_agg, test_trade_agg, 
                    on=['stock_id', 'time_id'], 
                    how='left')

train_agg.shape, test_agg.shape

In [None]:
# Merge to get the labels
train_final_df = pd.merge(train_df, train_agg, on=['stock_id', 'time_id'], how='left')

# Merge to get the row-id for submission
test_final_df = pd.merge(test_df, test_agg, on=['stock_id', 'time_id'], how='left')

print(train_final_df.shape, test_final_df.shape)

In [None]:
train_final_df.to_pickle('../train_agg_final_df.pickle')
test_final_df.to_pickle('../test_agg_final_df.pickle')