In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### This is a starter notebook on how you can create stats features and build models. 

Note: i have used only the <B>trade</B> features and not used the <B>book</B> features yet. You can include them and improve your score. Take a look my notebook on [data aggregated](https://www.kaggle.com/thanish/data-aggregated) to consolidate the book and trader features together. You can use them to build models on top of it.

In [None]:
import pandas as pd
import numpy as np
import glob
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold
from tqdm import tqdm

import gc

In [None]:
config = {'input_trade_path': "../input/optiver-realized-volatility-prediction/trade_",
          'input_book_path': "../input/optiver-realized-volatility-prediction/book_",
          'train_path': '../input/optiver-realized-volatility-prediction/train.csv',
          'test_path' : '../input/optiver-realized-volatility-prediction/test.csv'}

In [None]:
temp = pd.read_parquet("../input/optiver-realized-volatility-prediction/book_test.parquet/stock_id=0/7832c05caae3489cbcbbb9b02cf61711.parquet")
temp

In [None]:
train_df = pd.read_csv(config['train_path'])
test_df = pd.read_csv(config['test_path'])
train_df

In [None]:
test_df

In [None]:
def rmspe(y_true, y_pred):
    '''
    Compute Root Mean Square Percentage Error between two arrays.
    '''
    loss = np.sqrt(np.mean(np.square(((y_true - y_pred) / y_true)), axis=0))

    return loss

In [None]:
def read_trade_book_data(stock_id, inp_type, data_type):
    trade_file = glob.glob(config[inp_type]+f'{data_type}.parquet/stock_id={stock_id}/*')[0]
    trade = pd.read_parquet(trade_file)
    return trade

In [None]:
def get_final_df(df, data_type):
    unique_id = df['stock_id'].unique().tolist()
    
    trade_final_df = pd.DataFrame()
    book_final_df = pd.DataFrame()
    for stock_id in tqdm(unique_id):
        # Get book data
        temp_book_stock_df = read_trade_book_data(stock_id=stock_id, 
                                                  inp_type='input_book_path', 
                                                  data_type=data_type)
        temp_book_stock_df['stock_id'] = stock_id
        book_final_df = pd.concat([book_final_df, temp_book_stock_df])
        
        # Get trade data
        temp_trade_stock_df = read_trade_book_data(stock_id=stock_id, 
                                                   inp_type='input_trade_path', 
                                                   data_type=data_type)
        temp_trade_stock_df['stock_id'] = stock_id
        trade_final_df = pd.concat([trade_final_df, temp_trade_stock_df])
        
        gc.collect()
        
    book_final_df = book_final_df.reset_index(drop=True)
    trade_final_df = trade_final_df.reset_index(drop=True)

    return book_final_df, trade_final_df

In [None]:
gc.collect()
train_book_final_df, train_trade_final_df = get_final_df(df=train_df, data_type='train')
test_book_final_df, test_trade_final_df = get_final_df(df=train_df, data_type='test')

train_book_final_df.shape, train_trade_final_df.shape, test_book_final_df.shape, test_trade_final_df.shape

In [None]:
def get_trade_agg_info(df):
    agg_df = df.groupby(['stock_id', 'time_id']).agg(mean_sec_in_bucket = ('seconds_in_bucket', 'mean'), 
                                                     mean_price = ('price', 'mean'),
                                                     mean_size = ('size', 'mean'),
                                                     mean_order = ('order_count', 'mean'),
                                                     max_sec_in_bucket = ('seconds_in_bucket', 'max'), 
                                                     max_price = ('price', 'max'),
                                                     max_size = ('size', 'max'),
                                                     max_order = ('order_count', 'max'),
                                                     min_sec_in_bucket = ('seconds_in_bucket', 'min'), 
                                                     min_price = ('price', 'min'),
                                                     min_size = ('size', 'min'),
                                                     min_order = ('order_count', 'min'),
                                                     median_sec_in_bucket = ('seconds_in_bucket', 'median'), 
                                                     median_price = ('price', 'median'),
                                                     median_size = ('size', 'median'),
                                                     median_order = ('order_count', 'median')
                                                    ).reset_index()
    
    return agg_df

def get_book_agg_info(df):
    agg_df = df.groupby(['stock_id', 'time_id']).agg(mean_sec_in_bucket = ('seconds_in_bucket', 'mean'), 
                                                     mean_price = ('price', 'mean'),
                                                     mean_size = ('size', 'mean'),
                                                     mean_order = ('order_count', 'mean'),
                                                     max_sec_in_bucket = ('seconds_in_bucket', 'max'), 
                                                     max_price = ('price', 'max'),
                                                     max_size = ('size', 'max'),
                                                     max_order = ('order_count', 'max'),
                                                     min_sec_in_bucket = ('seconds_in_bucket', 'min'), 
                                                     min_price = ('price', 'min'),
                                                     min_size = ('size', 'min'),
                                                     min_order = ('order_count', 'min'),
                                                     median_sec_in_bucket = ('seconds_in_bucket', 'median'), 
                                                     median_price = ('price', 'median'),
                                                     median_size = ('size', 'median'),
                                                     median_order = ('order_count', 'median')
                                                    ).reset_index()
    
    return agg_df

In [None]:
train_final_df = get_final_df(df=train_df, data_type='train')
test_final_df = get_final_df(df=test_df, data_type='test')
train_final_df.shape, test_final_df.shape

In [None]:
train_agg = get_agg_info(df=train_final_df)
test_agg = get_agg_info(df=test_final_df)

train_agg.shape, test_agg.shape

In [None]:
train_final_df = pd.merge(train_agg, train_df, on=['stock_id', 'time_id'], how='left')
test_final_df = pd.merge(test_df, test_agg, on=['stock_id', 'time_id'], how='left')
train_final_df.fillna(-999, inplace=True)
test_final_df.fillna(-999, inplace=True)

print(train_final_df.shape, test_final_df.shape)

In [None]:
dep = 'target'
drop = ['stock_id', 'time_id']
indep = train_final_df.columns.difference([dep]+drop)
indep

In [None]:
RF = RandomForestRegressor(n_jobs=-1, n_estimators=15)
RF.fit(train_final_df[indep], train_final_df[dep])
RF_pred = RF.predict(test_final_df[indep])

In [None]:
sub_id = test_final_df.stock_id.astype(str) + '-' + test_final_df.time_id.astype(str)
submission_df = pd.DataFrame({'row_id':sub_id, 'target':RF_pred})
submission_df

In [None]:
submission_df.to_csv("submission.csv", index=False)