In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import glob
from IPython.core.debugger import set_trace
from tqdm import tqdm
import lightgbm as lgbm
import lightgbm as lgbm



In [None]:
from pathlib import Path
COMPETITION_NAME = "optiver-realized-volatility-prediction"
ROOT = Path(".").resolve().parent
INPUT_ROOT = ROOT / "input"
RAW_DATA = INPUT_ROOT / COMPETITION_NAME
WORK_DIR = ROOT / "working"
# OUTPUT_ROOT = ROOT / "output"
OUTPUT_ROOT = WORK_DIR / "output"
PROC_DATA = ROOT / "processed_data"

train = pd.read_csv(RAW_DATA / "train.csv")
test = pd.read_csv(RAW_DATA / "test.csv")
sample_submission = pd.read_csv(RAW_DATA / "sample_submission.csv")

In [None]:
test

In [None]:
sample_submission

In [None]:
list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')

list_trades_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/*')
list_trades_test= glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_test.parquet/*')

In [None]:
def create_features(book: pd.DataFrame, trades: pd.DataFrame) -> pd.DataFrame:
#     set_trace()
    book = book.reset_index(drop = True)
    trades = trades.reset_index(drop = True)

    return pd.concat([book, trades], axis = 1)

In [None]:
# %debug
df_train = pd.DataFrame()
c= 0
for file_path_book, file_path_trade in tqdm(tuple(zip(list_order_book_file_train, list_trades_train))):
    stock_id = file_path_book.split('=')[1]
    trade_id = file_path_trade.split('=')[1]
    
#    assert(stock_id == trade_id)
#     set_trace()
    df_temp_book = pd.read_parquet(file_path_book)
    df_temp_trade = pd.read_parquet(file_path_trade)

    grouped_book = df_temp_book.groupby("time_id")[["bid_price1", "ask_price1","bid_price2", "ask_price2",
                                "bid_size1", "ask_size1","bid_size2","ask_size2"]].agg(["min","max","std","mean","median"])
    grouped_book.columns = grouped_book.columns.get_level_values(0) + "_" + grouped_book.columns.get_level_values(1)

    grouped_trade = df_temp_trade.groupby("time_id")[["price", "size","order_count"]].agg(["min","max","std","mean","median"])
    grouped_trade.columns = grouped_trade.columns.get_level_values(0) + "_" + grouped_trade.columns.get_level_values(1)

    grouped_trade["stock_id"] = stock_id
    grouped_book["stock_id"] = stock_id

    stock_df = grouped_trade.merge(grouped_book, on = ["time_id", "stock_id"])
    stock_df["stock_id"] = stock_df["stock_id"].astype("int")

    stock_df = stock_df.reset_index().rename({"index": "time_id"})
    stock_df = stock_df.merge(train,on =["time_id", "stock_id"])
    
    df_train = pd.concat([df_train, stock_df]).reset_index(drop = True)

In [None]:
# %debug
df_test = pd.DataFrame()
c= 0
for file_path_book, file_path_trade in tqdm(tuple(zip(list_order_book_file_test, list_trades_test))):
    stock_id = file_path_book.split('=')[1]
    trade_id = file_path_trade.split('=')[1]
    
#     assert(stock_id == trade_id)
#   set_trace()
    df_temp_book = pd.read_parquet(file_path_book)
    df_temp_trade = pd.read_parquet(file_path_trade)

    grouped_book = df_temp_book.groupby("time_id")[["bid_price1", "ask_price1","bid_price2", "ask_price2",
                                "bid_size1", "ask_size1","bid_size2","ask_size2"]].agg(["min","max","std","mean","median"])
    grouped_book.columns = grouped_book.columns.get_level_values(0) + "_" + grouped_book.columns.get_level_values(1)

    grouped_trade = df_temp_trade.groupby("time_id")[["price", "size","order_count"]].agg(["min","max","std","mean","median"])
    grouped_trade.columns = grouped_trade.columns.get_level_values(0) + "_" + grouped_trade.columns.get_level_values(1)

    grouped_trade["stock_id"] = stock_id
    grouped_book["stock_id"] = stock_id

    stock_df = grouped_trade.merge(grouped_book, on = ["time_id", "stock_id"])
    stock_df["stock_id"] = stock_df["stock_id"].astype("int")

    stock_df = stock_df.reset_index().rename({"index": "time_id"})
    stock_df = stock_df.merge(test,on =["time_id", "stock_id"])
    
    df_test = pd.concat([df_test, stock_df]).reset_index(drop = True)

In [None]:
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

In [None]:
model = lgbm.LGBMRegressor(boosting= 'dart',
                                    num_iterations= 68,
                                    learning_rate= 0.023291061656432972, 
                                    num_leaves= 45,
                                    tree_learner= 'serial',
                                    min_data_in_leaf= 20, 
                                    min_sum_hessian_in_leaf= 0.0019366526911684936,
                                    feature_fraction= 0.7610390335831062,
                                    extra_trees= True, 
                                    lambda_l1= 0.0068097848290300704,
                                    lambda_l2= 0.0004129349780203663)

In [None]:
model.fit(df_train.drop("target", axis = 1), df_train["target"])
preds = model.predict(df_test.drop("row_id", axis = 1))

In [None]:
max_train = df_train["target"].max()
min_train = df_train["target"].min()

In [None]:
df_test["target"] = preds
df_test["target"] = df_test["target"].fillna(0)
df_test["target"] = df_test["target"].astype("float64")
df_test["row_id"] = df_test["row_id"].astype(str)

df_test["target"] = df_test["target"].clip(min_train,max_train)
df_test = df_test[["row_id", "target"]]

sample_submission = sample_submission.drop("target", axis = 1).merge(df_test, on = ["row_id"], how = "left")
sample_submission["target"] = sample_submission["target"].fillna(0)
sample_submission.to_csv('submission.csv',index = False)