# ライブラリのインポート

In [None]:
import numpy as np
import pandas as pd
import os
import glob
import re
import matplotlib.pyplot as plt
from pyarrow import parquet as pq
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score

# 訓練データの整形

In [None]:
def std_ask_bid_price_and_size(df_train=None, mode='train'):
    # 特徴量の選択
    feature_list=["ask_price1", "ask_price2", "bid_price1", "bid_price2", "ask_size1", "ask_size2", "bid_size1", "bid_size2"]
    #statistic_list=["mean","median","std","count","sum"]
    # trainデータを読み取るとき
    if mode == 'train':
        list_order_book_file = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
    # testデータを読み取るとき
    else:
        list_order_book_file = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')        

    # enumerateでインデックスを使う
    for kkk, iii in enumerate(list_order_book_file):
        # 各stock_idごとに取得
        df_order=pd.read_parquet(iii)
        #df_order_feature = df_order.groupby("time_id")[feature_list].agg(statistic_list).reset_index()
        # 
        df_order_feature = df_order.groupby("time_id")[feature_list].agg(ask_size1_std=('ask_size1', 'std'),ask_size2_std=('ask_size2', 'std'),bid_size1_std=('bid_size1', 'std'),bid_size2_std=('bid_size2', 'std'),ask_price1_std=('ask_price1', 'std'),ask_price2_std=('ask_price2', 'std'),bid_price1_std=('bid_price1', 'std'),bid_price2_std=('bid_price2', 'std')).reset_index()
        result = re.search(r'=\d+$', iii)
        stock_id_plus_equal=result.group()
        stock_id=stock_id_plus_equal.strip("=")
        df_order_feature['stock_id']=int(stock_id)
        #df_order_feature_target = pd.merge(df_order_feature,df_train,on=['stock_id', 'time_id'],how='left')
        
        if kkk ==0:
            df_order_feature_all=df_order_feature
        else:
            df_order_feature_all=pd.concat([df_order_feature_all, df_order_feature])
    
    df_order_feature_all['row_id'] = df_order_feature_all['stock_id'].astype(str) + '-' + df_order_feature_all['time_id'].astype(str)
    df_order_feature_all=df_order_feature_all.drop(columns=["time_id", "stock_id"])
    #df_order_feature_all.drop(columns=["time_id", "stock_id"])
    if mode == 'train':
        df_joined = df_train.merge(df_order_feature_all, on =['row_id'], how = 'left')
    else:
        df_joined=df_order_feature_all
    
    return df_joined

In [None]:
train=pd.read_csv('/kaggle/input/optiver-realized-volatility-prediction/train.csv')
stock = train.groupby("stock_id")["target"].agg(["mean","median","std","count","sum"]).reset_index()
print(stock)
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]
train

# 訓練データの生成

In [None]:
train=std_ask_bid_price_and_size(df_train=train, mode='train')
train.head()

# LightGBMの実行

In [None]:
model = lgb.LGBMRegressor(objective='regression', learning_rate=0.1, n_estimators=1000,
                          max_depth=5, min_child_weight=1.0, colsample_bytree=0.8,
                          subsample=0.8, reg_alpha=0.0, reg_lambda=1.0)

In [None]:
feature_columns=['ask_price1_std', 'ask_price2_std','bid_price1_std', 'bid_price2_std']

train_x=train[feature_columns]
train_y=train['target']

In [None]:
model.fit(np.log(train_x.values), np.log(train_y.values))

In [None]:
test=std_ask_bid_price_and_size(mode='test')
test.head()

In [None]:
test_x=test[feature_columns]
test_pred=model.predict(np.log(test_x.values))
test_pred

In [None]:
df_submit=pd.DataFrame({'row_id':test['row_id'],
                       'target':np.exp(test_pred)})
df_submit.to_csv("submission.csv", index=False)
df_submit

# book_[train/test].parquetの様子

In [None]:
# book_train_parquet = pd.read_parquet("../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0")
# book_train_parquet.head(5)

In [None]:
# book_test_parquet = pd.read_parquet("../input/optiver-realized-volatility-prediction/book_test.parquet/stock_id=0")
# book_test_parquet.head(5)

# trade_[train/test].parquetの様子

In [None]:
# trade_train_parquet = pd.read_parquet("../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0")
# trade_train_parquet.head(5)

In [None]:
# trade_test_parquet = pd.read_parquet("../input/optiver-realized-volatility-prediction/trade_test.parquet/stock_id=0")
# trade_test_parquet.head(5)

# train.csvの様子

In [None]:
# train = pd.read_csv("../input/optiver-realized-volatility-prediction/train.csv")
# train

# test.csvの様子

In [None]:
# test = pd.read_csv("../input/optiver-realized-volatility-prediction/test.csv")
# test

# submission.csvを様子見

In [None]:
# submission = pd.read_csv("../input/optiver-realized-volatility-prediction/sample_submission.csv")
# submission

# 関数の定義

In [None]:
# def log_returns(list_stock_prices):
#     return np.log(lost_stock_prices).diff()

# 株ごとの統計量を抽出

In [None]:
# stock = train.groupby("stock_id")["target"].agg(["mean","median","std","count","sum"]).reset_index()
# stock

# ヒストグラムで表示

In [None]:
# print("mean value=" ,stock["mean"].mean())
# plt.hist(stock["mean"])

In [None]:
# print("sum value=" ,stock["sum"].mean())
# plt.hist(stock["sum"])

# 試しにstock_id = 0, time_id = 5のbook，tradeを見てみる

In [None]:
# book_example = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
# book_test = book_example[book_example["time_id"]==5]
# book_test

In [None]:
# trade_example = pd.read_parquet("../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0")
# trade_test = trade_example[trade_example["time_id"]==5]
# trade_test

# オーダーブックの状況
## 紫が実際の取引．オーダーブックのbitとaskの間をうろうろ．

In [None]:
# samples = ["bid_price1","bid_price2","ask_price1","ask_price2"]
# plt.figure(figsize=(20,5))

# for num,a in enumerate(samples):
#     plt.plot(book_test["seconds_in_bucket"],book_test[a],label=a)
    
# plt.plot(trade_test["seconds_in_bucket"],trade_test["price"],label="trade_parquet",lw=10)
# plt.legend(fontsize=12)

# stock_id = 0で最もボラティリティが低い時のグラフ

In [None]:
# stock0 = train[train["stock_id"]==0]
# min_index = stock0["target"].idxmin()
# min_time_id = stock0.iloc[min_index]["time_id"]
# print("min index is",min_time_id,"min target is",stock0.iloc[min_index]["target"])

# book_test_min = book_example[book_example["time_id"]==min_time_id]
# trade_test_min = trade_example[trade_example["time_id"]==min_time_id]


# plt.figure(figsize=(20,5))

# for num,a in enumerate(samples):
    
   
#     plt.plot(book_test_min["seconds_in_bucket"],book_test_min[a],label=a)
    
# plt.plot(trade_test_min["seconds_in_bucket"],trade_test_min["price"],label="trade_parquet",lw=10)
# plt.legend(fontsize=12)

# stock_id = 0で最もボラティリティが高い時のグラフ

In [None]:
# stock0 = train[train["stock_id"]==0]
# max_index = stock0["target"].idxmax()
# max_time_id = stock0.iloc[max_index]["time_id"]
# print("max index is",max_time_id,"max target is",stock0.iloc[max_index]["target"])

# book_test_max = book_example[book_example["time_id"]==max_time_id]
# trade_test_max = trade_example[trade_example["time_id"]==max_time_id]


# plt.figure(figsize=(20,5))

# for num,a in enumerate(samples):
    
   
#     plt.plot(book_test_max["seconds_in_bucket"],book_test_max[a],label=a)
    
# plt.plot(trade_test_max["seconds_in_bucket"],trade_test_max["price"],label="trade_parquet",lw=10)
# plt.legend(fontsize=12)

# 実際の取引を重ねたグラフ

In [None]:
# plt.figure(figsize=(20,5))
# plt.plot(trade_test_min["seconds_in_bucket"],trade_test_min["price"],lw=10,label="min_vol_time")
# plt.plot(trade_test_max["seconds_in_bucket"],trade_test_max["price"],lw=10,label = "max_vol_time")
# plt.legend(fontsize=15)

### このグラフから，ボラティリティが大きいときは10分間の変動がすごく大きいことを確認できる

# 試しに中央値をsubmit

In [None]:
# stock2 = stock[["stock_id","median"]]
# stock2 = stock2.set_index("stock_id")
# stock2

In [None]:
# stock_dict = stock2.to_dict()
# # example stock id = 0のmedian値
# stock_dict["median"][0]

In [None]:
# sample = pd.read_csv("../input/optiver-realized-volatility-prediction/sample_submission.csv")
# sample["stock_id"] = [s.split("-")[0] for s in sample["row_id"]]
# sample["target"] = [stock_dict["median"][int(s)] for s in sample["stock_id"]]
# sample = sample.drop("stock_id",axis=1)
# sample.to_csv("submission.csv",index=False)
# sample