In [None]:
import numpy as np
import pandas as pd
import os

import time
import gc

%matplotlib inline
import matplotlib.pyplot as plt
import plotly
import plotly.express as px

from IPython import display
import sklearn
import gc
import time

gc.collect()

print('Library imported.')


In [None]:
# 定義整個資料預處理

def preprocess_trade(trade_data):

    trade_data['mean_price'] = trade_data.groupby(['stock_id', 'time_id'])\
         ['price'].transform('mean')
    
    trade_data['max_price'] = trade_data.groupby(['stock_id', 'time_id'])\
         ['price'].transform('max')
    # total number of traders in same time_id for each stock_id
    trade_data['total_trader'] = trade_data.groupby(['stock_id', 'time_id'])\
         ['order_count'].transform('sum')
    # total sold: price * size
    trade_data['total_sold'] = trade_data['price']*trade_data['size']
    trade_data['mean_totalsold'] = trade_data.groupby(['stock_id', 'time_id'])\
        ['total_sold'].transform('mean')
    trade_data['max_totalsold'] = trade_data.groupby(['stock_id', 'time_id'])\
        ['total_sold'].transform('max')
    
    trade_data.drop('total_sold', axis = 1, inplace = True)
    
    return trade_data
    
def log_return(list_stock_prices):
    # Give series of stock price, 計算每時段與下一筆資料間的log return
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    # Give log return, 計算realized volatility
    return np.sqrt(np.sum(series_log_return**2))


# 計算每個stock_id-time_id單位範圍內的realized volatility
def compute_realized_vol(file_path, realized_vol_colname):
    # file_path: 
    # realized_vol:
    df_book_data = pd.read_parquet(file_path)
    # 用公式計算order book資料的WAP
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']
                          +df_book_data['ask_price1'] * df_book_data['bid_size1'])/  \
                            (df_book_data['bid_size1'] + 
                             df_book_data['ask_size1'])
    # 用WAP計算log return
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    # 去除 log return 的NULL值
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    # 用公式計算realized volatility
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':realized_vol_colname})
    stock_id = file_path.split('=')[1]
    # 建立新欄位: stock_id-time_id 即每個樣本的index
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id',realized_vol_colname]]


def generate_realized_vol(list_file,realized_vol_colname):
    # Give list of order book files( ../stock_id/filename.parquet), generate realized vol.
    # and return Dataframe for each stock_id
    # list_file: list of filepath for multiple stock_id files
    # realized_vol_colname: column name for calculated realized_vol. value
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     compute_realized_vol(file,realized_vol_colname)])
    return df_past_realized

import glob
order_book_list = glob.glob('../input/optiver-realized-volatility-prediction/book_train.parquet/*')

if os.path.exists('../input/train-processed/train_processed.csv'):
    df_train = pd.read_csv('../input/train-processed/train_processed.csv')
else:
    df_train = generate_realized_vol(list_file=order_book_list, realized_vol_colname='realized_vol')

print('必要function已經設定完成')

gc.collect()

In [None]:
from sklearn.metrics import r2_score
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

print('RMSPE 定義完畢。')

In [None]:
import pickle
import lightgbm as lgb
from scipy.stats import randint as sp_randint
from sklearn.metrics import make_scorer

mu = 1e-9

def gradient(y_true, y_pred):
    # gradient for loss function
    # y_true: (n_samples)
    
    return -2*((y_true - y_pred)/(y_true + mu))

def hessian(y_true, y_pred):
    
    return 2/(y_true + mu)

def rmspe_loss(y_true, y_pred):
    # grad: 1-order derivative for each sample point
    # hess: 2-order derivative for each sample point
    grad = gradient(y_true, y_pred)
    hess = hessian(y_true, y_pred)
    
    return grad, hess

# lgb parameters for grid search
lgb_params_grid = {
    "num_leaves":[128, 256, 512],
    "min_data_in_leaf":np.arange(100, 300, 100),
    'learning_rate':[1e-3],
    'n_estimators':[2000, 3000, 5000],
    'reg_alpha':[0.1, 0.01, 1],
    'reg_lambda':[0.1, 0.01, 1],
    'random_state':[99],
    'silent':[True],
    'objective':[rmspe_loss],
    'n_jobs':[-1]
}

# custom scorer (RMSPE)
custom_score = make_scorer(rmspe, greater_is_better = False)


lgbm = lgb.LGBMRegressor()

print('LGBM Model parameter 搜尋池建立完畢.')

#### 每個stock_id在特定的時間區段time_id內都有一個目標realized vol. 值，如stock_id= 0在time_id = 5時的target即為0.004136

#### time_id代表某個時間區段的代號ID, target代表此time_id**未來10分鐘後的feature data計算出來的值**, 也就是我們要預測的目標。

#### 每個time_id時間區段內歷經長度為10分鐘, 即seconds_in_bucket的值會由0~600秒(但不一定每一秒都有紀錄)

#### 我們要用**現在的time_id, stock_id對應的order book, trade資料**建立數學模型，預測未來10分鐘的realized volatility

In [None]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV

y = df_train['target']
x = df_train[['realized_vol', 'mean_price', 'max_price', \
              'total_trader', 'mean_totalsold', 'max_totalsold']]

time_cv = TimeSeriesSplit(n_splits = 3)
val_metric = []

# hyperparameter tuning
grid_search = GridSearchCV(
    estimator = lgbm,
    param_grid = lgb_params_grid,
    cv = time_cv,
    scoring = custom_score,
    verbose = 100
)


grid_search.fit(x, y,
               verbose = 1000)


In [None]:
grid_search.best_params_

In [None]:
import pickle
# save best model as output
with open('best_lgbm.pickle', 'wb') as file:
    pickle.dump(grid_search.best_estimator_, file)