## 匯入之前notebook的Output file, 需要再add data處選擇Output file，在選擇該Notebook進行匯入。匯入的資料會跑到Input資料夾。 Output資料夾每次開啟必定清空

In [None]:
import numpy as np
import pandas as pd
import os

import time
import gc

%matplotlib inline
import matplotlib.pyplot as plt
import plotly
import plotly.express as px

from IPython import display
import sklearn
import gc
import time

gc.collect()

print('Library imported.')

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

gc.collect()

#df_join = pd.read_csv('../input/volatility-prediction/df_joined.csv')

print('Data imported.')

gc.collect()

In [None]:
print('訓練集的樣子: 實際的stock_id, time_id與目標值realized_volatility')
print(train.head())
print('測試集的樣子: ')
print(test.head())

#### 每個stock_id在特定的時間區段time_id內都有一個目標realized vol. 值，如stock_id= 0在time_id = 5時的target即為0.004136

#### time_id代表某個時間區段的代號ID, target代表此time_id**未來10分鐘後的feature data計算出來的值**, 也就是我們要預測的目標。

#### 每個time_id時間區段內歷經長度為10分鐘, 即seconds_in_bucket的值會由0~600秒(但不一定每一秒都有紀錄)

#### 我們要用**現在的time_id, stock_id對應的order book, trade資料**建立數學模型，預測未來10分鐘的realized volatility

In [None]:
book_train = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet')
book_test = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_test.parquet')

gc.collect()

print("訓練集的order book: 包含bid price和ask price")
print(book_train.head()) # seconds in bucket 可視為時間特徵
print('測試集的order book: ')
print(book_test.head())

#### 

In [None]:
gc.collect()
trade_train = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet')
trade_test = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_test.parquet')

gc.collect()

print('訓練集的實際交易:')
print(trade_train.head())

print('測試集的實際交易:')
print(trade_test.head())

In [None]:
# 為節省RAM，先不讀test data近來
del book_test, trade_test, test
gc.collect()

In [None]:
#print('book_train 總共包含 {} 種不同股票'.format(book_train['stock_id'].nunique()))
#print('book_train 總共包含 {} 個不同時間區段'.format(book_train['time_id'].nunique()))
#gc.collect()

#### book_train 總共包含 112 種不同股票
#### book_train 總共包含 3830 個不同時間區段

In [None]:
def log_return(list_stock_prices):
    # Give series of stock price, 計算每時段與下一筆資料間的log return
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    # Give log return, 計算realized volatility
    return np.sqrt(np.sum(series_log_return**2))


# 計算每個stock_id-time_id單位範圍內的realized volatility
def compute_realized_vol(file_path, realized_vol_colname):
    # file_path: 
    # realized_vol:
    df_book_data = pd.read_parquet(file_path)
    # 用公式計算order book資料的WAP
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']
                          +df_book_data['ask_price1'] * df_book_data['bid_size1'])/  \
                            (df_book_data['bid_size1'] + 
                             df_book_data['ask_size1'])
    # 用WAP計算log return
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    # 去除 log return 的NULL值
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    # 用公式計算realized volatility
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':realized_vol_colname})
    stock_id = file_path.split('=')[1]
    # 建立新欄位: stock_id-time_id 即每個樣本的index
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id',realized_vol_colname]]


def generate_realized_vol(list_file,realized_vol_colname):
    # Give list of order book files( ../stock_id/filename.parquet), generate realized vol.
    # and return Dataframe for each stock_id
    # list_file: list of filepath for multiple stock_id files
    # realized_vol_colname: column name for calculated realized_vol. value
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     compute_realized_vol(file,realized_vol_colname)])
    return df_past_realized

import glob
#order_book_list = glob.glob('../input/optiver-realized-volatility-prediction/book_train.parquet/*')

#df_train = generate_realized_vol(list_file=order_book_list, realized_vol_colname='realized_vol')
print('必要function已經設定完成')

gc.collect()

In [None]:
'''
df_join = pd.read_csv('../input/volatility-predict/df_joined.csv')
# row_id 組成為: stock_id-time_id
df_join['stock_id'] = df_join['row_id'].apply(lambda value: int(value.split('-')[0]))
df_join['time_id'] = df_join['row_id'].apply(lambda value: int(value.split('-')[1]))
'''

### 預測未來的Realized volatility(流動率): 什麼因素會影響**未來10分鐘的Order book**?

In [None]:
# add more features

# 定義整個資料預處理

def preprocess_trade(trade_data):
    # current number of traders and current trading price
    # mean price
    # transform() 可讓groupby的aggregate結果和原column進行align，建立新column
    trade_data['mean_price'] = trade_data.groupby(['stock_id', 'time_id'])\
         ['price'].transform('mean')
    
    trade_data['max_price'] = trade_data.groupby(['stock_id', 'time_id'])\
         ['price'].transform('max')
    # total number of traders in same time_id for each stock_id
    trade_data['total_trader'] = trade_data.groupby(['stock_id', 'time_id'])\
         ['order_count'].transform('sum')
    # total sold: price * size
    trade_data['total_sold'] = trade_data['price']*trade_data['size']
    trade_data['mean_totalsold'] = trade_data.groupby(['stock_id', 'time_id'])\
        ['total_sold'].transform('mean')
    trade_data['max_totalsold'] = trade_data.groupby(['stock_id', 'time_id'])\
        ['total_sold'].transform('max')
    
    trade_data.drop('total_sold', axis = 1, inplace = True)
    
    return trade_data
    

#trade_train = preprocess_trade(trade_train)

print('trade_train 新feature建立完畢。等等加入訓練資料作為新特徵。')
gc.collect()

#trade_train.head()

In [None]:
# add more features: 
'''
add_feature = trade_train[['stock_id', 'time_id', 'mean_price', \
                           'max_price', 'total_trader',\
                           'mean_totalsold', 'max_totalsold'
                          ]]
add_feature.drop_duplicates(inplace = True)
df_join = df_join.merge(add_feature, on = ['stock_id', 'time_id'],\
                       how = 'left')

gc.collect()
#print(df_join.head())
'''

In [None]:
# Data cleaning: Missing value or Infinite values
'''
df_join.replace([np.inf, -np.inf], np.nan, inplace = True)

print('缺失值: ')
print(df_join.isnull().sum())

df_join.fillna(method='ffill', inplace = True)

print('缺失值經過處理後: ')
print(df_join.isnull().sum())
'''

In [None]:
'''
# Save memory: float64 -> float32
df_join['realized_vol'], df_join['total_trader'] = df_join['realized_vol'].astype('float32'), df_join['total_trader'].astype('int32')

y = df_join['target']
x = df_join.drop(['target', 'stock_id', 'time_id'], axis = 1)
gc.collect()
'''

In [None]:
# Build model
# Candidate regression model: generalized linear model, GBM, ligthgbm, random forest
# neural net, linear regression
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression


lr = LinearRegression()

print('Model configuration 建立完畢')
gc.collect()

In [None]:
'''
df_join.reset_index(drop = True, inplace = True)
x.reset_index(drop = True, inplace = True)
y.reset_index(drop = True, inplace = True)
print('df join: \n')
print(df_join.head())
print('X: ')
print(x.head())
print('Y:')
print(y.head())
'''

In [None]:
# split model into train/validation set
"""
# split by time scan: first sort data by time_id
df_join.sort_values('time_id', axis = 0, ascending = True, inplace = True)

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score

tscv = TimeSeriesSplit(n_splits = 5)
i = 1

test_size = int(0.2*len(df_join))
x_train, x_valid = x[0:(len(df_join)-test_size)], x[test_size:]
y_train, y_valid = y[0:(len(df_join)-test_size)], y[test_size:]

gc.collect()


model = lgbm
model.fit(x_train, y_train)

model_score = r2_score(y_valid, model.predict(x_valid))
baseline_score = r2_score(y_valid, x_valid['realized_vol'])
print('Split {}'.format(i))
print('Model score: {}, Baseline score: {}'.format(model_score, baseline_score))

"""
"""
for train_id, valid_id in tscv.split(x):
    x_train, x_valid = x[train_id], x[valid_id]
    y_train, y_valid = y[train_id], y[valid_id]
    
    lgbm.fit(x_train, y_train)
    
    model_score = r2_score(y_valid, lgbm.predict(x_valid))
    baseline_score = r2_score(y_valid, x_valid['realized_vol'])
    print('Split {}'.format(i))
    print('Model score: {}, Baseline score: {}'.format(model_score, baseline_score))
    
    del x_train, x_valid, y_train, y_valid
    gc.collect()
    
    i += 1
"""


In [None]:
from sklearn.metrics import r2_score
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

print('RMSPE 定義完畢。')
print('計算model metrics: ')
#R2 = round(r2_score(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
#RMSPE = round(rmspe(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
#print(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')

In [None]:
'''
# Linear Regression
lr.fit(x_train, y_train)

model = lr

model_score = r2_score(y_valid, model.predict(x_valid))
baseline_score = r2_score(y_valid, x_valid['realized_vol'])
print('LinearRegression 模型表現: ')
print('Model score: {}, Baseline score: {}'.format(model_score, baseline_score))

RMSPE = round(rmspe(y_valid, model.predict(x_valid)), 3)
print('LR model RMSPE: ', RMSPE)


# Decision Tree
from sklearn.tree import DecisionTreeRegressor

forest_params = {
    'n_estimators':80,
    'criterion':'mse',
    'max_depth':20,
    'random_state':99
}
forest = RandomForestRegressor(**forest_params)

%time
forest.fit(x_train, y_train)
model = forest

model_score = r2_score(y_valid, model.predict(x_valid))
baseline_score = r2_score(y_valid, x_valid['realized_vol'])
print('模型表現: '.format(i))
print('Model score: {}, Baseline score: {}'.format(model_score, baseline_score))

RMSPE = round(rmspe(y_valid, forest.predict(x_valid)), 3)
print('Forest model RMSPE: ', RMSPE)


# LightGBM
lgb_params = {
    "num_leaves":128,
    'max_depth':50,
    'learning_rate':1e-3,
    'n_estimators':1000,
    'reg_lambda':0.01,
    'random_state':99
}
lgbm = lgb.LGBMRegressor(**lgb_params)

%time
lgbm.fit(x_train, y_train)
model = lgbm

model_score = r2_score(y_valid, model.predict(x_valid))
baseline_score = r2_score(y_valid, x_valid['realized_vol'])
print('模型表現: '.format(i))
print('Model score: {}, Baseline score: {}'.format(model_score, baseline_score))

RMSPE = round(rmspe(y_valid, model.predict(x_valid)), 3)
print('LGBM model RMSPE: ', RMSPE)

# GBM
gbm_params = {
    'learning_rate':1e-2,
    'n_estimators':300,
    'max_depth':30,
    'min_samples_leaf':10,
    'verbose':10
}

gbm = GradientBoostingRegressor(**gbm_params)
gbm.fit(x_train, y_train)
model = gbm

model_score = r2_score(y_valid, model.predict(x_valid))
baseline_score = r2_score(y_valid, x_valid['realized_vol'])
print('模型表現: '.format(i))
print('Model score: {}, Baseline score: {}'.format(model_score, baseline_score))

RMSPE = round(rmspe(y_valid, model.predict(x_valid)), 3)
print('GradientBoostingMachine model RMSPE: ', RMSPE)

# GBM + LR
gbm_output = gbm.predict(x_train)

lr = LinearRegression()
lr.fit(gbm_output.reshape(-1, 1), y_train)
prediction = lr.predict(gbm.predict(x_valid).reshape(-1, 1))

RMSPE = round(rmspe(y_valid, prediction), 3)
print('RMSPE for boosting: GBM + LR =  ', RMSPE)

import pickle

lr.fit(x, y)
with open('forest.pickle', 'wb') as file:
    pickle.dump(forest, file)
    
gc.collect()
lgbm.fit(x, y)
with open('lgbm.pickle', 'wb') as file:
    pickle.dump(lgbm, file)
gc.collect()

gbm.fit(x, y)
with open('gbm.pickle', 'wb') as file:
    pickle.dump(gbm, file)
    
bagging_prediction = gbm.predict(x)
lr_booster = LinearRegression()

lr_booster.fit(bagging_prediction.reshape(-1, 1), y)
with open('lr_booster.pickle', 'wb') as file:
    pickle.dump(lr_booster, file)
    
'''

In [None]:
# Start using test data to make predictions
'''
order_book_list = glob.glob('../input/optiver-realized-volatility-prediction/book_test.parquet/*')
df_test = generate_realized_vol(list_file=order_book_list, realized_vol_colname='realized_vol')
gc.collect()
# df_test contains row_id, realized_vol 

trade_test = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_test.parquet')
gc.collect()
trade_test = preprocess_trade(trade_test)
gc.collect()

# Create row_id column
trade_test['row_id'] = trade_test['stock_id'].astype(str) + '-' + trade_test['time_id'].astype(str)
trade_test.head()

# Aggregate useful features
add_feature = trade_test[['row_id', 'mean_price', \
                           'max_price', 'total_trader',\
                           'mean_totalsold', 'max_totalsold'
                          ]]
add_feature.drop_duplicates(inplace = True)
df_test = df_test.merge(add_feature, on = 'row_id',\
                       how = 'left')

# Data cleaning: Missing value or Infinite values
df_test.replace([np.inf, -np.inf], np.nan, inplace = True)
df_test.fillna(method='ffill', inplace = True)

df_test['realized_vol'], df_test['total_trader'] = df_test['realized_vol'].astype('float32'), df_test['total_trader'].astype('int32')

gc.collect()
print('帶預測的資料準備完畢。')

features = ['realized_vol', 'mean_price', 'max_price', 'total_trader', 'mean_totalsold', 'max_totalsold']
df_test['target'] = forest.predict(df_test[features])
submission = df_test[['row_id', 'target']]
submission.to_csv('submission.csv', index = False)

'''

#### 接續之前的預訓練模型繼續訓練

In [None]:
import numpy as np
import pandas as pd
import os

import time
import gc

%matplotlib inline
import matplotlib.pyplot as plt
import plotly
import plotly.express as px

from IPython import display
import sklearn
import gc
import time

gc.collect()

print('Library imported.')


In [None]:
# 定義整個資料預處理

def preprocess_trade(trade_data):

    trade_data['mean_price'] = trade_data.groupby(['stock_id', 'time_id'])\
         ['price'].transform('mean')
    
    trade_data['max_price'] = trade_data.groupby(['stock_id', 'time_id'])\
         ['price'].transform('max')
    # total number of traders in same time_id for each stock_id
    trade_data['total_trader'] = trade_data.groupby(['stock_id', 'time_id'])\
         ['order_count'].transform('sum')
    # total sold: price * size
    trade_data['total_sold'] = trade_data['price']*trade_data['size']
    trade_data['mean_totalsold'] = trade_data.groupby(['stock_id', 'time_id'])\
        ['total_sold'].transform('mean')
    trade_data['max_totalsold'] = trade_data.groupby(['stock_id', 'time_id'])\
        ['total_sold'].transform('max')
    
    trade_data.drop('total_sold', axis = 1, inplace = True)
    
    return trade_data
    
def log_return(list_stock_prices):
    # Give series of stock price, 計算每時段與下一筆資料間的log return
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    # Give log return, 計算realized volatility
    return np.sqrt(np.sum(series_log_return**2))


# 計算每個stock_id-time_id單位範圍內的realized volatility
def compute_realized_vol(file_path, realized_vol_colname):
    # file_path: 
    # realized_vol:
    df_book_data = pd.read_parquet(file_path)
    # 用公式計算order book資料的WAP
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']
                          +df_book_data['ask_price1'] * df_book_data['bid_size1'])/  \
                            (df_book_data['bid_size1'] + 
                             df_book_data['ask_size1'])
    # 用WAP計算log return
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    # 去除 log return 的NULL值
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    # 用公式計算realized volatility
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':realized_vol_colname})
    stock_id = file_path.split('=')[1]
    # 建立新欄位: stock_id-time_id 即每個樣本的index
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id',realized_vol_colname]]


def generate_realized_vol(list_file,realized_vol_colname):
    # Give list of order book files( ../stock_id/filename.parquet), generate realized vol.
    # and return Dataframe for each stock_id
    # list_file: list of filepath for multiple stock_id files
    # realized_vol_colname: column name for calculated realized_vol. value
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     compute_realized_vol(file,realized_vol_colname)])
    return df_past_realized

import glob
order_book_list = glob.glob('../input/optiver-realized-volatility-prediction/book_train.parquet/*')

if os.path.exists('../input/train-processed/train_processed.csv'):
    df_train = pd.read_csv('../input/train-processed/train_processed.csv')
else:
    df_train = generate_realized_vol(list_file=order_book_list, realized_vol_colname='realized_vol')

print('必要function已經設定完成')

gc.collect()

In [None]:
df_train.head()

In [None]:
# Start using test data to make predictions


# dataframe contains row_id, realized_vol 
'''
trade_train = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet')
gc.collect()
trade_train = preprocess_trade(trade_train)
gc.collect()

# Create row_id column
trade_train['row_id'] = trade_train['stock_id'].astype(str) + '-' + trade_train['time_id'].astype(str)

# Aggregate useful features
add_feature = trade_train[['row_id', 'mean_price', \
                           'max_price', 'total_trader',\
                           'mean_totalsold', 'max_totalsold'
                          ]]
add_feature.drop_duplicates(inplace = True)


df_train = df_train.merge(add_feature, on = 'row_id',\
                       how = 'left')

print(df_train.head())
stop

# Data cleaning: Missing value or Infinite values
df_train.replace([np.inf, -np.inf], np.nan, inplace = True)
df_train.fillna(method='ffill', inplace = True)

df_train['realized_vol'], df_train['total_trader'] = \
df_train['realized_vol'].astype('float32'), df_train['total_trader'].astype('int32')

train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
df_train = train.merge(df_train, on = 'row_id',\
                       how = 'left')

df_train.to_csv('train_processed.csv', index = False)
'''

if os.path.exists('../input/volatiltiy-prediction2/train_processed.csv'):

    print('待預測的資料準備完畢。')
    
gc.collect()

In [None]:
from sklearn.metrics import r2_score
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

print('RMSPE 定義完畢。')

In [None]:
import xgboost as xgb

# Define RMSPE as objective loss function
# set RMSPE**2 as custom loss
def gradient(y_true, y_pred):
    # gradient for loss function
    
    return np.sqrt(-2*np.mean((y_true - y_pred)/y_true))

def hessian(y_true, y_pred):
    
    return (2*np.mean(1/y_true))

def rmspe_loss(y_true, y_pred):
    # grad: 1-order derivative for each sample point
    # hess: 2-order derivative for each sample point
    grad = gradient(y_true, y_pred)
    hess = hessian(y_true, y_pred)
    
    return grad, hess


xgb_params = {
    "n_estimators":300,
    "max_depth":30,
    "min_samples_leaf":10,
    #"verbosity":3,
    "learning_rate":1e-2,
    "booster":"gbtree",
    "reg_lambda":0.01,
    "random_state":99
   # "objective":rmspe_loss
}
xgbm = xgb.XGBRegressor(**xgb_params)

print('Model 建議完畢。')

In [None]:
import pickle
import lightgbm as lgb
from scipy.stats import randint as sp_randint
from sklearn.metrics import make_scorer

mu = 1e-9

def gradient(y_true, y_pred):
    # gradient for loss function
    # y_true: (n_samples)
    
    return -2*((y_true - y_pred)/(y_true + mu))

def hessian(y_true, y_pred):
    
    return 2/(y_true + mu)

def rmspe_loss(y_true, y_pred):
    # grad: 1-order derivative for each sample point
    # hess: 2-order derivative for each sample point
    grad = gradient(y_true, y_pred)
    hess = hessian(y_true, y_pred)
    
    return grad, hess

# lgb parameters for grid search
lgb_params_grid = {
    "num_leaves":[64, 128, 256, 512],
    "min_data_in_leaf":np.arange(100, 600, 100),
    'learning_rate':[1e-2, 1e-3, 1e-4],
    'n_estimators':[1000, 2000, 3000, 5000],
    'reg_alpha':[0, 0.1, 0.01, 1, 5],
    'reg_lambda':[0, 0.1, 0.01, 1, 2, 7],
    'random_state':[99],
    'silent':[True],
    'objective':[rmspe_loss],
    'n_jobs':3
}

lgb_params = {
    "num_leaves":769,
    "min_data_in_leaf":300,
    "n_estimators":5000,
    "learning_rate":1e-2,
    "reg_alpha":1,
    "reg_lambda":1,
    "objective":rmspe_loss,
    "n_jobs":-1,
    "random_state":99,
    "verbosity":-1,
    "max_depth":3
}

seed = 99
params = {
        'learning_rate': 0.13572437900113307,        
        'lambda_l1': 2.154360665259325,
        'lambda_l2': 6.711089761523827,
        'num_leaves': 769,
        'min_sum_hessian_in_leaf': 20.44437160769411,
        'feature_fraction': 0.7921473067441019,
        'feature_fraction_bynode': 0.8083803860191322,
        'bagging_fraction': 0.9726755660563261,
        'bagging_freq': 42,
        'min_data_in_leaf': 690,
        'max_depth': 3,
        'random_state': seed,
        'feature_fraction_seed': seed,
        'bagging_seed': seed,
        'drop_seed': seed,
        'data_random_seed': seed,
        'objective': rmspe_loss,
        'boosting': 'gbdt',
        'verbosity': -1,
        'n_jobs': -1,
    }

# custom scorer (RMSPE)
custom_score = make_scorer(rmspe, greater_is_better = False)



print('LGBM Model parameter 搜尋池建立完畢.')

In [None]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV


lgbm = lgb.LGBMRegressor(**params)


y = df_train['target']
x = df_train[['realized_vol', 'mean_price', 'max_price', \
              'total_trader', 'mean_totalsold', 'max_totalsold']]

time_cv = TimeSeriesSplit(n_splits = 5)
val_metric = []
'''
# hyperparameter tuning
grid_search = GridSearchCV(
    estimator = lgbm,
    param_grid = lgb_params_grid,
    cv = time_cv,
    scoring = custom_score,
    verbose = 100
)


grid_search.fit(x, y,
               #eval_set = [x, y],
               #eval_metric = rmspe_loss,
               #eval_names = ['rmspe_metric'],
               verbose = 1000)

'''
lr = LinearRegression()

for train_id, valid_id in time_cv.split(df_train):

    x_train, y_train = x.loc[train_id], y.loc[train_id]
    x_valid, y_valid = x.loc[valid_id], y.loc[valid_id]
    
    print(x_train.shape)
    print(x_valid.shape)

    lgbm.fit(x_train, y_train,
             verbose = 500
            )
    
    #lr.fit(lgbm.predict(x_train).reshape(-1, 1), y_train) # LR model boosting
    # Calculate performance metric
    
    RMSPE = rmspe(y_valid, lgbm.predict(x_valid))
    
    val_metric.append(RMSPE)
    print('RMSPE metric: ', RMSPE)
    

print('Mean validate metric: ', np.mean(val_metric))




In [None]:
'''
from sklearn.linear_model import LinearRegression

lgbm2 = lgb.LGBMRegressor(**lgb_params)

lr = LinearRegression()

for train_id, valid_id in time_cv.split(df_train):

    x_train, y_train = x.loc[train_id], y.loc[train_id]
    x_valid, y_valid = x.loc[valid_id], y.loc[valid_id]
    
    print(x_train.shape)
    print(x_valid.shape)

    lgbm2.fit(x_train, y_train,
             verbose = 500
            )
    
    lr.fit(lgbm2.predict(x_train).reshape(-1, 1), y_train) # LR model boosting
    
    # Calculate performance metric
    lgb_pred = lgbm2.predict(x_valid)
    lr_pred = lr.predict(lgb_pred.reshape(-1, 1))
    RMSPE = rmspe(y_valid, lr_pred)
    val_metric.append(RMSPE)
    print('RMSPE metric: ', RMSPE)
    

print('Mean validate metric: ', np.mean(val_metric))
'''

In [None]:
import pickle
# save best model as output
with open('lgbm_best_param.pickle', 'wb') as file:
    pickle.dump(lgbm, file)

#### LGBM performance for: num_leaves=128,n_estimators = 1500, l2_loss => val_RMSPE = 0.4447 
#### LGBM performance for: num_leaves=128,n_estimators = 1500, rmspe**2 loss => val_RMSPE = 0.3047

In [None]:
'''
RMSPE = rmspe(y_train, lgbm.predict(x_train))
print('Training performance RMSPE:', RMSPE)

lgb.plot_metric(lgbm)
plt.title('LGBM leaning curve:')
'''