# Import Libraries

In [None]:
import numpy as np 
import pandas as pd 
import glob

import gc

from joblib import Parallel, delayed

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Helper Functions

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def get_stock_stat(stock_id : int, dataType = 'train'):
    
    book_train_subset = pd.read_parquet(f'../input/optiver-realized-volatility-prediction/book_{dataType}.parquet/stock_id={stock_id}/')
    book_train_subset.sort_values(by=['time_id', 'seconds_in_bucket'])

    book_train_subset['bas'] = (book_train_subset[['ask_price1', 'ask_price2']].min(axis = 1)
                                / book_train_subset[['bid_price1', 'bid_price2']].max(axis = 1)
                                - 1)                               

    
    book_train_subset['wap'] = (book_train_subset['bid_price1'] * book_train_subset['ask_size1'] +
                            book_train_subset['ask_price1'] * book_train_subset['bid_size1']) / (
                            book_train_subset['bid_size1']+ book_train_subset['ask_size1'])

    book_train_subset['log_return'] = (book_train_subset.groupby(by = ['time_id'])['wap'].
                                       apply(log_return).
                                       reset_index(drop = True).
                                       fillna(0)
                                      )
    
    stock_stat = pd.merge(
        book_train_subset.groupby(by = ['time_id'])['log_return'].agg(realized_volatility).reset_index(),
        book_train_subset.groupby(by = ['time_id'], as_index = False)['bas'].mean(),
        on = ['time_id'],
        how = 'left'
    )
    
    stock_stat['stock_id'] = stock_id
    
    return stock_stat

def get_dataSet(stock_ids : list, dataType = 'train'):

    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_stock_stat)(stock_id, dataType) 
        for stock_id in stock_ids
    )
    
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)

    return stock_stat_df

from sklearn.metrics import r2_score
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

rs = 69420

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')

In [None]:
%%time
train_stock_stat_df = get_dataSet(stock_ids = train['stock_id'].unique(), dataType = 'train')
train_dataSet = pd.merge(train, train_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')

In [None]:
x = gc.collect()

In [None]:
y = train_dataSet['target'].values
X = train_dataSet.drop(['stock_id', 'time_id', 'target'], axis = 1).values

X.shape, y.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=rs, shuffle=False)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Models

In [None]:
xgb = XGBRegressor(tree_method='gpu_hist', random_state = rs, n_jobs= - 1)

lgbm = LGBMRegressor(device='gpu', random_state=rs)

# XGBoost

In [None]:
%%time
xgb.fit(X_train, y_train)

In [None]:
preds = xgb.predict(X_test)
R2 = round(r2_score(y_true = y_test, y_pred = preds), 6)
RMSPE = round(rmspe(y_true = y_test, y_pred = preds), 6)
print(f'Performance of the naive XGBOOST prediction: R2 score: {R2}, RMSPE: {RMSPE}')

# XGB Optuna Tuning

In [None]:
import optuna
from optuna.samplers import TPESampler

def objective(trial, data=X, target=y):
    
    def rmspe(y_true, y_pred):
        return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=rs, shuffle=False)
    
    param = {
        'tree_method':'gpu_hist', 
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20]),
        'random_state': trial.suggest_categorical('random_state', [24, 48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300)}
    
    model = XGBRegressor(**param)
    
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-rmse")
    model.fit(X_train ,y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=False)
    
    preds = model.predict(X_test)
    
    rmspe = rmspe(y_test, preds)
    
    return rmspe

In [None]:
study = optuna.create_study(sampler=TPESampler(), direction='minimize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study.optimize(objective, n_trials=1000, gc_after_trial=True)

In [None]:
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
best_xgbparams = study.best_params
best_xgbparams

In [None]:
xgb = XGBRegressor(**best_xgbparams, tree_method='gpu_hist')

In [None]:
%%time
xgb.fit(X_train ,y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=False)

preds = xgb.predict(X_test)
R2 = round(r2_score(y_true = y_test, y_pred = preds), 6)
RMSPE = round(rmspe(y_true = y_test, y_pred = preds), 6)
print(f'Performance of the naive Tuned XGB prediction: R2 score: {R2}, RMSPE: {RMSPE}')

# LightGBM

In [None]:
%%time
lgbm.fit(X_train, y_train)

In [None]:
preds = lgbm.predict(X_test)
R2 = round(r2_score(y_true = y_test, y_pred = preds),6)
RMSPE = round(rmspe(y_true = y_test, y_pred = preds),6)
print(f'Performance of the naive LIGHTGBM prediction: R2 score: {R2}, RMSPE: {RMSPE}')

# LGBM Optuna

In [None]:
def objective(trial):
    
    def rmspe(y_true, y_pred):
        return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rs, shuffle=False)
    valid = [(X_test, y_test)]
    
    param = {
        "device": "gpu",
        "metric": "rmse",
        "verbosity": -1,
        'learning_rate':trial.suggest_loguniform('learning_rate', 0.005, 0.5),
        "max_depth": trial.suggest_int("max_depth", 2, 500),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "n_estimators": trial.suggest_int("n_estimators", 100, 4000),
#         "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 100000, 700000),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100)}

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "rmse")
    model = LGBMRegressor(**param)
    
    model.fit(X_train, y_train, eval_set=valid, verbose=False, callbacks=[pruning_callback], early_stopping_rounds=100)

    preds = model.predict(X_test)
    
    rmspe = rmspe(y_test, preds)
    return rmspe

In [None]:
study = optuna.create_study(sampler=TPESampler(), direction='minimize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study.optimize(objective, n_trials=1000, gc_after_trial=True)

In [None]:
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
best_lgbmparams = study.best_params
best_lgbmparams

In [None]:
lgbm = LGBMRegressor(**best_lgbmparams, device='gpu')

In [None]:
%%time
lgbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False, early_stopping_rounds=100)

preds = xgb.predict(X_test)
R2 = round(r2_score(y_true = y_test, y_pred = preds), 6)
RMSPE = round(rmspe(y_true = y_test, y_pred = preds), 6)
print(f'Performance of the Naive Tuned LIGHTGBM prediction: R2 score: {R2}, RMSPE: {RMSPE}')

# Stacking Regressor

In [None]:
def_xgb = XGBRegressor(tree_method='gpu_hist', random_state = rs, n_jobs= - 1)

def_lgbm = LGBMRegressor(device='gpu', random_state=rs)

In [None]:
from sklearn.ensemble import StackingRegressor


estimators = [('def_xgb', def_xgb),
              ('def_lgbm', def_lgbm),
              ('tuned_xgb', xgb)]

clf = StackingRegressor(estimators=estimators, final_estimator=lgbm, verbose=1)

In [None]:
%%time
clf.fit(X_train, y_train)

In [None]:
preds = clf.predict(X_test)
R2 = round(r2_score(y_true = y_test, y_pred = preds),6)
RMSPE = round(rmspe(y_true = y_test, y_pred = preds), 6)
print(f'Performance of the Naive STACK prediction: R2 score: {R2}, RMSPE: {RMSPE}')

# Submission

In [None]:
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

test_stock_stat_df = get_dataSet(stock_ids = test['stock_id'].unique(), dataType = 'test')
test_dataSet = pd.merge(test, test_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')
test_dataSet = test_dataSet.drop(['stock_id', 'time_id'], axis = 1)

y_pred = test_dataSet[['row_id']]
X_test = test_dataSet.drop(['row_id'], axis = 1).fillna(0)

In [None]:
y_pred = y_pred.assign(target = clf.predict(X_test))
y_pred.to_csv('submission.csv',index = False)