In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
from joblib import Parallel, delayed
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
import os
from sklearn.linear_model import LinearRegression
import warnings
from sklearn.model_selection import train_test_split, KFold
import lightgbm as lgb
import time
import joblib
import datatable as dt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, RobustScaler, StandardScaler
warnings.filterwarnings(action='ignore', category=UserWarning)
from tqdm import tqdm
import gc

# Infer: https://www.kaggle.com/yus002/realized-volatility-prediction-lgbm-infer

# Training Data Set comes from 
* # https://www.kaggle.com/mayunnan/realized-volatility-prediction-code-template
* # https://www.kaggle.com/thanish/randomforest-starter-submission

# Helper methods

In [None]:
def my_metrics(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))
def rmspe(y_true, y_pred):  # f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool
    output = my_metrics(y_true, y_pred)
    return 'rmspe', output, False
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def get_stock_stat(stock_id : int, dataType = 'train'):   
    book_train_subset = pd.read_parquet(f'../input/optiver-realized-volatility-prediction/book_{dataType}.parquet/stock_id={stock_id}/')
    book_train_subset.sort_values(by=['time_id', 'seconds_in_bucket'])

    book_train_subset['bas'] = (book_train_subset[['ask_price1', 'ask_price2']].min(axis = 1)
                                / book_train_subset[['bid_price1', 'bid_price2']].max(axis = 1)
                                - 1)                               
    book_train_subset['wap'] = (book_train_subset['bid_price1'] * book_train_subset['ask_size1'] +
                            book_train_subset['ask_price1'] * book_train_subset['bid_size1']) / (
                            book_train_subset['bid_size1']+ book_train_subset['ask_size1'])
    book_train_subset['log_return'] = (book_train_subset.groupby(by = ['time_id'])['wap'].
                                       apply(log_return).
                                       reset_index(drop = True).
                                       fillna(0)
                                      )
    stock_stat = pd.merge(
        book_train_subset.groupby(by = ['time_id'])['log_return'].agg(realized_volatility).reset_index(),
        book_train_subset.groupby(by = ['time_id'], as_index = False)['bas'].mean(),
        on = ['time_id'],
        how = 'left'
    )
    stock_stat['stock_id'] = stock_id
    return stock_stat
def get_dataSet(stock_ids : list, dataType = 'train'):
    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_stock_stat)(stock_id, dataType) 
        for stock_id in stock_ids
    )
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)
    return stock_stat_df

# Config

In [None]:
keep_stock_id = 1
#keep_stock_id = 1

folds = 7
seed_list = [i for i in range(12, 13)]
early_stopping = 200

# Data

In [None]:
# train -------------------------
if keep_stock_id:
    td = dt.fread('../input/mytrain/X_131_features.csv')
    X = td.to_pandas()
    del td
else: 
    X = pd.read_csv("../input/mytrain/X.csv")
y = pd.read_csv("../input/mytrain/y.csv")
# to_test ----------------------------------------------------
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
test_stock_stat_df = get_dataSet(stock_ids = test['stock_id'].unique(), dataType = 'test')
test_dataSet = pd.merge(test, test_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')
test_dataSet = test_dataSet
final_pred1 = test_dataSet[['row_id']]
to_test = test_dataSet.drop(['row_id'], axis = 1).fillna(0)
if keep_stock_id:
    train = pd.read_csv("../input/optiver-realized-volatility-prediction/train.csv")
    cols = [f'stock_id_{c}' for c in list(set(train.stock_id))]
    to_test[cols] = pd.DataFrame(np.stack([(to_test.stock_id == c).astype('int') for c in list(set(train.stock_id))]).T, columns = cols)
else:
    to_test = to_test.drop("stock_id", axis = 1)
    X = X.drop("stock_id", axis = 1)

In [None]:
config = {'input_path': "../input/optiver-realized-volatility-prediction/trade_",
          'train_path': '../input/optiver-realized-volatility-prediction/train.csv',
          'test_path' : '../input/optiver-realized-volatility-prediction/test.csv'}
test_df = pd.read_csv(config['test_path'])
def read_data(stock_id, data_type):
    file = glob.glob(config['input_path']+f'{data_type}.parquet/stock_id={stock_id}/*')[0]
    df = pd.read_parquet(file)
    return df
def get_final_df(df, data_type):
    final_df = pd.DataFrame()
    unique_id = df['stock_id'].unique().tolist()
    for stock_id in tqdm(unique_id):
        temp_stock_df = read_data(stock_id=stock_id, data_type=data_type)
        temp_stock_df['stock_id'] = stock_id
        final_df = pd.concat([final_df, temp_stock_df])
    final_df.reset_index(drop=True)
    return final_df
def get_agg_info(df):
    agg_df = df.groupby(['stock_id', 'time_id']).agg(mean_sec_in_bucket = ('seconds_in_bucket', 'mean'), 
                                                     mean_price = ('price', 'mean'),
                                                     mean_size = ('size', 'mean'),
                                                     mean_order = ('order_count', 'mean'),
                                                     max_sec_in_bucket = ('seconds_in_bucket', 'max'), 
                                                     max_price = ('price', 'max'),
                                                     max_size = ('size', 'max'),
                                                     max_order = ('order_count', 'max'),
                                                     min_sec_in_bucket = ('seconds_in_bucket', 'min'), 
                                                     min_price = ('price', 'min'),
                                                     min_size = ('size', 'min'),
                                                     min_order = ('order_count', 'min'),
                                                     median_sec_in_bucket = ('seconds_in_bucket', 'median'), 
                                                     median_price = ('price', 'median'),
                                                     median_size = ('size', 'median'),
                                                     median_order = ('order_count', 'median')
                                                    ).reset_index()
    
    return agg_df
test_final_df = get_final_df(df=test_df, data_type='test')
test_agg = get_agg_info(df=test_final_df)
test_final_df = pd.merge(test_df, test_agg, on=['stock_id', 'time_id'], how='left')
test_final_df.fillna(-999, inplace=True)
test_final_df = test_final_df.drop("row_id", axis = 1)
to_test = to_test.merge(test_final_df, on=['stock_id', 'time_id'], how='left')
to_test.fillna(-999, inplace=True)
to_test = to_test.drop("stock_id", axis = 1)

In [None]:
X

In [None]:
to_test

In [None]:
def objective(trial , X = X , y = y):
    if keep_stock_id: 
        params = {
            'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 1e-5 , 1),
            'reg_lambda' : trial.suggest_loguniform('reg_lambda' , 3 , 9),
            'num_leaves' : trial.suggest_int('num_leaves' , 20 , 60),
            'learning_rate' : trial.suggest_uniform('learning_rate' , 0.02 , 0.08),
            'max_depth' : trial.suggest_int('max_depth', 20 , 60),
            'n_estimators' : trial.suggest_int('n_estimators', 3000 , 3600),
            'min_child_weight' : trial.suggest_loguniform('min_child_weight', 0.05 , 0.15),
            'subsample' : trial.suggest_uniform('subsample' , 0.7 , 1.0),
            'colsample_bytree' : trial.suggest_loguniform('colsample_bytree', 0.5 , 1),
            'min_child_samples' : trial.suggest_int('min_child_samples', 10, 40),
            'metric' : 'rmse', #'rmse'
            'device_type' : 'gpu',
        }
    else:
        params = {
            'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 1e-5 , 1),
            'reg_lambda' : trial.suggest_loguniform('reg_lambda', 1e-5 , 1),
            'num_leaves' : trial.suggest_int('num_leaves' , 10 , 60),
            'learning_rate' : trial.suggest_uniform('learning_rate' , 0.02 , 0.08),
            'max_depth' : trial.suggest_int('max_depth', 10 , 30),
            'n_estimators' : trial.suggest_int('n_estimators', 60 , 1000),
            'min_child_weight' : trial.suggest_loguniform('min_child_weight', 0.05 , 0.15),
            'subsample' : trial.suggest_uniform('subsample' , 0.4 , 1.0),
            'colsample_bytree' : trial.suggest_loguniform('colsample_bytree', 0.4 , 1),
            'min_child_samples' : trial.suggest_int('min_child_samples', 10, 40),
            'metric' : 'rmse', #'rmse'
            'device_type' : 'gpu',
        }
    #pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'rmspe', valid_name = 'valid_0')  
    score = 0
    for seed in seed_list: 
        kf = KFold(n_splits = folds ,random_state= seed, shuffle=True)
        for idx_train,idx_test in kf.split(X, y):
            X_train,X_test=X.iloc[idx_train],X.iloc[idx_test]
            y_train,y_test=y.iloc[idx_train],y.iloc[idx_test]
            model = lgb.LGBMRegressor(**params, random_state = seed, n_jobs = -1)
            model.fit(X_train, y_train.values.ravel(), eval_set = [(X_test , y_test.values.ravel())] ,eval_metric = rmspe, early_stopping_rounds = early_stopping, \
             verbose = 1500
                    #   ,callbacks = [pruning_callback]
                     ) 
            y_pred = model.predict(X_test)  
            score += (my_metrics(y_test.values.ravel(), y_pred) / folds) / len(seed_list)                 
    del model
    return score
import optuna
study = optuna.create_study(direction = 'minimize' , study_name = 'lgbm'
                           # , pruner = optuna.pruners.HyperbandPruner()
                           )
study.optimize(objective , n_trials = 4)
print('numbers of the finished trials:' , len(study.trials))
print('the best params:' , study.best_trial.params)
print('the best value:' , study.best_value)
print("done")
time.sleep(60)

In [None]:
params = {'reg_alpha': 0.028887686028843496, 
          'reg_lambda': 5.609420891429227, 
          'num_leaves': 56, 'learning_rate': 0.07716013360199103, 
          'max_depth': 23, 'n_estimators': 3398,
          'min_child_weight': 0.11236058184476623, 
          'subsample': 0.8933699572438508, 
          'colsample_bytree': 0.9836129968372216, 
          'min_child_samples': 28} # Best is trial 2 with value: 0.28758503964033905.

# Save models

In [None]:
score = 0
for seed in seed_list: 
    kf = KFold(n_splits = folds ,random_state= seed, shuffle=True)
    count = 1
    for idx_train,idx_test in kf.split(X, y):
        print("=" * 40)
        print("seed", seed)
        print("fold", count)
        print("=" * 30)
        start_time = time.time()
        X_train, X_test = X.iloc[idx_train], X.iloc[idx_test]
        y_train, y_test = y.iloc[idx_train], y.iloc[idx_test]
        model = lgb.LGBMRegressor(**params, random_state = seed, n_jobs = -1, metric = 'rmse', device_type = 'gpu')
        model.fit(X_train, y_train, eval_set = [(X_test , y_test.values.ravel())], eval_metric = rmspe,\
                  early_stopping_rounds = early_stopping, verbose = False)
        cv_score = my_metrics(y_test.values.ravel(), model.predict(X_test))
        score += (cv_score / folds) / len(seed_list)
        joblib.dump(model, f'LGBM seed_{seed}_fold_{count}_cv_score_{round(cv_score, 3)}.pkl') # save model
        end_time = time.time()
        run_time = round(end_time - start_time)
        print ("fold", count, "took", run_time , "seconds to run")
        count += 1
        print ("The estimated remaining training time in the current seed", seed, "are",\
               round(((folds - count) * run_time) / 60, 3), "minuets")
        print("Validation score", cv_score)
print("Mean RMSPE validation score of", folds, "folds", score)
