# [AutoML] Optiver Autogluon Notebook

In this notebook, I try training an AutoML model from Autogluon, which implements automated bagging, stacking, ensembling and deep learning. More can be found in https://auto.gluon.ai/dev/index.html

I have also uploaded the [autogluon files][1] as a dataset for further inference without internet connection.

**Reference:**
https://www.kaggle.com/mayunnan/realized-volatility-prediction-code-template

[1]: https://www.kaggle.com/gogo827jz/autogluon-files

# Install Autogluon

In [None]:
!mkdir -p /tmp/pip/cache/

import os
from shutil import copyfile
from tqdm.auto import tqdm

src = '../input/autogluon-files/'
dst = '/tmp/pip/cache/'
for filename in tqdm(os.listdir(src)):
    if '.xyz' in filename:
        f = filename.split('.xyz')[0]
        copyfile(src + filename, dst + f + '.tar.gz')
    else:
        copyfile(src + filename, dst + filename)

!pip install --no-index --find-links /tmp/pip/cache/ autogluon

# Libraries

In [None]:
import warnings 
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import glob, os

from joblib import Parallel, delayed
from tqdm.auto import tqdm
from autogluon.tabular import TabularPredictor

# Functions

In [None]:
def calculate_wap(df):
    a1 = df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']
    b1 = df['bid_size1'] + df['ask_size1']
    a2 = df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']
    b2 = df['bid_size2']+ df['ask_size2']
    return (a1 / b1 + a2 / b2) / 2

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return ** 2))

In [None]:
from autogluon.core.metrics import make_scorer

def rmspe(targets, predictions):
    return np.sqrt((((predictions - targets) / targets) ** 2).mean())

rmspe_metric = make_scorer('rmspe', rmspe, optimum = 0, greater_is_better = False)

In [None]:
def get_stock_stat(stock_id : int, dataType = 'train'):
    
    book_train_subset = pd.read_parquet(f'../input/optiver-realized-volatility-prediction/book_{dataType}.parquet/stock_id={stock_id}/')
    book_train_subset.sort_values(by = ['time_id', 'seconds_in_bucket'])
    book_train_subset['bas'] = (book_train_subset[['ask_price1', 'ask_price2']].min(axis = 1)
                                / book_train_subset[['bid_price1', 'bid_price2']].max(axis = 1)
                                - 1)                             
    book_train_subset['wap'] = calculate_wap(book_train_subset)
    book_train_subset['log_return'] = book_train_subset.groupby(by = ['time_id'])['wap'].apply(log_return).reset_index(drop = True).fillna(0)
    stock_stat = pd.concat([
        book_train_subset.groupby(['time_id'])['log_return'].agg(realized_volatility).rename('rv'),
        book_train_subset.groupby(['time_id'])['bas'].mean().rename('bas_mean'), 
        book_train_subset.groupby(['time_id'])['bas'].std().rename('bas_std'), 
        ], 
        axis = 1, 
    ).reset_index()
    stock_stat['stock_id'] = stock_id
    return stock_stat

In [None]:
def get_dataSet(stock_ids : list, dataType = 'train'):

    stock_stat = Parallel(n_jobs = -1)(
        delayed(get_stock_stat)(stock_id, dataType) 
        for stock_id in tqdm(stock_ids, total = len(stock_ids))
    )
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)
    return stock_stat_df

# Train

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.head()

In [None]:
train_stock_stat_df = get_dataSet(stock_ids = train['stock_id'].unique(), dataType = 'train')
train_dataSet = pd.merge(train, train_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')
train_dataSet['stock_id'] = train_dataSet['stock_id'].astype('category')

In [None]:
train_dataSet.drop('time_id', axis = 1, inplace = True)
train_dataSet['sample_weight'] = 1 / np.square(train_dataSet['target'])
train_dataSet.head()

In [None]:
predictor = TabularPredictor(
    label = 'target', 
    problem_type = 'regression', 
    eval_metric = rmspe_metric, 
    sample_weight = 'sample_weight', 
    path = 'autogluon', 
    verbosity = 3, 
)

In [None]:
predictor.fit(
    train_data = train_dataSet, 
    time_limit = 1 * 3600, 
    presets = 'medium_quality_faster_train',#'best_quality', 
#     ag_args_fit = {'num_gpus': 1}, 
#     excluded_model_types = ['KNN', 'RF', 'NN', 'FASTAI'], 
    keep_only_best = True, 
    save_space = True, 
    verbosity = 3, 
)

In [None]:
predictor.leaderboard()

# Inference

In [None]:
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

test_stock_stat_df = get_dataSet(stock_ids = test['stock_id'].unique(), dataType = 'test')
test_dataSet = pd.merge(test, test_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')
test_dataSet['stock_id'] = test_dataSet['stock_id'].astype('category')
test_dataSet = test_dataSet.drop(
    [
        'time_id', 
    ], axis = 1)

y_pred = test_dataSet[['row_id']]
X_test = test_dataSet.drop(['row_id'], axis = 1)
X_test.head()

In [None]:
predictor = predictor.load('autogluon')

In [None]:
from time import time

start = time()
preds = predictor.predict(X_test)
print('Time Cost:', time() - start)

In [None]:
y_pred = y_pred.assign(target = preds)
y_pred.to_csv('submission.csv',index = False)
y_pred.head()