# Offline LightAutoML installation 

In [None]:
!cp ../input/lightautoml-framework-lama/PyMeeus-0.5.11.tar.gz.txt PyMeeus-0.5.11.tar.gz && pip install PyMeeus-0.5.11.tar.gz 
!cp ../input/lightautoml-framework-lama/efficientnet_pytorch-0.7.1.tar.gz.txt efficientnet_pytorch-0.7.1.tar.gz && pip install efficientnet_pytorch-0.7.1.tar.gz
!cp ../input/lightautoml-framework-lama/json2html-1.3.0.tar.gz.txt json2html-1.3.0.tar.gz && pip install json2html-1.3.0.tar.gz
!cp ../input/lightautoml-framework-lama/log_calls-0.3.2.tar.gz.txt log_calls-0.3.2.tar.gz && pip install log_calls-0.3.2.tar.gz
!cp ../input/lightautoml-framework-lama/pyperclip-1.8.2.tar.gz.txt pyperclip-1.8.2.tar.gz && pip install pyperclip-1.8.2.tar.gz
!rm -rf *.tar.gz
!pip install --no-index --find-links=../input/lightautoml-framework-lama lightautoml

# Libraries imports

In [None]:
import numpy as np
import pandas as pd
pd.set_option('max_rows', 300)
pd.set_option('max_columns', 300)
import warnings
warnings.filterwarnings("ignore")

import os
from collections import ChainMap
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold, train_test_split

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco

%matplotlib inline
from matplotlib import pyplot as plt, rcParams
rcParams.update({'font.size': 22})

# Global constants 

In [None]:
INPUT_PATH = '../input/optiver-realized-volatility-prediction/'
N_THREADS = 4

# Functions for preprocess

In [None]:
def calc_wap(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1'])/(df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2'])/(df['bid_size2'] + df['ask_size2'])
    return wap

def calc_mean_price(df):
    mp = (df['bid_price1'] + df['ask_price1']) / 2
    return mp

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

def count_unique(series):
    return len(np.unique(series))

In [None]:
def calc_array_feats(arr, prefix, q_cnt = 10, diff = True):
    if diff:
        arr = np.diff(np.array(arr))
    percs = np.linspace(0, 100, q_cnt + 1).astype(int)
    cols = [prefix + '__P' + str(p)  for p in percs]
    if len(arr) > 0:
        vals = np.percentile(arr, percs)
    else:
        vals = [np.nan] * len(cols)
    res = dict(zip(cols,vals))
    return res

In [None]:
def rmspe(y_true, y_pred, **kwargs):
    return (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

In [None]:
def create_targ_enc_feature(col, targ_col, transform_func, tr_data, te_data, n_folds = 20, n_runs = 1):
    # Test col is transformed by the whole train set aggregation
    stock_id_target_trans = tr_data.groupby(col)[targ_col].agg(transform_func) 
    te_col_transformed = te_data[col].map(stock_id_target_trans)

    # Train col can be transformed only inside CV not to overfit
    # New values imputed with global train values
    glob_val = transform_func(tr_data[col].values)
    tr_col_transformed = np.repeat(0.0, tr_data.shape[0])
    for i in range(n_runs):
        kf = KFold(n_splits = n_folds, shuffle = True, random_state = 13)
        for idx_train, idx_val in kf.split(tr_data):
            target_trans = tr_data.iloc[idx_train].groupby(col)[targ_col].agg(transform_func) 
            tr_col_transformed[idx_val] += tr_data[col].iloc[idx_val].map(target_trans).fillna(glob_val) / n_runs
        
    return tr_col_transformed, te_col_transformed

# EDA and data visualization 

In [None]:
def create_plot(stock_id, time_id, book_train, trade_train):
    # Select time_id
    bt = book_train.query('time_id == {}'.format(time_id))
    bt['wap'] = calc_wap(bt)

    trades = trade_train.query('time_id == {}'.format(time_id))
    trades['seconds_in_bucket'] = np.maximum(trades['seconds_in_bucket'] - 1, 0)

    # Combine trades prices/timestamps with book prices/timestamps
    diffs = []
    times = set(bt['seconds_in_bucket'].values)
    for t in trades['seconds_in_bucket'].values:
        d = 0
        while t >= 0:
            if t in times:
                diffs.append(d)
                break
            else:
                t -= 1
                d += 1
        if t == -1:
            print('Negative!')

    trades['seconds_in_bucket'] -= diffs

    # Merged and calc the color (buy/sell)
    merged = pd.merge(bt, trades[['seconds_in_bucket', 'price', 'size']], on = 'seconds_in_bucket', how = 'left').dropna()
    merged['diff_with_bid'] = merged['price'] - merged['bid_price1']
    merged['diff_with_ask'] = merged['ask_price1'] - merged['price']
    merged['color'] = (merged['diff_with_bid'] < merged['diff_with_ask']).astype(int).map({0: 'green', 1: 'red'})

    fig = plt.figure(figsize = (60, 20))
    plt.plot(bt['seconds_in_bucket'].values, bt['ask_price2'].values, 'b--', linewidth = 1, label = 'Ask price 2')
    plt.plot(bt['seconds_in_bucket'].values, bt['ask_price1'].values, 'b', linewidth = 2, label = 'Ask price 1')
    plt.plot(bt['seconds_in_bucket'].values, bt['wap'].values, 'm', linewidth = 1, label = 'WAP')
    plt.plot(bt['seconds_in_bucket'].values, bt['bid_price1'].values, 'g', linewidth = 2, label = 'Bid price 1')
    plt.plot(bt['seconds_in_bucket'].values, bt['bid_price2'].values, 'g--', linewidth = 1, label = 'Bid price 2')

    fig.axes[0].fill_between(bt['seconds_in_bucket'].values, 
                             bt['bid_price1'].values, 
                             bt['ask_price1'].values, 
                             color = 'orange', 
                             alpha = 0.15)

    fig.axes[0].fill_between(bt['seconds_in_bucket'].values, 
                             bt['bid_price2'].values, 
                             bt['bid_price1'].values, 
                             color = 'green', 
                             alpha = 0.25)

    fig.axes[0].fill_between(bt['seconds_in_bucket'].values, 
                             bt['ask_price1'].values, 
                             bt['ask_price2'].values, 
                             color = 'blue', 
                             alpha = 0.15)


    mask = (merged['color'] == 'green').values
    plt.scatter(merged['seconds_in_bucket'].values[mask], 
                merged['price'].values[mask], 
                marker = '*', color = merged['color'].values[mask], 
                s = 500, label = 'Buy trades')
    
    mask = (merged['color'] == 'red').values
    plt.scatter(merged['seconds_in_bucket'].values[mask], 
                merged['price'].values[mask], 
                marker = '*', color = merged['color'].values[mask], 
                s = 500, label = 'Sell trades')

    plt.grid(True)
    plt.legend()
    plt.title('Stock_id = {}, time_id = {}'.format(stock_id, time_id))
    plt.xlabel('seconds_in_bucket')
    plt.ylabel('Price')
    plt.show()

In [None]:
for stock_id in [0,1,2]:
    # Read data
    bt = pd.read_parquet(INPUT_PATH + "book_train.parquet/stock_id={}".format(stock_id))
    tt = pd.read_parquet(INPUT_PATH + "trade_train.parquet/stock_id={}".format(stock_id))
    
    time_ids = bt['time_id'].value_counts().index.values[[0, -1]]
    for time_id in time_ids:
        create_plot(stock_id, time_id, bt, tt)

# Feature engineering 

In [None]:
def create_features(stock_id, time_id, bt, trades, last_s):
    q_cnt = 5
    
    bt = bt[bt['seconds_in_bucket'] >= last_s]
    trades = trades[trades['seconds_in_bucket'] > bt['seconds_in_bucket'].min() + 1]
    
    # BOOK PART    
    features_arr = [{
        'rv_1': realized_volatility(bt['log_return']),
        'rv_2': realized_volatility(bt['log_return2']),
        'rv_mp': realized_volatility(bt['log_return_mean_price'])
    }]
    
    for col in ['abs_wap_balance', 'wap_balance', 'price_spread', 'bid_spread', 'ask_spread',
               'total_volume', 'abs_volume_imbalance', 'volume_imbalance']:
        features_arr.append(calc_array_feats(bt[col].values, 'B_' + col, q_cnt, False))
        
    for col in ['seconds_in_bucket', 'bid_volume', 'ask_volume']:
        features_arr.append(calc_array_feats(bt[col].values, 'B_' + col, q_cnt, True))
    # ==========================================
    
    # TRADES PART ==========================================
    trades['seconds_in_bucket'] = np.maximum(trades['seconds_in_bucket'] - 1, 0)

    # Combine trades prices/timestamps with book prices/timestamps
    diffs = []
    times = set(bt['seconds_in_bucket'].values)
    for t in trades['seconds_in_bucket'].values:
        d = 0
        while t >= 0:
            if t in times:
                diffs.append(d)
                break
            else:
                t -= 1
                d += 1

    trades['seconds_in_bucket'] -= diffs
    
    features_arr.append(calc_array_feats(np.array(diffs), 'T_diffs', q_cnt, False))
    vc_diffs = pd.Series(diffs).value_counts()
    features_arr.append(calc_array_feats(vc_diffs.values, 'T_vc_diffs_values', q_cnt, False))
    features_arr.append(calc_array_feats(vc_diffs.index.values, 'T_vc_diffs_index', q_cnt, False))
    features_arr.append({'T_len_vc_diffs': len(vc_diffs)})
    
    for col in ['size', 'order_count']:
        features_arr.append(calc_array_feats(trades[col].values, 'T_' + col, q_cnt, False))
        
    for col in ['seconds_in_bucket', 'price']:
        features_arr.append(calc_array_feats(trades[col].values, 'T_' + col, q_cnt, True))
    # ==========================================

    # MERGED PART
    merged = pd.merge(bt, trades[['seconds_in_bucket', 'price', 'size']], on = 'seconds_in_bucket', how = 'left').dropna()
    merged['diff_with_bid'] = merged['price'] - merged['bid_price1']
    merged['diff_with_ask'] = merged['ask_price1'] - merged['price']
    merged['side'] = (merged['diff_with_bid'] < merged['diff_with_ask']).astype(int)
    side = merged['side'].values
    merged['diff_with_side_volume1'] = np.array([row[s] for row, s in zip(merged[['ask_size1', 'bid_size1']].values, side)]) - merged['size']
    merged['diff_with_side_full_volume'] = np.array([row[s] for row, s in zip(merged[['ask_volume', 'bid_volume']].values, side)]) - merged['size']
    
    features_arr.append({
        'M_cnt': len(merged),
        'M_mean_side': np.mean(side)
    })
    
    cside = np.cumsum(2 * side - 1)
    features_arr.append(calc_array_feats(cside, 'M_cside', q_cnt, False))
    
    for col in ['diff_with_side_volume1', 'diff_with_side_full_volume', 'diff_with_bid', 'diff_with_ask']:
        features_arr.append(calc_array_feats(merged[col].values, 'M_' + col, q_cnt, False)) 
      
    # ==========================================
    features = dict(ChainMap(*features_arr))
    features = {str(last_s) + '_' + k: features[k] for k in features}
    
    features['#stock_id'] = stock_id
    features['#time_id'] = time_id
    features['#row_id'] = '{}-{}'.format(stock_id, time_id)

    return features

In [None]:
def calc_features_dataset_for_stock(stock_id, test_flg = False, debug = False):
    part = 'train'
    if test_flg:
        part = 'test'
        
    bt = pd.read_parquet(INPUT_PATH + "book_{}.parquet/stock_id={}".format(part, stock_id))
    bt['wap'] = calc_wap(bt)
    bt['log_return'] = log_return(bt['wap'])
    bt['wap2'] = calc_wap2(bt)
    bt['log_return2'] = log_return(bt['wap2'])
    bt['mean_price'] = calc_mean_price(bt)
    bt['log_return_mean_price'] = log_return(bt['mean_price'])
    
    bt['abs_wap_balance'] = abs(bt['wap'] - bt['wap2'])
    bt['wap_balance'] = bt['wap'] - bt['wap2']
    
    bt['price_spread'] = 2 * (bt['ask_price1'] - bt['bid_price1']) / (bt['ask_price1'] + bt['bid_price1'])
    bt['bid_spread'] = (bt['bid_price1'] - bt['bid_price2']) / bt['bid_price1']
    bt['ask_spread'] = (bt['ask_price1'] - bt['ask_price2']) / bt['ask_price1']
    
    bt['total_volume'] = (bt['ask_size1'] + bt['ask_size2']) + (bt['bid_size1'] + bt['bid_size2'])
    bt['bid_volume'] = bt['bid_size1'] + bt['bid_size2']
    bt['ask_volume'] = bt['ask_size1'] + bt['ask_size2']
    bt['abs_volume_imbalance'] = abs(bt['bid_volume'] - bt['ask_volume'])
    bt['volume_imbalance'] = bt['bid_volume'] - bt['ask_volume']
    
    book_groups = bt.groupby('time_id')
    trade_groups = pd.read_parquet(INPUT_PATH + "trade_{}.parquet/stock_id={}".format(part, stock_id)).groupby('time_id')

    feats_arr = []
    bg_keys = book_groups.groups.keys()
    tr_keys = set(trade_groups.groups.keys())
    sample_trades_df = pd.DataFrame(columns = ['time_id', 'seconds_in_bucket', 'price', 'size', 'order_count'])
    for time_id in tqdm(bg_keys):
        arr = []
        b_gr = book_groups.get_group(time_id)
        for last_s in [600, 300]:
            if time_id in tr_keys:
                t_gr = trade_groups.get_group(time_id)
            else:
                t_gr = sample_trades_df.copy()
            arr.append(create_features(stock_id, time_id, b_gr, t_gr, 600 - last_s))
        feats_arr.append(dict(ChainMap(*arr)))
        if debug:
            break
       
    df = pd.DataFrame(feats_arr)
    df = df[sorted(df.columns)].rename({'#stock_id': 'stock_id', '#time_id': 'time_id', '#row_id': 'row_id'}, axis = 1)
    print('Stock {} ready'.format(stock_id))
    return df

In [None]:
%%time
df = calc_features_dataset_for_stock(stock_id = 0, test_flg = False, debug = True)
df

# Multiprocessed preprocessor wrapper

In [None]:
def multiprocessed_df_creation(stock_ids, n_jobs = 4, test_flg = False, debug = False):
    res_df = Parallel(n_jobs=n_jobs, verbose=1)(
        delayed(calc_features_dataset_for_stock)(stock_id, test_flg, debug) for stock_id in stock_ids
    )

    res_df = pd.concat(res_df).reset_index(drop = True)
    return res_df


In [None]:
stock_ids = [0,1,2,3,4,5]
multiprocessed_df_creation(stock_ids = stock_ids, 
                           n_jobs = N_THREADS, 
                           test_flg = False, 
                           debug = True)

# Generate full train

In [None]:
train = pd.read_csv(INPUT_PATH + 'train.csv')
train.head()

In [None]:
train_stock_ids = train.stock_id.unique()
train_stock_ids

In [None]:
%%time
train_data = multiprocessed_df_creation(train_stock_ids, 
                                        n_jobs = N_THREADS, 
                                        test_flg = False, 
                                        debug = False)

In [None]:
train_data = pd.merge(train, train_data, on = ['stock_id', 'time_id'], how = 'left')

In [None]:
train_data

In [None]:
train_data.shape

# Generate full test

In [None]:
test = pd.read_csv(INPUT_PATH + 'test.csv')
test.head()

In [None]:
DEBUG = (test.shape[0] == 3)
DEBUG

In [None]:
test_stock_ids = test.stock_id.unique()
test_stock_ids

In [None]:
%%time
test_data = multiprocessed_df_creation(test_stock_ids, 
                                       n_jobs = N_THREADS, 
                                       test_flg = True, 
                                       debug = False)

In [None]:
test_data = pd.merge(test, test_data, on = ['stock_id', 'time_id', 'row_id'], how = 'left')
test_data

# Create LightAutoML model

In [None]:
N_FOLDS = 10
TIMEOUT = 24 * 3600

# Default params for LGBM models
lgbm_params = {
      "objective": "rmse", 
      "metric": "rmse", 
      "boosting_type": "gbdt",
      'early_stopping_rounds': 30,
      'learning_rate': 0.01,
      'lambda_l1': 1.0,
      'lambda_l2': 1.0,
      'feature_fraction': 0.8,
      'bagging_fraction': 0.8,
}

In [None]:
def create_additional_feats(tr_data, te_data):
    for t_col in ['target', '0_rv_1', '300_rv_1']:
        print(t_col)
        for name, func in [('mean', np.mean), ('min', np.min), ('max', np.max)]:
            print('\t', name)
            tr_col, te_col = create_targ_enc_feature('stock_id', t_col, func, tr_data, te_data, 20, 3)
            tr_data['stock_id_enc_{}_{}'.format(name, t_col)] = tr_col
            te_data['stock_id_enc_{}_{}'.format(name, t_col)] = te_col
            
    for d in [tr_data, te_data]:
        d['0rv1_diff_300rv1'] = d['0_rv_1'] - d['300_rv_1']
        d['0rv1_del_300rv1'] = d['0_rv_1'] / d['300_rv_1']
    

In [None]:
if DEBUG:
    tr_data, te_data = train_test_split(train_data, 
                                         test_size = 0.2, 
                                         random_state = 42)
    print('Data splitted. Parts sizes: tr_data = {}, te_data = {}'
                  .format(tr_data.shape, te_data.shape))

In [None]:
def create_and_train_model(tr_data, te_data):
    # Task setup - mse loss and mse metric. To optimize rmspe we use object weights for the loss (weight column)
    task = Task('reg',)
    tr_data['weight'] = 1 / tr_data['target'] ** 2
    
    # Columns roles setup
    roles = {
        'target': 'target',
        'drop': ['row_id', 'time_id'],
        'category': 'stock_id',
        'weights': 'weight'
    }
    
    # Train LightAutoML model
    automl = TabularAutoML(task = task, 
                           timeout = TIMEOUT,
                           cpu_limit = N_THREADS,
                           general_params = {'use_algos': [['lgb', 'lgb_tuned', 'cb_tuned']]},
                           reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS},
                           tuning_params = {'max_tuning_time': 600},
                           lgb_params = {'default_params': lgbm_params, 'freeze_defaults': True},
                           verbose = 3
                           )

    oof_pred = automl.fit_predict(tr_data, roles = roles)
    print('OOF prediction for tr_data:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))
    
    # Fast feature importances calculation
    fast_fi = automl.get_feature_scores('fast')
    fast_fi.set_index('Feature')['Importance'].head(100).plot.bar(figsize = (50, 10), grid = True)
    
    # Let's see how the final model looks like
    print(automl.create_model_str_desc())
    
    # Test data prediction
    te_pred = automl.predict(te_data)
    print('Prediction for te_data:\n{}\nShape = {}'.format(te_pred, te_pred.shape))
    
    return oof_pred.data[:, 0], te_pred.data[:, 0], automl

In [None]:
if DEBUG:
    create_additional_feats(tr_data, te_data)
    
    oof_pred, valid_pred, automl = create_and_train_model(tr_data, te_data)
    
    # Check scores
    print('OOF RMSPE score = {:.5f}'.format(rmspe(tr_data['target'], oof_pred)))
    print('TEST RMSPE score = {:.5f}'.format(rmspe(te_data['target'], valid_pred)))
    
    create_additional_feats(tr_data, test_data)
    
    test_pred = automl.predict(test_data)
    submission = test_data[['row_id']]
    submission['target'] = test_pred.data[:, 0]
    submission.to_csv('submission.csv', index = False)

In [None]:
if not DEBUG:
    create_additional_feats(train_data, test_data)
    
    oof_pred, test_pred, automl = create_and_train_model(train_data, test_data)
    
    # Check scores
    print('OOF RMSPE score = {:.5f}'.format(rmspe(train_data['target'], oof_pred)))
    
    submission = test_data[['row_id']]
    submission['target'] = test_pred
    submission.to_csv('submission.csv', index = False)

# Bonus. Feature importances and model structure 

In [None]:
# Fast feature importances calculation
fast_fi = automl.get_feature_scores('fast')
fast_fi.set_index('Feature')['Importance'].head(100).plot.bar(figsize = (50, 10), grid = True)

In [None]:
# Let's see how the final model looks like
print(automl.create_model_str_desc())