# Trading at the Close - Inference
-----------------------
-----------------------

## Imports
-----------

In [1]:
import sys
import numpy as np
from pathlib import Path
import pandas as pd
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [2]:
utils_path = Path("/", "kaggle", "input", "optiver-inference-utils")
if str(utils_path) not in sys.path:
    sys.path = [str(utils_path),] + sys.path
    
print(sys.path)

['/kaggle/input/optiver-inference-utils', '/kaggle/working', '/kaggle/lib/kagglegym', '/kaggle/lib', '/kaggle/input/optiver-trading-at-the-close', '/opt/conda/lib/python310.zip', '/opt/conda/lib/python3.10', '/opt/conda/lib/python3.10/lib-dynload', '', '/root/.local/lib/python3.10/site-packages', '/opt/conda/lib/python3.10/site-packages', '/root/src/BigQuery_Helper']


In [3]:
class CFG:
    LOCAL = True
    JOBS_PATH = Path("/", "kaggle", "input", "optiver-trained-artifacts", "job_artifacts")
    FEATURES_PATH = JOBS_PATH.joinpath("optiver-feature_selection-0008")
    FEATURES_NAME = "feature_names.json"
    MEMORY_HORIZON = 5 # Retain MEMORY_HORIZON date_ids
    MODEL_PATH = JOBS_PATH.joinpath("optiver-tuning_lgbmregressor-0009")


## Inference
--------------

In [4]:
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices - std_error * step

    return out

In [5]:
import optiver2023
optiver2023.make_env.func_dict['__called__'] = False

env = optiver2023.make_env()
iter_test = env.iter_test()

In [6]:
def merge_df(df1, df2, on):

    if len(df1) == 0:
        return df2
    elif len(df2) == 0:
        return df1
    else:
        df3=pd.merge(df1, df2, on=on, how='outer')
        #now we have a mess to fix
        cols=[x[:-2] for x in df3.columns if x.endswith('_x')]
        for i_col in cols:
            df3.loc[:,i_col+'_x']=df3[i_col+'_x'].combine_first(df3[i_col+'_y'])
            df3.rename(columns={i_col+'_x':i_col},inplace=True)
            df3.drop(columns=[i_col+'_y'],inplace=True)
        return df3

In [7]:
def apply_aggregations(df: pd.DataFrame, aggregations_date = None, reduce_memory: bool = False):

    # Perform aggregations and features dependent on these
    if aggregations_date is not None:
        for key, value in aggregations_date.items():
            aggr_index = ["date_id", "seconds_in_bucket"]
            aggr_colum = pd.Series(value, name=key + "_date_aggr")
            aggr_colum.index = aggr_colum.index.set_names(aggr_index)
            df = merge_df(df, aggr_colum, on=aggr_index)


    return df

In [8]:
def load_model(
    model_type,
    booster_file
):
    model = model_type(model_file=str(booster_file.with_suffix(".txt")))
    
    return model

In [9]:
from utils.files import read_json
from utils.features import make_features, select_features, make_rolling_features
from utils.aggregations import aggregate
from lightgbm import Booster
from timeit import default_timer as timer

features = read_json(CFG.FEATURES_PATH.joinpath(CFG.FEATURES_NAME))
selected_features = features["selected_features"]
selected_features.remove("target") # In case it has been included
stock_weights = features["stock_weights"]
counter = 0
predictions = []

models_boosters = CFG.MODEL_PATH.glob("**/*.txt")
models = [load_model(Booster, path) for path in models_boosters][:1]

feat_prev = None
previous_targets = []
cummulative_feat = None
for (test, revealed_targets, sample_prediction) in iter_test:

    start = timer()
    
    #################################################
    # Accumulate data
    #################################################
    revealed_feat = revealed_targets.copy()
    current_date_id = test["date_id"].iloc[0]
    current_second_in_bucket = test["seconds_in_bucket"].iloc[0]

    # Accumulate features over time for rolling features
    cummulative_feat = test if cummulative_feat is None else \
            pd.concat([cummulative_feat, test], ignore_index=True, axis=0)

    # Drop features with certain age (this maintains the memory usage stable)
    cummulative_feat = cummulative_feat[cummulative_feat["date_id"] > \
                                        (current_date_id - CFG.MEMORY_HORIZON)]

    # Add previous targets to the rolling cummulative features
    if revealed_feat.shape[0] > 1:
        revealed_feat = revealed_feat[["stock_id", "revealed_date_id", "seconds_in_bucket", "revealed_target"]]
        revealed_feat = revealed_feat.rename(columns={"revealed_target": "target", "revealed_date_id": "date_id"})
        cummulative_feat = pd.merge(revealed_feat, cummulative_feat, on=["stock_id", "date_id", "seconds_in_bucket"], how="outer")

    for col in cummulative_feat.columns:
        cummulative_feat[col] = cummulative_feat[col].apply(lambda x: x if not pd.isna(x) else np.nan)

    elapsed = timer() - start
    print(f"Preparation (took {elapsed:.2f}s)")

    #################################################
    # Feature engineering
    #################################################

    start = timer()
    
    # Create new features
    cummulative_feat = make_features(cummulative_feat, reduce_memory=True)

    # Apply (stock-wise) aggregations
    _, aggregations = aggregate(cummulative_feat, weights=stock_weights)
    cummulative_feat = apply_aggregations(cummulative_feat, aggregations_date=aggregations, reduce_memory=True)

    # Get rolling features
    cummulative_feat = make_rolling_features(cummulative_feat, reduce_memory=True)

    # Get only the features from the current date and second
    feat = (cummulative_feat[(cummulative_feat["date_id"] == current_date_id) & \
                             (cummulative_feat["seconds_in_bucket"] == current_second_in_bucket)]).copy().reset_index()

    # Perform feature selection for inference. This also ensures the order is the same than during training
    feat = select_features(feat, selected_features, reduce_memory=True) 

    elapsed = timer() - start
    print(f"Feat Eng (took {elapsed:.2f}s)")
    
    start = timer()
    #################################################
    # Prediction
    #################################################
    # Mean ensemble
    prediction = 0
    for model in models:
        prediction += model.predict(feat)   
    prediction /= len(models)

    prediction = zero_sum(prediction, test.loc[:, "bid_size"] + test.loc[:, "ask_size"])
    elapsed = timer() - start
    print(f"Inference (took {elapsed:.2f}s)")

    #################################################
    # Storage
    #################################################
    sample_prediction["target"] = prediction
    env.predict(sample_prediction)
    counter += 1

    elapsed = timer() - start
    print(f"Date ID {current_date_id}, second in bucket {current_second_in_bucket} (took {elapsed:.2f}s)")

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
Preparation (took 0.27s)
Feat Eng (took 1.25s)
Inference (took 0.02s)
Date ID 478, second in bucket 0 (took 0.02s)
Preparation (took 1.61s)
Feat Eng (took 1.37s)
Inference (took 0.02s)
Date ID 478, second in bucket 10 (took 0.02s)
Preparation (took 1.65s)
Feat Eng (took 1.35s)
Inference (took 0.02s)
Date ID 478, second in bucket 20 (took 0.02s)
Preparation (took 1.73s)
Feat Eng (took 1.46s)
Inference (took 0.02s)
Date ID 478, second in bucket 30 (took 0.02s)
Preparation (took 1.73s)
Feat Eng (took 1.37s)
Inference (took 0.02s)
Date ID 478, second in bucket 40 (took 0.02s)
Preparation (took 1.71s)
Feat Eng (took 1.40s)
Inference (took 0.02s)
Date ID 478, second in bucket 50 (took 0.02s)
Preparation (took 1.73s)
Feat Eng (took 1.51s)
Inference (took 0.02s)
Date ID 478, second in bucket 60 (took 0.02s)
Preparation (took 1.78s)
Feat Eng (took 1.44s)
Inference (took 0