# Trading at the Close - Inference
-----------------------
-----------------------

## Imports
-----------

In [1]:
import sys
import numpy as np
from pathlib import Path
import pandas as pd

import warnings
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", pd.errors.PerformanceWarning)

In [2]:
utils_path = Path("/", "kaggle", "input", "optiver-inference-utils")
if str(utils_path) not in sys.path:
    sys.path = [str(utils_path),] + sys.path
    
print(sys.path)

['\\kaggle\\input\\optiver-inference-utils', 'c:\\Users\\salva\\Documents\\Python Projects\\trading-at-the-close', 'C:\\Program Files\\Python\\python310.zip', 'C:\\Program Files\\Python\\DLLs', 'C:\\Program Files\\Python\\lib', 'C:\\Program Files\\Python', 'c:\\Users\\salva\\Documents\\Python Projects\\trading-at-the-close\\.venv', '', 'c:\\Users\\salva\\Documents\\Python Projects\\trading-at-the-close\\.venv\\lib\\site-packages', 'c:\\Users\\salva\\Documents\\Python Projects\\trading-at-the-close\\.venv\\lib\\site-packages\\win32', 'c:\\Users\\salva\\Documents\\Python Projects\\trading-at-the-close\\.venv\\lib\\site-packages\\win32\\lib', 'c:\\Users\\salva\\Documents\\Python Projects\\trading-at-the-close\\.venv\\lib\\site-packages\\Pythonwin']


In [3]:
class CFG:
    LOCAL = True
    JOBS_PATH_LOCAL = Path(".", "job_artifacts")
    JOBS_PATH_ONLINE = Path("/", "kaggle", "input", "optiver-trained-artifacts", "job_artifacts")
    TEST_PATH = Path(".", "train_files", "train.csv")
    INPUT_METADATA_CASE = "optiver-feature_selection-0010"
    INPUT_METADATA_NAME = "output_metadata.json"
    MODEL_CASE = "optiver-feature_selection-0010"
    CACHE_HORIZON = 11

## Inference
--------------

In [4]:
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices - std_error * step

    return out

In [5]:
def zero_mean(prices):
    prices -= prices.mean()
    return prices

In [6]:
def load_model(
    model_type,
    booster_file
):
    model = model_type(model_file=str(booster_file.with_suffix(".txt")))
    
    return model

In [7]:
# Online API
if not CFG.LOCAL:

    import optiver2023
    optiver2023.make_env.func_dict['__called__'] = False # This enables running the API again if an error was produced

    env = optiver2023.make_env()
    iter_test = env.iter_test()

    CFG.JOBS_PATH = CFG.JOBS_PATH_ONLINE

# Local API
else:
    from utils.public_timeseries_testing_util import MockApi
    import pandas as pd

    df = pd.read_csv(CFG.TEST_PATH)
    env = MockApi(df, start_date=478, end_date=480)
    iter_test = env.iter_test()

    CFG.JOBS_PATH = CFG.JOBS_PATH_LOCAL

CFG.INPUT_METADATA_PATH = CFG.JOBS_PATH.joinpath(CFG.INPUT_METADATA_CASE, CFG.INPUT_METADATA_NAME)
CFG.MODEL_PATH = CFG.JOBS_PATH.joinpath(CFG.MODEL_CASE)

Creating mock API...


KeyboardInterrupt: 

In [None]:
try:
    from loguru import logger
    import sys
    logger.remove()
    logger.add(sys.stdout, level="ERROR")
except:
    import logging
    logger = logging.getLogger("__main__")
    logger.setLevel("ERROR")

In [None]:
from utils.files import read_json
from utils.features import feature_engineering, select_features
from lightgbm import Booster
from timeit import default_timer as timer

metadata = read_json(CFG.INPUT_METADATA_PATH)
stock_weights = {int(k): v for k, v in metadata["stock_weights"].items()}
stock_clusters = {int(k): v for k, v in metadata["stock_clusters"].items()}
selected_features = metadata["selected_features"]

counter = 0
predictions, qps = [], []

models_boosters = CFG.MODEL_PATH.glob("**/*.txt")
models = [load_model(Booster, path) for path in models_boosters]

cache_test = pd.DataFrame()
cache_revealed_targets = None
for (test, revealed_targets, sample_prediction) in iter_test:
    start = timer()

    current_date_id = test["date_id"].iloc[0]
    current_second_in_bucket = test["seconds_in_bucket"].iloc[0]

    # Save revealed target when available
    if revealed_targets.shape[0] > 1:
        cache_revealed_targets = revealed_targets

    # Cache inference data (without feature engineering for memory reasons)
    cache_test = pd.concat([cache_test, test], ignore_index=True, axis=0)
    if counter > 0:
        cache_test = cache_test.groupby(["stock_id"]).tail(CFG.CACHE_HORIZON).sort_values(
                by=["date_id", "seconds_in_bucket", "stock_id"]).reset_index(drop=True)

    # Do feature engineering over cache and take only the data relevant for inference in this timestep (revealed target is introduced by other means)
    feat = feature_engineering(cache_test, revealed_target=False, weights=stock_weights, clusters=stock_clusters)[-test.shape[0]:]
    #if cache_revealed_targets is not None:
    #    feat["revealed_target"] = cache_revealed_targets["revealed_target"].loc[
    #        (cache_revealed_targets["revealed_date_id"] == current_date_id - 1) & (cache_revealed_targets["seconds_in_bucket"] == current_second_in_bucket)]
    #else:
    #    feat["revealed_target"] = np.nan

    # New addition to the inference APU
    if feat.currently_scored.iloc[0] == False:
        sample_prediction["target"] = 0
        env.predict(sample_prediction)
        counter += 1
        qps.append(timer() - start)
        if counter % 10 == 0:
            print(counter, "qps:", np.mean(qps))
        continue

    feat = select_features(feat, selected_features, reduce_memory=False)
    # Perform prediction as a mean ensemble
    prediction = 0
    for model in models:
        prediction += model.predict(feat)   
    prediction /= len(models)

    # Do postprocessing over the redictions
    #prediction = zero_sum(prediction, test["bid_size"] + test["ask_size"])
    prediction = zero_mean(prediction)
    end = timer()

    # Save prediction
    sample_prediction["target"] = prediction
    env.predict(sample_prediction)
    counter += 1

    qps.append(timer() - start)

    if counter % 10 == 0:
        print(f"Date ID {current_date_id}, second in bucket {current_second_in_bucket} qps: {np.mean(qps)}")

    time_cost = 1.146 * np.mean(qps)
    print(f"The code will take approximately {np.round(time_cost, 4)} hours to reason about")

Date ID 478, second in bucket 0 (took 0.91s)
Date ID 478, second in bucket 10 (took 0.93s)
Date ID 478, second in bucket 20 (took 0.98s)
Date ID 478, second in bucket 30 (took 1.07s)
Date ID 478, second in bucket 40 (took 1.14s)
Date ID 478, second in bucket 50 (took 1.25s)
Date ID 478, second in bucket 60 (took 1.46s)
Date ID 478, second in bucket 70 (took 1.40s)
Date ID 478, second in bucket 80 (took 1.53s)
Date ID 478, second in bucket 90 (took 1.57s)
Date ID 478, second in bucket 100 (took 1.61s)
Date ID 478, second in bucket 110 (took 1.63s)
Date ID 478, second in bucket 120 (took 1.60s)
Date ID 478, second in bucket 130 (took 1.59s)
Date ID 478, second in bucket 140 (took 1.62s)
Date ID 478, second in bucket 150 (took 1.62s)
Date ID 478, second in bucket 160 (took 1.62s)
Date ID 478, second in bucket 170 (took 1.60s)
Date ID 478, second in bucket 180 (took 1.61s)
Date ID 478, second in bucket 190 (took 1.60s)
Date ID 478, second in bucket 200 (took 1.60s)
Date ID 478, second in b