# Trading at the Close - Inference
-----------------------
-----------------------


This notebook is intended to be run after the train notebook. It takes the artifacts generated by the hyperparameter search and produces the final predictions on the public leaderboard.

## Installs
------------

In [1]:
%pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


## Imports

In [1]:
import numpy as np
from pathlib import Path
import pandas as pd
import seaborn as sns

sns.set_style("ticks")

In [7]:
class CFG:
    LOCAL = True
    JOBS_PATH = Path(".", "job_artifacts")
    JOB_NAME = "optiver-inference_lgbmregressor"
    TEST_PATH = Path(".", "example_test_files", "test.csv")
    FEATURES_PATH = JOBS_PATH.joinpath("optiver-feature_selection-0002")
    FEATURES_NAME = "feature_names.json"
    MODEL_PATH = JOBS_PATH.joinpath("optiver-tuning_lgbmregressor-0006")

Create the artifacts folder tree incrementally. Each run will correspond to a different folder.

In [11]:
CFG.JOBS_PATH.mkdir(exist_ok=True, parents=True)

for i in range(1, 10000):
    CFG.JOB_PATH = CFG.JOBS_PATH.joinpath(CFG.JOB_NAME + "-" +  str(i).zfill(4))
    try:
        CFG.JOB_PATH.mkdir()
        break
    except:
        continue

## Data Loading
---------------

In [9]:
from utils.featurizers import featurize
from utils.files import read_json

df = pd.read_csv(CFG.TEST_PATH)
selected_features = read_json(CFG.FEATURES_PATH.joinpath(CFG.FEATURES_NAME))["selected_features"]

# Get features
featurize(df, selected_features)

X = df.copy()

del df

[32m2023-10-12 18:51:30.101[0m | [1mINFO    [0m | [36mutils.featurizers[0m:[36mfeaturize[0m:[36m16[0m - [1mCreating additional features...[0m
[32m2023-10-12 18:51:30.468[0m | [1mINFO    [0m | [36mutils.featurizers[0m:[36mfeaturize[0m:[36m64[0m - [1mDropping unnecesary features...[0m
[32m2023-10-12 18:51:30.472[0m | [1mINFO    [0m | [36mutils.featurizers[0m:[36mfeaturize[0m:[36m68[0m - [1mReducing data memory footprint...[0m
[32m2023-10-12 18:51:30.475[0m | [1mINFO    [0m | [36mutils.compression[0m:[36mdowncast[0m:[36m11[0m - [1mMemory usage of dataframe is 3.78 MB[0m
[32m2023-10-12 18:51:30.494[0m | [1mINFO    [0m | [36mutils.compression[0m:[36mdowncast[0m:[36m20[0m - [1mMemory usage after optimization is: 2.36 MB[0m
[32m2023-10-12 18:51:30.495[0m | [1mINFO    [0m | [36mutils.compression[0m:[36mdowncast[0m:[36m22[0m - [1mDecreased by 37.50%[0m


## Inference
--------------

In [5]:
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices - std_error * step

    return out

In [12]:

from utils.files import load_model

if CFG.LOCAL:
    from utils.public_timeseries_testing_util import MockApi
    env = MockApi(X, submission_path=CFG.JOB_PATH.joinpath("submission.csv"))
else:
    import optiver2023
    env = optiver2023.make_env()

iter_test = env.iter_test()
counter = 0
predictions = []

models_path = CFG.MODEL_PATH.glob("**/*.pkl")
models = [load_model(path) for path in models_path]

for (test, revealed_targets, sample_prediction) in iter_test:
    feat = featurize(test)

    # Mean ensemble
    prediction = 0
    for model in models:
        prediction += model.predict(feat)   
    prediction /= len(models)

    fold_prediction = zero_sum(fold_prediction, test.loc[:, "bid_size"] + test.loc[:, "ask_size"])
    #clipped_predictions = np.clip(fold_prediction, y_min, y_max)
    sample_prediction["target"] = prediction
    env.predict(sample_prediction)
    counter += 1

Creating mock API...


KeyError: "['target'] not found in axis"