In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import talib as ta
from itertools import combinations
import seaborn as sns
import os, sys, warnings
from time import time 
from create_feature import *
from glob import glob
import lightgbm as lgb
import joblib, gc
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import sys 
# add path of optiver2023 package to pythonpath 
sys.path.append(os.path.abspath('../data'))

import optiver2023 

In [2]:
def gen_test_feature(test, cache,  counter, max_ts_len):
    
    current_sec = test['seconds_in_bucket'].unique()
    print(f"Current seconds: {current_sec}")
    
    test = test.fillna(0)
    test = reduce_mem_usage(test, verbose=0)
    
    df_v1, v1_feat, v1_feat_cat = gen_v1_features(test, feature_dicts['prices'])
    feature_dicts['v1_features'] = v1_feat
    feature_dicts['v1_feature_category'] = v1_feat_cat
    
    v2_feat_cols = feature_dicts['prices'] + feature_dicts['sizes'] + feature_dicts['v1_features']
    df_v2, v2_features = gen_v2_features(df_v1, v2_feat_cols)
    feature_dicts['v2_features'] = v2_features
    
    cache = pd.concat([cache, df_v2])
    cache.reset_index(drop=True, inplace=True)
    
    # In cache, we keep only the past max_ts_len seconds of data
    if counter > max_ts_len:
        sec_in_buk_list = cache['seconds_in_bucket'].unique()
        sec_in_buk_list.sort()
        sec_to_keep = sec_in_buk_list[-max_ts_len:]
        cache = cache[cache['seconds_in_bucket'].isin(sec_to_keep)]
        cache.reset_index(drop=True, inplace=True)
        
    df_v3, v3_features = gen_v3_features(
        cache, 
        feature_dicts['prices'],
        feature_dicts['sizes'],
        feature_dicts['v1_features']
        )
    
    feature_dicts['v3_features'] = v3_features
    
    df_v3.fillna(0, inplace=True)
    df_v3.replace([np.inf, -np.inf], 0, inplace=True)
    df_v3 = reduce_mem_usage(df_v3, verbose=0)
    
    df_test = df_v3[df_v3['seconds_in_bucket'].isin(current_sec)]
    
    return df_test, cache, feature_dicts

In [3]:
env = optiver2023.make_env()
iter_test = env.iter_test()

max_ts_len = 12 # max length of ts to keep in cache
feature_dicts = {
    'prices': ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"],
    'sizes':  ["matched_size", "bid_size", "ask_size", "imbalance_size"],
    "category": ["stock_id", "seconds_in_bucket", 'imbalance_buy_sell_flag']
}

model_files = glob("../data/lgb_regressor_fold_*.pkl")
models = [joblib.load(model_file) for model_file in model_files]

print(len(models))

scaler_train = joblib.load("../data/scaler.save")

5


To Do：add if-condition to skip predicting unscored rows. 

In [4]:
counter = 0 
n_reveals = 0

cache = pd.DataFrame()
df_records = pd.DataFrame()

day_begin = time()

for (test, revealed_targets, sample_prediction) in iter_test:
    
    df_test, cache, feature_dicts = gen_test_feature(test, cache, counter, max_ts_len)
    
    df_records = pd.concat([df_records, df_test])
    
    feature_cols, category_cols = gen_feature_cols(feature_dicts)
    
    scale_cols = [x for x in feature_cols if x not in category_cols]
    
    df_test = df_test[feature_cols]
    df_test[scale_cols] = scaler_train.transform(df_test[scale_cols])
    
    # predict target using trained model
    target = 0
    for model in models:
        target += model.predict(df_test) 
        
    target /= len(models)
    
    sample_prediction['target'] = target
    
    env.predict(sample_prediction)

    # after 54 timesteps, a new day starts
    if counter >= 54:
        print(f"New Day! Time used: {time() - day_begin:2f}s.")
        counter = 0
        day_begin = time()
        cache = pd.DataFrame()
    else:
        counter += 1
        
    if counter == 1:
        n_reveals += 1        
        print('Targets revealed for day', revealed_targets['revealed_date_id'].unique().tolist())
        if n_reveals > 1:
            df_records.merge(
                revealed_targets, 
                left_on=['date_id', 'stock_id', 'seconds_in_bucket'],
                right_on=['revealed_date_id', 'stock_id', 'seconds_in_bucket'], 
                how='left')
            print(df_records.shape)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
Current seconds: [0]
Targets revealed for day [477]
Current seconds: [10]
Current seconds: [20]
Current seconds: [30]
Current seconds: [40]
Current seconds: [50]
Current seconds: [60]
Current seconds: [70]
Current seconds: [80]
Current seconds: [90]
Current seconds: [100]
Current seconds: [110]
Current seconds: [120]
Current seconds: [130]
Current seconds: [140]
Current seconds: [150]
Current seconds: [160]
Current seconds: [170]
Current seconds: [180]
Current seconds: [190]
Current seconds: [200]
Current seconds: [210]
Current seconds: [220]
Current seconds: [230]
Current seconds: [240]
Current seconds: [250]
Current seconds: [260]
Current seconds: [270]
Current seconds: [280]
Current seconds: [290]
Current seconds: [300]
Current seconds: [310]
Current seconds: [320]
Current seconds: [330]
Current seconds: [340]
Current seconds: [350]
Current seconds: [360]
Curr

In [None]:
df_test = pd.read_csv("/home/lishi/projects/Competition/kaggle_2023/data/example_test_files/test.csv")

df_test = df_test[(df_test['date_id'] == 479)&(df_test['seconds_in_bucket'] == 130)]

df_test 