In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import talib as ta
from itertools import combinations
import seaborn as sns
import os, sys, warnings
from time import time 
from create_feature import *
from glob import glob
import lightgbm as lgb
import joblib, gc
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import sys 
# add path of optiver2023 package to pythonpath 
sys.path.append(os.path.abspath('../data'))

import optiver2023 

In [2]:
def gen_section_feat(test, feature_dicts):
    
    current_sec = test['seconds_in_bucket'].unique()
    print(f"Current seconds: {current_sec}")
    
    test = test.fillna(0)
    test = reduce_mem_usage(test, verbose=0)
    
    df_v1, v1_feat, v1_feat_cat = gen_v1_features(test, feature_dicts['prices'])
    feature_dicts['v1_features'] = v1_feat
    feature_dicts['v1_feature_category'] = v1_feat_cat
    
    v2_feat_cols = feature_dicts['prices'] + feature_dicts['sizes'] + feature_dicts['v1_features']
    df_v2, v2_features = gen_v2_features(df_v1, v2_feat_cols)
    feature_dicts['v2_features'] = v2_features
    
    df_v2.fillna(0, inplace=True)
    df_v2.replace([np.inf, -np.inf], 0, inplace=True)
    df_v2 = reduce_mem_usage(df_v2, verbose=0)
    
    df_test = df_v2[df_v2['seconds_in_bucket'].isin(current_sec)]
    
    return df_test, feature_dicts


def gen_ts_feat(test, cache, feature_dicts, counter, max_ts_len):
    
    current_sec = test['seconds_in_bucket'].unique()
    print(f"Current seconds: {current_sec}")
    
    cache = pd.concat([cache, test])
    cache.reset_index(drop=True, inplace=True)
    
    # In cache, we keep only the past max_ts_len seconds of data
    if counter > max_ts_len:
        sec_in_buk_list = cache['seconds_in_bucket'].unique()
        sec_in_buk_list.sort()
        sec_to_keep = sec_in_buk_list[-max_ts_len:]
        cache = cache[cache['seconds_in_bucket'].isin(sec_to_keep)]
        cache.reset_index(drop=True, inplace=True)
        
    df_v3, v3_features = gen_v3_features(
        cache, 
        feature_dicts['prices'],
        feature_dicts['sizes'],
        feature_dicts['v1_features']
        )
    
    feature_dicts['v3_features'] = v3_features
    
    df_v3.fillna(0, inplace=True)
    df_v3.replace([np.inf, -np.inf], 0, inplace=True)
    df_v3 = reduce_mem_usage(df_v3, verbose=0)
    
    df_test = df_v3[df_v3['seconds_in_bucket'].isin(current_sec)]
    
    return df_test, cache, feature_dicts

In [3]:
def gen_test_feature(test, cache,  counter, max_ts_len):
    
    current_sec = test['seconds_in_bucket'].unique()
    print(f"Current seconds: {current_sec}")
    
    test = test.fillna(0)
    test = reduce_mem_usage(test, verbose=0)
    
    df_v1, v1_feat, v1_feat_cat = gen_v1_features(test, feature_dicts['prices'])
    feature_dicts['v1_features'] = v1_feat
    feature_dicts['v1_feature_category'] = v1_feat_cat
    
    v2_feat_cols = feature_dicts['prices'] + feature_dicts['sizes'] + feature_dicts['v1_features']
    df_v2, v2_features = gen_v2_features(df_v1, v2_feat_cols)
    feature_dicts['v2_features'] = v2_features
    
    cache = pd.concat([cache, df_v2])
    cache.reset_index(drop=True, inplace=True)
    
    # In cache, we keep only the past max_ts_len seconds of data
    if counter > max_ts_len:
        sec_in_buk_list = cache['seconds_in_bucket'].unique()
        sec_in_buk_list.sort()
        sec_to_keep = sec_in_buk_list[-max_ts_len:]
        cache = cache[cache['seconds_in_bucket'].isin(sec_to_keep)]
        cache.reset_index(drop=True, inplace=True)
        
    # df_v3, v3_features = gen_v3_features(
    #     cache, 
    #     feature_dicts['prices'],
    #     feature_dicts['sizes'],
    #     feature_dicts['v1_features']
    #     )
    
    # feature_dicts['v3_features'] = v3_features
    
    df_v2.fillna(0, inplace=True)
    df_v2.replace([np.inf, -np.inf], 0, inplace=True)
    df_v2 = reduce_mem_usage(df_v2, verbose=0)
    
    df_test = df_v2[df_v2['seconds_in_bucket'].isin(current_sec)]
    
    return df_test, cache, feature_dicts

In [4]:
env = optiver2023.make_env()
iter_test = env.iter_test()

max_ts_len = 12 # max length of ts to keep in cache
feature_dicts = {
    'prices': ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"],
    'sizes':  ["matched_size", "bid_size", "ask_size", "imbalance_size"],
    "category": ["stock_id", "seconds_in_bucket", 'imbalance_buy_sell_flag']
}

model_files = glob("../data/lgb_regressor_fold_*.pkl")
models = [joblib.load(model_file) for model_file in model_files]

print(len(models))

scaler_train = joblib.load("../data/scaler.save")

5


To Do：add if-condition to skip predicting unscored rows. 

In [5]:
counter = 0 
n_reveals = 0

cache = pd.DataFrame()

n_lookback = 50
train_csv = "/home/lishi/projects/Competition/kaggle_2023/data/train.csv"

df_records = pd.read_csv(train_csv)

date_list = df_records['date_id'].unique().tolist()
date_list.sort()

lookback = date_list[-n_lookback:]

df_records = df_records[df_records['date_id'].isin(lookback)]
df_records.drop(columns=['target'], inplace=True)

df_revealed = pd.read_csv(train_csv, usecols=['date_id', 'stock_id', 'seconds_in_bucket', 'target'])
df_revealed = df_revealed[df_revealed['date_id'].isin(lookback)]

print(df_records.columns)

day_begin = time()
for (test, revealed_targets, sample_prediction) in iter_test:
    
    curr_date = test['date_id'].unique().tolist()[0]
    print("curr_date", curr_date)
    
    if counter == 0:
        print(len(df_records))
        n_reveals += 1        
        print('Targets revealed for day', revealed_targets['revealed_date_id'].unique().tolist())
        if n_reveals > 1:
            tmp = revealed_targets[['revealed_date_id', 'stock_id', 'seconds_in_bucket', 'revealed_target']]
            tmp.columns = ['date_id', 'stock_id', 'seconds_in_bucket', 'target']
            df_revealed = pd.concat([df_revealed, tmp])
            
    if n_reveals > 1:
        
        lookback = np.arange(curr_date-n_lookback, curr_date)
        x_train_past = df_records[df_records['date_id'].isin(lookback)]
        y_train_past = df_revealed[df_revealed['date_id'].isin(lookback)]
        
        print(x_train_past.columns)
        print(y_train_past.columns)
        
        xy_train_past = x_train_past.merge(
            y_train_past, 
            left_on=['date_id', 'stock_id', 'seconds_in_bucket'],
            right_on=['date_id', 'stock_id', 'seconds_in_bucket'], 
            how='left')
        
        print(xy_train_past.columns)
        
        print(xy_train_past[['date_id', 'stock_id', 'seconds_in_bucket', 'target']])
    
    # df_test, feature_dicts = gen_section_feat(test, feature_dicts)
    # df_test, cache, feature_dicts = gen_test_feature(df_test, cache, feature_dicts, counter, max_ts_len)
    
    df_records = pd.concat([df_records, test])
    
    # feature_cols, category_cols = gen_feature_cols(feature_dicts)
    
    # scale_cols = [x for x in feature_cols if x not in category_cols]
    
    # df_test = df_test[feature_cols]
    # df_test[scale_cols] = scaler_train.transform(df_test[scale_cols])
    
    # # predict target using trained model
    # target = 0
    # for model in models:
    #     target += model.predict(df_test) 
        
    # target /= len(models)
    
    sample_prediction['target'] = 1
    
    env.predict(sample_prediction)

    # after 54 timesteps, a new day starts
    if counter >= 54:
        print(f"New Day! Time used: {time() - day_begin:2f}s.")
        counter = 0
        day_begin = time()
        cache = pd.DataFrame()
        recorded_dates = df_records['date_id'].unique().tolist()
        recorded_dates.sort()
        df_records = df_records[df_records['date_id'].isin(recorded_dates[1:])]
    else:
        counter += 1

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'time_id', 'row_id'],
      dtype='object')
This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
curr_date 478
550000
Targets revealed for day [477]
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 4

In [6]:
df_test = pd.read_csv("/home/lishi/projects/Competition/kaggle_2023/data/example_test_files/test.csv")

df_test = df_test[(df_test['date_id'] == 479)&(df_test['seconds_in_bucket'] == 130)]

df_test 

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,time_id,row_id,currently_scored
13600,0,479,130,5823688.88,1,1.000086,28594597.11,,,0.999912,81606.69,1.000086,19671.84,1.000052,26358,479_130_0,False
13601,1,479,130,57825.84,-1,0.999979,5575776.31,,,0.999830,61028.00,1.000527,29129.05,1.000302,26358,479_130_1,False
13602,2,479,130,5045503.71,-1,1.000959,10242938.13,,,1.000805,22056.47,1.001215,1562.16,1.001188,26358,479_130_2,False
13603,3,479,130,43912362.67,-1,1.000016,70976105.67,,,1.000016,4361.07,1.000064,70403.52,1.000019,26358,479_130_3,False
13604,4,479,130,0.00,0,1.000608,27441848.73,,,1.000445,9746.70,1.000663,2943.04,1.000612,26358,479_130_4,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13795,195,479,130,4943681.55,-1,1.000511,21459622.00,,,1.000511,33732.00,1.000630,47483.42,1.000560,26358,479_130_195,False
13796,196,479,130,1096508.81,1,0.999963,3691343.17,,,0.999714,38427.13,0.999963,18196.32,0.999883,26358,479_130_196,False
13797,197,479,130,13131110.61,-1,1.001094,4479411.09,,,1.001094,22287.30,1.001283,34711.05,1.001168,26358,479_130_197,False
13798,198,479,130,35849074.40,-1,0.999648,75347057.55,,,0.999648,210692.40,0.999888,56489.95,0.999837,26358,479_130_198,False
