# Time Separated Model
This origin comming from https://www.kaggle.com/code/lblhandsome/optiver-robust-best-single-model/notebook

In [1]:
from pathlib import Path
import os
import warnings
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import gc  # Garbage collection for memory management
import os  # Operating system-related functions
import time  # Time-related functions
import warnings  # Handling warnings
from itertools import combinations  # For creating combinations of elements
from warnings import simplefilter  # Simplifying warning handling

# 📦 Importing machine learning libraries
import joblib  # For saving and loading models
import lightgbm as lgb  # LightGBM gradient boosting framework
import numpy as np  # Numerical operations
import pandas as pd  # Data manipulation and analysis
from sklearn.metrics import mean_absolute_error  # Metric for evaluation
from sklearn.model_selection import KFold, TimeSeriesSplit  # Cross-validation techniques

# 🤐 Disable warnings to keep the code clean
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

max_lookback = np.nan  # Maximum lookback (not specified)
split_day = 435  # Split day for time series data

warnings.filterwarnings("ignore")

PREV_MAX = 80
seed = 2023

# For kaggle environment
if os.environ.get("KAGGLE_DATA_PROXY_TOKEN") != None:
    BASE_OUTPUT_PATH = Path(f'/kaggle/working')
    BASE_INPUT_PATH = Path(f'/kaggle/input/optiver-trading-at-the-close')
    TRAIN_FILE = Path(f'{BASE_INPUT_PATH}/train.csv')
    TEST_FILE = Path(f'{BASE_INPUT_PATH}/test.csv')
    
    IS_LOCAL = False # If kaggle environment, set False
    IS_TRAIN = True # If kaggle environment, set True
    IS_INFER = True # If kaggle environment, set True

    if IS_LOCAL:
        SAMPLE_SUBMISSION_FILE = Path(f'{BASE_INPUT_PATH}/sample_submission.csv')
        REVEALED_TARGETS_FILE = Path(f'{BASE_INPUT_PATH}/revealed_targets.csv')

    stopping_rounds = 30 # early_stopping用コールバック関数
    num_boost_round = 3000 # 計算回数
    num_folds = 5 # クロスバリデーションの分割数
    DEVICE = 'gpu' # cpu or gpu


    lgb_params = {
        'task': 'train',                   # 学習
        'boosting_type': 'gbdt',           # GBDT
        'objective': 'regression',         # 回帰
        'metric': 'rmse',                  # 損失（誤差）
        'learning_rate': 0.01,             # 学習率
        'lambda_l1': 0.5,                  # L1正則化項の係数
        'lambda_l2': 0.5,                  # L2正則化項の係数
        'num_leaves': 10,                  # 最大葉枚数
        'feature_fraction': 0.5,           # ランダムに抽出される列の割合
        'bagging_fraction': 0.5,           # ランダムに抽出される標本の割合
        'bagging_freq': 5,                 # バギング実施頻度
        'min_child_samples': 10,           # 葉に含まれる最小データ数
        'seed': seed,                       # シード値
        "device": DEVICE
    }


    """
    lgb_params = {
        'task': 'train',                   # 学習
        'objective': 'regression',                # 目的関数の種類。ここでは回帰タスクを指定
        'metric': 'rmse',                          # 評価指標
        'boosting_type': 'gbdt',                  # ブースティングタイプ。勾配ブースティング決定木
        "n_estimators": 32,                        # ブースティングに使用する木の数。多いほど性能が向上するが計算コストが増加
        "num_leaves": 64,                         # 木に存在する最大の葉の数。大きい値は精度を向上させるが過学習のリスクが増加
        "subsample": 0.8,                         # 各木のトレーニングに使用されるデータの割合。過学習を防ぐために一部のデータをサンプリング
        "colsample_bytree": 0.8,                  # 木を構築する際に使用される特徴の割合。特徴のサブセットを使用し過学習を防ぐ
        "learning_rate": 0.01,                 # 学習率。小さい値は堅牢なモデルを生成するが収束に時間がかかる
        'max_depth': 32,                           # 木の最大の深さ。深い木は複雑なモデルを作成するが過学習のリスクがある
        "device": DEVICE,                         # トレーニングに使用するデバイス（CPUまたはGPU）
        "verbosity": -1,                          # LightGBMのログ出力のレベル。-1はログを出力しないことを意味する
        "importance_type": "gain",                # 特徴重要度を計算する際の指標。"gain"は分割による平均情報利得
        'lambda_l1': 0.5,                         # L1正則化項の係数。過学習を防ぐためにモデルの複雑さにペナルティを課す
        'lambda_l2': 0.5,                         # L2正則化項の係数。同じく過学習を防ぐ
        'bagging_freq': 5,                 # バギング実施頻度
        'min_child_samples': 10,           # 葉に含まれる最小データ数
        'seed': seed,                       # シード値
    }
    """

# For local environment
else:
    BASE_OUTPUT_PATH = Path(f'../output')
    BASE_INPUT_PATH = Path(f'../kaggle/input/optiver-trading-at-the-close')
    TRAIN_FILE = Path(f'{BASE_INPUT_PATH}/train.csv')
    TEST_FILE = Path(f'{BASE_INPUT_PATH}/test.csv')

    SAMPLE_SUBMISSION_FILE = Path(f'{BASE_INPUT_PATH}/sample_submission.csv')
    REVEALED_TARGETS_FILE = Path(f'{BASE_INPUT_PATH}/revealed_targets.csv')
    IS_LOCAL = True
    IS_TRAIN = True
    IS_INFER = True
    TARGET_STOCK_IDS = [0,1,2,3,4,5,6,7,8,9]

    # For training
    stopping_rounds = 10 # early_stopping用コールバック関数
    num_boost_round = 1000 # 計算回数
    num_folds = 5 # クロスバリデーションの分割数
    DEVICE = 'cpu' # cpu or gpu

    lgb_params = {
        'task': 'train',                   # 学習
        'boosting_type': 'gbdt',           # GBDT
        'objective': 'regression',         # 回帰
        'metric': 'rmse',                  # 損失（誤差）
        'learning_rate': 0.01,             # 学習率
        'lambda_l1': 0.5,                  # L1正則化項の係数
        'lambda_l2': 0.5,                  # L2正則化項の係数
        'num_leaves': 10,                  # 最大葉枚数
        'feature_fraction': 0.5,           # ランダムに抽出される列の割合
        'bagging_fraction': 0.5,           # ランダムに抽出される標本の割合
        'bagging_freq': 5,                 # バギング実施頻度
        'min_child_samples': 10,           # 葉に含まれる最小データ数
        'seed': seed,                       # シード値
        "device": DEVICE
    }


print(f"BASE_OUTPUT_PATH: {BASE_OUTPUT_PATH}")
print(f"BASE_INPUT_PATH: {BASE_INPUT_PATH}")
print(f"TRAIN_FILE: {TRAIN_FILE}")
print(f"TEST_FILE: {TEST_FILE}")
print(f"IS_LOCAL: {IS_LOCAL}")
print(f"IS_TRAIN: {IS_TRAIN}")
print(f"IS_INFER: {IS_INFER}")


BASE_OUTPUT_PATH: ../output
BASE_INPUT_PATH: ../kaggle/input/optiver-trading-at-the-close
TRAIN_FILE: ../kaggle/input/optiver-trading-at-the-close/train.csv
TEST_FILE: ../kaggle/input/optiver-trading-at-the-close/test.csv
IS_LOCAL: True
IS_TRAIN: True
IS_INFER: True


In [2]:
%%time 

from gc import collect;
from psutil import Process;
from os import system, getpid, walk;

# Defining global configurations and functions:-

    
def GetMemUsage():
    """
    This function defines the memory usage across the kernel. 
    Source-
    https://stackoverflow.com/questions/61366458/how-to-find-memory-usage-of-kaggle-notebook
    """;
    
    pid = getpid();
    py = Process(pid);
    memory_use = py.memory_info()[0] / 2. ** 30;
    return f"RAM memory GB usage = {memory_use :.4}";


collect();
print(GetMemUsage())

RAM memory GB usage = 0.4384
CPU times: user 75.2 ms, sys: 971 µs, total: 76.2 ms
Wall time: 75.8 ms


# Functions

In [3]:
%%time 

from typing import Sequence, Tuple
import pandas as pd

# for local execution
class MockApi:
    def __init__(self):
        '''
        YOU MUST UPDATE THE FIRST THREE LINES of this method.
        They've been intentionally left in an invalid state.

        Variables to set:
            input_paths: a list of two or more paths to the csv files to be served
            group_id_column: the column that identifies which groups of rows the API should serve.
                A call to iter_test serves all rows of all dataframes with the current group ID value.
            export_group_id_column: if true, the dataframes iter_test serves will include the group_id_column values.
        '''
        self.input_paths: Sequence[str] = [TEST_FILE, REVEALED_TARGETS_FILE, SAMPLE_SUBMISSION_FILE]
        self.group_id_column: str = 'time_id'
        self.export_group_id_column: bool = True
        # iter_test is only designed to support at least two dataframes, such as test and sample_submission
        assert len(self.input_paths) >= 2

        self._status = 'initialized'
        self.predictions = []

    def iter_test(self) -> Tuple[pd.DataFrame]:
        '''
        Loads all of the dataframes specified in self.input_paths,
        then yields all rows in those dataframes that equal the current self.group_id_column value.
        '''
        if self._status != 'initialized':

            raise Exception('WARNING: the real API can only iterate over `iter_test()` once.')

        dataframes = []
        for pth in self.input_paths:
            dataframes.append(pd.read_csv(pth, low_memory=False))
        group_order = dataframes[0][self.group_id_column].drop_duplicates().tolist()
        dataframes = [df.set_index(self.group_id_column) for df in dataframes]

        for group_id in group_order:
            self._status = 'prediction_needed'
            current_data = []
            for df in dataframes:
                cur_df = df.loc[group_id].copy()
                # returning single line dataframes from df.loc requires special handling
                if not isinstance(cur_df, pd.DataFrame):
                    cur_df = pd.DataFrame({a: b for a, b in zip(cur_df.index.values, cur_df.values)}, index=[group_id])
                    cur_df.index.name = self.group_id_column
                cur_df = cur_df.reset_index(drop=not(self.export_group_id_column))
                current_data.append(cur_df)
            yield tuple(current_data)

            while self._status != 'prediction_received':
                print('You must call `predict()` successfully before you can continue with `iter_test()`', flush=True)
                yield None

        with open('submission.csv', 'w') as f_open:
            pd.concat(self.predictions).to_csv(f_open, index=False)
        self._status = 'finished'

    def predict(self, user_predictions: pd.DataFrame):
        '''
        Accepts and stores the user's predictions and unlocks iter_test once that is done
        '''
        if self._status == 'finished':
            raise Exception('You have already made predictions for the full test set.')
        if self._status != 'prediction_needed':
            raise Exception('You must get the next test sample from `iter_test()` first.')
        if not isinstance(user_predictions, pd.DataFrame):
            raise Exception('You must provide a DataFrame.')

        self.predictions.append(user_predictions)
        self._status = 'prediction_received'


def make_env():
    return MockApi()

CPU times: user 49 µs, sys: 1 µs, total: 50 µs
Wall time: 52.2 µs


In [4]:
def pd_display_max():
    pd.set_option('display.max_rows', None)  # 行の最大表示数を無制限に設定
    pd.set_option('display.max_columns', None)  # 列の最大表示数を無制限に設定
    pd.set_option('display.width', None)  # 表示幅を拡張
    pd.set_option('display.max_colwidth', None)  # 列の幅を最大に設定

def pd_clear_display_max():
    pd.set_option('display.max_rows', 10)
    pd.set_option('display.max_columns', 10)
    pd.set_option('display.width', None)  # 表示幅を拡張
    pd.set_option('display.max_colwidth', None)  # 列の幅を最大に設定

In [5]:
# 🧹 Function to reduce memory usage of a Pandas DataFrame
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    
    # 📏 Calculate the initial memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2

    # 🔄 Iterate through each column in the DataFrame
    for col in df.columns:
        col_type = df[col].dtype

        # Check if the column's data type is not 'object' (i.e., numeric)
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            # Check if the column's data type is an integer
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                # Check if the column's data type is a float
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)

    # ℹ️ Provide memory optimization information if 'verbose' is True
    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")

    # 🔄 Return the DataFrame with optimized memory usage
    return df

In [6]:
# 🏎️ Import Numba for just-in-time (JIT) compilation and parallel processing
from numba import njit, prange

# 📊 Function to compute triplet imbalance in parallel using Numba
@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))

    # 🔁 Loop through all combinations of triplets
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        
        # 🔁 Loop through rows of the DataFrame
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            
            # 🚫 Prevent division by zero
            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features

# 📈 Function to calculate triplet imbalance for given price data and a DataFrame
def calculate_triplet_imbalance_numba(price, df):
    # Convert DataFrame to numpy array for Numba compatibility
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]

    # Calculate the triplet imbalance using the Numba-optimized function
    features_array = compute_triplet_imbalance(df_values, comb_indices)

    # Create a DataFrame from the results
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)

    return features

In [7]:
# 📊 Function to generate imbalance features
def imbalance_features(df):
    if DEVICE == 'gpu':
        import cudf
        df = cudf.from_pandas(df)
    
    # Define lists of price and size-related column names
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

    # V1 features
    # Calculate various features using Pandas eval function
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("ask_price + bid_price")/2
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("imbalance_size-matched_size")/df.eval("matched_size+imbalance_size")
    df["size_imbalance"] = df.eval("bid_size / ask_size")
    
    # Create features for pairwise price imbalances
    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")
        
    # V2 features
    # Calculate additional features
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    df['match_balance'] = ( df['matched_size']  + (df['imbalance_buy_sell_flag'] * df['imbalance_size'])) / df['matched_size']
    
    # Calculate various statistical aggregation features
    
        
    # V3 features
    # Calculate shifted and return features for specific columns
    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1, 2, 3, 10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)
    
    # Calculate diff features for specific columns
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size']:
        for window in [1, 2, 3, 10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)
    if DEVICE == 'gpu':
        df = df.to_pandas()
    # Replace infinite values with 0
    return df.replace([np.inf, -np.inf], 0)

def numba_imb_features(df):
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
        
    # Calculate triplet imbalance features using the Numba-optimized function
    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
    return df

# 📅 Function to generate time and stock-related features
def other_features(df):
    df["dow"] = df["date_id"] % 5  # Day of the week
    df["seconds"] = df["seconds_in_bucket"] % 60  # Seconds
    df["minute"] = df["seconds_in_bucket"] // 60  # Minutes

    # Map global features to the DataFrame
    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df


def cal_vix(df, group_key, target_col, period):
    return df.groupby(group_key)[target_col].transform(lambda x: np.log(x).diff().rolling(period).std())


def generate_historical_features(df, is_train):
    for col in ['wap', 'match_balance']:
        for window in [1, 2, 3, 10]:
            col_name = f"{col}_diff_{window}"
            df[col_name] = df.groupby("stock_id")[col].diff(window)
            #df[col_name] = df[col_name].fillna(0)  # NaNを0で置き換える
        for period in [5]:
            col_name = f"{col}_vix_{period}"
            df[col_name] = cal_vix(df, ['stock_id', 'date_id'], col, period)

    df = df.replace([np.inf, -np.inf], 0)
    return df

# 🚀 Function to generate all features by combining imbalance and other features
def generate_all_features(df, is_train):
    prev_cols = list(df.columns)

    # Generate imbalance features
    df = imbalance_features(df)
    df = numba_imb_features(df)
    df = generate_historical_features(df, is_train)
    df = other_features(df)
    
    generated_feature_name = list(set(df.columns) - set(prev_cols))
    gc.collect()  # Perform garbage collection to free up memory

    return df, generated_feature_name

def normarized_features(df):
    df['normalized_wap'] = (df['wap'] - global_population_wap['mean']) / global_population_wap['std']
    df['normalized_match_balance'] = (df['match_balance'] - global_population_mathch_balance['mean']) / global_population_mathch_balance['std']
    return df

# Generationg train dataset

In [8]:
def load_train_dataset():
    df = pd.read_csv(TRAIN_FILE)
    # 🧹 Remove rows with missing values in the "target" column
    df = df.dropna(subset=["target"])
    # 🔁 Reset the index of the DataFrame and apply the changes in place
    df.reset_index(drop=True, inplace=True)
    return df


In [9]:
%%time

# Check if the code is running in offline or online mode
df_train = load_train_dataset()

if IS_LOCAL:
    # In local mode, stock id TARGET_STOCK_ID is used for training
    df_train = df_train[df_train["stock_id"].isin(TARGET_STOCK_IDS)]

# Display a message indicating online mode
print("Online mode")

if IS_TRAIN:
    global_stock_id_feats = {
        "median_size": df_train.groupby("stock_id")["bid_size"].median() + df_train.groupby("stock_id")["ask_size"].median(),
        "std_size": df_train.groupby("stock_id")["bid_size"].std() + df_train.groupby("stock_id")["ask_size"].std(),
        "ptp_size": df_train.groupby("stock_id")["bid_size"].max() - df_train.groupby("stock_id")["bid_size"].min(),
        "median_price": df_train.groupby("stock_id")["bid_price"].median() + df_train.groupby("stock_id")["ask_price"].median(),
        "std_price": df_train.groupby("stock_id")["bid_price"].std() + df_train.groupby("stock_id")["ask_price"].std(),
        "ptp_price": df_train.groupby("stock_id")["bid_price"].max() - df_train.groupby("stock_id")["ask_price"].min(),
    }

    df_train, generated_feature_name = generate_all_features(df_train, True)

    global_population_wap = df_train['wap'].describe()
    global_population_mathch_balance = df_train['match_balance'].describe()

    # normarize
    # df_train = normarized_features(df_train)

    print("Build Online Train Feats Finished.")

    df_train = reduce_mem_usage(df_train)

collect()
print(GetMemUsage())

Online mode
Build Online Train Feats Finished.
RAM memory GB usage = 1.585
CPU times: user 10.1 s, sys: 1.46 s, total: 11.6 s
Wall time: 11.7 s


In [10]:
# feature selection
feature_name = [
    "wap_vix_5", "reference_price_shift_10", "matched_size_ret_10",
    "matched_size_shift_10", "match_balance_vix_5", "ask_price_bid_price_reference_price_imb2",
    "seconds_in_bucket", "match_balance_diff_10", "imbalance_size_ret_10",
    "ask_size_diff_10", "imbalance_size_shift_10", "reference_price",
    "ask_price_bid_price_imb", "reference_price_ret_10", "all_sizes_mean",
    "matched_size", "bid_size_diff_10", "volume", "reference_price_shift_3",
    "bid_size", "bid_price_wap_reference_price_imb2", "ask_size",
    "ask_size_diff_3", "reference_price_bid_price_imb", "reference_price_wap_imb",
    "all_prices_kurt", "matched_size_bid_size_ask_size_imb2", "bid_price",
    "wap_diff_10", "bid_size_diff_3", "all_prices_std", "bid_size_ask_size_imbalance_size_imb2",
    "all_prices_skew", "all_sizes_skew", "bid_price_wap_imb",
    "ask_price_diff_10", "ask_price_wap_imb", "imbalance_size_ret_3",
    "matched_size_shift_3", "reference_price_shift_1", "all_prices_mean",
    "imbalance_size", "matched_size_ret_3", "reference_price_shift_2",
    "ask_price", "ask_size_diff_2", "bid_size_diff_2", "price_pressure",
    "reference_price_ask_price_imb", "matched_size_shift_1", "bid_size_diff_1",
    "market_urgency", "wap_diff_3", "price_spread", "all_sizes_std",
    "matched_size_shift_2", "imbalance_size_shift_3", "bid_price_diff_10",
    "ask_size_diff_1", "far_price", "reference_price_ret_3", "match_balance",
    "wap_diff_2", "wap_diff_1", "matched_imbalance", "wap"
]
feature_name

['wap_vix_5',
 'reference_price_shift_10',
 'matched_size_ret_10',
 'matched_size_shift_10',
 'match_balance_vix_5',
 'ask_price_bid_price_reference_price_imb2',
 'seconds_in_bucket',
 'match_balance_diff_10',
 'imbalance_size_ret_10',
 'ask_size_diff_10',
 'imbalance_size_shift_10',
 'reference_price',
 'ask_price_bid_price_imb',
 'reference_price_ret_10',
 'all_sizes_mean',
 'matched_size',
 'bid_size_diff_10',
 'volume',
 'reference_price_shift_3',
 'bid_size',
 'bid_price_wap_reference_price_imb2',
 'ask_size',
 'ask_size_diff_3',
 'reference_price_bid_price_imb',
 'reference_price_wap_imb',
 'all_prices_kurt',
 'matched_size_bid_size_ask_size_imb2',
 'bid_price',
 'wap_diff_10',
 'bid_size_diff_3',
 'all_prices_std',
 'bid_size_ask_size_imbalance_size_imb2',
 'all_prices_skew',
 'all_sizes_skew',
 'bid_price_wap_imb',
 'ask_price_diff_10',
 'ask_price_wap_imb',
 'imbalance_size_ret_3',
 'matched_size_shift_3',
 'reference_price_shift_1',
 'all_prices_mean',
 'imbalance_size',
 'ma

## global_stock_id_feats

In [11]:
df_global_stock_id_feats = pd.DataFrame(global_stock_id_feats)
df_global_stock_id_feats

Unnamed: 0_level_0,median_size,std_size,ptp_size,median_price,std_price,ptp_price
stock_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,42739.16,132986.92003,5898989.29,1.999695,0.003353,0.017414
1,25548.5,66444.908534,693898.57,1.999827,0.005588,0.02937
2,26228.1,75674.654248,1069837.58,2.0002,0.005333,0.051622
3,41667.0,93875.77052,1928848.21,1.99998,0.002903,0.018551
4,34014.58,80670.27455,1604065.54,1.999816,0.003717,0.017379
5,28287.94,108045.498152,2265456.7,2.000082,0.006301,0.072807
6,24720.61,109732.90446,949838.58,1.999907,0.004339,0.018155
7,29798.0,139523.98599,1383196.54,1.999987,0.005259,0.022833
8,57259.95,387634.914143,3047677.2,1.999989,0.004006,0.017607
9,33052.0,69988.944079,831560.6,1.999803,0.003837,0.021646


## df_train_feats

In [12]:
df_train

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,match_balance_vix_5,dow,seconds,minute,global_median_size,global_std_size,global_ptp_size,global_median_price,global_std_price,global_ptp_price
0,0,0,0,3.180603e+06,1,0.999812,13380277.00,,,0.999812,...,,0,0,0,42739.160156,132986.921875,5.898990e+06,1.999695,0.003353,0.017414
1,1,0,0,1.666039e+05,-1,0.999896,1642214.25,,,0.999896,...,,0,0,0,25548.500000,66444.906250,6.938986e+05,1.999827,0.005588,0.029370
2,2,0,0,3.028799e+05,-1,0.999561,1819368.00,,,0.999403,...,,0,0,0,26228.099609,75674.656250,1.069838e+06,2.000200,0.005333,0.051622
3,3,0,0,1.191768e+07,-1,1.000171,18389746.00,,,0.999999,...,,0,0,0,41667.000000,93875.773438,1.928848e+06,1.999980,0.002903,0.018551
4,4,0,0,4.475500e+05,-1,0.999532,17860614.00,,,0.999394,...,,0,0,0,34014.578125,80670.273438,1.604066e+06,1.999816,0.003717,0.017379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237697,5,480,540,1.063551e+06,1,0.998403,11476997.00,0.999396,0.999396,0.998119,...,0.011378,0,0,9,28287.939453,108045.500000,2.265457e+06,2.000082,0.006301,0.072807
5237698,6,480,540,9.985981e+05,1,1.000118,13713463.00,1.000495,1.000469,0.999984,...,0.018601,0,0,9,24720.609375,109732.906250,9.498386e+05,1.999907,0.004339,0.018155
5237699,7,480,540,2.469864e+06,-1,0.996621,73054344.00,0.996304,0.996335,0.996573,...,0.013453,0,0,9,29798.000000,139523.984375,1.383196e+06,1.999987,0.005259,0.022833
5237700,8,480,540,8.815596e+05,-1,1.000927,84052704.00,1.000055,1.000602,1.000703,...,0.002727,0,0,9,57259.949219,387634.906250,3.047677e+06,1.999989,0.004006,0.017607


# Model Training

In [16]:
%%time

# 📦 Import necessary libraries
import numpy as np
#import lightgbm as lgb
import optuna.integration.lightgbm as lgb
import optuna
from sklearn.metrics import mean_absolute_error
import gc
import os
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import lightgbm as lgb
import numpy as np
from dataclasses import dataclass

# Setting the logging level WARNING, the INFO logs are suppressed.
optuna.logging.set_verbosity(optuna.logging.WARNING)

@dataclass
class Model:
    model: lgb.Booster
    fold: int
    feature_importance: pd.DataFrame
    score: float
    best_iteration: int
    train_time: float = None
    weight: float = None

def train_model(train_x, train_y, val_x, val_y):
    trains = lgb.Dataset(train_x, train_y)
    valids = lgb.Dataset(val_x, val_y, reference=trains)

    verbose_eval = 0
    model = lgb.train(
        lgb_params,
        trains,
        valid_sets=[valids, trains], # 検証データ
        valid_names=['Train', 'Valid'],    # データセット名前
        num_boost_round=num_boost_round,
        callbacks=[
                lgb.early_stopping(stopping_rounds=stopping_rounds, verbose=True),
                lgb.log_evaluation(verbose_eval)
        ]
    )
    return model

def cross_train(df, key, n_splits, feature_name, valid_name):
    """ For Cross Train

    Args:
        df (_type_): _description_
        n_splits (_type_): _description_

    Returns:
        _type_: _description_
    """
    print(f"Cross Train key id {key}: start, shape: {df.shape}, n_splits: {n_splits}")

    models = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    df.reset_index(drop=True, inplace=True)
    
    for fold, (train_indices, valid_indices) in enumerate(kf.split(df)):
        print(f"{key}: {fold} start")
        now_time = time.time()
        X_train, X_valid = df[feature_name].iloc[train_indices], df[feature_name].iloc[valid_indices]
        y_train, y_valid = df[valid_name].loc[train_indices], df[valid_name].loc[valid_indices]
        print(f"X_train: {X_train.shape}, X_valid: {X_valid.shape}, y_train: {y_train.shape}, y_valid: {y_valid.shape}")

        model = train_model(X_train, y_train, X_valid, y_valid)

        y_valid_pred = model.predict(X_valid)
        
        # For Debug
        #y_valid_df = pd.DataFrame(y_valid, columns=[valid_name])
        #y_valid_df["pred"] = y_valid_pred
        #y_valid_df.to_csv('filename.csv', index=False)

        score = mean_absolute_error(y_valid, y_valid_pred)
        train_time = time.time() - now_time
        m = Model(model, fold, model.feature_importance(), score, model.best_iteration, train_time, weight= 1 / n_splits)

        models.append(m)
        print(f"{key}: {fold} end, score: {score}, time: {train_time}")
        
        del X_train, X_valid, y_train, y_valid
        gc.collect()

    return key, models

#cross_train(x, x.name, num_folds, feature_name, "target")
models = df_train.groupby("seconds_in_bucket").apply(lambda x: cross_train(df=x, key=x.name, n_splits=num_folds, feature_name=feature_name, valid_name="target"))
# modelsを辞書に変換
models = {key: model for key, model in models}

del df_train
collect()
print(GetMemUsage())

Cross Train key id 0: start, shape: (4810, 127), n_splits: 5
0: 0 start
X_train: (3848, 66), X_valid: (962, 66), y_train: (3848,), y_valid: (962,)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000961 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15544
[LightGBM] [Info] Number of data points in the train set: 3848, number of used features: 61
[LightGBM] [Info] Start training from score 0.023362
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[164]	Valid's rmse: 8.79946	Train's rmse: 8.76824
0: 0 end, score: 6.602760484129706, time: 0.38181304931640625
0: 1 start
X_train: (3848, 66), X_valid: (962, 66), y_train: (3848,), y_valid: (962,)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000910 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15540
[LightGBM] [Info] Nu

In [None]:
# check model quality
data = []

for key, i_models in models.items():
    for model in i_models:
        score = model.score
        best_iteration = model.best_iteration
        fold = model.fold
        train_time = model.train_time
        data.append({"key": key, "fold": fold, "score": score, "best_iteration": best_iteration, "train_time": train_time})

df_model = pd.DataFrame(data)
df_model.describe()

Unnamed: 0,key,fold,score,best_iteration,train_time
count,275.0,275.0,275.0,275.0,275.0
mean,270.0,2.0,5.932889,128.570909,0.283512
std,159.034496,1.416792,1.021308,53.201496,0.101406
min,0.0,0.0,4.468954,1.0,0.043392
25%,130.0,1.0,5.329459,92.0,0.213265
50%,270.0,2.0,5.685547,126.0,0.276404
75%,410.0,3.0,6.059091,163.5,0.345327
max,540.0,4.0,9.252994,276.0,0.583217


In [None]:
# Initialize an empty DataFrame for aggregated importances
aggregated_importance = pd.DataFrame(index=feature_name, columns=['importance'])

# Aggregate the importances from each model
for key, i_models in models.items():
    for model in i_models:
        importance = pd.DataFrame({'feature': feature_name, 'importance': model.feature_importance})
        aggregated_importance = aggregated_importance.add(importance.set_index('feature'), fill_value=0)


if models:
    aggregated_importance['importance'] /= len(df_model)

#pd_display_max()
# Sort the features by importance
aggregated_importance = aggregated_importance.sort_values(by='importance', ascending=False)
aggregated_importance

Unnamed: 0,importance
market_urgency,88.305455
reference_price_wap_imb,60.010909
bid_price_wap_imb,38.56
ask_price_wap_imb,38.403636
wap_diff_2,30.487273
...,...
imbalance_size,6.541818
matched_imbalance,6.414545
matched_size_shift_2,5.672727
matched_size_shift_1,5.349091


In [None]:
del aggregated_importance
pd_clear_display_max()
collect()

0

# Infer

In [None]:
%%time

# 📉 Define a function to adjust prices based on volumes
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)  # 🧮 Calculate standard error based on volumes
    step = np.sum(prices) / np.sum(std_error)  # 🧮 Calculate the step size based on prices and standard error
    out = prices - std_error * step  # 💰 Adjust prices by subtracting the standardized step size
    return out

y_min, y_max = -64, 64

if IS_INFER:
    if IS_LOCAL:
        print("Infer Local")
        env = make_env()
    else:
        print("Infer Submission")
        import optiver2023
        env = optiver2023.make_env()
    iter_test = env.iter_test()

    df_prediction = pd.DataFrame()
    df_revealed_targets = pd.DataFrame()

    for (test, revealed_targets, sample_prediction) in iter_test:
        now_time = time.time()
        df_test, generated_feature_name = generate_all_features(test, False)
        df_test_grouped = df_test.groupby('seconds_in_bucket')

        for seconds_in_bucket, df in df_test_grouped:
            print(f"prdict: {test['date_id'][0]}, {seconds_in_bucket}")
            if seconds_in_bucket in models:
                target_models = models[seconds_in_bucket]
                df = df.sort_values(by=['stock_id', 'seconds_in_bucket', 'date_id']).reset_index(drop=True)

                predictions = np.mean([m.model.predict(df[feature_name]) for m in target_models], 0)
                #df_prediction = pd.concat([df_prediction, sample_prediction])
                #df_revealed_targets = pd.concat([df_revealed_targets, revealed_targets])
            else:
                raise Exception(f"seconds_in_bucket {seconds_in_bucket} is not in models")
        zerosum_predictions = zero_sum(predictions, test['bid_size'] + test['ask_size'])
        clipped_predictions = np.clip(zerosum_predictions, y_min, y_max)  # 📏 Clip predictions within a specified range
        v_clipped_predictions = clipped_predictions.values
        sample_prediction['target'] = v_clipped_predictions
        del df_test, df_test_grouped
        collect()
        env.predict(sample_prediction)

collect()
print(GetMemUsage())

Infer Local
prdict: 478, 0
prdict: 478, 10
prdict: 478, 20
prdict: 478, 30
prdict: 478, 40
prdict: 478, 50
prdict: 478, 60
prdict: 478, 70
prdict: 478, 80
prdict: 478, 90
prdict: 478, 100
prdict: 478, 110
prdict: 478, 120
prdict: 478, 130
prdict: 478, 140
prdict: 478, 150
prdict: 478, 160
prdict: 478, 170
prdict: 478, 180
prdict: 478, 190
prdict: 478, 200
prdict: 478, 210
prdict: 478, 220


KeyboardInterrupt: 

In [19]:
%%time

# 📉 Define a function to adjust prices based on volumes
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)  # 🧮 Calculate standard error based on volumes
    step = np.sum(prices) / np.sum(std_error)  # 🧮 Calculate the step size based on prices and standard error
    out = prices - std_error * step  # 💰 Adjust prices by subtracting the standardized step size
    return out

y_min, y_max = -64, 64

if IS_INFER:
    if IS_LOCAL:
        print("Infer Local")
        env = make_env()
    else:
        print("Infer Submission")
        import optiver2023
        env = optiver2023.make_env()
    iter_test = env.iter_test()

    df_prediction = pd.DataFrame()
    df_revealed_targets = pd.DataFrame()

    for (test, revealed_targets, sample_prediction) in iter_test:
        now_time = time.time()
        df_test, generated_feature_name = generate_all_features(test, False)
        seconds_in_bucket = test['seconds_in_bucket'][0]
        target_models = models[seconds_in_bucket]
        predictions = np.mean([m.model.predict(df_test[feature_name]) for m in target_models], 0)

        zerosum_predictions = zero_sum(predictions, test['bid_size'] + test['ask_size'])
        clipped_predictions = np.clip(zerosum_predictions, y_min, y_max)  # 📏 Clip predictions within a specified range
        v_clipped_predictions = clipped_predictions.values
        sample_prediction['target'] = v_clipped_predictions
        del df_test
        collect()
        env.predict(sample_prediction)

collect()
print(GetMemUsage())

Infer Local
