# Time Lag Model
This origin comming from https://www.kaggle.com/code/lblhandsome/optiver-robust-best-single-model/notebook

In [44]:
from pathlib import Path
import os
import warnings
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import gc  # Garbage collection for memory management
import os  # Operating system-related functions
import time  # Time-related functions
import warnings  # Handling warnings
from itertools import combinations  # For creating combinations of elements
from warnings import simplefilter  # Simplifying warning handling

# 📦 Importing machine learning libraries
import joblib  # For saving and loading models
import numpy as np  # Numerical operations
import pandas as pd  # Data manipulation and analysis
from sklearn.metrics import mean_absolute_error  # Metric for evaluation
from sklearn.model_selection import KFold, TimeSeriesSplit  # Cross-validation techniques

# 🤐 Disable warnings to keep the code clean
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

max_lookback = np.nan  # Maximum lookback (not specified)

warnings.filterwarnings("ignore")

seed = 2023
num_folds = 4 # クロスバリデーションの分割数
# valid_split_day = 435  # Split day for time series data


# For kaggle environment
if os.environ.get("KAGGLE_DATA_PROXY_TOKEN") != None:
    BASE_OUTPUT_PATH = Path(f'/kaggle/working')
    BASE_INPUT_PATH = Path(f'/kaggle/input/optiver-trading-at-the-close')
    TRAIN_FILE = Path(f'{BASE_INPUT_PATH}/train.csv')
    TEST_FILE = Path(f'{BASE_INPUT_PATH}/test.csv')
    
    IS_LOCAL = False # If kaggle environment, set False
    IS_INFER = True # If kaggle environment, set True
    IS_USE_SAVED_MODEL = False # Use saved model or not
    USE_OPTUNA = True # Use optuna or not

    if IS_LOCAL:
        SAMPLE_SUBMISSION_FILE = Path(f'{BASE_INPUT_PATH}/sample_submission.csv')
        REVEALED_TARGETS_FILE = Path(f'{BASE_INPUT_PATH}/revealed_targets.csv')

    stopping_rounds = 10 # early_stopping用コールバック関数
    num_boost_round = 3000 # 計算回数
    DEVICE = 'gpu' # cpu or gpu
    OPTUNA_TIME_BUDGET = 60 * 60 * 2 # 2 hours

    optuna_params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',         # 回帰
        'metric': 'rmse',                  # 損失（誤差）
        'verbosity': -1,
        'deterministic':True, #再現性確保用のパラメータ
        'force_row_wise':True,  #再現性確保用のパラメータ
        'device': DEVICE
    }

    lgb_params = {
        'task': 'train',                   # 学習
        'boosting_type': 'gbdt',           # GBDT
        'objective': 'regression',         # 回帰
        'metric': 'rmse',                  # 損失（誤差）
        'learning_rate': 0.01,             # 学習率
        'lambda_l1': 0.5,                  # L1正則化項の係数
        'lambda_l2': 0.5,                  # L2正則化項の係数
        'num_leaves': 10,                  # 最大葉枚数
        'feature_fraction': 0.5,           # ランダムに抽出される列の割合
        'bagging_fraction': 0.5,           # ランダムに抽出される標本の割合
        'bagging_freq': 5,                 # バギング実施頻度
        'min_child_samples': 10,           # 葉に含まれる最小データ数
        'seed': seed,                       # シード値
        "device": DEVICE,
        'verbosity': -1
    }


    """
    lgb_params = {
        'task': 'train',                   # 学習
        'objective': 'regression',                # 目的関数の種類。ここでは回帰タスクを指定
        'metric': 'rmse',                          # 評価指標
        'boosting_type': 'gbdt',                  # ブースティングタイプ。勾配ブースティング決定木
        "n_estimators": 32,                        # ブースティングに使用する木の数。多いほど性能が向上するが計算コストが増加
        "num_leaves": 64,                         # 木に存在する最大の葉の数。大きい値は精度を向上させるが過学習のリスクが増加
        "subsample": 0.8,                         # 各木のトレーニングに使用されるデータの割合。過学習を防ぐために一部のデータをサンプリング
        "colsample_bytree": 0.8,                  # 木を構築する際に使用される特徴の割合。特徴のサブセットを使用し過学習を防ぐ
        "learning_rate": 0.01,                 # 学習率。小さい値は堅牢なモデルを生成するが収束に時間がかかる
        'max_depth': 32,                           # 木の最大の深さ。深い木は複雑なモデルを作成するが過学習のリスクがある
        "device": DEVICE,                         # トレーニングに使用するデバイス（CPUまたはGPU）
        "verbosity": -1,                          # LightGBMのログ出力のレベル。-1はログを出力しないことを意味する
        "importance_type": "gain",                # 特徴重要度を計算する際の指標。"gain"は分割による平均情報利得
        'lambda_l1': 0.5,                         # L1正則化項の係数。過学習を防ぐためにモデルの複雑さにペナルティを課す
        'lambda_l2': 0.5,                         # L2正則化項の係数。同じく過学習を防ぐ
        'bagging_freq': 5,                 # バギング実施頻度
        'min_child_samples': 10,           # 葉に含まれる最小データ数
        'seed': seed,                       # シード値
    }
    """

# For local environment
else:
    BASE_OUTPUT_PATH = Path(f'../output')
    BASE_INPUT_PATH = Path(f'../kaggle/input/optiver-trading-at-the-close')
    TRAIN_FILE = Path(f'{BASE_INPUT_PATH}/train.csv')
    TEST_FILE = Path(f'{BASE_INPUT_PATH}/test.csv')

    SAMPLE_SUBMISSION_FILE = Path(f'{BASE_INPUT_PATH}/sample_submission.csv')
    REVEALED_TARGETS_FILE = Path(f'{BASE_INPUT_PATH}/revealed_targets.csv')

    IS_LOCAL = True
    IS_INFER = True
    IS_USE_SAVED_MODEL = False # Use saved model or not
    # TARGET_STOCK_IDS = [0,1,2,3,4,5,6,7,8,9]
    USE_OPTUNA = True # Use optuna or not
    TARGET_STOCK_IDS = [0, 1]

    # For training
    stopping_rounds = 1 # early_stopping用コールバック関数
    num_boost_round = 1 # 計算回数
    DEVICE = 'cpu' # cpu or gpu
    OPTUNA_TIME_BUDGET = 60 # 1 min

    optuna_params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',         # 回帰
        'metric': 'rmse',                  # 損失（誤差）
        'verbosity': -1,
        'deterministic':True, #再現性確保用のパラメータ
        'force_row_wise':True,  #再現性確保用のパラメータ
        'device': DEVICE
    }

    lgb_params = {
        'task': 'train',                   # 学習
        'boosting_type': 'gbdt',           # GBDT
        'objective': 'regression',         # 回帰
        'metric': 'rmse',                  # 損失（誤差）
        'learning_rate': 0.01,             # 学習率
        'lambda_l1': 0.5,                  # L1正則化項の係数
        'lambda_l2': 0.5,                  # L2正則化項の係数
        'num_leaves': 10,                  # 最大葉枚数
        'feature_fraction': 0.5,           # ランダムに抽出される列の割合
        'bagging_fraction': 0.5,           # ランダムに抽出される標本の割合
        'bagging_freq': 5,                 # バギング実施頻度
        'min_child_samples': 10,           # 葉に含まれる最小データ数
        'seed': seed,                       # シード値
        "device": DEVICE,
        'verbosity': -1
    }


print(f"BASE_OUTPUT_PATH: {BASE_OUTPUT_PATH}")
print(f"BASE_INPUT_PATH: {BASE_INPUT_PATH}")
print(f"TRAIN_FILE: {TRAIN_FILE}")
print(f"TEST_FILE: {TEST_FILE}")
print(f"IS_LOCAL: {IS_LOCAL}")
print(f"IS_INFER: {IS_INFER}")
print(f"IS_USE_SAVED_MODEL: {IS_USE_SAVED_MODEL}")


BASE_OUTPUT_PATH: ../output
BASE_INPUT_PATH: ../kaggle/input/optiver-trading-at-the-close
TRAIN_FILE: ../kaggle/input/optiver-trading-at-the-close/train.csv
TEST_FILE: ../kaggle/input/optiver-trading-at-the-close/test.csv
IS_LOCAL: True
IS_INFER: True
IS_USE_SAVED_MODEL: False


In [45]:
%%time 

from gc import collect;
from psutil import Process;
from os import system, getpid, walk;

# Defining global configurations and functions:-

    
def GetMemUsage():
    """
    This function defines the memory usage across the kernel. 
    Source-
    https://stackoverflow.com/questions/61366458/how-to-find-memory-usage-of-kaggle-notebook
    """;
    
    pid = getpid();
    py = Process(pid);
    memory_use = py.memory_info()[0] / 2. ** 30;
    return f"RAM memory GB usage = {memory_use :.4}";


collect();
print(GetMemUsage())

RAM memory GB usage = 0.3525
CPU times: user 108 ms, sys: 73.7 ms, total: 181 ms
Wall time: 222 ms


# Functions

In [46]:
%%time 

from typing import Sequence, Tuple
import pandas as pd

# for local execution
class MockApi:
    def __init__(self):
        '''
        YOU MUST UPDATE THE FIRST THREE LINES of this method.
        They've been intentionally left in an invalid state.

        Variables to set:
            input_paths: a list of two or more paths to the csv files to be served
            group_id_column: the column that identifies which groups of rows the API should serve.
                A call to iter_test serves all rows of all dataframes with the current group ID value.
            export_group_id_column: if true, the dataframes iter_test serves will include the group_id_column values.
        '''
        self.input_paths: Sequence[str] = [TEST_FILE, REVEALED_TARGETS_FILE, SAMPLE_SUBMISSION_FILE]
        self.group_id_column: str = 'time_id'
        self.export_group_id_column: bool = True
        # iter_test is only designed to support at least two dataframes, such as test and sample_submission
        assert len(self.input_paths) >= 2

        self._status = 'initialized'
        self.predictions = []

    def iter_test(self) -> Tuple[pd.DataFrame]:
        '''
        Loads all of the dataframes specified in self.input_paths,
        then yields all rows in those dataframes that equal the current self.group_id_column value.
        '''
        if self._status != 'initialized':

            raise Exception('WARNING: the real API can only iterate over `iter_test()` once.')

        dataframes = []
        for pth in self.input_paths:
            dataframes.append(pd.read_csv(pth, low_memory=False))
        group_order = dataframes[0][self.group_id_column].drop_duplicates().tolist()
        dataframes = [df.set_index(self.group_id_column) for df in dataframes]

        for group_id in group_order:
            self._status = 'prediction_needed'
            current_data = []
            for df in dataframes:
                cur_df = df.loc[group_id].copy()
                # returning single line dataframes from df.loc requires special handling
                if not isinstance(cur_df, pd.DataFrame):
                    cur_df = pd.DataFrame({a: b for a, b in zip(cur_df.index.values, cur_df.values)}, index=[group_id])
                    cur_df.index.name = self.group_id_column
                cur_df = cur_df.reset_index(drop=not(self.export_group_id_column))
                current_data.append(cur_df)
            yield tuple(current_data)

            while self._status != 'prediction_received':
                print('You must call `predict()` successfully before you can continue with `iter_test()`', flush=True)
                yield None

        with open('submission.csv', 'w') as f_open:
            pd.concat(self.predictions).to_csv(f_open, index=False)
        self._status = 'finished'

    def predict(self, user_predictions: pd.DataFrame):
        '''
        Accepts and stores the user's predictions and unlocks iter_test once that is done
        '''
        if self._status == 'finished':
            raise Exception('You have already made predictions for the full test set.')
        if self._status != 'prediction_needed':
            raise Exception('You must get the next test sample from `iter_test()` first.')
        if not isinstance(user_predictions, pd.DataFrame):
            raise Exception('You must provide a DataFrame.')

        self.predictions.append(user_predictions)
        self._status = 'prediction_received'


def make_env():
    return MockApi()

CPU times: user 24 µs, sys: 4 µs, total: 28 µs
Wall time: 27.9 µs


In [47]:
def pd_display_max():
    pd.set_option('display.max_rows', None)  # 行の最大表示数を無制限に設定
    pd.set_option('display.max_columns', None)  # 列の最大表示数を無制限に設定
    pd.set_option('display.width', None)  # 表示幅を拡張
    pd.set_option('display.max_colwidth', None)  # 列の幅を最大に設定

def pd_clear_display_max():
    pd.set_option('display.max_rows', 10)
    pd.set_option('display.max_columns', 10)
    pd.set_option('display.width', None)  # 表示幅を拡張
    pd.set_option('display.max_colwidth', None)  # 列の幅を最大に設定

In [48]:
# 🧹 Function to reduce memory usage of a Pandas DataFrame
def reduce_mem_usage(df, name: str):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    
    # 📏 Calculate the initial memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2

    # 🔄 Iterate through each column in the DataFrame
    for col in df.columns:
        col_type = df[col].dtype

        # Check if the column's data type is not 'object' (i.e., numeric)
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            # Check if the column's data type is an integer
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                # Check if the column's data type is a float
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)


    print(f"Memory usage of {name} is {start_mem:.2f} MB")
    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage after optimization is: {end_mem:.2f} MB")
    decrease = 100 * (start_mem - end_mem) / start_mem
    print(f"Decreased by {decrease:.2f}%")

    # 🔄 Return the DataFrame with optimized memory usage

    start_mem = df.memory_usage().sum() / 1024**2
    return df

In [49]:
# 🏎️ Import Numba for just-in-time (JIT) compilation and parallel processing
from numba import njit, prange

# 📊 Function to compute triplet imbalance in parallel using Numba
@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))

    # 🔁 Loop through all combinations of triplets
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        
        # 🔁 Loop through rows of the DataFrame
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            
            # 🚫 Prevent division by zero
            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features

# 📈 Function to calculate triplet imbalance for given price data and a DataFrame
def calculate_triplet_imbalance_numba(price, df):
    # Convert DataFrame to numpy array for Numba compatibility
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]

    # Calculate the triplet imbalance using the Numba-optimized function
    features_array = compute_triplet_imbalance(df_values, comb_indices)

    # Create a DataFrame from the results
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)

    return features

In [50]:
# 📊 Function to generate imbalance features
def imbalance_features(df):
    if DEVICE == 'gpu':
        import cudf
        df = cudf.from_pandas(df)
    
    # Define lists of price and size-related column names
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

    # V1 features
    # Calculate various features using Pandas eval function
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("ask_price + bid_price")/2
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("imbalance_size-matched_size")/df.eval("matched_size+imbalance_size")
    df["size_imbalance"] = df.eval("bid_size / ask_size")
    
    # Create features for pairwise price imbalances
    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")
        
    # V2 features
    # Calculate additional features
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    # Calculate the imbalance ratio
    df['match_balance'] = ( df['matched_size']  + (df['imbalance_buy_sell_flag'] * df['imbalance_size'])) / df['matched_size']
    
    # Calculate various statistical aggregation features
    
        
    # V3 features
    # Calculate shifted and return features for specific columns
    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1, 2, 3, 10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)
    
    # Calculate diff features for specific columns
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size']:
        for window in [1, 2, 3, 10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)
    if DEVICE == 'gpu':
        df = df.to_pandas()
    # Replace infinite values with 0
    return df.replace([np.inf, -np.inf], 0)

def numba_imb_features(df):
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
        
    # Calculate triplet imbalance features using the Numba-optimized function
    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
    return df

# 📅 Function to generate time and stock-related features
def other_features(df):
    df["dow"] = df["date_id"] % 5  # Day of the week
    df["seconds"] = df["seconds_in_bucket"] % 60  # Seconds
    df["minute"] = df["seconds_in_bucket"] // 60  # Minutes

    # Map global features to the DataFrame
    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df

# 🚀 Function to generate all features by combining imbalance and other features
def generate_basic_features(df):
    prev_cols = list(df.columns)

    # Generate imbalance features
    df = imbalance_features(df)
    df = numba_imb_features(df)

    df = reduce_mem_usage(df, "generate_basic_features")

    collect()  # Perform garbage collection to free up memory

    return df

# Generationg train dataset

In [51]:
def load_train_dataset():
    df = pd.read_csv(TRAIN_FILE)
    # 🧹 Remove rows with missing values in the "target" column
    df = df.dropna(subset=["target"])
    # 🔁 Reset the index of the DataFrame and apply the changes in place
    df.reset_index(drop=True, inplace=True)
    return df


In [52]:
%%time


# Check if the code is running in offline or online mode
print("Load train dataset")

df_train = load_train_dataset()

if IS_LOCAL:
    # In local mode, stock id TARGET_STOCK_ID is used for training
    df_train = df_train[df_train["stock_id"].isin(TARGET_STOCK_IDS)]
features = [c for c in df_train.columns if c not in ["row_id", "target", "time_id", "row_id", "date_id"]]
print(features)

collect();
print(GetMemUsage())

Load train dataset
['stock_id', 'seconds_in_bucket', 'imbalance_size', 'imbalance_buy_sell_flag', 'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap']
RAM memory GB usage = 1.918
CPU times: user 3.97 s, sys: 989 ms, total: 4.96 s
Wall time: 5.49 s


In [53]:
%%time

print("Step1. Generate general Global Stock ID Features and basic features")
prev_cols = list(df_train.columns)
global_stock_id_feats = {
    "median_size": df_train.groupby("stock_id")["bid_size"].median() + df_train.groupby("stock_id")["ask_size"].median(),
    "std_size": df_train.groupby("stock_id")["bid_size"].std() + df_train.groupby("stock_id")["ask_size"].std(),
    "ptp_size": df_train.groupby("stock_id")["bid_size"].max() - df_train.groupby("stock_id")["bid_size"].min(),
    "median_price": df_train.groupby("stock_id")["bid_price"].median() + df_train.groupby("stock_id")["ask_price"].median(),
    "std_price": df_train.groupby("stock_id")["bid_price"].std() + df_train.groupby("stock_id")["ask_price"].std(),
    "ptp_price": df_train.groupby("stock_id")["bid_price"].max() - df_train.groupby("stock_id")["ask_price"].min(),
}

df_train = generate_basic_features(df_train)

generated_feature_name = list(set(df_train.columns) - set(prev_cols))
features += generated_feature_name
print(generated_feature_name)

collect()
print(GetMemUsage())

Step1. Generate general Global Stock ID Features and basic features
Memory usage of generate_basic_features is 44.00 MB
Memory usage after optimization is: 21.80 MB
Decreased by 50.46%
['reference_price_wap_imb', 'imbalance_size_shift_1', 'bid_size_diff_3', 'imbalance_size_shift_2', 'imbalance_buy_sell_flag_shift_1', 'ask_price_diff_1', 'reference_price_ret_3', 'far_price_ask_price_imb', 'ask_price_diff_10', 'bid_price_diff_3', 'match_balance', 'imbalance_size_ret_3', 'imbalance_buy_sell_flag_ret_1', 'ask_price_wap_imb', 'far_price_bid_price_imb', 'imbalance_momentum', 'imbalance_size_shift_3', 'bid_size_diff_1', 'reference_price_shift_10', 'bid_size_diff_2', 'all_prices_kurt', 'market_urgency', 'reference_price_far_price_imb', 'imbalance_buy_sell_flag_ret_3', 'all_prices_std', 'near_price_wap_imb', 'imbalance_buy_sell_flag_ret_10', 'reference_price_ret_10', 'ask_price_bid_price_imb', 'all_prices_mean', 'bid_price_wap_reference_price_imb2', 'ask_price_diff_2', 'all_sizes_mean', 'matche

In [54]:
%%time

print("Step2. Generate enhanced features")
prev_cols = list(df_train.columns)

def generate_historical_features(df):
    def cal_vix(df, group_key, target_col, period):
        return df.groupby(group_key)[target_col].transform(lambda x: np.log(x).diff().rolling(period).std())

    print("generate_historical_features")
    for col in ['wap', 'match_balance']:
        for window in [1, 2, 3, 10]:
            col_name = f"{col}_diff_{window}"
            df[col_name] = df.groupby("stock_id")[col].diff(window)
            #df[col_name] = df[col_name].fillna(0)  # NaNを0で置き換える
        for period in [5]:
            col_name = f"{col}_vix_{period}"
            df[col_name] = cal_vix(df, ['stock_id', 'date_id'], col, period)

    df = df.replace([np.inf, -np.inf], 0)
    return df

def generate_index_features(df):
    print("generate_index_features")
     # Calculating mean and std for 'wap' and 'match_balance'
    wap_stats = df.groupby(['date_id', 'seconds_in_bucket'])['wap'].agg(['mean', 'std']).reset_index()
    match_balance_stats = df.groupby(['date_id', 'seconds_in_bucket'])['match_balance'].agg(['mean', 'std']).reset_index()

    # Adding prefix and suffix
    wap_stats = wap_stats.add_prefix('index_').add_suffix('_wap')
    match_balance_stats = match_balance_stats.add_prefix('index_').add_suffix('_match_balance')

    # Adjusting column names for merging
    wap_stats = wap_stats.rename(columns={'index_date_id_wap': 'date_id', 'index_seconds_in_bucket_wap': 'seconds_in_bucket'})
    match_balance_stats = match_balance_stats.rename(columns={'index_date_id_match_balance': 'date_id', 'index_seconds_in_bucket_match_balance': 'seconds_in_bucket'})

    # Merging with the original dataframe
    df = df.merge(wap_stats, on=['date_id', 'seconds_in_bucket'], how='left')
    df = df.merge(match_balance_stats, on=['date_id', 'seconds_in_bucket'], how='left')

    del wap_stats, match_balance_stats
    return df

def generate_enhance_features(df):
    df = generate_historical_features(df)

    df = reduce_mem_usage(df, "generate_historical_features")
    collect()

    df = generate_index_features(df)
    df = reduce_mem_usage(df, "generate_index_features")
    collect()
    return df

df_train = generate_enhance_features(df_train)
generated_feature_name = list(set(df_train.columns) - set(prev_cols))
features += generated_feature_name
print(generated_feature_name)

collect()
print(GetMemUsage())

Step2. Generate enhanced features
generate_historical_features
Memory usage of generate_historical_features is 24.22 MB
Memory usage after optimization is: 23.82 MB
Decreased by 1.67%
generate_index_features
Memory usage of generate_index_features is 24.22 MB
Memory usage after optimization is: 24.22 MB
Decreased by 0.00%
['wap_diff_1', 'index_std_match_balance', 'match_balance_diff_2', 'match_balance_diff_10', 'index_mean_wap', 'wap_diff_2', 'wap_diff_10', 'match_balance_vix_5', 'wap_diff_3', 'match_balance_diff_3', 'wap_vix_5', 'match_balance_diff_1', 'index_std_wap', 'index_mean_match_balance']
RAM memory GB usage = 1.966
CPU times: user 535 ms, sys: 14.1 ms, total: 549 ms
Wall time: 554 ms


In [55]:
%%time

print("Step3. Normalized features")
prev_cols = list(df_train.columns)


# Global features for normarization
global_wap = df_train['wap'].describe()
global_mathch_balance = df_train['match_balance'].describe()
global_target = df_train['target'].describe()
global_reference_price = df_train['reference_price'].describe()

# 📈 Function to generate normalized features
def generate_normalized_features(df, is_train):
    print("generate_normalized_features")
    if is_train:
        df['n_target'] = (df['target'] - global_target['mean']) / global_target['std']
    df['n_wap'] = (df['wap'] - global_wap['mean']) / global_wap['std']
    df['n_match_balance'] = (df['match_balance'] - global_mathch_balance['mean']) / global_mathch_balance['std']
    df['n_reference_price'] = (df['reference_price'] - global_reference_price['mean']) / global_reference_price['std']
    
    df = reduce_mem_usage(df, "generate_normalized_features")
    return df

df_train = generate_normalized_features(df_train, True)
generated_feature_name = list(set(df_train.columns) - set(prev_cols))
generated_feature_name = [c for c in generated_feature_name if c not in ["n_target"]]

features += generated_feature_name
print(generated_feature_name)

collect();
print(GetMemUsage())

Step3. Normalized features
generate_normalized_features
Memory usage of generate_normalized_features is 25.03 MB
Memory usage after optimization is: 25.03 MB
Decreased by 0.00%
['n_wap', 'n_reference_price', 'n_match_balance']
RAM memory GB usage = 1.967
CPU times: user 114 ms, sys: 1.82 ms, total: 115 ms
Wall time: 114 ms


In [56]:
df_train = reduce_mem_usage(df_train, 'train')

collect();
print(GetMemUsage())

Memory usage of train is 25.03 MB
Memory usage after optimization is: 25.03 MB
Decreased by 0.00%
RAM memory GB usage = 1.967


# Feature selection

In [57]:
# feature selection
"""
feature_name = [
    "wap_vix_5", "reference_price_shift_10", "matched_size_ret_10",
    "matched_size_shift_10", "match_balance_vix_5", "ask_price_bid_price_reference_price_imb2",
    "seconds_in_bucket", "match_balance_diff_10", "imbalance_size_ret_10",
    "ask_size_diff_10", "imbalance_size_shift_10", "reference_price",
    "ask_price_bid_price_imb", "reference_price_ret_10", "all_sizes_mean",
    "matched_size", "bid_size_diff_10", "volume", "reference_price_shift_3",
    "bid_size", "bid_price_wap_reference_price_imb2", "ask_size",
    "ask_size_diff_3", "reference_price_bid_price_imb", "reference_price_wap_imb",
    "all_prices_kurt", "matched_size_bid_size_ask_size_imb2", "bid_price",
    "wap_diff_10", "bid_size_diff_3", "all_prices_std", "bid_size_ask_size_imbalance_size_imb2",
    "all_prices_skew", "all_sizes_skew", "bid_price_wap_imb",
    "ask_price_diff_10", "ask_price_wap_imb", "imbalance_size_ret_3",
    "matched_size_shift_3", "reference_price_shift_1", "all_prices_mean",
    "imbalance_size", "matched_size_ret_3", "reference_price_shift_2",
    "ask_price", "ask_size_diff_2", "bid_size_diff_2", "price_pressure",
    "reference_price_ask_price_imb", "matched_size_shift_1", "bid_size_diff_1",
    "market_urgency", "wap_diff_3", "price_spread", "all_sizes_std",
    "matched_size_shift_2", "imbalance_size_shift_3", "bid_price_diff_10",
    "ask_size_diff_1", "far_price", "reference_price_ret_3", "match_balance",
    "wap_diff_2", "wap_diff_1", "matched_imbalance", "wap"
]
"""
features

['stock_id',
 'seconds_in_bucket',
 'imbalance_size',
 'imbalance_buy_sell_flag',
 'reference_price',
 'matched_size',
 'far_price',
 'near_price',
 'bid_price',
 'bid_size',
 'ask_price',
 'ask_size',
 'wap',
 'reference_price_wap_imb',
 'imbalance_size_shift_1',
 'bid_size_diff_3',
 'imbalance_size_shift_2',
 'imbalance_buy_sell_flag_shift_1',
 'ask_price_diff_1',
 'reference_price_ret_3',
 'far_price_ask_price_imb',
 'ask_price_diff_10',
 'bid_price_diff_3',
 'match_balance',
 'imbalance_size_ret_3',
 'imbalance_buy_sell_flag_ret_1',
 'ask_price_wap_imb',
 'far_price_bid_price_imb',
 'imbalance_momentum',
 'imbalance_size_shift_3',
 'bid_size_diff_1',
 'reference_price_shift_10',
 'bid_size_diff_2',
 'all_prices_kurt',
 'market_urgency',
 'reference_price_far_price_imb',
 'imbalance_buy_sell_flag_ret_3',
 'all_prices_std',
 'near_price_wap_imb',
 'imbalance_buy_sell_flag_ret_10',
 'reference_price_ret_10',
 'ask_price_bid_price_imb',
 'all_prices_mean',
 'bid_price_wap_reference_pri

# Model Training

In [58]:
# 📦 Import necessary libraries
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
import gc
import os
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import numpy as np
from dataclasses import dataclass
import sys
import shutil
import lightgbm as lgb

from warnings import simplefilter
simplefilter("ignore", category=RuntimeWarning)

@dataclass
class Model:
    booster: lgb.Booster
    fold: int
    feature_importance: pd.DataFrame
    score: float
    best_iteration: int
    train_time: float = None
    weight: float = None
    mem_usage: float = None
    train_func: str = None

def train_model(train_x, train_y, val_x, val_y, best_params=None):
    print("------ Model training start ------")
    trains = lgb.Dataset(train_x, train_y)
    valids = lgb.Dataset(val_x, val_y, reference=trains)

    verbose_eval = 0
    if best_params is None:
        params = lgb_params
    else:
        params = best_params

    print("Use params:")
    print(params)

    booster = lgb.train(
        params,
        trains,
        valid_sets=[valids, trains], # 検証データ
        valid_names=['Train', 'Valid'],    # データセット名前
        num_boost_round=num_boost_round,
        callbacks=[
                lgb.early_stopping(stopping_rounds=stopping_rounds, verbose=True),
                lgb.log_evaluation(verbose_eval)
        ]
    )

    print("------ Model training end ------")
    return model

def cross_train(df, key, n_splits, features, valid_name, best_params=None):
    """ For Cross Train

    Args:
        df (_type_): _description_
        n_splits (_type_): _description_

    Returns:
        _type_: _description_
    """
    print("----------------------------------------")
    print(f"Cross Train key id {key}: start, shape: {df.shape}, n_splits: {n_splits}")
    print(f"num_boost_round: {num_boost_round}, stopping_rounds: {stopping_rounds}, folds: {num_folds}")

    models = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    df.reset_index(drop=True, inplace=True)
    
    for fold, (train_indices, valid_indices) in enumerate(kf.split(df)):
        print(f"{key}: {fold} start")
        now_time = time.time()
        X_train, X_valid = df[features].iloc[train_indices], df[features].iloc[valid_indices]
        y_train, y_valid = df[valid_name].loc[train_indices], df[valid_name].loc[valid_indices]
        print(f"X_train: {X_train.shape}, X_valid: {X_valid.shape}, y_train: {y_train.shape}, y_valid: {y_valid.shape}")


        booster = train_model(X_train, y_train, X_valid, y_valid, best_params)

        y_valid_pred = booster.predict(X_valid)
        
        score = mean_absolute_error(y_valid, y_valid_pred)
        train_time = time.time() - now_time
        mem_usage = sys.getsizeof(booster) / (1024 * 1024) # MB
        model = Model(booster, fold, booster.feature_importance(), score, booster.best_iteration, train_time, weight= 1 / n_splits, mem_usage=mem_usage, train_func="lightgbm")
        print(f"{key}: {fold} end, score: {score}, time: {model.train_time}, best_iteration: {model.best_iteration}, memory usage: {model.mem_usage}")
        
        models.append(model)
        
        del X_train, X_valid, y_train, y_valid
        gc.collect()

    print(f"Cross train {key} model len {len(models)}")
    print("----------------------------------------")
    return key, models

# Optuna tuner

In [59]:
%%time

import optuna.integration.lightgbm as optuna_lgb
import optuna
import lightgbm
optuna.logging.set_verbosity(optuna.logging.ERROR)


class TunerCVCheckpointCallback(object):
    """Optuna の LightGBMTunerCV から学習済みモデルを取り出すためのコールバック"""

    def __init__(self):
        # Models
        self.models = []
        self.counter = 0

    def get_models(self):
        # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html#lightgbm.Booster
        return self.models

    def __call__(self, env: lightgbm.callback.CallbackEnv):
        """_summary_

        Args:
            env (lightgbm.callback.CallbackEnv): _description_
            "model",
            "params",
            "iteration",
            "begin_iteration",
            "end_iteration",
            "evaluation_result_list"
        """
        print("")

        self.counter += 1
        print("-------------------")
        print(f"Counter: {self.counter}")
        print(f"Iteration: {env.iteration}")
        print(f"Begin_iteration: {env.begin_iteration}")
        print(f"End_iteration: {env.end_iteration}")
        print(f"Evaluation_result_list: {env.evaluation_result_list}")
        print(f"Model best_iteration: {env.model.best_iteration}")
        print("Params: ", env.params)
        #self.models.append(env.model)
        del env

        collect();
        print(GetMemUsage())

def optuna_tuning(df, n_splits, features, valid_name, model_save_path):
    #df_train = df[df["date_id"] <= valid_split_day]
    #df_valid = df[df["date_id"] > valid_split_day]
    df_train = df

    train_x = df_train[features]
    train_y = df_train[valid_name] 

    #valid_x = df_valid[features]
    #valid_y = df_valid[valid_name] 

    del df
    trains = optuna_lgb.Dataset(train_x, train_y)
    
    print("------- Optuna Tuning Start -------")
    now_time = time.time()
    print(f"num_boost_round: {num_boost_round}, stopping_rounds: {stopping_rounds}, folds: {num_folds}")

    folds = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    checkpoint_cb = TunerCVCheckpointCallback()
    
    verbose_eval = 0
    # https://optuna.readthedocs.io/en/stable/reference/generated/optuna.integration.lightgbm.LightGBMTunerCV.html
    tuner = optuna_lgb.LightGBMTunerCV(
        optuna_params,
        trains,
        num_boost_round=num_boost_round,
        folds=folds,
        show_progress_bar=False,
        return_cvbooster=True,
        verbosity=-1,
        model_dir=model_save_path,
        optuna_seed=seed,
        time_budget=OPTUNA_TIME_BUDGET,
        callbacks=[
                lgb.early_stopping(stopping_rounds=stopping_rounds, verbose=True),
                lgb.log_evaluation(verbose_eval),
                checkpoint_cb
        ]
    )
    
    tuner.run()
    best_params = tuner.best_params
    
    print("Params: ")
    for key, value in best_params.items():
        print(" {}: {}".format(key, value))

    print("")
    print("len(tuner.study.trials): ", len(tuner.study.trials))
    #print("len(checkpoint_cb.cv_boosters): ", len(checkpoint_cb.models))
    print("Tuner best_params", tuner.best_params)
    print("Tuner best score: ", tuner.best_score)
   
    # 最も良かったパラメータをキーにして学習済みモデルを取り出す
    best_booster = tuner.get_best_booster()
    # best_boosters = best_booster.boosters
    # valid_y_pred = best_booster.predict(valid_x)
    # print("len valid_y_pred: ", len(valid_y_pred))

    # 予測値の平均をとる
    # valid_y_pred = np.mean(valid_y_pred, axis=0)

    # score = mean_absolute_error(valid_y, valid_y_pred)
    scoure = -1
    train_time = time.time() - now_time
    mem_usage = sys.getsizeof(best_booster) / (1024 * 1024) # MB
    feature_importance = np.mean(best_booster.feature_importance(), axis=0)

    best_model = Model(best_booster, 1, feature_importance, score, best_booster.best_iteration, train_time, weight= 1, mem_usage=mem_usage, train_func="optuna_lgb")
    print("------- Optuna Tuning End -------")
    return best_params, best_model


CPU times: user 61 µs, sys: 19 µs, total: 80 µs
Wall time: 81.8 µs


In [60]:
%%time

KEY = "-1"

# Train
best_params = None
key_models = None
if USE_OPTUNA:
    model_save_base_path = f"{BASE_OUTPUT_PATH}/model"
    if os.path.exists(model_save_base_path):
        print(f"{model_save_base_path} already exists, clean up it.")
        shutil.rmtree(model_save_base_path)
    os.makedirs(model_save_base_path)
    print(f"model_save_base_path: {model_save_base_path}")

    best_params, best_model = optuna_tuning(df=df_train, n_splits=num_folds, features=features, valid_name="target", model_save_path=model_save_base_path)
    key_models = [(KEY, [best_model])]
else:
    #key_models = df_train.groupby("seconds_in_bucket").apply(lambda x: cross_train(df=x, key=x.name, n_splits=num_folds, feature_name=feature_name, valid_name="target", best_params=best_params))
    key_models = [cross_train(df=df_train, key=KEY, n_splits=num_folds, features=features, valid_name="target", best_params=best_params)]
    if IS_USE_SAVED_MODEL:
        model_save_base_path = f"{BASE_OUTPUT_PATH}/model"
        if os.path.exists(model_save_base_path):
            print(f"{model_save_base_path} already exists, clean up it.")
            shutil.rmtree(model_save_base_path)
        os.makedirs(model_save_base_path)

        key_model_paths = []
        for key, models in key_models:
            model_save_path = f"{model_save_base_path}/{key}"
            os.makedirs(model_save_path)
            model_paths = []
            for model in models:
                model_save_fullpath = f"{model_save_path}/model_{key}_{model.fold}.txt"
                model.model.save_model(model_save_fullpath)
                model_paths.append(model_save_fullpath)
            key_model_paths.append((key, model_paths))

        model_dict_saved = {key: model_paths for key, model_paths in key_model_paths}
        print(model_dict_saved)


model_dict = {key: model for key, model in key_models}

del df_train
collect()
print(GetMemUsage())

../output/model already exists, clean up it.
model_save_base_path: ../output/model
------- Optuna Tuning Start -------
num_boost_round: 1, stopping_rounds: 1, folds: 2

-------------------
Counter: 1
Iteration: 0
Begin_iteration: 0
End_iteration: 1
Evaluation_result_list: [('cv_agg', 'valid rmse', 9.193271630339488, False, 0.0997464711957905)]
Model best_iteration: -1
Params:  {'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'verbosity': -1, 'deterministic': True, 'force_row_wise': True, 'device': 'cpu', 'feature_pre_filter': False, 'lambda_l1': 0.0, 'lambda_l2': 0.0, 'num_leaves': 31, 'feature_fraction': 0.8999999999999999, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 20, 'num_iterations': 1}
RAM memory GB usage = 2.009
Training until validation scores don't improve for 1 rounds
Did not meet early stopping. Best iteration is:
[1]	cv_agg's valid rmse: 9.19327 + 0.0997465

-------------------
Counter: 2
Iteration: 0
Begin_iteration: 0
End_iterat

In [61]:
# Show results
print("")
print(f"Total model len {(sum([len(models) for key, models in key_models]))}")
print(f"Total model mem usage {sum([sum([model.mem_usage for model in models]) for key, models in key_models]):2f} MB")
print(f"Model dict len {len(model_dict)}")


Total model len 1
Total model mem usage 0.000053 MB
Model dict len 1


In [62]:
# Check model quality
data = []

for key, i_models in model_dict.items():
    for model in i_models:
        score = model.score
        best_iteration = model.best_iteration
        fold = model.fold
        train_time = model.train_time
        data.append({"key": key, "fold": fold, "score": score, "best_iteration": best_iteration, "train_time": train_time})

df_model = pd.DataFrame(data)
df_model.describe()

Unnamed: 0,fold,score,best_iteration,train_time
count,1.0,1.0,1.0,1.0
mean,1.0,6.021438,1.0,22.644154
std,,,,
min,1.0,6.021438,1.0,22.644154
25%,1.0,6.021438,1.0,22.644154
50%,1.0,6.021438,1.0,22.644154
75%,1.0,6.021438,1.0,22.644154
max,1.0,6.021438,1.0,22.644154


In [63]:
# Initialize an empty DataFrame for aggregated importances
aggregated_importance = pd.DataFrame(index=features, columns=['importance'])

# Aggregate the importances from each model
for key, i_models in model_dict.items():
    for model in i_models:
        importance = pd.DataFrame({'feature': features, 'importance': model.feature_importance})
        aggregated_importance = aggregated_importance.add(importance.set_index('feature'), fill_value=0)

aggregated_importance['importance'] /= len(df_model)

pd_display_max()
# Sort the features by importance
aggregated_importance = aggregated_importance.sort_values(by='importance', ascending=False)
aggregated_importance

Unnamed: 0,importance
market_urgency,9.5
wap_vix_5,9.5
index_mean_match_balance,7.5
matched_size_ret_10,7.5
reference_price_bid_price_imb,6.5
index_std_match_balance,5.5
reference_price_shift_10,5.5
reference_price_ask_price_imb,5.5
match_balance_diff_10,5.0
index_std_wap,5.0


In [64]:
# Clean up
del key_models
if IS_USE_SAVED_MODEL:
    print("Delete model_dict")
    del model_dict
collect()
pd_clear_display_max()
collect()
print(GetMemUsage())

RAM memory GB usage = 2.622


# Infer

In [67]:
%%time

# 📉 Define a function to adjust prices based on volumes
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)  # 🧮 Calculate standard error based on volumes
    step = np.sum(prices) / np.sum(std_error)  # 🧮 Calculate the step size based on prices and standard error
    out = prices - std_error * step  # 💰 Adjust prices by subtracting the standardized step size
    return out

def model_infer(key, df_feat):
    def predictor(boosters):
        print(f"Predictor target models len {len(boosters)}")
        #print(f"Predictor Feat len {len(df_feat)}")
        predictions = np.zeros(len(df_feat))
        predictions = np.mean([np.mean(booster.predict(df_feat[features]), 0) for booster in boosters], 0)
        return predictions
    
    if IS_USE_SAVED_MODEL:
        model_paths = model_dict_saved[key]
        models = [lgb.Booster(model_file=model_path) for model_path in model_paths]
        predictions = predictor(models)
        del models
    else:
        boosters = [m.booster for m in model_dict[key]]
        predictions = predictor(boosters)
    collect()
    return predictions


y_min, y_max = -64, 64
predictions = []
cache = pd.DataFrame()
result = pd.DataFrame()

counter = 0

if IS_INFER:
    if IS_LOCAL:
        print("Infer Local")
        env = make_env()
    else:
        print("Infer Submission")
        import optiver2023
        env = optiver2023.make_env()
    iter_test = env.iter_test()

    for (test, revealed_targets, sample_prediction) in iter_test:
        now_time = time.time()
        print(f"------- counter {counter} start -------")

        # It faults due to test is iterator
        #seconds_in_bucket = test['seconds_in_bucket'][0]
        #print(f"prdict: {test['date_id'][0]}, {seconds_in_bucket}")
        
        # Generate cahce
        cache = pd.concat([cache, test], ignore_index=True, axis=0)
        cache = reduce_mem_usage(cache, 'cache')
        if counter > 0:
            # 🔄 If not the first iteration, limit the cache to the last 21 rows for each stock
            cache = cache.groupby(['stock_id']).tail(21).sort_values(by=['date_id', 'seconds_in_bucket', 'stock_id']).reset_index(drop=True)
        print(f"cache len {len(cache)}")

        # 📊 Generate features
        df_test = generate_basic_features(cache)
        df_test = generate_enhance_features(df_test)
        df_test = generate_normalized_features(df_test, False)
        df_test = df_test[-len(test):].reset_index(drop=True)

        df_feat = df_test[features]
        print(f"df_feat len {len(df_feat)}")

        # Get seconds_in_bucket and date_id
        seconds_in_bucket = df_test['seconds_in_bucket'][0]
        date_id = df_test['date_id'][0]
        print(f"Predict: {date_id}, {seconds_in_bucket}")
        
        # Predict
        predictions = model_infer(KEY, df_feat)

        # Adjust the predictions based on the order book imbalance
        predictions = zero_sum(predictions, test['bid_size'] + test['ask_size'])
        predictions = np.clip(predictions, y_min, y_max)
        predictions.replace([np.nan, np.inf, -np.inf], 0, inplace=True)

        # For save
        df_feat['pred'] = predictions
        result = pd.concat([result, df_feat], ignore_index=True, axis=0)

        # Submit
        sample_prediction['target'] = predictions
        env.predict(sample_prediction)
        
        # Clean up
        del df_feat, df_test, predictions
        collect()
        counter += 1

collect()
print(GetMemUsage())
result.to_csv(f"{BASE_OUTPUT_PATH}/result.csv", index=False)

Infer Local
------- counter 0 start -------
Memory usage of cache is 0.02 MB
Memory usage after optimization is: 0.01 MB
Decreased by 55.96%
cache len 200
Memory usage of generate_basic_features is 0.09 MB
Memory usage after optimization is: 0.08 MB
Decreased by 13.21%
generate_historical_features
Memory usage of generate_historical_features is 0.09 MB
Memory usage after optimization is: 0.09 MB
Decreased by 1.71%
generate_index_features
Memory usage of generate_index_features is 0.09 MB
Memory usage after optimization is: 0.09 MB
Decreased by 0.00%
generate_normalized_features
Memory usage of generate_normalized_features is 0.09 MB
Memory usage after optimization is: 0.09 MB
Decreased by 0.00%
df_feat len 200
Predict: 478, 0
Predictor target models len 1
------- counter 1 start -------
Memory usage of cache is 0.18 MB
Memory usage after optimization is: 0.14 MB
Decreased by 22.59%
cache len 400
Memory usage of generate_basic_features is 0.18 MB
Memory usage after optimization is: 0.16