In [18]:
import os
import polars as pl

def generate_test_data(
    base_path: str,
    id_col: str, 
    date_id: str,  # 例: ("date_id", int)
    time_id: str,  # 例: ("time_id", int)
    pred_cols: list, # 例: [("responder_0", float)]
    input_cols: list,# 例: [("feature_00", float)]
    num_partitions=4,
    num_id=2,
    date_per_partition=2
):
    """
    id_col, date_id, time_id, pred_cols, input_colsをもとにテストデータを作成し、
    partition_id=0～(num_partitions-1)までのParquetファイルに出力する。

    各パーティションは、以下の条件でデータを生成：
      - id_col: 0～(num_id-1)までのIDを用意
      - date_id: partition i は [i*date_per_partition, (i+1)*date_per_partition-1]
      - time_id: 0～24 (25個)
    よって1パーティションあたり: date_per_partition * num_id * 25行。

    pred_cols, input_colsはタプルのリスト [(col_name, type), ...]で指定し、
    値は日付・ID・time_idに基づき決定的に割り当てる。
    """
    
    os.makedirs(base_path, exist_ok=True)

    # カラム名抽出
    id_col_name = id_col[0]
    date_col_name = date_id[0]
    time_col_name = time_id[0]

    # 1パーティションあたりの行数
    time_per_day = 25
    rows_per_partition = date_per_partition * num_id * time_per_day

    # 全ての列名
    all_col_names = [id_col_name, date_col_name, time_col_name] + [c[0] for c in pred_cols] + [c[0] for c in input_cols]

    for i in range(num_partitions):
        date_start = i * date_per_partition
        date_end = date_start + date_per_partition

        # データ生成
        # 二重三重ループで決定的に割り当て
        date_ids_list = []
        id_list = []
        time_ids_list = []

        for d in range(date_start, date_end):
            for sid in range(num_id):
                for t in range(time_per_day):
                    date_ids_list.append(d)
                    id_list.append(sid)
                    time_ids_list.append(t)

        # ベースとなる数値計算用
        # responder_0やfeature_00に割り当てるための基礎値
        # 例：val = d*10000 + sid*100 + t
        base_vals = []
        for d_id, s_id, t_id in zip(date_ids_list, id_list, time_ids_list):
            val = d_id * 10000 + s_id * 100 + t_id
            base_vals.append(val)

        # DataFrame用辞書
        data = {
            id_col_name: id_list,
            date_col_name: date_ids_list,
            time_col_name: time_ids_list,
        }

        # pred_cols割り当て（例：responder_0 = base_val + index_of_col）
        for idx, (c_name, c_type) in enumerate(pred_cols):
            # 例: responder_0 = base_val + idx
            c_values = [v + idx for v in base_vals]
            data[c_name] = c_values

        # input_cols割り当て（例：feature_00 = base_val/1000）
        for idx, (c_name, c_type) in enumerate(input_cols):
            # 例: feature_00 = base_val / 1000.0
            c_values = [v / 1000.0 + idx for v in base_vals]
            data[c_name] = c_values

        df = pl.DataFrame(data)
        # ソート（id_col_name, date_col_name, time_col_name順）
        df = df.sort([id_col_name, date_col_name, time_col_name])

        partition_dir = os.path.join(base_path, f"partition_id={i}")
        os.makedirs(partition_dir, exist_ok=True)
        df.write_parquet(os.path.join(partition_dir, "part-0.parquet"))

    print("Test data generated without rows_per_partition, using given columns.")

In [19]:
# 入力変数選択
id_col = ("symbol_id", int)
date_id = ("date_id", int)
time_id = ("time_id", int)

#preds_cols = [f"responder_{i}" for i in range(9)] # 目的変数
pred_cols = [("responder_0", float)]
input_cols = [("feature_00", float)] # 説明変数

target_cols = [id_col, date_id, time_id] + pred_cols + input_cols
target_cols

[('symbol_id', int),
 ('date_id', int),
 ('time_id', int),
 ('responder_0', float),
 ('feature_00', float)]

In [20]:
generate_test_data("test", id_col, date_id, time_id, pred_cols, input_cols, num_partitions=2, num_id=1, date_per_partition=3)

Test data generated without rows_per_partition, using given columns.


In [21]:
df = (
    pl.scan_parquet("/kaggle/working/test/partition_id=*/part-*.parquet", glob=True)
      .filter(pl.col("symbol_id") == 0)
      .collect()
)
df

symbol_id,date_id,time_id,responder_0,feature_00
i64,i64,i64,i64,f64
0,0,0,0,0.0
0,0,1,1,0.001
0,0,2,2,0.002
0,0,3,3,0.003
0,0,4,4,0.004
…,…,…,…,…
0,5,20,50020,50.02
0,5,21,50021,50.021
0,5,22,50022,50.022
0,5,23,50023,50.023


In [22]:
import os
import glob
import polars as pl
import numpy as np
import gc
import time

def create_metadata(input_base):
    parquet_files = sorted(glob.glob(os.path.join(input_base, "partition_id=*", "*.parquet")))
    meta = []
    for pi, f in enumerate(parquet_files):
        df_all = pl.read_parquet(f, columns=["date_id"])
        min_id = df_all.select(pl.col("date_id").min()).item()
        max_id = df_all.select(pl.col("date_id").max()).item()
        meta.append((pi, min_id, max_id))
    print("meta: partition, min date, max date", meta)
    return meta

def partitions_for_range(meta, start_id: int, end_id: int):
    needed = []
    for (pi, mini, maxi) in meta:
        if maxi >= start_id and mini <= end_id:
            needed.append(pi)
    return needed

In [23]:
import os
import glob
import gc
import polars as pl

def run_walk_forward(
    input_base: str,
    output_base: str,
    id_col, date_id, time_id, pred_cols, input_cols,
    train_length, valid_length, train_shift, retroactive_size,
    add_feature_func
):
    """
    1) Kaggleパーティションファイルを読み込み、指定期間でtrainとvalidを作る。
    2) add_feature_funcの処理の前にgc.collect()でメモリ解放を試みる。
    3) 前回と今回の generated_feature, generated_predsに差分がある場合にExceptionを投げる。
    4) 書き出し時に zstd圧縮(level=9)を指定。
    
    Returns:
        (last_generated_feature, last_generated_preds)
    """
    original_cols = [id_col, date_id, time_id] + pred_cols + input_cols
    print("original_cols", original_cols)

    meta = create_metadata(input_base)
    max_date_id_global = max(m[2] for m in meta)

    start_of_train = 0
    sprint_num = 1

    parquet_files = sorted(glob.glob(os.path.join(input_base, "partition_id=*", "*.parquet")))

    # 前回のgeneratedを記録する変数(Noneで初期化)
    prev_gen_feature_str = None
    prev_gen_preds_str = None

    generated_feature = None
    generated_preds = None

    while True:
        if start_of_train > max_date_id_global:
            break

        train_start = start_of_train
        train_end = start_of_train + train_length
        valid_start = train_end
        valid_end = train_end + valid_length

        if valid_start > max_date_id_global:
            break
        if valid_end > max_date_id_global + 1:
            valid_end = max_date_id_global + 1

        extended_start = max(train_start - retroactive_size, 0)
        required_start = extended_start
        required_end = valid_end - 1

        needed_partitions = partitions_for_range(meta, required_start, required_end)
        if len(needed_partitions) == 0:
            break

        df_current = None
        for p in needed_partitions:
            parquet_files_matched = [f for f in parquet_files if f"partition_id={p}" in f]
            for fpart in parquet_files_matched:
                print("read_parquet", fpart)
                # 存在しない列がある場合に備えてスキーマ確認
                df_schema = pl.read_parquet(fpart, n_rows=1).schema
                actual_cols = df_schema.keys()
                cols_to_read = []
                for col in original_cols:
                    if isinstance(col, tuple):
                        col_name = col[0]
                    else:
                        col_name = col
                    if col_name in actual_cols:
                        cols_to_read.append(col_name)
                
                df_part = pl.read_parquet(fpart, columns=cols_to_read)
                if df_current is None:
                    df_current = df_part
                else:
                    df_current = pl.concat([df_current, df_part], how="vertical")

        if df_current is None:
            break

        print("extended_start", extended_start)
        print("valid_end", valid_end)

        df_current = df_current.filter((pl.col("date_id") >= extended_start) & (pl.col("date_id") < valid_end))
        
        # 1) add_feature_func の実行前にメモリ解放
        gc.collect()

        # 2) 特徴量生成
        start_time = time.time()
        df_current, generated_feature, generated_preds = add_feature_func(
            df_current,
            pred_cols=pred_cols,
            input_cols=input_cols,
            original_cols=[col[0] if isinstance(col, tuple) else col for col in original_cols]
        )
        end_time = time.time()
        print(f"add_feature_func Execution time: {end_time - start_time:.4f} seconds")

        # 3) 前回と今回の generated_feature/generated_predsを比較
        # リストのリストであるため、repr()で簡易比較
        print("Check generated col diff.")
        current_gen_feature_str = repr(generated_feature)
        current_gen_preds_str   = repr(generated_preds)
        if prev_gen_feature_str is not None:
            # 差分があるかどうか判定
            if current_gen_feature_str != prev_gen_feature_str:
                raise Exception("generated_feature differs from previous sprint!")
        if prev_gen_preds_str is not None:
            if current_gen_preds_str != prev_gen_preds_str:
                raise Exception("generated_preds differs from previous sprint!")

        # 今回を前回として保存
        prev_gen_feature_str = current_gen_feature_str
        prev_gen_preds_str   = current_gen_preds_str

        # train/valid抽出
        print("Check generated col diff.")
        train_df = df_current.filter((pl.col("date_id") >= train_start) & (pl.col("date_id") < train_end))
        valid_df = df_current.filter((pl.col("date_id") >= valid_start) & (pl.col("date_id") < valid_end))

        sprint_dir = os.path.join(output_base, f"sprint{sprint_num}")
        os.makedirs(sprint_dir, exist_ok=True)

        train_path = os.path.join(sprint_dir, "train.parquet")
        valid_path = os.path.join(sprint_dir, "valid.parquet")

        print("write_parquet start.")
        gc.collect()
        start_time = time.time()
        train_df.collect().write_parquet(train_path, compression="zstd")
        #train_df.sink_parquet(train_path, compression="snappy")
        gc.collect()

        valid_df.collect().write_parquet(valid_path, compression="zstd")
        #valid_df.sink_parquet(valid_path, compression="snappy")
        end_time = time.time()
        print(f"  write_parquet Execution time: {end_time - start_time:.4f} seconds")

        print(f"Sprint {sprint_num}:")
        print(f"  Train: date_id in [{train_start}, {train_end}) -> {train_path}")
        print(f"  Valid: date_id in [{valid_start}, {valid_end}) -> {valid_path}")
        print(f"  Saved to {sprint_dir}\n")

        del df_current, train_df, valid_df
        gc.collect()

        start_of_train += train_shift
        sprint_num += 1

    print("Done.")
    gc.collect()

    return generated_feature, generated_preds

In [24]:
def add_rolling_preds(df: pl.DataFrame, pred_cols: list[str]) -> pl.DataFrame:
    """
    """
    agg_expr = []
    for c in pred_cols:
        agg_expr.append(pl.col(c).first().alias(f"{c}_prev_first"))
        agg_expr.append(pl.col(c).last().alias(f"{c}_prev_last"))
        agg_expr.append(pl.col(c).max().alias(f"{c}_prev_max"))
        agg_expr.append(pl.col(c).min().alias(f"{c}_prev_min"))
        agg_expr.append(pl.col(c).std().alias(f"{c}_prev_std"))

    return (
        df.group_by(["symbol_id", "date_id"])
        .agg(agg_expr)
        .with_columns((pl.col("date_id") + 1).alias("date_id_next"))
        .drop("date_id")
        .rename({"date_id_next": "date_id"})
    )

window_size = 7
def add_feature(
    df: pl.DataFrame, 
    pred_cols, 
    input_cols,
    original_cols
) -> (pl.DataFrame, pl.DataFrame):
    # add_rolling_preds_funcを使用してprev_day_aggs相当を計算

    df = df.lazy()
    
    df_prev = add_rolling_preds(df, pred_cols)
    df = df.join(df_prev, on=["symbol_id", "date_id"], how="left")
    df = df.sort(["symbol_id","date_id","time_id"])

    # pred_cols[0]でrolling_meanする例
    first_pred = pred_cols[0]
    rolling_col = f"{first_pred}_prev_last"

    df = df.with_columns(
        pl.col(rolling_col)
        .rolling_mean(window_size=window_size)
        .over("symbol_id")
        .alias(f"{rolling_col}_{window_size}day_mean")
    )
    return df, [], []

In [25]:
output_base = "test_datasets"
os.makedirs(output_base, exist_ok=True)

test_train_length = 2
test_valid_length = 1
test_train_shift = 1
test_retroactive_size = 1

run_walk_forward(
    input_base="/kaggle/working/test",
    output_base=output_base,
    id_col=id_col[0],
    date_id=date_id[0], 
    time_id=time_id[0],
    pred_cols=[c[0] for c in pred_cols],
    input_cols=[c[0] for c in input_cols],
    train_length=test_train_length,
    valid_length=test_valid_length,
    train_shift=test_train_shift, 
    retroactive_size=test_retroactive_size,
    add_feature_func=add_feature
)

original_cols ['symbol_id', 'date_id', 'time_id', 'responder_0', 'feature_00']
meta: partition, min date, max date [(0, 0, 2), (1, 3, 5)]
read_parquet /kaggle/working/test/partition_id=0/part-0.parquet
extended_start 0
valid_end 3
add_feature_func Execution time: 0.0003 seconds
Check generated col diff.
Check generated col diff.
write_parquet start.
  write_parquet Execution time: 0.0740 seconds
Sprint 1:
  Train: date_id in [0, 2) -> test_datasets/sprint1/train.parquet
  Valid: date_id in [2, 3) -> test_datasets/sprint1/valid.parquet
  Saved to test_datasets/sprint1

read_parquet /kaggle/working/test/partition_id=0/part-0.parquet
read_parquet /kaggle/working/test/partition_id=1/part-0.parquet
extended_start 0
valid_end 4
add_feature_func Execution time: 0.0004 seconds
Check generated col diff.
Check generated col diff.
write_parquet start.
  write_parquet Execution time: 0.0781 seconds
Sprint 2:
  Train: date_id in [1, 3) -> test_datasets/sprint2/train.parquet
  Valid: date_id in [3, 

([], [])

In [26]:
train1 = pl.read_parquet("/kaggle/working/test_datasets/sprint1/train.parquet")
train1.filter(pl.col("symbol_id") == 0)

symbol_id,date_id,time_id,responder_0,feature_00,responder_0_prev_first,responder_0_prev_last,responder_0_prev_max,responder_0_prev_min,responder_0_prev_std,responder_0_prev_last_7day_mean
i64,i64,i64,i64,f64,i64,i64,i64,i64,f64,f64
0,0,0,0,0.0,,,,,,
0,0,1,1,0.001,,,,,,
0,0,2,2,0.002,,,,,,
0,0,3,3,0.003,,,,,,
0,0,4,4,0.004,,,,,,
…,…,…,…,…,…,…,…,…,…,…
0,1,20,10020,10.02,0,24,24,0,7.359801,24.0
0,1,21,10021,10.021,0,24,24,0,7.359801,24.0
0,1,22,10022,10.022,0,24,24,0,7.359801,24.0
0,1,23,10023,10.023,0,24,24,0,7.359801,24.0


In [27]:
valid1 = pl.read_parquet("/kaggle/working/test_datasets/sprint1/valid.parquet")
valid1.filter(pl.col("symbol_id") == 0)

symbol_id,date_id,time_id,responder_0,feature_00,responder_0_prev_first,responder_0_prev_last,responder_0_prev_max,responder_0_prev_min,responder_0_prev_std,responder_0_prev_last_7day_mean
i64,i64,i64,i64,f64,i64,i64,i64,i64,f64,f64
0,2,0,20000,20.0,10000,10024,10024,10000,7.359801,1452.571429
0,2,1,20001,20.001,10000,10024,10024,10000,7.359801,2881.142857
0,2,2,20002,20.002,10000,10024,10024,10000,7.359801,4309.714286
0,2,3,20003,20.003,10000,10024,10024,10000,7.359801,5738.285714
0,2,4,20004,20.004,10000,10024,10024,10000,7.359801,7166.857143
…,…,…,…,…,…,…,…,…,…,…
0,2,20,20020,20.02,10000,10024,10024,10000,7.359801,10024.0
0,2,21,20021,20.021,10000,10024,10024,10000,7.359801,10024.0
0,2,22,20022,20.022,10000,10024,10024,10000,7.359801,10024.0
0,2,23,20023,20.023,10000,10024,10024,10000,7.359801,10024.0


In [28]:
train2 = pl.read_parquet("/kaggle/working/test_datasets/sprint2/train.parquet")
train2.filter(pl.col("symbol_id") == 0)

symbol_id,date_id,time_id,responder_0,feature_00,responder_0_prev_first,responder_0_prev_last,responder_0_prev_max,responder_0_prev_min,responder_0_prev_std,responder_0_prev_last_7day_mean
i64,i64,i64,i64,f64,i64,i64,i64,i64,f64,f64
0,1,0,10000,10.0,0,24,24,0,7.359801,
0,1,1,10001,10.001,0,24,24,0,7.359801,
0,1,2,10002,10.002,0,24,24,0,7.359801,
0,1,3,10003,10.003,0,24,24,0,7.359801,
0,1,4,10004,10.004,0,24,24,0,7.359801,
…,…,…,…,…,…,…,…,…,…,…
0,2,20,20020,20.02,10000,10024,10024,10000,7.359801,10024.0
0,2,21,20021,20.021,10000,10024,10024,10000,7.359801,10024.0
0,2,22,20022,20.022,10000,10024,10024,10000,7.359801,10024.0
0,2,23,20023,20.023,10000,10024,10024,10000,7.359801,10024.0


In [29]:
valid2 = pl.read_parquet("/kaggle/working/test_datasets/sprint2/valid.parquet")
valid2.filter(pl.col("symbol_id") == 0)

symbol_id,date_id,time_id,responder_0,feature_00,responder_0_prev_first,responder_0_prev_last,responder_0_prev_max,responder_0_prev_min,responder_0_prev_std,responder_0_prev_last_7day_mean
i64,i64,i64,i64,f64,i64,i64,i64,i64,f64,f64
0,3,0,30000,30.0,20000,20024,20024,20000,7.359801,11452.571429
0,3,1,30001,30.001,20000,20024,20024,20000,7.359801,12881.142857
0,3,2,30002,30.002,20000,20024,20024,20000,7.359801,14309.714286
0,3,3,30003,30.003,20000,20024,20024,20000,7.359801,15738.285714
0,3,4,30004,30.004,20000,20024,20024,20000,7.359801,17166.857143
…,…,…,…,…,…,…,…,…,…,…
0,3,20,30020,30.02,20000,20024,20024,20000,7.359801,20024.0
0,3,21,30021,30.021,20000,20024,20024,20000,7.359801,20024.0
0,3,22,30022,30.022,20000,20024,20024,20000,7.359801,20024.0
0,3,23,30023,30.023,20000,20024,20024,20000,7.359801,20024.0


In [30]:
train3 = pl.read_parquet("/kaggle/working/test_datasets/sprint3/train.parquet")
train3.filter(pl.col("symbol_id") == 0)

symbol_id,date_id,time_id,responder_0,feature_00,responder_0_prev_first,responder_0_prev_last,responder_0_prev_max,responder_0_prev_min,responder_0_prev_std,responder_0_prev_last_7day_mean
i64,i64,i64,i64,f64,i64,i64,i64,i64,f64,f64
0,2,0,20000,20.0,10000,10024,10024,10000,7.359801,
0,2,1,20001,20.001,10000,10024,10024,10000,7.359801,
0,2,2,20002,20.002,10000,10024,10024,10000,7.359801,
0,2,3,20003,20.003,10000,10024,10024,10000,7.359801,
0,2,4,20004,20.004,10000,10024,10024,10000,7.359801,
…,…,…,…,…,…,…,…,…,…,…
0,3,20,30020,30.02,20000,20024,20024,20000,7.359801,20024.0
0,3,21,30021,30.021,20000,20024,20024,20000,7.359801,20024.0
0,3,22,30022,30.022,20000,20024,20024,20000,7.359801,20024.0
0,3,23,30023,30.023,20000,20024,20024,20000,7.359801,20024.0


In [31]:
valid3 = pl.read_parquet("/kaggle/working/test_datasets/sprint3/valid.parquet")
valid3.filter(pl.col("symbol_id") == 0)

symbol_id,date_id,time_id,responder_0,feature_00,responder_0_prev_first,responder_0_prev_last,responder_0_prev_max,responder_0_prev_min,responder_0_prev_std,responder_0_prev_last_7day_mean
i64,i64,i64,i64,f64,i64,i64,i64,i64,f64,f64
0,4,0,40000,40.0,30000,30024,30024,30000,7.359801,21452.571429
0,4,1,40001,40.001,30000,30024,30024,30000,7.359801,22881.142857
0,4,2,40002,40.002,30000,30024,30024,30000,7.359801,24309.714286
0,4,3,40003,40.003,30000,30024,30024,30000,7.359801,25738.285714
0,4,4,40004,40.004,30000,30024,30024,30000,7.359801,27166.857143
…,…,…,…,…,…,…,…,…,…,…
0,4,20,40020,40.02,30000,30024,30024,30000,7.359801,30024.0
0,4,21,40021,40.021,30000,30024,30024,30000,7.359801,30024.0
0,4,22,40022,40.022,30000,30024,30024,30000,7.359801,30024.0
0,4,23,40023,40.023,30000,30024,30024,30000,7.359801,30024.0


In [32]:
import os
import glob
import polars as pl
import numpy as np

output_base = "datasets"
os.makedirs(output_base, exist_ok=True)

data_path = "/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet"
parquet_files = sorted(glob.glob(os.path.join(data_path, "partition_id=*", "*.parquet")))

# 入力変数選択（事前定義）
id_col = "symbol_id"    # ID列
date_id = "date_id"     # 日付列
time_id = "time_id"     # 時間列
pred_cols = [f"responder_{i}" for i in range(9)]  # 目的変数

# 最初のファイルからカラム一覧とdtype取得
if not parquet_files:
    raise FileNotFoundError("No parquet files found in the specified data path.")

df_test = pl.read_parquet(parquet_files[9], n_rows=1)
all_cols = df_test.columns
schema = df_test.schema  # {col_name: polars.DataType}

# 除外する列
exclude_cols = {id_col, date_id, time_id} | set(pred_cols)

# input_colsを自動的に決定
input_cols = []
for col in all_cols:
    if col not in exclude_cols:
        polars_dtype = schema[col]
        print(f"{col}, {polars_dtype}")
        input_cols.append(col)

# target_cols作成
target_cols = [id_col, date_id, time_id] + pred_cols + input_cols
print("target_cols:", target_cols)

weight, Float32
feature_00, Float32
feature_01, Float32
feature_02, Float32
feature_03, Float32
feature_04, Float32
feature_05, Float32
feature_06, Float32
feature_07, Float32
feature_08, Float32
feature_09, Int8
feature_10, Int8
feature_11, Int16
feature_12, Float32
feature_13, Float32
feature_14, Float32
feature_15, Float32
feature_16, Float32
feature_17, Float32
feature_18, Float32
feature_19, Float32
feature_20, Float32
feature_21, Float32
feature_22, Float32
feature_23, Float32
feature_24, Float32
feature_25, Float32
feature_26, Float32
feature_27, Float32
feature_28, Float32
feature_29, Float32
feature_30, Float32
feature_31, Float32
feature_32, Float32
feature_33, Float32
feature_34, Float32
feature_35, Float32
feature_36, Float32
feature_37, Float32
feature_38, Float32
feature_39, Float32
feature_40, Float32
feature_41, Float32
feature_42, Float32
feature_43, Float32
feature_44, Float32
feature_45, Float32
feature_46, Float32
feature_47, Float32
feature_48, Float32
feature_49, 

In [33]:
import polars as pl

# Globalデータ
# max_time_idを取得
max_time_id = pl.read_parquet(parquet_files[9]).select(pl.col("time_id").max()).item()
segment_size = (max_time_id + 1) // 4  # 4等分のサイズを計算

# 4分割の閾値を計算
threshold1 = segment_size
threshold2 = segment_size * 2
threshold3 = segment_size * 3

max_time_id, threshold1, threshold2, threshold3

(967, 242, 484, 726)

In [34]:
def add_prev_feature(ldf: pl.LazyFrame, cols: list[str]) -> tuple[pl.LazyFrame, list[str]]:
    """
    前日集計(_prev_*)を生成する関数 (LazyFrame版).
    
    Args:
        ldf: LazyFrame
        cols: 前日分の集計対象となる列（文字列のみ）
    Returns:
        (ldf_joined, generated_cols): 
          ldf_joined → ldfに_prev_*列をleft joinしたLazyFrame
          generated_cols → 生成された_prev_*列の名前一覧
    """
    agg_expr = []
    for c in cols:
        agg_expr.extend([
            pl.col(c).first().alias(f"{c}_prev_first"),
            pl.col(c).last().alias(f"{c}_prev_last"),
            pl.col(c).max().alias(f"{c}_prev_max"),
            pl.col(c).min().alias(f"{c}_prev_min"),
            pl.col(c).mean().alias(f"{c}_prev_mean"),
            pl.col(c).std().alias(f"{c}_prev_std")
        ])

    ldf_agg = (
        ldf
        .group_by(["symbol_id", "date_id"])
        .agg(agg_expr)
        .with_columns((pl.col("date_id") + 1).alias("date_id_next"))
        .drop("date_id")
        .rename({"date_id_next": "date_id"})
    )

    # 生成された列名(除く symbol_id, date_id)
    generated_cols = [
        c for c in ldf_agg.collect_schema().keys()
        if c not in ("symbol_id","date_id")
    ]

    # 元の ldf に left join
    ldf_joined = ldf.join(ldf_agg, on=["symbol_id","date_id"], how="left")
    return ldf_joined, generated_cols

def add_diff_n_by_id_date_time(ldf: pl.LazyFrame, n: int, cols: list[str]) -> tuple[pl.LazyFrame, list[str]]:
    """
    symbol_id, date_id, time_id でソートした時系列上で、
    n行前との差分(col - shift(n))を計算。time_id=0は前のdate_id末尾が参照対象.
    
    Args:
        ldf: LazyFrame
        n: 何行前とのdiffをとるか
        cols: 差分を計算したい列（文字列リスト）
    Returns:
        (ldf_with_diff, diff_col_names): 
          ldf_with_diff → diff列を追加したLazyFrame
          diff_col_names → 生成されたdiff列名
    """
    ldf_sorted = ldf.sort(["symbol_id","date_id","time_id"])

    diff_exprs = []
    diff_col_names = []

    for c in cols:
        diff_name = f"{c}_diff{n}"
        expr = (
            pl.col(c) - pl.col(c).shift(n)
        ).over("symbol_id").alias(diff_name)
        diff_exprs.append(expr)
        diff_col_names.append(diff_name)

    ldf_with_diff = ldf_sorted.with_columns(diff_exprs)
    return ldf_with_diff, diff_col_names

def add_id_time_id_group_feature(ldf: pl.LazyFrame, base_features: list[str]) -> tuple[pl.LazyFrame, list[str]]:
    """
    time_id を threshold1,2,3 で4分割し、
    group_first_ratio, group_expanding_mean(60)を追加.
    
    Returns:
        (ldf_with_cols, generated_cols)
    """
    ldf_sorted = ldf.sort(["symbol_id","date_id","time_id"])

    # time_id_group 列を追加
    ldf_grouped = ldf_sorted.with_columns(
        pl.when(pl.col("time_id") < threshold1).then(0)
        .when(pl.col("time_id") < threshold2).then(1)
        .when(pl.col("time_id") < threshold3).then(2)
        .otherwise(3)
        .cast(pl.Int32)
        .alias("time_id_group")
    )

    group_first_ratio_exprs = []
    group_expanding_mean_exprs = []
    generated_cols = []

    for col in base_features:
        gfr_name = f"{col}_group_first_ratio"
        gem_name = f"{col}_group_expanding_mean60"
        expr_gfr = (
            (pl.col(col).first() / pl.col(col))
            .over(['date_id','time_id_group','symbol_id'])
            .cast(pl.Float32)
            .alias(gfr_name)
        )
        expr_gem = (
            (pl.col(col).rolling_mean(60, min_periods=1) / pl.col(col))
            .over(['date_id','time_id_group','symbol_id'])
            .cast(pl.Float32)
            .alias(gem_name)
        )
        group_first_ratio_exprs.append(expr_gfr)
        group_expanding_mean_exprs.append(expr_gem)
        generated_cols.extend([gfr_name, gem_name])

    ldf_with_cols = ldf_grouped.with_columns(group_first_ratio_exprs + group_expanding_mean_exprs)
    return ldf_with_cols, generated_cols

def add_date_time_feature(ldf: pl.LazyFrame, base_features: list[str]) -> tuple[pl.LazyFrame, list[str]]:
    """
    date_id, time_id ごとの mean ratio / rank を追加
    Returns:
        (ldf_with_cols, generated_cols)
    """
    group_mean_ratio_exprs = []
    group_rank_exprs = []
    generated_cols = []

    for col in base_features:
        gmr_name = f"{col}_time_id_group_mean_ratio"
        grk_name = f"{col}_time_id_group_rank"

        expr_gmr = (
            (pl.col(col).mean() / pl.col(col))
            .over(['date_id','time_id'])
            .cast(pl.Float32)
            .alias(gmr_name)
        )
        expr_grk = (
            (pl.col(col).rank(descending=True, method='ordinal') / pl.col(col).count())
            .over(['date_id','time_id'])
            .cast(pl.Float32)
            .alias(grk_name)
        )
        group_mean_ratio_exprs.append(expr_gmr)
        group_rank_exprs.append(expr_grk)
        generated_cols.extend([gmr_name, grk_name])

    ldf_with_cols = ldf.with_columns(group_mean_ratio_exprs + group_rank_exprs)
    return ldf_with_cols, generated_cols

In [35]:
import polars as pl
from typing import List, Tuple


def add_symbol_date_lag(
    ldf: pl.LazyFrame, 
    cols: list[str],
    n: int = 1
) -> tuple[pl.LazyFrame, list[str]]:
    """
    symbol_id × date_id でgroupbyし、colsの統計量を計算して
    "date_id + n" に対応付ける -> 次の日(あるいは n日後)が参照する形でリークを防ぐ。
    
    戻り値:
      (ldf_joined, generated_cols)
        ldf_joined: ldfに統計量をleft joinしたLazyFrame
        generated_cols: 新たに生成された列名一覧
    """
    agg_expr = []
    generated_cols = []
    for c in cols:
        # 例として mean, std, min, maxあたりを計算
        agg_expr.extend([
            pl.col(c).mean().alias(f"{c}_lag{n}_mean"),
            pl.col(c).std().alias(f"{c}_lag{n}_std"),
            pl.col(c).min().alias(f"{c}_lag{n}_min"),
            pl.col(c).max().alias(f"{c}_lag{n}_max"),
        ])
        generated_cols.extend([
            f"{c}_lag{n}_mean",
            f"{c}_lag{n}_std",
            f"{c}_lag{n}_min",
            f"{c}_lag{n}_max",
        ])
    
    # groupby symbol_id, date_id
    ldf_agg = (
        ldf.group_by(["symbol_id","date_id"])
           .agg(agg_expr)
           .with_columns((pl.col("date_id") + n).alias("date_id_next"))
           .drop("date_id")
           .rename({"date_id_next":"date_id"})
    )
    # これで "n日後" の行に統計量をjoin可能

    # left join
    ldf_joined = ldf.join(ldf_agg, on=["symbol_id","date_id"], how="left")
    return ldf_joined, generated_cols

def add_symbol_timeidgroup_lag(
    ldf: pl.LazyFrame,
    cols: List[str],
    agg_funcs: List[str] = ["mean","std"],
    group_size: int = 4  # time_id_group が [0,1,2,3] の場合
) -> Tuple[pl.LazyFrame, List[str]]:
    """
    time_id_groupが 0 の行は (date_id-1, group=group_size-1) を“直前”として参照し、
    group>0 の行は (date_id, group-1) を“直前”として参照。
    これを実現するために:
      1) ldfに date_id_prev, time_id_group_prev を計算
      2) group_by(["symbol_id","date_id_prev","time_id_group_prev"]) で集計 (cols, agg_funcs)
      3) main ldf に on=["symbol_id","date_id","time_id_group"] で left join

    戻り値:
      (ldf_joined, generated_cols)
    """

    # 1) date_id_prev, time_id_group_prev を列として作る
    #    time_id_group=0 → (date_id-1, group=3)
    #    time_id_group>0 → (date_id, group=time_id_group-1)
    # LazyFrameに列を追加
    ldf_prep = ldf.with_columns([
        # date_id_prev
        pl.when(pl.col("time_id_group") == 0)
          .then(pl.col("date_id") - 1)
          .otherwise(pl.col("date_id"))
          .alias("date_id_prev"),

        # time_id_group_prev
        pl.when(pl.col("time_id_group") == 0)
          .then(group_size - 1)  # group=3 if group_size=4
          .otherwise(pl.col("time_id_group") - 1)
          .alias("time_id_group_prev")
    ])

    # 2) group_by(["symbol_id","date_id_prev","time_id_group_prev"]) で集計
    agg_exprs = []
    generated_cols = []
    for c in cols:
        for f in agg_funcs:
            alias_name = f"{c}_lag_{f}"  # 例: c_lag_mean, c_lag_std
            generated_cols.append(alias_name)
            if f=="mean":
                agg_exprs.append(pl.col(c).mean().alias(alias_name))
            elif f=="std":
                agg_exprs.append(pl.col(c).std().alias(alias_name))
            elif f=="min":
                alias_name = f"{c}_lag_min"
                generated_cols.append(alias_name)
                agg_exprs.append(pl.col(c).min().alias(alias_name))
            elif f=="max":
                alias_name = f"{c}_lag_max"
                generated_cols.append(alias_name)
                agg_exprs.append(pl.col(c).max().alias(alias_name))
            # 必要に応じて他の集約関数も追加

    ldf_agg = (
        ldf_prep.group_by(["symbol_id","date_id_prev","time_id_group_prev"])
                .agg(agg_exprs)
    )

    # 3) main ldf と join
    # main ldf のキー: (symbol_id, date_id, time_id_group)
    # ldf_agg のキー:  (symbol_id, date_id_prev, time_id_group_prev)
    # => join するには ldf_prep と rename などで合わせる
    #  あるいは on left side: "symbol_id", "date_id_prev as date_id", "time_id_group_prev as time_id_group"
    #   → ここは left sideを下準備 or right side rename
    #   ここでは ldf_agg 側を rename して "date_id_prev->date_id", "time_id_group_prev->time_id_group"
    ldf_agg_renamed = ldf_agg.rename({"date_id_prev":"date_id","time_id_group_prev":"time_id_group"})

    # now we can join on (symbol_id, date_id, time_id_group)
    ldf_joined = ldf_prep.join(
        ldf_agg_renamed,
        on=["symbol_id","date_id","time_id_group"],
        how="left"
    )

    return ldf_joined, generated_cols

# 汎用特徴量作成関数

In [36]:
import re
import polars as pl
from typing import List, Tuple

def create_stat_features_by(
    ldf: pl.LazyFrame,
    cols: List[str],
    key_by: List[str],
    agg_funcs: List[str],
) -> Tuple[pl.LazyFrame, List[str]]:
    """
    ldf を key_by で group_by し、cols に対して指定された agg_funcs を実行する汎用関数。
    例:
      agg_funcs に ["mean","std","min","max","median","sum","count","n_unique",
                    "last","first","skew","kurtosis","cv", "q0.25","q0.75" ...]
      のような文字列を指定できる。

    戻り値:
      (ldf_agg, generated_cols):
        ldf_agg : group_by(key_by).agg(...) の結果 (key_by + 各集計列 を持つ LazyFrame)
        generated_cols : 新しく生成された列名のリスト

    注意:
      - "q0.25" 等のquantile形式は "q0.x" のフォーマットで xをfloatとして解釈し expr.quantile(x)。
      - "cv" (coefficient of variation) = std/mean (mean=0に注意)
      - skew, kurtosis は Polars のバージョンによっては使えない場合あり
      - rolling_meanなどのウィンドウ関数はこのgroup_by集約とは別物

    例:
      aggregator_mapに含まれる文字列一覧を "agg_funcs" で指定
        -> "mean","std","cv","q0.25","skew"など
    """

    # Polarsでサポートする単一スカラー集約を文字列→lambda で定義
    def parse_quantile(alias: str, cexpr: pl.Expr) -> pl.Expr:
        # 例: alias == "q0.25" → 0.25
        #     alias == "q0.75" → 0.75
        m = re.match(r"q0\.(\d+)", alias)  # 例: "q0.25" -> group(1)=="25"
        if not m:
            # 不正形式ならそのまま col
            return cexpr
        float_str = "0." + m.group(1)
        q = float(float_str)
        return cexpr.quantile(q)  # quantile(0.25 / 0.75 etc.)

    aggregator_map = {
        "mean":      lambda cexpr: cexpr.mean(),
        "std":       lambda cexpr: cexpr.std(),
        "min":       lambda cexpr: cexpr.min(),
        "max":       lambda cexpr: cexpr.max(),
        "median":    lambda cexpr: cexpr.median(),
        "sum":       lambda cexpr: cexpr.sum(),
        "count":     lambda cexpr: cexpr.count(),
        "n_unique":  lambda cexpr: cexpr.n_unique(),
        "last":      lambda cexpr: cexpr.last(),
        "first":     lambda cexpr: cexpr.first(),
        "skew":      lambda cexpr: cexpr.skew(),
        "kurtosis":  lambda cexpr: cexpr.kurtosis(),
        # 変動係数 (coefficient of variation)
        "cv":        lambda cexpr: (cexpr.std() / cexpr.mean()),  # mean=0注意
    }

    agg_exprs = []
    generated_cols: List[str] = []

    for c in cols:
        for f in agg_funcs:
            alias_name = f"{c}_{'_'.join(key_by)}_{f}"

            # 1) quantile系:  "q0.25", "q0.50" etc.
            if f.startswith("q0."):
                expr = parse_quantile(f, pl.col(c)).alias(alias_name)
                agg_exprs.append(expr)
                generated_cols.append(alias_name)
                continue

            # 2) aggregator_map にあるか？
            aggregator = aggregator_map.get(f, None)
            if aggregator is not None:
                expr = aggregator(pl.col(c)).alias(alias_name)
                agg_exprs.append(expr)
                generated_cols.append(alias_name)
            else:
                raise Exception(f"create_stat_features_by: not defined {f}")

    # group_by and agg
    ldf_agg = ldf.group_by(key_by).agg(agg_exprs)

    return ldf_agg, generated_cols

# rolling/window系特徴量

In [37]:
import polars as pl
from typing import List, Tuple


def add_avg_change_and_volatility(
    ldf: pl.LazyFrame,
    cols: List[str],
    sort_keys: List[str],
    group_keys: List[str],
    n: int = 5,
    use_log_return: bool = False,
) -> Tuple[pl.LazyFrame, List[str]]:
    """
    LazyFrameに対して:
      (1) t-nとの変化率 (log/ratio) ×100
      (2) 過去n期間 rolling_std(ボラティリティ)
    をcolsの各カラムについて計算し、(ldf_out, generated_cols)を返す。

    Parameters
    ----------
    ldf : pl.LazyFrame
        処理対象のLazyFrame
    cols : List[str]
        計算対象の数値カラム (["feature_00","feature_01"]など)
    n : int
        shiftやrollingのウィンドウサイズ
    use_log_return : bool
        Trueならlogリターンで計算
    sort_keys : List[str], optional
        時系列順を保証するためにソートするカラムのリスト
        例: ["symbol_id","date_id","time_id"]
    group_keys : List[str], optional
        shiftやrollingを「どのカラム単位でパーティション切る」か
        例: ["symbol_id"]

    Returns
    -------
    (ldf_out, generated_cols):
      ldf_out       : 新しい列を追加したLazyFrame
      generated_cols: 作成された列の名前一覧

    Note
    ----
    - sort_keys が指定されれば ldf.sort(by=sort_keys) で並べ替えてから計算
    - group_keys が指定されれば shift(n).over(group_keys), rolling_std(...).over(group_keys)
      → 銘柄単位、などで独立計算が可能
    """

    # 1) ソート
    if sort_keys:
        ldf = ldf.sort(by=sort_keys)

    # group_keys が空でなければ partition 単位で over() を使う
    def shift_n_expr(col: str, periods: int):
        if group_keys:
            return pl.col(col).shift(periods).over(group_keys)
        else:
            return pl.col(col).shift(periods)

    def rolling_std_expr(col: str, window_size: int):
        if group_keys:
            return pl.col(col).rolling_std(window_size=window_size).over(group_keys)
        else:
            return pl.col(col).rolling_std(window_size=window_size)

    generated_cols: List[str] = []
    tmp_cols: List[str] = []

    # (A) まず、一時列(1期間リターン)を作成
    exprs_step1 = []
    for col_name in cols:
        tmp_return_col = f"__tmp_return_{col_name}_{n}_{'log' if use_log_return else 'ratio'}"
        tmp_cols.append(tmp_return_col)

        # 1期前の log 差 or ratio
        if use_log_return:
            # (log(col) - log(col.shift(1))) * 100
            exprs_step1.append(
                (
                    (pl.col(col_name).log() - shift_n_expr(col_name, 1).log()) * 100
                ).alias(tmp_return_col)
            )
        else:
            # ((col / col.shift(1)) - 1) * 100
            exprs_step1.append(
                (
                    ((pl.col(col_name) / shift_n_expr(col_name, 1)) - 1) * 100
                ).alias(tmp_return_col)
            )

    # 一時列を先に追加
    ldf = ldf.with_columns(exprs_step1)

    # (B) 追加で (a) t-n との変化率, (b) rolling_std を計算
    exprs_step2 = []
    for col_name in cols:
        avg_change_col = f"{col_name}_avg_change_{n}"
        vol_col = f"{col_name}_volatility_{n}"
        generated_cols.extend([avg_change_col, vol_col])

        tmp_return_col = f"__tmp_return_{col_name}_{n}_{'log' if use_log_return else 'ratio'}"

        # (a) t-n との変化率
        if use_log_return:
            # log リターン: (log(col) - log(col.shift(n))) * 100
            exprs_step2.append(
                (
                    (pl.col(col_name).log() - shift_n_expr(col_name, n).log()) * 100
                ).alias(avg_change_col)
            )
        else:
            # ratio リターン: ((col / col.shift(n)) - 1) * 100
            exprs_step2.append(
                (
                    ((pl.col(col_name) / shift_n_expr(col_name, n)) - 1) * 100
                ).alias(avg_change_col)
            )

        # (b) rolling_std (過去 n 期間)
        exprs_step2.append(
            rolling_std_expr(tmp_return_col, n).alias(vol_col)
        )

    # 二段階目の計算を実行し、一時列を drop
    ldf_out = (
        ldf
        .with_columns(exprs_step2)
        .drop(tmp_cols)
    )

    return ldf_out, generated_cols

# 特徴量処理

In [38]:
train_length = 180
valid_length = 30
train_shift = 120
retroactive_size = 60 # rollingやwindowの最大lags数

## Fature Today Rolling 処理

In [39]:
def feature_today_rolling_func(
    df: pl.DataFrame,
    pred_cols: List[str],
    input_cols: List[str],
    original_cols: List[str] | None = None
) -> Tuple[pl.DataFrame, List[List[str]], List[str]]:
    """
    featureに関する特徴量を計算
    """
    generated_features: List[str] = []
    generated_preds: List[str] = []

    # Lazy化
    ldf = df.lazy()

    ldf.drop(pred_cols)

    #  sort
    ldf = ldf.sort(["symbol_id", "date_id", "time_id"])

    # 当日特徴量
    ## rolling特徴量
    ldf, generated_cols = add_avg_change_and_volatility(ldf, input_cols, n = 4, group_keys=["symbol_id"], sort_keys=["date_id", "time_id"])
    generated_features.extend(generated_cols)

    ldf, generated_cols = add_avg_change_and_volatility(ldf, input_cols, n = 28, group_keys=["symbol_id"], sort_keys=["date_id", "time_id"])
    generated_features.extend(generated_cols)

    # drop
    ldf.drop(input_cols)

    return ldf, generated_features, generated_preds

In [None]:
output_base = "feature_today_rolling_datasets"
os.makedirs(output_base, exist_ok=True)

generated_feature_lag, _ = run_walk_forward(
    input_base=data_path,
    output_base=output_base,
    id_col=id_col,
    date_id=date_id,
    time_id=time_id,
    pred_cols=pred_cols,
    input_cols=input_cols,
    train_length=train_length,
    valid_length=valid_length,
    train_shift=train_shift, 
    retroactive_size=retroactive_size,
    add_feature_func=feature_today_rolling_func
)

original_cols ['symbol_id', 'date_id', 'time_id', 'responder_0', 'responder_1', 'responder_2', 'responder_3', 'responder_4', 'responder_5', 'responder_6', 'responder_7', 'responder_8', 'weight', 'feature_00', 'feature_01', 'feature_02', 'feature_03', 'feature_04', 'feature_05', 'feature_06', 'feature_07', 'feature_08', 'feature_09', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49', 'feature_50', 'feature_51', 'feature_52', 'feature_53', 'feature_54', 'feature_55', 'feature_56', 'featur

## Feature Today Stat処理

In [None]:
def feature_today_stat_func(
    df: pl.DataFrame,
    pred_cols: List[str],
    input_cols: List[str],
    original_cols: List[str] | None = None
) -> Tuple[pl.DataFrame, List[List[str]], List[str]]:
    """
    featureに関する特徴量を計算
    """
    generated_features: List[str] = []
    generated_preds: List[str] = []

    # Lazy化
    ldf = df.lazy()

    #  sort
    ldf = ldf.sort(["symbol_id", "date_id", "time_id"])
    
    # date_id, time_id単位
    df_datetime, datetime_feature_cols = create_stat_features_by(ldf=ldf, cols=input_cols, key_by=["date_id", "time_id"], agg_funcs=["mean", "std", "skew", "kurtosis", "cv", "last"])
    generated_features.extend(datetime_feature_cols)
    
    df_datetime, datetime_feature_cols = add_avg_change_and_volatility(df_datetime, datetime_feature_cols, n = 7, group_keys=["date_id"], sort_keys=["time_id"])
    generated_features.extend(datetime_feature_cols)
    
    return df_datetime, generated_features, generated_preds

In [None]:
output_base = "feature_today_stat_datasets"
os.makedirs(output_base, exist_ok=True)

generated_feature_lag, _ = run_walk_forward(
    input_base=data_path,
    output_base=output_base,
    id_col=id_col,
    date_id=date_id,
    time_id=time_id,
    pred_cols=pred_cols,
    input_cols=input_cols,
    train_length=train_length,
    valid_length=valid_length,
    train_shift=train_shift, 
    retroactive_size=retroactive_size,
    add_feature_func=feature_today_stat_func
)

## Responder処理

In [None]:
import polars as pl
from typing import List, Tuple

def create_raw_responder_lag(
    df: pl.DataFrame,
    pred_cols: List[str]
) -> Tuple[pl.LazyFrame, List[str]]:
    """
    1) 'symbol_id', 'date_id', 'w' と pred_cols の列のみ取り出す
    2) w * responder_6 の列を新たに作る ("w_responder_6")
    3) w を削除
    4) date_id を (date_id - 1) シフト
    5) pred_cols + ["w_responder_6"] を "_lag_1" にリネーム
    戻り値:
      (lazy_frame, generated_cols):
        lazy_frame : 上記処理結果をLazyFrameに変換したもの
        generated_cols : リネーム後の列名(例: "responder_6_lag_1", "w_responder_6_lag_1"など)
    """

    # 1) 必要列のみ抽出 (Eager DataFrame)
    target_cols = ["symbol_id", "date_id", "time_id_group", "weight"] + pred_cols
    df_lags = df.select(target_cols)

    # 2) w * responder を作成
    w_responder_cols = []
    for c in pred_cols:
        w_c = f"w_{c}"
        w_responder_cols.append(w_c)
        df_lags = df_lags.with_columns(
            (pl.col("weight") * pl.col(c)).alias(w_c)
        )
    new_pred_cols = pred_cols + w_responder_cols
    
    # 3) wを削除
    df_lags = df_lags.drop(["weight"])

    # 4) date_id = date_id + 1
    df_lags = df_lags.with_columns(
        (pl.col("date_id") + 1).alias("date_id")
    )

    # 5) 上記 new_pred_cols を "colName_lag_1" にリネーム
    rename_map = {}
    for c in new_pred_cols:
        rename_map[c] = f"{c}_lag_1"

    df_lags = df_lags.rename(rename_map)

    # リネーム後の列名一覧
    generated_cols = list(rename_map.values())
    
    keys = ["symbol_id", "date_id", "time_id_group"]
    return df_lags, generated_cols, keys

In [None]:
def create_responder_lag_feature(ldf, cols):
    generated_features = []
    df_raw_lags, lags_cols, raw_lags_keys = create_raw_responder_lag(ldf, cols)

    # symbol_id, date_id, time_id_group 単位
    df_lags, gen_cols_symdate_group = create_stat_features_by(ldf=df_raw_lags, cols=lags_cols, key_by=["symbol_id", "date_id", "time_id_group"], agg_funcs=["mean", "std", "skew", "kurtosis", "cv", "last"])
    generated_features.extend(gen_cols_symdate_group)

    ## rolling特徴量
    ### 4 time_id_groupまで見る
    df_lags, generated_cols = add_avg_change_and_volatility(df_lags, gen_cols_symdate_group, n = 4, group_keys=["symbol_id"], sort_keys=["date_id", "time_id_group"])
    generated_features.extend(generated_cols)

    ### 4 * 7 time_id_groupまで見る
    df_lags, generated_cols = add_avg_change_and_volatility(df_lags, gen_cols_symdate_group, n = 28, group_keys=["symbol_id"], sort_keys=["date_id", "time_id_group"])
    generated_features.extend(generated_cols)

    # date_id単位でラグ列を集計
    df_lags_dateid, gen_cols_date = create_stat_features_by(ldf=df_raw_lags, cols=lags_cols, key_by=["date_id"], agg_funcs=["mean", "std", "skew", "kurtosis", "cv", "last"])
    generated_features.extend(gen_cols_date)

    df_lags_dateid, generated_cols = add_avg_change_and_volatility(df_lags_dateid, gen_cols_date, n = 7, group_keys=["date_id"], sort_keys=[])
    generated_features.extend(generated_cols)
    
    df_lags = df_lags.join(df_lags_dateid, on=["date_id"], how="left")

    # symbol_id, date_id単位
    df_lags_symdate, gen_cols_symdate = create_stat_features_by(ldf=df_raw_lags, cols=lags_cols,key_by=["symbol_id", "date_id"],agg_funcs=["mean", "std", "skew", "kurtosis", "cv", "last"])
    generated_features.extend(gen_cols_symdate)
    
    df_lags_symdate, generated_cols = add_avg_change_and_volatility(df_lags_symdate, gen_cols_symdate, n = 7, group_keys=["symbol_id"], sort_keys=["date_id"])
    generated_features.extend(generated_cols)

    df_lags = df_lags.join(df_lags_symdate, on=["symbol_id", "date_id"], how="left")

    # date_id, time_group_id単位
    df_lags_datetime, gen_cols_datetime = create_stat_features_by(ldf=df_raw_lags, cols=lags_cols, key_by=["date_id", "time_id_group"],agg_funcs=["mean", "std", "skew", "kurtosis", "cv", "last"])
    generated_features.extend(gen_cols_datetime)
   
    df_lags_datetime, generated_cols = add_avg_change_and_volatility(df_lags_datetime, gen_cols_datetime, n = 7, group_keys=["date_id"], sort_keys=["time_id_group"])
    generated_features.extend(generated_cols)
    
    df_lags = df_lags.join(df_lags_datetime, on=["date_id","time_id_group"], how="left")

    return df_lags, generated_features

In [None]:
def responder_lag_feature_func(
    df: pl.DataFrame,
    pred_cols: List[str],
    input_cols: List[str],
    original_cols: List[str] | None = None
) -> Tuple[pl.DataFrame, List[List[str]], List[str]]:
    ## [lags.parquetの扱い]
    ## 当日と前日のtiem_idが異なる可能性なので、単純なdate_id, time_idのlagは有用ではない
    ## また、前日の全てのlagは当日のtime_id == 0のタイミングで提供される
    ## すなわち、lagsが提供された場合で統計量の処理を行い、date_id - 1のresponderの統計量として扱う。
    
    generated_features: List[str] = []
    generated_preds: List[str] = []

    # Lazy化
    ldf = df.lazy()

    # 1) sort
    ldf = ldf.sort(["symbol_id", "date_id", "time_id"])

    # 2) time_id_group (4分割)
    ldf = ldf.with_columns(
        pl.when(pl.col("time_id") < threshold1).then(0)
          .when(pl.col("time_id") < threshold2).then(1)
          .when(pl.col("time_id") < threshold3).then(2)
          .otherwise(3)
          .cast(pl.Int32)
          .alias("time_id_group")
    )
    generated_features.extend(["time_id_group"])

    # responder-lag特徴量
    ldf, cols = create_responder_lag_feature(ldf, pred_cols)
    generated_features.extend(cols)

    return result_df, generated_features, generated_preds

In [None]:
output_base = "responder_lag_datasets"
os.makedirs(output_base, exist_ok=True)

responder_lag_features, generated_preds = run_walk_forward(
    input_base=data_path,
    output_base=output_base,
    id_col=id_col,
    date_id=date_id,
    time_id=time_id,
    pred_cols=pred_cols,
    input_cols=input_cols,
    train_length=train_length,
    valid_length=valid_length,
    train_shift=train_shift, 
    retroactive_size=retroactive_size,
    add_feature_func=responder_lag_feature_func
)

In [None]:
responder_lag_features

## Feature(LAG)処理

In [None]:
import polars as pl
from typing import List, Tuple

def create_raw_feature_lag(
    df: pl.DataFrame,
    pred_cols: List[str]
) -> Tuple[pl.LazyFrame, List[str]]:
    target_cols = ["symbol_id", "date_id", "time_id_group"] + pred_cols
    df_lags = df.select(target_cols)

    # 4) date_id = date_id + 1
    df_lags = df_lags.with_columns(
        (pl.col("date_id") + 1).alias("date_id")
    )
    rename_map = {}
    for c in pred_cols:
        rename_map[c] = f"{c}_lag_1"

    df_lags = df_lags.rename(rename_map)

    # リネーム後の列名一覧
    generated_cols = list(rename_map.values())
    
    keys = ["symbol_id", "date_id", "time_id_group"]
    return df_lags, generated_cols, keys

def create_feature_lag(ldf, cols):
    generated_features = []
    df_raw_lags, lags_cols, raw_lags_keys = create_raw_feature_lag(ldf, cols)

    # symbol_id, date_id, time_id_group 単位
    df_lags, gen_cols_symdate_group = create_stat_features_by(ldf=df_raw_lags, cols=lags_cols, key_by=["symbol_id", "date_id", "time_id_group"], agg_funcs=["mean", "std", "skew", "kurtosis", "cv", "last"])
    generated_features.extend(gen_cols_symdate_group)

    ## rolling特徴量
    ### 4 time_id_groupまで見る
    df_lags, generated_cols = add_avg_change_and_volatility(df_lags, gen_cols_symdate_group, n = 4, group_keys=["symbol_id"], sort_keys=["date_id", "time_id_group"])
    generated_features.extend(generated_cols)

    ### 4 * 7 time_id_groupまで見る
    df_lags, generated_cols = add_avg_change_and_volatility(df_lags, gen_cols_symdate_group, n = 28, group_keys=["symbol_id"], sort_keys=["date_id", "time_id_group"])
    generated_features.extend(generated_cols)

    # date_id単位でラグ列を集計
    df_lags_dateid, gen_cols_date = create_stat_features_by(ldf=df_raw_lags, cols=lags_cols, key_by=["date_id"], agg_funcs=["mean", "std", "skew", "kurtosis", "cv", "last"])
    generated_features.extend(gen_cols_date)

    df_lags_dateid, generated_cols = add_avg_change_and_volatility(df_lags_dateid, gen_cols_date, n = 7, group_keys=["date_id"], sort_keys=[])
    generated_features.extend(generated_cols)
    
    df_lags = df_lags.join(df_lags_dateid, on=["date_id"], how="left")

    # symbol_id, date_id単位
    df_lags_symdate, gen_cols_symdate = create_stat_features_by(ldf=df_raw_lags, cols=lags_cols,key_by=["symbol_id", "date_id"],agg_funcs=["mean", "std", "skew", "kurtosis", "cv", "last"])
    generated_features.extend(gen_cols_symdate)
    
    df_lags_symdate, generated_cols = add_avg_change_and_volatility(df_lags_symdate, gen_cols_symdate, n = 7, group_keys=["symbol_id"], sort_keys=["date_id"])
    generated_features.extend(generated_cols)

    df_lags = df_lags.join(df_lags_symdate, on=["symbol_id", "date_id"], how="left")

    # date_id, time_group_id単位
    df_lags_datetime, gen_cols_datetime = create_stat_features_by(ldf=df_raw_lags, cols=lags_cols, key_by=["date_id", "time_id_group"],agg_funcs=["mean", "std", "skew", "kurtosis", "cv", "last"])
    generated_features.extend(gen_cols_datetime)
   
    df_lags_datetime, generated_cols = add_avg_change_and_volatility(df_lags_datetime, gen_cols_datetime, n = 7, group_keys=["date_id"], sort_keys=["time_id_group"])
    generated_features.extend(generated_cols)
    
    df_lags = df_lags.join(df_lags_datetime, on=["date_id","time_id_group"], how="left")

    return df_lags, generated_features

In [None]:
def feature_lag_func(
    df: pl.DataFrame,
    pred_cols: List[str],
    input_cols: List[str],
    original_cols: List[str] | None = None
) -> Tuple[pl.DataFrame, List[List[str]], List[str]]:
    """
    featureに関する特徴量を計算
    """
    generated_features: List[str] = []
    generated_preds: List[str] = []

    # Lazy化
    ldf = df.lazy()

    # 1) sort
    ldf = ldf.sort(["symbol_id", "date_id", "time_id"])

    # 2) time_id_group (4分割)
    ldf = ldf.with_columns(
        pl.when(pl.col("time_id") < threshold1).then(0)
          .when(pl.col("time_id") < threshold2).then(1)
          .when(pl.col("time_id") < threshold3).then(2)
          .otherwise(3)
          .cast(pl.Int32)
          .alias("time_id_group")
    )
    generated_features.extend(["time_id_group"])

    #前日特徴量(symbol_id, date_id -1, time_group_id)
    ldf, generated_cols = create_feature_lag(ldf, input_cols)
    generated_features.extend(generated_cols)

    return result_df, generated_features, generated_preds

In [None]:
output_base = "feature_lag_datasets"
os.makedirs(output_base, exist_ok=True)

generated_feature_lag, _ = run_walk_forward(
    input_base=data_path,
    output_base=output_base,
    id_col=id_col,
    date_id=date_id,
    time_id=time_id,
    pred_cols=pred_cols,
    input_cols=input_cols,
    train_length=train_length,
    valid_length=valid_length,
    train_shift=train_shift, 
    retroactive_size=retroactive_size,
    add_feature_func=feature_lag_func
)

In [None]:
generated_feature_lag