In [2]:
from hydra import compose, initialize
import polars as pl
from sklearn.metrics import mean_absolute_error
from src.processing import feature_engineering, preprocessing
from run.train import train_cv_for_ensemble

with initialize(config_path="../run/conf", version_base=None):
    cfg = compose(config_name="train")
cfg.cv.n_splits = 3

df = pl.read_csv("../data/input/train.csv")
df = preprocessing(df)
print(df.shape)
df.head()

(5237892, 17)


stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
u16,u16,u16,f32,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,u32,str
0,0,0,3180600.0,1,0.999812,13380277.0,,,0.999812,60651.5,1.000026,8493.030273,1.0,-3.029704,0,"""0_0_0"""
1,0,0,166603.90625,-1,0.999896,1642200.0,,,0.999896,3233.040039,1.00066,20605.089844,1.0,-5.519986,0,"""0_0_1"""
2,0,0,302879.875,-1,0.999561,1819368.0,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,"""0_0_2"""
3,0,0,11917682.0,-1,1.000171,18389746.0,,,0.999999,2324.899902,1.000214,479032.40625,1.0,-4.010201,0,"""0_0_3"""
4,0,0,447549.96875,-1,0.999532,17860614.0,,,0.999394,16485.539062,1.000016,434.100006,1.0,-7.349849,0,"""0_0_4"""


`stock_wap_t60`を目的変数として学習し、予測結果から計算で`target`を求める方法を試す。  
`target`より、`stock_wap_t60`の方がより直接的で予測しやすいのでは？という仮説を試す。

In [3]:
df = df.with_columns(
    pl.col("wap").shift(-6).over("stock_id", "date_id").alias("stock_wap_t60")
).drop_nulls(subset="stock_wap_t60")

df = feature_engineering(df, maintain_stock_id=True)
df = df.sort("stock_id", "time_id")
X, y = df.drop("target", "stock_wap_t60"), df["stock_wap_t60"]

# TODO: `train_cv_for_ensemble`は削除したので、書き換える必要がある
_, _, output = train_cv_for_ensemble(cfg, X, y, model_names=["lgb"])

  0%|          | 0/3 [00:00<?, ?it/s]

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 0.000722449
lgb fold 0 score: 0.0007224488757729102


 33%|███▎      | 1/3 [01:03<02:06, 63.04s/it]

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 0.000724886
lgb fold 1 score: 0.0007248863309102523


 67%|██████▋   | 2/3 [02:06<01:03, 63.15s/it]

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[499]	valid_0's l1: 0.000703659
lgb fold 2 score: 0.0007036591726609144


100%|██████████| 3/3 [03:09<00:00, 63.04s/it]

ensemble CV score: 0.0007169981264480256
best iters: {'lgb': 499}





スコアが上がらない。つまり、別にtargetを直接予測することにそれほど問題はないことになる。  
やや複雑な計算式で導かれるので、単純な`stock_wap_t60`の方が予測しやすいかと思ったが、そうではなかった。

In [4]:
# `pred_stock_wap_t60`から、`pred_target`を算出する
df = df.with_columns(pl.Series(output).alias("pred_stock_wap_t60"))
index_wap_df = df.group_by("time_id").agg(
    (pl.col("pred_stock_wap_t60") * pl.col("weight")).sum().alias("pred_index_wap_t60"),
    (pl.col("stock_wap_t60") * pl.col("weight")).sum().alias("index_wap_t60"),
)
df = df.join(index_wap_df, on="time_id").sort("stock_id", "time_id")

df = df.with_columns(
    (((pl.col("pred_stock_wap_t60") / pl.col("wap")) - (pl.col("pred_index_wap_t60") / pl.col("index_wap")))*10000).alias("pred_target")
)

print(f'MAE: {mean_absolute_error(df["target"], df["pred_target"])}')

MAE: 6.393566097605141


`stock_wap_t60`に対する予測結果を特徴量として、今度は`target`に対する学習を行う。  
いわゆる蒸留作業をやってみる。
- 結果
  - 精度は上がらず。このアプローチは中止し、普通に`target`を目的変数とする方針に戻す。

In [6]:
X_distillated, y_distillated = df.drop("target", "stock_wap_t60", "index_wap_t60"), df["target"]
train_cv_for_ensemble(cfg, X_distillated, y_distillated, model_names=["lgb"]);

  0%|          | 0/3 [00:00<?, ?it/s]

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 6.36464
lgb fold 0 score: 6.364635592344532


 33%|███▎      | 1/3 [01:04<02:09, 64.68s/it]

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 6.34843
lgb fold 1 score: 6.348427552225072


 67%|██████▋   | 2/3 [02:11<01:05, 65.85s/it]

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 6.1993
lgb fold 2 score: 6.19930025531353


100%|██████████| 3/3 [03:20<00:00, 66.98s/it]

ensemble CV score: 6.304121133294378
best iters: {'lgb': 500}



