In [1]:
# NOTE: batchがシャッフルされてしまっており、NNの結果はもとdataframeの順番と整合的でないため、アンサンブルの結果がおかしくなる

In [2]:
import polars as pl
from typing import Tuple
train_eng = pl.read_parquet("data/train_eng.parquet", columns=["date_id", "stock_id", "target"])

def split_by_date(df: pl.DataFrame, dates: Tuple[int, int]) -> pl.DataFrame:
    return df.filter(
        pl.col("date_id").ge(dates[0]).and_(pl.col("date_id").le(dates[1]))
    )
dates_test = (401, 480)
test_data = split_by_date(train_eng, dates_test)
test_data

date_id,stock_id,target
u16,u8,f32
401,0,-1.32978
401,1,0.150204
401,2,2.750158
401,3,5.559921
401,4,-9.959936
…,…,…
480,195,2.310276
480,196,-8.220077
480,197,1.169443
480,198,-1.540184


In [3]:
import pandas as pd
models = ["lgb", "xgb", "catboost", "cnn", "gru", "lstm"]
dfs = []
for model in models:
    df = pd.read_parquet(f"./output/{model}_predictions.parquet")["target"]
    dfs.append(df)

df = pd.concat(dfs, axis=1, keys=models)

In [4]:
df

Unnamed: 0,lgb,xgb,catboost,cnn,gru,lstm
0,1.053054,1.108541,1.122630,-6.911349,6.508505,-1.642773
1,-5.334840,-4.092080,-5.519453,1.779648,-2.046038,1.413247
2,1.804386,1.892154,1.688945,-1.375195,-1.917746,-0.160318
3,0.127272,-0.159102,-0.025283,0.792497,0.385002,0.087721
4,-3.650655,-3.687573,-3.586555,-1.622277,0.634913,0.011543
...,...,...,...,...,...,...
879995,-2.224638,-2.148515,-1.749194,-0.784402,-0.304411,-1.676512
879996,-0.904131,-1.430885,-0.986348,0.280511,-0.685694,-0.980154
879997,0.186629,0.265843,-0.050948,0.318113,-0.584958,1.479759
879998,1.512423,1.640055,1.708537,-0.519688,-0.636625,1.236915


In [5]:
df["ensemble"] = df["catboost"] * 0.1 + df["lgb"] * 0.6 + df["xgb"] * 0.3
# df["ensemble"] = df["cnn"] * 0.3 + df["lstm"] * 0.3 + df["gru"] * 0.4
# df["ensemble"] = df["cnn"]

In [6]:
df = pd.concat([df, test_data.to_pandas()], axis=1)

In [7]:
from sklearn.metrics import mean_absolute_error

print(f"{mean_absolute_error(df['ensemble'], df['target']):.5f}")

5.83871


In [8]:
import json
weight = json.load(open("data/weight.json"))
weight = dict(zip(range(200), weight))

df["stock_weights"] = df["stock_id"].map(weight)
df["ensemble_diff"] = (
    df["ensemble"]
    - (df["ensemble"] * df["stock_weights"]).sum() / df["stock_weights"].sum()
)

In [9]:
print(f"{mean_absolute_error(df['ensemble_diff'], df['target']):.5f}")

5.83871
