In [9]:
import polars as pl
from typing import Tuple, List, Dict
import os
from sklearn.metrics import mean_absolute_error
import numpy as np
from xgboost import XGBRegressor
import pandas as pd
import json

In [10]:
dates_train = (0, 400)
dates_test = (401, 480)
num_models = {"xgb": 1}
models_path = "./models"

In [11]:
# @title params dashboard
params = {
    "eta": 0.005,  # 0.005,0.05
    "max_depth": 14,  # 14
    "n_estimators": 5000,
    "max_leaves": 1023,  # 511,31,1023
    "objective": "reg:absoluteerror",
    "subsample": 0.2,
    "colsample_bytree": 0.3,
    "nthread": 6,
    "enable_categorical": True,
    "eval_metric": "mae",
    "early_stopping_rounds": 250,
    "device": "gpu",
    # 'reg_alpha'         : 0.1,
    # 'reg_lambda'        : 4,
}

In [26]:
def split_by_date(df: pl.DataFrame, dates: Tuple[int, int]) -> pl.DataFrame:
    return df.filter(
        pl.col("date_id").ge(dates[0]).and_(pl.col("date_id").le(dates[1]))
    )


def make_predictions(models, X_test, model="nn"):
    if model == "nn":
        all_predictions = [model.predict(X_test, batch_size=16384) for model in models]
    if model == "lgb":
        all_predictions = [
            model.predict(X_test, num_iteration=model.best_iteration)
            for model in models
        ]
    if model == "xgb":
        all_predictions = [
            model.predict(
                X_test, iteration_range=(0, model.get_booster().best_iteration + 1)
            )
            for model in models
        ]
    if model == "cat":
        all_predictions = [model.predict(X_test) for model in models]
    prediction = np.mean(all_predictions, axis=0)
    return prediction

In [14]:
excluded_columns = ["row_id", "date_id", "time_id", "target", "stock_return"]

train_eng = pl.read_parquet("data/train_eng.parquet")
lgb_features = [
    col for col in train_eng.schema.keys() if col not in excluded_columns
]
categorical_features = ["seconds_in_bucket"]

print("we have {} xgb features".format(len(lgb_features)))

train_data = split_by_date(train_eng, dates_train)
test_data = split_by_date(train_eng, dates_test)


X_train, y_train = (
    train_data.select(pl.col(lgb_features)).to_pandas(),
    train_data.select(pl.col("target")).to_pandas(),
)
X_test, y_test = (
    test_data.select(pl.col(lgb_features)).to_pandas(),
    test_data.select(pl.col("target")).to_pandas(),
)

we have 272 xgb features


In [15]:
X_train[categorical_features] = X_train[categorical_features].astype("category")
X_test[categorical_features] = X_test[categorical_features].astype("category")

del train_data, test_data, train_eng
import gc
gc.collect()

3563

In [27]:
os.makedirs(models_path, exist_ok=True)
models = []
for i in range(num_models["xgb"]):
    rnd_state = 42 + i
    print(
        f"Training model {i+1} out of {num_models['xgb']} with seed {rnd_state}"
    )
    print("---------------------------------------")

    params["random_state"] = rnd_state

    model = XGBRegressor(**params)

    if os.path.exists(f"{models_path}/model_xgb_{i}.json"):
        model.load_model(f"{models_path}/model_xgb_{i}.json")
    else:
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            verbose=1,
        )
        model.save_model(f"{models_path}/model_xgb_{i}.json")
    models.append(model)

    # if dates_train[1] != 480:
    #     pred = model.predict(X_test)
    #     mae = mean_absolute_error(test_data["target"], pred) # type: ignore
    #     print(f"Mean Absolute Error on test data: {mae:.5f}")

if dates_train[1] != 480:
    predictions = make_predictions(models, X_test, model="xgb")
    print(
        f"XGB Ensemble Mean Absolute Error: {mean_absolute_error(y_test, predictions):.5f}"
    )
    prediction_df = pd.DataFrame(
        {
            "stock_id": X_test["stock_id"],
            "target": predictions.flatten(),
        }
    )
    weight = json.load(open("data/weight.json"))
    weight = dict(zip(range(200), weight))

    prediction_df["stock_weights"] = prediction_df["stock_id"].map(weight)
    prediction_df["target"] = (
        prediction_df["target"]
        - (prediction_df["target"] * prediction_df["stock_weights"]).sum()
        / prediction_df["stock_weights"].sum()
    )

    print(
        f"XGB Ensemble + PP Mean Absolute Error: {mean_absolute_error(y_test, prediction_df['target']):.5f}"
    )
    prediction_df.to_parquet("output/xgb_predictions.parquet")

Training model 1 out of 1 with seed 42
---------------------------------------


XGB Ensemble Mean Absolute Error: 5.84710
XGB Ensemble + PP Mean Absolute Error: 5.84709


In [28]:
predictions = make_predictions(models, X_test, model="xgb")
print(
    f"XGB Ensemble Mean Absolute Error: {mean_absolute_error(y_test, predictions):.5f}"
)
prediction_df = pd.DataFrame(
    {
        "stock_id": X_test["stock_id"],
        "target": predictions.flatten(),
    }
)

weight = json.load(open(f"data/weight.json"))
weight = dict(zip(range(200), weight))

prediction_df["stock_weights"] = prediction_df["stock_id"].map(weight)
prediction_df["target"] = (
    prediction_df["target"]
    - (prediction_df["target"] * prediction_df["stock_weights"]).sum()
    / prediction_df["stock_weights"].sum()
)

print(
    f"XGB Ensemble + PP Mean Absolute Error: {mean_absolute_error(y_test, prediction_df['target']):.5f}"
)
prediction_df.to_parquet("output/xgb_predictions.parquet")

XGB Ensemble Mean Absolute Error: 5.84710
XGB Ensemble + PP Mean Absolute Error: 5.84709
