In [1]:
import polars as pl
from typing import Tuple, List, Dict
import os
from sklearn.metrics import mean_absolute_error
import numpy as np
from catboost import CatBoostRegressor
import pandas as pd
import json

In [2]:
dates_train = (0, 400)
dates_test = (401, 480)
num_models = {"cat": 1}
models_path = "./models"

In [3]:
# @title params dashboard
params = {
    "learning_rate": 0.005,  # 0.005,0.05
    "depth": 14,  # 14
    "iterations": 10000,
    "l2_leaf_reg": 0.1,  # 511,31,1023
    "loss_function": "MAE",
    "subsample": 0.2,
    "rsm": 0.3,
    "thread_count": 7,
    "cat_features": ["seconds_in_bucket"],
    # "device": "gpu",
    # 'reg_alpha'         : 0.1,
    # 'reg_lambda'        : 4,
}

In [4]:
def split_by_date(df: pl.DataFrame, dates: Tuple[int, int]) -> pl.DataFrame:
    return df.filter(
        pl.col("date_id").ge(dates[0]).and_(pl.col("date_id").le(dates[1]))
    )


def make_predictions(models, X_test, model="nn"):
    if model == "nn":
        all_predictions = [model.predict(X_test, batch_size=16384) for model in models]
    if model == "lgb":
        all_predictions = [
            model.predict(X_test, num_iteration=model.best_iteration)
            for model in models
        ]
    if model == "xgb":
        all_predictions = [
            model.predict(
                X_test, iteration_range=(0, model.get_booster().best_iteration + 1)
            )
            for model in models
        ]
    if model == "cat":
        all_predictions = [model.predict(X_test) for model in models]
    prediction = np.mean(all_predictions, axis=0)
    return prediction

In [5]:
excluded_columns = ["row_id", "date_id", "time_id", "target", "stock_return"]

train_eng = pl.read_parquet("data/train_eng.parquet")
lgb_features = [
    col for col in train_eng.schema.keys() if col not in excluded_columns
]
categorical_features = ["seconds_in_bucket"]

print("we have {} catboost features".format(len(lgb_features)))

train_data = split_by_date(train_eng, dates_train)
test_data = split_by_date(train_eng, dates_test)


X_train, y_train = (
    train_data.select(pl.col(lgb_features)).to_pandas(),
    train_data.select(pl.col("target")).to_pandas(),
)
X_test, y_test = (
    test_data.select(pl.col(lgb_features)).to_pandas(),
    test_data.select(pl.col("target")).to_pandas(),
)

we have 272 catboost features


In [6]:
X_train[categorical_features] = X_train[categorical_features].astype("category")
X_test[categorical_features] = X_test[categorical_features].astype("category")

del train_data, test_data, train_eng
import gc

gc.collect()

0

In [7]:
os.makedirs(models_path, exist_ok=True)
models = []
for i in range(num_models["cat"]):
    rnd_state = 42 + i
    print(f"Training model {i+1} out of {num_models['cat']} with seed {rnd_state}")
    print("---------------------------------------")

    params["random_state"] = rnd_state

    model = CatBoostRegressor(**params)

    if os.path.exists(f"{models_path}/model_cat_{i}.cbm"):
        model.load_model(f"{models_path}/model_cat_{i}.cbm")
    else:
        model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose_eval=50, early_stopping_rounds=250)
        model.save_model(f"{models_path}/model_cat_{i}.cbm")
    models.append(model)

    # if dates_train[1] != 480:
    #     pred = model.predict(X_test)
    #     mae = mean_absolute_error(test_data["target"], pred) # type: ignore
    #     print(f"Mean Absolute Error on test data: {mae:.5f}")

if dates_train[1] != 480:
    predictions = make_predictions(models, X_test, model="cat")
    print(
        f"Catboost Ensemble Mean Absolute Error: {mean_absolute_error(y_test, predictions):.5f}"
    )
    prediction_df = pd.DataFrame(
        {
            "stock_id": X_test["stock_id"],
            "target": predictions.flatten(),
        }
    )
    weight = json.load(open("data/weight.json"))
    weight = dict(zip(range(200), weight))

    prediction_df["stock_weights"] = prediction_df["stock_id"].map(weight)
    prediction_df["target"] = (
        prediction_df["target"]
        - (prediction_df["target"] * prediction_df["stock_weights"]).sum()
        / prediction_df["stock_weights"].sum()
    )

    print(
        f"Catboost Ensemble + PP Mean Absolute Error: {mean_absolute_error(y_test, prediction_df['target']):.5f}"
    )
    prediction_df.to_parquet("output/catboost_predictions.parquet")

Training model 1 out of 1 with seed 42
---------------------------------------
0:	learn: 6.4891808	test: 5.9965605	best: 5.9965605 (0)	total: 1.34s	remaining: 3h 43m 31s
50:	learn: 6.4427142	test: 5.9628494	best: 5.9628494 (50)	total: 1m 23s	remaining: 4h 32m 11s
100:	learn: 6.4086925	test: 5.9399041	best: 5.9399041 (100)	total: 2m 42s	remaining: 4h 24m 48s
150:	learn: 6.3827563	test: 5.9240821	best: 5.9240821 (150)	total: 4m 2s	remaining: 4h 23m 35s
200:	learn: 6.3626064	test: 5.9131365	best: 5.9131365 (200)	total: 5m 20s	remaining: 4h 20m 39s
250:	learn: 6.3461164	test: 5.9054165	best: 5.9054165 (250)	total: 6m 40s	remaining: 4h 19m 14s
300:	learn: 6.3318799	test: 5.8993696	best: 5.8993696 (300)	total: 8m 1s	remaining: 4h 18m 41s
350:	learn: 6.3195062	test: 5.8948430	best: 5.8948430 (350)	total: 9m 20s	remaining: 4h 16m 34s
400:	learn: 6.3085778	test: 5.8914760	best: 5.8914760 (400)	total: 10m 37s	remaining: 4h 14m 17s
450:	learn: 6.2983699	test: 5.8885774	best: 5.8885774 (450)	total