In [None]:
import polars as pl
import lightgbm as lgb
from typing import Tuple, List, Dict
import os
from sklearn.metrics import mean_absolute_error
import numpy as np
import json
import pandas as pd

In [None]:
dates_train = (0, 400)
dates_test = (401, 480)
num_models = {"lgb": 1}
models_path = "./models"

In [None]:
# @title params dashboard
lgb_params = {
    "learning_rate": 0.005,  # 0.005,0.05
    "max_depth": 14,  # 14
    "n_estimators": 5000,
    "num_leaves": 1023,  # 511,31,1023
    "objective": "mae",
    "subsample": 0.2,
    "colsample_bytree": 0.3,
    "num_threads": 7,
    # "device": "gpu",
    # 'reg_alpha'         : 0.1,
    # 'reg_lambda'        : 4,
}

In [None]:
def split_by_date(df: pl.DataFrame, dates: Tuple[int, int]) -> pl.DataFrame:
    return df.filter(
        pl.col("date_id").ge(dates[0]).and_(pl.col("date_id").le(dates[1]))
    )


def make_predictions(models, X_test, model="nn"):
    if model == "nn":
        all_predictions = [model.predict(X_test, batch_size=16384) for model in models]
    if model == "lgb":
        all_predictions = [
            model.predict(X_test, num_iteration=model.best_iteration)
            for model in models
        ]
    if model == "xgb":
        all_predictions = [
            model.predict(
                X_test, iteration_range=(0, model.get_booster().best_iteration + 1)
            )
            for model in models
        ]
    if model == "cat":
        all_predictions = [model.predict(X_test) for model in models]
    prediction = np.mean(all_predictions, axis=0)
    return prediction

In [None]:
excluded_columns = ["row_id", "date_id", "time_id", "target", "stock_return"]

train_eng = pl.read_parquet("data/train_eng.parquet")
lgb_features = [
    col for col in train_eng.schema.keys() if col not in excluded_columns
]
categorical_features = ["seconds_in_bucket"]

print("we have {} lgb features".format(len(lgb_features)))

train_data = split_by_date(train_eng, dates_train)
test_data = split_by_date(train_eng, dates_test)



X_train, y_train = (
    train_data.select(pl.col(lgb_features)).to_numpy(),
    train_data.select(pl.col("target")).to_numpy(),
)
X_test, y_test = (
    test_data.select(pl.col(lgb_features)).to_numpy(),
    test_data.select(pl.col("target")).to_numpy(),
        )
train_set = lgb.Dataset(
    X_train,
    label=y_train,
    # categorical_feature=categorical_features,
    free_raw_data=False,
)
test_set = lgb.Dataset(
    X_test,
    label=y_test,
    # categorical_feature=categorical_features,
    free_raw_data=False,
)

In [None]:
from lightgbm.callback import log_evaluation, early_stopping

os.makedirs(models_path, exist_ok=True)

lgb_models = []
for i in range(num_models["lgb"]):
    rnd_state = 42 + i

    lgb_params["random_state"] = rnd_state
    if os.path.exists(f"{models_path}/model_lgb_{i}.txt"):
        lgb_model = lgb.Booster(model_file=f"{models_path}/model_lgb_{i}.txt")
        print(f"Loaded model {i+1} out of {num_models['lgb']}")
    else:
        print(
            f"Training model {i+1} out of {num_models['lgb']} with seed {rnd_state}"
        )
        print("---------------------------------------")
        lgb_model = lgb.train(
            lgb_params,
            train_set,
            init_model=None,
            valid_sets=[train_set, test_set],
            callbacks=[log_evaluation(50), early_stopping(250)],
        )
        lgb_model.save_model(f"{models_path}/model_lgb_{i}.txt")
    lgb_models.append(lgb_model)

    # if dates_train[1] != 480:
    #     pred = lgb_model.predict(X_test)
    #     mae = mean_absolute_error(test_data["target"], pred) # type: ignore
    #     print(f"Mean Absolute Error on test data: {mae:.5f}")

if dates_train[1] != 480:
    predictions = make_predictions(
        lgb_models, X_test, model="lgb"
    )
    print(
        f"LGB Ensemble Mean Absolute Error: {mean_absolute_error(y_test, predictions):.5f}"
    )
    prediction_df = pd.DataFrame(
        {
            "stock_id": test_data["stock_id"],
            "target": predictions.flatten(),
        }
    )
    weight = json.load(open("data/weight.json"))
    weight = dict(zip(range(200), weight))

    prediction_df["stock_weights"] = prediction_df["stock_id"].map(weight)
    prediction_df["target"] = (
        prediction_df["target"]
        - (prediction_df["target"] * prediction_df["stock_weights"]).sum()
        / prediction_df["stock_weights"].sum()
    )

    print(
        f"LGB Ensemble + PP Mean Absolute Error: {mean_absolute_error(y_test, prediction_df['target']):.5f}"
    )

In [None]:
prediction_df.to_parquet(f"output/lgb_predictions.parquet")

In [None]:
import pandas as pd
fi = pd.Series(
    dict(
        zip(
            lgb_features,
            lgb_models[0].feature_importance(importance_type="gain"),
        )
    )
).sort_values(ascending=False).iloc[:20]
fi.plot(kind="barh", figsize=(10, 20))

In [None]:
fi.plot.bar(figsize=(20, 10))