In [1]:
import pickle

import optuna
import pandas as pd
import xgboost as xgb

In [None]:
train_df: pd.DataFrame = pd.read_pickle("../data/training_needed_df.pkl")
train_df = train_df.drop(
    columns=[
        "MessageID",
        "ChannelID",
        "issuerid",
        "SentimentScore",
        "MessageText",
        "llm_issuerid",
        "llm_SentimentScore",
    ]
)
train_df = train_df.explode(["final_ids", "final_scores"])
train_df[[f"embedding_{i}" for i in range(768)]] = pd.DataFrame(
    [x.reshape(-1) for x in train_df["embedding"].tolist()], index=train_df.index
)
train_df = train_df.drop(columns=["embedding"])
train_df.final_ids = train_df.final_ids.astype(int)

In [None]:
X = train_df.drop(columns=["final_scores"]).to_numpy()
y = (
    train_df["final_scores"]
    .apply(lambda x: x - 1)
    .apply(lambda x: 0 if x < 0 else 4 if x > 4 else x)
    .to_numpy()
)

In [None]:
study_name = "xgboost_study"


class SaveBestModel(xgb.callback.TrainingCallback):
    def __init__(self, cvboosters):
        self._cvboosters = cvboosters

    def after_training(self, model):
        self._cvboosters[:] = [cvpack.bst for cvpack in model.cvfolds]
        return model


def objective(trial: optuna.Trial):
    dtrain = xgb.DMatrix(X, label=y)

    param = {
        "verbosity": 0,
        "objective": "multi:softmax",
        "eval_metric": "auc",
        "n_estimators": trial.suggest_int("n_estimators", 50, 500, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 0.0001, 0.1, log=True),
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
    }
    param["num_class"] = 5
    param["device"] = "cuda:1"

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    cvboosters = []
    savemodel_callback = SaveBestModel(cvboosters)
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-auc")
    history = xgb.cv(
        param, dtrain, num_boost_round=100, callbacks=[pruning_callback, savemodel_callback]
    )

    with open("../models/{}_{}.pickle".format(study_name, trial.number), "wb") as fout:
        pickle.dump(cvboosters[-1], fout)

    mean_auc = history["test-auc-mean"].values[-1]
    return mean_auc

In [None]:
pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
study = optuna.create_study(
    pruner=pruner,
    direction="maximize",
    study_name=study_name,
    storage="sqlite:///optuna_study.db",
    load_if_exists=True,
)
study.optimize(objective, n_trials=100)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))