In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna

In [None]:
train = pd.read_csv("../input/30-days-of-ml-5-folds/train_5_folds.csv")
test = pd.read_csv("../input/30-days-of-ml/test.csv")
submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [None]:
feature_columns = [column for column in train.columns if column not in ["id", "target", "fold"]]
categorical_columns = [column for column in feature_columns if column.startswith("cat")]

In [None]:
fold = 0
def objective(trial):
    X_train = train[train.fold != fold]
    X_valid = train[train.fold == fold]

    y_train = X_train.target
    y_valid = X_valid.target

    X_train = X_train[feature_columns]
    X_valid = X_valid[feature_columns]

    ordinal_encoder = OrdinalEncoder()
    X_train[categorical_columns] = ordinal_encoder.fit_transform(X_train[categorical_columns])
    X_valid[categorical_columns] = ordinal_encoder.transform(X_valid[categorical_columns])

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 10000),
        "max_depth": trial.suggest_int("max_depth", 2, 6),
        "min_child_weight": trial.suggest_int("min_child_weight", 2, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "reg_lambda": trial.suggest_float("lambda", 1e-8, 100.0, log=True),
        "reg_alpha": trial.suggest_float("alpha", 1e-8, 100.0, log=True),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    }

    xgb_regressor = XGBRegressor(**params,
                                 tree_method="gpu_hist",
                                 random_state=33,
                                 gpu_id=0,
                                 predictor="gpu_predictor",
                                 verbosity=0)

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation_0-rmse")
    
    xgb_regressor.fit(X_train, 
                      y_train,
                      eval_set=[(X_valid, y_valid)],
                      eval_metric="rmse",
                      early_stopping_rounds=10,
                      verbose=False,
                      callbacks=[pruning_callback])

    y_pred = xgb_regressor.predict(X_valid)
    rmse = mean_squared_error(y_valid, y_pred, squared=False)

    return rmse

In [None]:
pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
study = optuna.create_study(pruner= pruner, study_name="xgbr-study", direction="minimize")
study.optimize(objective, n_trials=100, timeout=600)

In [None]:
study.best_params

In [None]:
X_train = train.copy()
X_test = test.copy()

y_train = X_train.target

X_train = X_train[feature_columns]
X_test = X_test[feature_columns]

ordinal_encoder = OrdinalEncoder()
X_train[categorical_columns] = ordinal_encoder.fit_transform(X_train[categorical_columns])
X_test[categorical_columns] = ordinal_encoder.transform(X_test[categorical_columns])

xgb_regressor = XGBRegressor(**study.best_params,
                             tree_method="hist",
                             random_state=33,
                             predictor="gpu_predictor")

xgb_regressor.fit(X_train, y_train)

y_pred = xgb_regressor.predict(X_test)

In [None]:
submission.target = y_pred
submission.to_csv("submission.csv", index=False)