In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna

In [2]:
df = pd.read_csv("../input/30-days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "KFold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        xtrain =  df[df.KFold != fold].reset_index(drop = True)
        xvalid = df[df.KFold == fold].reset_index(drop = True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
    
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)
    

useful_features = [c for c in df.columns if c not in ("id", "target", "KFold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

In [3]:
def run(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log = True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = df[df.KFold != fold].reset_index(drop = True)
    xvalid = df[df.KFold == fold].reset_index(drop = True)

    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])

    model = XGBRegressor(
        random_state = 42,
        tree_method = "gpu_hist",
        gpu_id = 1,
        predictor = "gpu_predictor",
        n_estimators = 7000,
        learning_rate = learning_rate,
        reg_lambda = reg_lambda,
        reg_alpha = reg_alpha,
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        max_depth = max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds = 300, eval_set = [(xvalid, yvalid)], verbose = 1000)
    preds_valid = model.predict(xvalid)
    rmse = mean_squared_error(yvalid, preds_valid, squared = False)
    return rmse

In [4]:
study = optuna.create_study(direction = "minimize")
study.optimize(run, n_trials = 5)

[32m[I 2021-08-26 16:15:59,066][0m A new study created in memory with name: no-name-3806b7db-569a-4f8b-9dd1-20832f72615f[0m


[0]	validation_0-rmse:7.65351
[1000]	validation_0-rmse:0.73181
[2000]	validation_0-rmse:0.72792
[3000]	validation_0-rmse:0.72555
[4000]	validation_0-rmse:0.72395
[5000]	validation_0-rmse:0.72281
[6000]	validation_0-rmse:0.72198
[6999]	validation_0-rmse:0.72132


[32m[I 2021-08-26 16:16:18,457][0m Trial 0 finished with value: 0.7213158068508109 and parameters: {'learning_rate': 0.016395899887688282, 'reg_lambda': 3.950060997010027e-05, 'reg_alpha': 46.62293465296621, 'subsample': 0.531664605970771, 'colsample_bytree': 0.7178841248029837, 'max_depth': 2}. Best is trial 0 with value: 0.7213158068508109.[0m


[0]	validation_0-rmse:7.45208
[1000]	validation_0-rmse:0.72050
[1251]	validation_0-rmse:0.72069


[32m[I 2021-08-26 16:16:29,035][0m Trial 1 finished with value: 0.720465536987354 and parameters: {'learning_rate': 0.042633753418169815, 'reg_lambda': 2.7304672710196803e-08, 'reg_alpha': 3.671062805122271e-08, 'subsample': 0.8289721744413217, 'colsample_bytree': 0.8182474591264499, 'max_depth': 6}. Best is trial 1 with value: 0.720465536987354.[0m


[0]	validation_0-rmse:7.01334
[730]	validation_0-rmse:0.72119


[32m[I 2021-08-26 16:16:36,178][0m Trial 2 finished with value: 0.720129012045899 and parameters: {'learning_rate': 0.09951730771605123, 'reg_lambda': 0.014774039575400275, 'reg_alpha': 0.015555623072094056, 'subsample': 0.9309222409822783, 'colsample_bytree': 0.27296880769346366, 'max_depth': 6}. Best is trial 2 with value: 0.720129012045899.[0m


[0]	validation_0-rmse:7.62459
[1000]	validation_0-rmse:0.72252
[2000]	validation_0-rmse:0.72003
[3000]	validation_0-rmse:0.71962
[3918]	validation_0-rmse:0.71959


[32m[I 2021-08-26 16:16:56,877][0m Trial 3 finished with value: 0.7195422586441621 and parameters: {'learning_rate': 0.02019006541464487, 'reg_lambda': 9.692599175478158e-08, 'reg_alpha': 0.11644115036710795, 'subsample': 0.9466540872869719, 'colsample_bytree': 0.7141025389129902, 'max_depth': 5}. Best is trial 3 with value: 0.7195422586441621.[0m


[0]	validation_0-rmse:7.37001
[1000]	validation_0-rmse:0.71998
[2000]	validation_0-rmse:0.71930
[2115]	validation_0-rmse:0.71935


[32m[I 2021-08-26 16:17:06,476][0m Trial 4 finished with value: 0.7192786870407952 and parameters: {'learning_rate': 0.05319921803896438, 'reg_lambda': 4.061902345841237e-07, 'reg_alpha': 11.85067358866691, 'subsample': 0.5259431029448453, 'colsample_bytree': 0.8300629816653685, 'max_depth': 4}. Best is trial 4 with value: 0.7192786870407952.[0m


In [5]:
study.best_params

{'learning_rate': 0.05319921803896438,
 'reg_lambda': 4.061902345841237e-07,
 'reg_alpha': 11.85067358866691,
 'subsample': 0.5259431029448453,
 'colsample_bytree': 0.8300629816653685,
 'max_depth': 4}