In [10]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna

In [7]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
    
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)
    

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

In [8]:
def run(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])

    model = XGBRegressor(
        random_state=42,
        tree_method="gpu_hist",
        gpu_id=1,
        predictor="gpu_predictor",
        n_estimators=7000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    return rmse

In [11]:
study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=5)

[32m[I 2021-08-31 19:27:59,074][0m A new study created in memory with name: no-name-6ffbb452-4910-4ef1-a53a-bf3e5577936c[0m


[0]	validation_0-rmse:6.13383
[405]	validation_0-rmse:0.72946


[32m[I 2021-08-31 19:28:04,972][0m Trial 0 finished with value: 0.722921174433389 and parameters: {'learning_rate': 0.213870093168849, 'reg_lambda': 6.516690872098945e-06, 'reg_alpha': 0.03270892991130916, 'subsample': 0.8319697579143283, 'colsample_bytree': 0.3277113074287743, 'max_depth': 6}. Best is trial 0 with value: 0.722921174433389.[0m


[0]	validation_0-rmse:7.68267
[1000]	validation_0-rmse:0.72327
[2000]	validation_0-rmse:0.72096
[3000]	validation_0-rmse:0.72060
[3405]	validation_0-rmse:0.72062


[32m[I 2021-08-31 19:28:28,201][0m Trial 1 finished with value: 0.7205222356228401 and parameters: {'learning_rate': 0.012638734258461837, 'reg_lambda': 0.1654682444195007, 'reg_alpha': 8.464204988794859e-07, 'subsample': 0.2814943186672778, 'colsample_bytree': 0.7410401444936071, 'max_depth': 6}. Best is trial 1 with value: 0.7205222356228401.[0m


[0]	validation_0-rmse:6.84710
[614]	validation_0-rmse:0.72288


[32m[I 2021-08-31 19:28:33,273][0m Trial 2 finished with value: 0.7216804907513846 and parameters: {'learning_rate': 0.12133025733597931, 'reg_lambda': 0.014541905001821521, 'reg_alpha': 3.158537981108844e-05, 'subsample': 0.5599409012134213, 'colsample_bytree': 0.40823839586330135, 'max_depth': 5}. Best is trial 1 with value: 0.7205222356228401.[0m


[0]	validation_0-rmse:7.47173
[1000]	validation_0-rmse:0.72059
[2000]	validation_0-rmse:0.71885
[2765]	validation_0-rmse:0.71881


[32m[I 2021-08-31 19:28:45,090][0m Trial 3 finished with value: 0.7187393044726648 and parameters: {'learning_rate': 0.040075760420698166, 'reg_lambda': 0.0004699745046141965, 'reg_alpha': 1.3655450307414496, 'subsample': 0.7222794496202382, 'colsample_bytree': 0.5074652573342793, 'max_depth': 4}. Best is trial 3 with value: 0.7187393044726648.[0m


[0]	validation_0-rmse:7.35820
[1000]	validation_0-rmse:0.73060
[2000]	validation_0-rmse:0.72820
[3000]	validation_0-rmse:0.72667
[4000]	validation_0-rmse:0.72562
[5000]	validation_0-rmse:0.72480
[6000]	validation_0-rmse:0.72412
[6999]	validation_0-rmse:0.72357


[32m[I 2021-08-31 19:29:01,146][0m Trial 4 finished with value: 0.7235507300754 and parameters: {'learning_rate': 0.05472940162092398, 'reg_lambda': 3.979142205866442e-06, 'reg_alpha': 0.0004941142482206257, 'subsample': 0.5413367489580537, 'colsample_bytree': 0.8220457595772027, 'max_depth': 1}. Best is trial 3 with value: 0.7187393044726648.[0m


In [12]:
study.best_params

{'learning_rate': 0.040075760420698166,
 'reg_lambda': 0.0004699745046141965,
 'reg_alpha': 1.3655450307414496,
 'subsample': 0.7222794496202382,
 'colsample_bytree': 0.5074652573342793,
 'max_depth': 4}

In [13]:
sample_submission.target = study
sample_submission.to_csv("submission.csv", index=False)