In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import optuna
from optuna.samplers import TPESampler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from functools import partial
optuna.logging.set_verbosity(optuna.logging.WARNING)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/test.csv")
train.head()

In [None]:
Xtrain = train.drop(["id", "loss"], axis = 1)
ytrain = train.loss
test1 = test.drop("id", axis = 1)
print(Xtrain.shape, test.shape)
print(train['loss'].value_counts())

In [None]:
ss = StandardScaler()
ss.fit(Xtrain)
Xtrain = ss.transform(Xtrain)
test1 = ss.transform(test1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xtrain, ytrain, test_size=0.15, random_state=44)

In [None]:
def getXgbHyperparameters(trial):
    xgb_param = {
            "tree_method": "gpu_hist",
            "eval_metric": "rmse",
            'n_estimators': trial.suggest_int('n_estimators', 7500, 11000, 1500),
            "booster": 'gbtree',
            "reg_lambda": trial.suggest_int("reg_lambda", 1, 100),
            "reg_alpha": trial.suggest_int("reg_alpha", 1, 100),
            "subsample": trial.suggest_float("subsample", 0.2, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
            "max_depth": trial.suggest_int("max_depth", 3, 9),
            "min_child_weight": trial.suggest_int("min_child_weight", 2, 10),
            "learning_rate": 0.010154255408501112,
            "gamma": trial.suggest_float("gamma", 0, 20)
        }
    return xgb_param

In [None]:
def optimize(trial,X,y):
    params = getXgbHyperparameters(trial)
    xgb = XGBRegressor(**params)
    X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.20, random_state=44)
    xgb.fit(X_train,y_train)
    pred = xgb.predict(X_test)
    return np.sqrt(mean_squared_error(pred, y_test))

In [None]:
opt_func = partial(optimize, X=X_train, y=y_train)
func = lambda trial: optimize(trial, Xtrain, ytrain)


In [None]:
def logging_callback(study, frozen_trial):
    previous_best_value = study.user_attrs.get("previous_best_value", None)
    if previous_best_value != study.best_value:
        study.set_user_attr("previous_best_value", study.best_value)
        print(study.best_params)
        print("Trial {} finished with best value: {}. ".format(frozen_trial.number,frozen_trial.value))
        

In [None]:
study = optuna.create_study(sampler=TPESampler(seed=13), direction = "minimize", study_name='xgb')
study.optimize(func, timeout=4*60*60,callbacks=[logging_callback])

In [None]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")
for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

In [None]:
best_params = study.best_params

In [None]:
best_params

In [None]:
import joblib

joblib.dump(study, "xgb_study.pkl")

In [None]:
xgb = XGBRegressor(**best_params, 
                   tree_method = "gpu_hist",
                   eval_metric = "rmse",
                   booster = 'gbtree',
                   learning_rate=0.010154255408501112)
xgb.fit(X_train,y_train,
        eval_metric="rmse",
        verbose=True)

In [None]:
pred = xgb.predict(X_test)
np.sqrt(mean_squared_error(pred, y_test))

In [None]:
predictions = xgb.predict(test1)
predictions = pd.Series(predictions,name = 'loss')
final = pd.concat([test.id,predictions],axis = 1)
final.to_csv('predictions.csv',index = False)