In [None]:
import optuna
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')

In [None]:
display(train.iloc[:1,:11])
train.iloc[:1,11:]

In [None]:
cat = [ca for ca in train.columns if 'cat' in ca]
cat

In [None]:
cont = [con for con in train.columns if 'cont' in con]
cont

In [None]:
from sklearn.preprocessing import LabelEncoder
for col in cat:
    le = LabelEncoder()
    le.fit(train[col])
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

In [None]:
data = train[cat+cont]
data[:1]

In [None]:
target = train['target']
target[:1]

In [None]:
def objective(trial , data = data , target = target):
    train_x , test_x , train_y , test_y = train_test_split(data , target ,\
                test_size = 0.1925517854845237 , random_state = 42)
    params = {
        'tree_method' : 'gpu_hist',
        'n_estimators' : trial.suggest_int('n_estimators' , 1 , 9999),
        'lambda' : trial.suggest_loguniform('lambda' , 1e-3 , 10.0),
        'alpha' : trial.suggest_loguniform('alpha' , 1e-3 , 10.0),
        'colsample_bytree' : trial.suggest_loguniform('colsample_bytree' , 1e-3 ,1.0),
        'subsample': trial.suggest_uniform('subsample' , 0.1 , 1.0),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0.005 , 0.02),
        'max_depth' : trial.suggest_int('max_depth' , 1 , 20),
        'random_state' : trial.suggest_categorical('random_state' ,[0,42,2021]),
        'min_child_weight' : trial.suggest_int('min_child_weight' , 1 ,300),
    }
    model = xgb.XGBRegressor(**params)
    model.fit(train_x , train_y , eval_set = [(test_x , test_y)] , early_stopping_rounds = 100 , verbose = False)
    preds = model.predict(test_x)
    rmse = mean_squared_error(test_y , preds , squared = False)
    return rmse

In [None]:
study = optuna.create_study(direction = 'minimize' , study_name = 'xgb')
study.optimize(objective , n_trials = 50)
print('number of the finished trials:' , len(study.trials))
print('the parametors of best trial:' , study.best_trial.params)

In [None]:
Best_params1 = {'n_estimators': 4505, 'lambda': 1.4902632994856266, 'alpha': 5.0535366370981984, \
               'colsample_bytree': 0.20164640846117046, 'subsample': 0.654423243609363, \
               'learning_rate': 0.010513893882452396, 'max_depth': 6, 'random_state': 42, \
               'min_child_weight': 216 , 'tree_method' : 'gpu_hist'}

In [None]:
Best_params2 = {'n_estimators': 8881, 'lambda': 1.2298109918165678, 'alpha': 0.359577921491942,\
                'colsample_bytree': 0.2533060803686332, 'subsample': 0.6566758518175597, \
                'learning_rate': 0.019187373383606838, 'max_depth': 9, 'random_state': 41,\
                'min_child_weight': 185 , 'tree_method' : 'gpu_hist'}

In [None]:
Best_params3 = {'n_estimators': 3267, 'lambda': 1.8219449574236453, 'alpha': 1.8133711798780716, \
                'colsample_bytree': 0.3142208841368521, 'subsample': 0.35286963371872676,\
                'learning_rate': 0.0075213341845336, 'max_depth': 15, 'random_state': 33, \
                'min_child_weight': 238 , 'tree_method' : 'gpu_hist'}

In [None]:
Best_params4 = { 'n_estimators': 2752, 'lambda': 6.423065559703725, \
                'alpha': 0.020341970739886955, 'colsample_bytree': 0.4125103556750885, \
                'subsample': 0.882333488316686, 'learning_rate': 0.01742867512405962, 'max_depth': 11, \
                'random_state': 2021, 'min_child_weight': 180 , 'tree_method' : 'gpu_hist'}

In [None]:
Best_params5 = {'n_estimators': 8237, 'lambda': 0.0025684307904619967, 'alpha': 0.04547002052513876,\
                'colsample_bytree': 0.24030337794848314, 'subsample': 0.7789787184252718, \
                'learning_rate': 0.008159219557435779, 'max_depth': 10, 'random_state': 0, \
                'min_child_weight': 231 , 'tree_method' : 'gpu_hist'}

In [None]:
preds = np.zeros(test.shape[0])
kf = KFold(n_splits = 20 , random_state = 42 , shuffle = True)
rmse = []
n = 0
for trn_idx , test_idx in kf.split(data , target):
    train_X , train_y = data.iloc[trn_idx] , target.iloc[trn_idx]
    val_X , val_y = data.iloc[test_idx] , target.iloc[test_idx]
    model = xgb.XGBRegressor(**Best_params4)
    model.fit(train_X , train_y , eval_set = [(val_X , val_y)] , early_stopping_rounds = 100 , verbose = False)
    preds += model.predict(test[cat+cont])/kf.n_splits
    rmse.append(mean_squared_error(model.predict(val_X) , val_y , squared = False))
    print(n+1 , rmse[n])
    n+=1# 

In [None]:
sub['target'] = preds
sub.to_csv('xgbsubmission4-2.9.csv' , index = False)