In [None]:
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna
import pickle

In [None]:
data = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv', index_col='id')
data['n_missing'] = data.isna().sum(axis=1)
study_file = open('../input/xgboost-tuning/xgb_optimizing_study.pickle', 'rb')
study = pickle.load(study_file)
study_file.close()
X, y = data.drop('claim', axis=1), data['claim']

In [None]:
def objective(trial, X, y, n_splits=5, random_state=42):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    params = {
        'tree_method':'gpu_hist',
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 10.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1),
        'subsample': trial.suggest_uniform('subsample', 0.1, 1),
        'learning_rate': 0.01,
        'n_estimators': 50000,
        'use_label_encoder': False,
        'max_depth': trial.suggest_int('max_depth', 1, 12),
        'random_state': random_state,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300)
    }
    
    oof = np.empty_like(y, dtype='float64')
    for train_idx, val_idx in skf.split(X, y):
        model = XGBClassifier(**params)
        X_train = X.loc[train_idx, :]
        y_train = y[train_idx]
        X_val = X.loc[val_idx, :]
        y_val = y[val_idx]
        model.fit(X_train,y_train,eval_set=[(X_val, y_val)],early_stopping_rounds=200,
                  verbose=False, eval_metric='auc')
        oof_pred = model.predict_proba(X_val)[:, 1]
        oof[val_idx] = oof_pred
    val_score = roc_auc_score(y, oof)
    return val_score

In [None]:
timeout = 60*60*5
n_splits = 5
random_state = 42
n_trials = None

study.optimize(lambda trial: objective(trial, X, y, n_splits, random_state),
               n_trials=n_trials, timeout=timeout)

In [None]:
def get_params(study):   
    params = study.best_params
    params['tree_method'] = 'gpu_hist'
    params['learning_rate'] = 0.01
    params['n_estimators'] = 50000
    params['use_label_encoder'] = False
    params['random_state'] = random_state
    return params

In [None]:
params = get_params(study)

In [None]:
pickle.dump(study, open('xgb_optimizing_study.pickle', 'wb'))

In [None]:
print('XGBoost best params:')
print(params)