This is an XGB version of the following notebook: https://www.kaggle.com/rmiperrier/lgb-optuna

It uses Label Encoding (LE) and GPU acceleration.

In [None]:
# Libraries
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from IPython.display import display

In [None]:

train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv', index_col=0)
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv', index_col=0)
submission = pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')
submission.head()

In [None]:
# Predictors & target
predictors = train.columns[:-1]
target = train.columns[-1]
predictors

In [None]:
def label_encode(train_df, test_df, column):
    le = LabelEncoder()
    new_feature = "{}_le".format(column)
    le.fit(train_df[column].unique().tolist() + test_df[column].unique().tolist())
    train_df[new_feature] = le.transform(train_df[column])
    test_df[new_feature] = le.transform(test_df[column])
    return new_feature

In [None]:
cat_cols = [col for col in predictors if 'cat' in col]
cont_cols = [col for col in predictors if 'cont' in col]

In [None]:
le_cols = []
for feature in cat_cols:
    le_cols.append(label_encode(train, test, feature))

In [None]:
train.head()

In [None]:
cols = le_cols + cont_cols

In [None]:
len(cols)

In [None]:
len(predictors)

In [None]:
# Functions for KFold evaluation
def create(hyperparams):
    """Create LGBM Classifier for a given set of hyper-parameters."""
    model = XGBClassifier(**hyperparams)
    return model

def fit(model, X, y):
    """Simple training of a given model."""
    model.fit(X, y)
    return model

def fit_with_stop(model, X, y, X_val, y_val):
    """Advanced training with early stopping."""
    model.fit(X, y,
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=200, # ! Hard-coded value
              verbose=300)
    return model

def evaluate(model, X, y):
    """Compute AUC for a given model."""
    yp = model.predict_proba(X)[:, 1]
    auc_score = roc_auc_score(y, yp)
    return auc_score

def kfold_evaluation(X, y, k, hyperparams):
    """Run a KFlod evaluation."""
    scores = []
    
    print(f"\n------ {k}-fold evaluation -----")
    print(hyperparams)
    
    kf = KFold(k)
    for i, (train_idx, test_idx) in enumerate(kf.split(X)):
        print(f"----- FOLD {i} -----")
        
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[test_idx]
        y_val = y.iloc[test_idx]
        
        model = create(hyperparams)
        model = fit_with_stop(model, X_train, y_train, X_val, y_val)
        train_score = evaluate(model, X_train, y_train)
        val_score = evaluate(model, X_val, y_val)
        scores.append((train_score, val_score))
        
        print(f"Eval AUC: {val_score}")
        
        
    scores = pd.DataFrame(scores, columns=['train score', 'validation score'])
    
    return scores

def kfold_prediction(X, y, X_test, k, hyperparams):
    """Make predictions with a bagged model based on KFold."""
    yp = np.zeros(len(X_test))
    
    kf = KFold(k)
    for train_idx, test_idx in kf.split(X):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[test_idx]
        y_val = y.iloc[test_idx]
        
        model = create(hyperparams)
        model = fit_with_stop(model, X_train, y_train, X_val, y_val)
        yp += model.predict_proba(X_test)[:, 1] / k
    
    return yp

In [None]:
# Constant
K = 5
X = train[cols]
Y = train[target]
X_TEST = test[cols]
BEST_PARAMS = {'learning_rate': 0.03, 
               'eval_metric': 'auc',
                'tree_method': 'gpu_hist',
                'predictor': 'gpu_predictor',}

In [None]:
X

In [None]:
# Objective function
def objective(trial):
    # Search spaces
    hyperparams = {
        'seed': 137,
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'use_label_encoder': False,
        'max_bin': trial.suggest_int('max_bin', 2, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 31),
        'alpha': trial.suggest_float('alpha', 1E-16, 12),
        'gamma': trial.suggest_float('gamma', 1E-16, 12),
        'reg_lambda': trial.suggest_float('reg_lambda', 1E-16, 12),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 1E-16, 1.0),
        'subsample': trial.suggest_float('subsample', 1E-16, 1.0), 
        'min_child_weight': trial.suggest_float('min_child_weight', 1E-16, 12),
    }
    
    # Add BEST_PARAMS
    hyperparams.update(BEST_PARAMS)
    
    # Evaluation
    scores = kfold_evaluation(X, Y, K, hyperparams)
    
    return scores['validation score'].mean()

In [None]:
# Optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=3600*2)

In [None]:
# Best score
study.best_value

In [None]:
# Historic
plot_optimization_history(study)


In [None]:
# Importance
plot_param_importances(study)

In [None]:
# Best parameters
BEST_PARAMS.update(study.best_params)
BEST_PARAMS

In [None]:
model = XGBClassifier(**BEST_PARAMS, use_label_encoder=False)

In [None]:
%%time
# Predictions on test set and submission
submission['target'] = kfold_prediction(X, Y, X_TEST, K, BEST_PARAMS)
submission.to_csv('submission.csv', index=False)