# XGBoost Hyperparameter Search

In this notebook we optimize an XGBoost model using the optuna library along with a [pruner](https://optuna.readthedocs.io/en/stable/reference/pruners.html). For each set of parameters, we perform k-fold cross-validation and our pruner references past models trained on the same data and ends unpromising trials early (i.e. if the AUC on a given fold is too low).

In [None]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 3
MAX_TREES = 20000
EARLY_STOP = 150
NUM_TRIALS = 50

In [None]:
# General imports
import numpy as np
import pandas as pd
import datatable as dt
import time
import gc

# Model and evaluation
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from xgboost import XGBClassifier
import xgboost as xgb

# Optuna
import optuna
from optuna.visualization import plot_param_importances, plot_parallel_coordinate
from optuna.pruners import PercentilePruner

# Hide warnings (makes optuna output easier to parse)
import warnings
warnings.filterwarnings('ignore')

# Preparing the Data

1. Load data with `datatable` and convert to `pandas`
2. Reduce memory usage by downcasting datatypes
3. Get holdout set from training data using a stratified scheme

In [None]:
# Helper function for downcasting 
def reduce_memory_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col, dtype in df.dtypes.iteritems():
        if dtype.name.startswith('int'):
            df[col] = pd.to_numeric(df[col], downcast ='integer')
        elif dtype.name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype.name.startswith('float'):
            df[col] = pd.to_numeric(df[col], downcast ='float')
        
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
%%time

# Load training data
train = dt.fread(r'../input/tabular-playground-series-oct-2021/train.csv').to_pandas()
train = reduce_memory_usage(train)

# Holdout set for testing our models
train, holdout = train_test_split(
    train,
    test_size = 0.5,
    shuffle = True,
    stratify = train['target'],
    random_state = RANDOM_SEED,
)

train.reset_index(drop = True, inplace = True)
holdout.reset_index(drop = True, inplace = True)

# Get features
features = [x for x in train.columns if x not in ['id','target']]

# XGBoost

We create a function to train an XGBoost model and return the holdout AUC.

## 1. Default Parameters

In [None]:
# Default XGBoost params, used for ALL models considered
default_params = dict(            
    random_state = RANDOM_SEED,
    n_estimators = MAX_TREES,
    tree_method = 'gpu_hist',
    predictor = "gpu_predictor",
)

## 2. Scoring Function

* `model_params` - parameters passed to `XGBClassifier`
* `fit_params` - parameters passed to the `fit` method

In [None]:
def score_xgboost(trial = None, model_params = {}, fit_params = {}):
    
    # Store the holdout predictions
    holdout_preds = np.zeros((holdout.shape[0],))
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['target'])):
        
        # Training and Validation Sets
        start = time.time()
        X_train, y_train = train[features].iloc[train_idx], train['target'].iloc[train_idx]
        X_valid, y_valid = train[features].iloc[valid_idx], train['target'].iloc[valid_idx]
        
        # Define Model
        model = XGBClassifier(**default_params, **model_params)
        gc.collect()
        
        model.fit(
            X_train, y_train,
            verbose = False,
            eval_set = [(X_valid, y_valid)],
            eval_metric = "logloss",
            early_stopping_rounds = EARLY_STOP,
            **fit_params
        )
        
        # validation/holdout predictions
        valid_preds = model.predict_proba(X_valid)[:, 1]
        holdout_preds += model.predict_proba(holdout[features])[:, 1] / NUM_FOLDS
        valid_auc = roc_auc_score(y_valid, valid_preds)
        end = time.time()
        
        print(f'Fold {fold} AUC: {round(valid_auc, 6)} in {round((end-start) / 60, 2)} minutes.')
        
        time.sleep(0.5)
        if trial:
            # Use pruning on fold AUC
            trial.report(
                value = valid_auc,
                step = fold
            )
            # prune slow trials and bad fold AUCs
            if trial.should_prune():
                raise optuna.TrialPruned()
        
        
    return roc_auc_score(holdout['target'], holdout_preds)

# Hyperparameter Search

To tweak the pruner consider adding/adjusting the following keyword arguments:

* `percentile` - prunes trial if in lower percentile of trials at a given step
* `n_startup_trials` - number of trials (models trained) before pruning starts
* `n_warmup_steps` - number of iterations before pruning checks
* `interval_steps` - number of iterations between pruning checks
* `n_min_trials` - skip pruning check if too few trials

In [None]:
# Percentile Pruner settings
pruner = PercentilePruner(
    percentile = 66,
    n_startup_trials = 5,
    n_warmup_steps = 0,
    interval_steps = 1,
    n_min_trials = 5,
)

In [None]:
def parameter_search(trials):
    
    # Optuna objective function
    def objective(trial):
        model_params = dict( 
            # default 6
            max_depth = trial.suggest_int(
                "max_depth", 2, 12
            ), 
            # default 0.3
            learning_rate = trial.suggest_loguniform(
                "learning_rate", 0.01, 0.3
            ),
            # default 0
            gamma = trial.suggest_loguniform(
                "gamma", 1e-10, 100
            ), 
            # default 1
            min_child_weight = trial.suggest_loguniform(
                "min_child_weight", 1e-2, 1e2
            ),
            # default 1
            subsample = trial.suggest_discrete_uniform(
                "subsample", 0.2, 1.0, 0.01
            ),
            # default 1
            colsample_bytree = trial.suggest_discrete_uniform(
                "colsample_bytree",  0.2, 1.0, 0.01
            ),
            # default 1
            colsample_bylevel = trial.suggest_discrete_uniform(
                "colsample_bylevel",  0.2, 1.0, 0.01
            ),
            # default 1
            reg_lambda = trial.suggest_loguniform(
                "reg_lambda", 1e-10, 100
            ),
            # default 0
            reg_alpha = trial.suggest_loguniform(
                "reg_alpha", 1e-10, 100
            ),
        )
        
        return score_xgboost(trial, model_params)
    
    
    optuna.logging.set_verbosity(optuna.logging.DEBUG)
    study = optuna.create_study(pruner = pruner,direction = "maximize")
    
    # (nearly) defaults
    study.enqueue_trial({
        "max_depth": 6,
        'learning_rate': 0.3, 
        'gamma': 1e-10, 
        'min_child_weight': 1.0, 
        'subsample': 1.0,
        'colsample_bytree': 1.0,
        'colsample_bylevel': 1.0,
        'reg_alpha': 1e-10,
        'reg_lambda': 1.0,
    })
    # high auc from previous run
    study.enqueue_trial({
        'max_depth': 4, 
        'learning_rate': 0.010283092300598066, 
        'gamma': 0.03506917176837801,
        'min_child_weight': 0.3878531236460043, 
        'subsample': 0.8900000000000001, 
        'colsample_bytree': 0.69, 
        'colsample_bylevel': 0.24000000000000002, 
        'reg_lambda': 5.051637651463356e-07,
        'reg_alpha': 30.170712609605435
    })
    study.optimize(objective, n_trials=trials)
    return study

In [None]:
# Hide output
study = parameter_search(NUM_TRIALS)

# Evaluation

## 1. Best Parameters

In [None]:
print("Best Parameters:", study.best_params)

## 2. Parameter Importances

In [None]:
plot_param_importances(study)

## 3. Parallel Coordinate Plot

Click on the vertical axes to see how certain parameter ranges affected the scores

In [None]:
# Likely broken on GitHub, view on Kaggle for interactive version
plot_parallel_coordinate(study)

# Make Submission

In [None]:
%%time
train = dt.fread(r'../input/tabular-playground-series-oct-2021/train.csv').to_pandas()
test = dt.fread(r'../input/tabular-playground-series-oct-2021/test.csv').to_pandas()
submission = dt.fread(r'../input/tabular-playground-series-oct-2021/sample_submission.csv').to_pandas()

train = reduce_memory_usage(train)
test = reduce_memory_usage(test)
gc.collect()

In [None]:
# Similar to scoring function but trains on full data and predicts on test
def train_xgboost(folds, model_params = {}, fit_params = {}):
    
    # Store the holdout predictions
    test_preds = np.zeros((test.shape[0],))
    print('')
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = folds, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['target'])):
        
        # Training and Validation Sets
        start = time.time()
        X_train, y_train = train[features].iloc[train_idx], train['target'].iloc[train_idx]
        X_valid, y_valid = train[features].iloc[valid_idx], train['target'].iloc[valid_idx]
        
        # Define Model
        model = XGBClassifier(**default_params, **model_params)
        gc.collect()
        
        model.fit(
            X_train, y_train,
            verbose = False,
            eval_set = [(X_valid, y_valid)],
            eval_metric = "logloss",
            early_stopping_rounds = EARLY_STOP,
            **fit_params
        )
        
        # validation and test predictions
        valid_preds = model.predict_proba(X_valid)[:, 1]
        test_preds += model.predict_proba(test[features])[:, 1] / folds
        
        # fold auc score
        fold_auc = roc_auc_score(y_valid, valid_preds)
        end = time.time()
        print(f'Fold {fold} AUC: {round(fold_auc, 6)} in {round((end-start) / 60, 2)} minutes.')

        
    return test_preds

In [None]:
# Make submission
submission['target'] = train_xgboost(6, model_params = study.best_params)
submission.to_csv('xgboost_submission.csv', index=False)

Hope you found this notebook useful, feel free to fork it and adapt it to your own uses.