# CatBoost Hyperparameter Search

In this notebook we use optuna to perform a hyperparameter search on a Catboost model with a custom [pruner](https://optuna.readthedocs.io/en/stable/reference/pruners.html). Unfortunately, CatBoost does not yet have a built-in [integration](https://optuna.readthedocs.io/en/stable/reference/integration.html) like LightGBM and XGBoost.

We check each set of parameters using k-fold cross validation. Our pruner checks the validation AUC on each fold and compares it to the previously seen models, if our current validation AUC is in the lower half of seen models, we exit the trial early (prune), thus saving some time by not training as many unpromising models.

**Note:** This notebook will take several hours to run. To shorten the runtime adjust `NUM_TRIALS` below.

In [None]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 3
MAX_TREES = 20000
EARLY_STOP = 150
NUM_TRIALS = 50

In [None]:
# General imports
import numpy as np
import pandas as pd
import datatable as dt
import time
import gc

# Model and evaluation
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from catboost import CatBoostClassifier
import catboost

# Optuna
import optuna
from optuna.visualization import plot_param_importances, plot_parallel_coordinate
from optuna.pruners import PercentilePruner

# Hide warnings (makes optuna output easier to parse)
import warnings
warnings.filterwarnings('ignore')

# Preparing the Data

1. Load data with `datatable` and convert to `pandas`
2. Reduce memory usage by downcasting datatypes
3. Get holdout set from training data using a stratified scheme
4. Save categorical features

In [None]:
# Helper function for downcasting 
def reduce_memory_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col, dtype in df.dtypes.iteritems():
        if dtype.name.startswith('int'):
            df[col] = pd.to_numeric(df[col], downcast ='integer')
        elif dtype.name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype.name.startswith('float'):
            df[col] = pd.to_numeric(df[col], downcast ='float')
        
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
%%time

# Load training data
train = dt.fread(r'../input/tabular-playground-series-oct-2021/train.csv').to_pandas()
train = reduce_memory_usage(train)

# Holdout set for testing our models
train, holdout = train_test_split(
    train,
    test_size = 0.5,
    shuffle = True,
    stratify = train['target'],
    random_state = RANDOM_SEED,
)

train.reset_index(drop = True, inplace = True)
holdout.reset_index(drop = True, inplace = True)

# Save features and categorical features
features = [x for x in train.columns if x not in ['id','target']]
categorical_features = [i for i,x in enumerate(features) if train[x].dtype.name.startswith("int")]

# CatBoost

We create a function to train a CatBoost model and return the holdout AUC.

## 1. Default Parameters

* `Bernoulli` bootstrap
* `Plain` boosting type

In [None]:
# Default CatBoost params, used for ALL models considered
default_params = dict(            
    random_state = RANDOM_SEED,
    n_estimators = MAX_TREES,
    early_stopping_rounds = EARLY_STOP,
    boosting_type = 'Plain',
    bootstrap_type = 'Bernoulli',
    eval_metric = 'Logloss',
    task_type = 'GPU',
)

## 2. Scoring Function

We define a scoring function which performs cross-validation on a training sets and predicts on a holdout set. We prune based on cross-validation and evaluate using the holdout score.

* `trial` - optuna trial object passed if used as part of an optuna trial
* `model_params` - parameters passed to `CatBoostClassifier`
* `fit_params` - parameters passed to the `fit` method.

In [None]:
def score_catboost(trial = None, model_params = {}, fit_params = {}):
    
    # Store the holdout predictions
    holdout_preds = np.zeros((holdout.shape[0],))
    scores = np.zeros(NUM_FOLDS)
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['target'])):
        
        # Training and Validation Sets
        X_train, y_train = train[features].iloc[train_idx], train['target'].iloc[train_idx]
        X_valid, y_valid = train[features].iloc[valid_idx], train['target'].iloc[valid_idx]
        
        start = time.time()
        
        # Define Model
        model = CatBoostClassifier(**{**default_params, **model_params})
        gc.collect()
        
        model.fit(
            X_train, y_train,
            verbose = False,
            eval_set = [(X_valid, y_valid)],
            cat_features = categorical_features,
            use_best_model = True,
            **fit_params
        )
        
        # validation/holdout predictions
        valid_preds = model.predict_proba(X_valid)[:, 1]
        holdout_preds += model.predict_proba(holdout[features])[:, 1] / NUM_FOLDS
        valid_auc = roc_auc_score(y_valid, valid_preds)
        end = time.time()
        print(f'Fold {fold} AUC: {round(valid_auc, 6)} in {round((end-start) / 60, 2)} minutes.')
        
        if trial:
            # Use pruning on fold AUC
            trial.report(
                value = valid_auc,
                step = fold
            )
            # prune slow trials and bad fold AUCs
            if trial.should_prune() or round(end - start, 1) > 480:
                raise optuna.TrialPruned()
        
        time.sleep(0.5)
        
    return roc_auc_score(holdout['target'], holdout_preds)

## 3. Pruning

There's no built-in integration for CatBoost but we can still prune based on the fold AUC, which should still save a decent amount of time

* `n_startup_trials` - number of trials (models trained) before pruning starts
* `n_warmup_steps` - number of iterations before pruning checks
* `interval_steps` - number of iterations between pruning checks
* `n_min_trials` - skip pruning check if too few trials

In [None]:
# Tweak Pruner settings
pruner = PercentilePruner(
    percentile = 0.66,
    n_startup_trials = 5,
    n_warmup_steps = 0,
    interval_steps = 1,
    n_min_trials = 5,
)

# Hyperparameter Search

In [None]:
def parameter_search(trials):
    
    # Optuna objective function
    def objective(trial):
        model_params = dict( 
            # default 
            max_depth = trial.suggest_int(
                "max_depth", 2, 8
            ), 
            # default 0.03
            learning_rate = trial.suggest_loguniform(
                "learning_rate", 0.009, 0.03
            ),
            # default 
            min_child_samples = trial.suggest_int(
                "min_child_samples", 1, 20000
            ), 
            # default 
            random_strength = trial.suggest_uniform(
                "random_strength", 1, 100
            ), 
            # default 
            leaf_estimation_iterations = trial.suggest_int(
                "leaf_estimation_iterations", 1, 20
            ),             
            subsample = trial.suggest_discrete_uniform(
                'subsample', 0.2, 1.0, 0.001
            ),
            # default 3.0
            reg_lambda = trial.suggest_loguniform(
                'reg_lambda', 1e-10, 100
            ),
        )
        
        return score_catboost(trial, model_params = model_params)
    
    optuna.logging.set_verbosity(optuna.logging.DEBUG)
    study = optuna.create_study(pruner = pruner, direction = "maximize")
    # (nearly) default
    study.enqueue_trial({
        'max_depth': 6, 
        'learning_rate': 0.0125730000436306,
        'min_child_samples': 1, 
        'random_strength': 1, 
        'leaf_estimation_iterations': 10,
        'subsample': 1.0, 
        'reg_lambda': 3, 
    })
    study.optimize(objective, n_trials=trials)
    return study

In [None]:
study = parameter_search(NUM_TRIALS)

# Evaluation

## 1. Best Parameters

In [None]:
print("Best Parameters:", study.best_params)

## 2. Parameter Importances

In [None]:
plot_param_importances(study)

## 3. Parallel Coordinate Plot

In [None]:
plot_parallel_coordinate(study)

# Make Submission

In [None]:
%%time
del train, holdout; gc.collect()
train = dt.fread(r'../input/tabular-playground-series-oct-2021/train.csv').to_pandas()
test = dt.fread(r'../input/tabular-playground-series-oct-2021/test.csv').to_pandas()
submission = dt.fread(r'../input/tabular-playground-series-oct-2021/sample_submission.csv').to_pandas()

train = reduce_memory_usage(train)
test = reduce_memory_usage(test)
gc.collect()

In [None]:
# Similar to scoring function but trains on full data and predicts on test
def train_catboost(folds, model_params = {}):
    
    # Store the holdout predictions
    test_preds = np.zeros((test.shape[0],))
    print('')
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = folds, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['target'])):
        
        # Training and Validation Sets
        start = time.time()
        X_train, y_train = train[features].iloc[train_idx], train['target'].iloc[train_idx]
        X_valid, y_valid = train[features].iloc[valid_idx], train['target'].iloc[valid_idx]
        
        # Define Model
        model = CatBoostClassifier(**{**default_params, **model_params})
        gc.collect()
        
        model.fit(
            X_train, y_train,
            verbose = False,
            eval_set = [(X_valid, y_valid)],
            cat_features = categorical_features,
            use_best_model = True,
        )
        
        # validation and test predictions
        valid_preds = model.predict_proba(X_valid)[:, 1]
        test_preds += model.predict_proba(test[features])[:, 1] / folds
        
        # fold auc score
        fold_auc = roc_auc_score(y_valid, valid_preds)
        end = time.time()
        print(f'Fold {fold} AUC: {round(fold_auc, 6)} in {round((end-start) / 60, 2)} minutes.')
        
    return test_preds

In [None]:
# Make submission
submission['target'] = train_catboost(6, study.best_params)
submission.to_csv('catboost_submission.csv', index=False)



Hope you found this notebook useful, feel free to fork it and adapt it to your own uses.