In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

One nice feature about XGBoost is the possibility to do early-stopping. However, this functionality is not integrated well with scikit-learn's API. So I wrote a custom function which allows us to do grid search and early-stopping at the same time.

By the way, if you choose not to specify `early_stopping_rounds`, this custom function would reduce to an ordinary grid searcher.

The logic and implementation of this custom function might not be ideal and any suggestion is welcome.

In [None]:
from copy import deepcopy
from itertools import product
from collections import defaultdict

from xgboost import XGBRegressor

In [None]:
def GridSearchCV_XGB_early_stoppping(param_grid, fit_params, scorer, cv, X, y):
    """This function performs grid search for the best set of parameters of XGBoost model with early stopping.

    Args:
        param_grid (dict): The parameter ranges for which the function searches.
        fit_params (dict): The fitting parameters for XGBoost.
        scorer (_PredictScorer): The sklearn's scorer instance.
        cv (model_selection._split): The sklearn's split instance.
        X (DataFrame): The input data matrix.
        y (Series): The ground truth label.
        
    Returns:
        dict: The best set of parameters found via grid search.
    """
    if isinstance(X, pd.DataFrame):
        X = X.to_numpy()
    if isinstance(y, pd.Series):
        y = y.to_numpy()
        
    param_names, param_values = zip(*list(param_grid.items()))

    cv_best_iterations = defaultdict(list)
    cv_results = defaultdict(list)

    for train_index, test_index in cv.split(X, y):
        X_in, X_out = X[train_index], X[test_index]
        y_in, y_out = y[train_index], y[test_index]

        fit_params_cv = deepcopy(fit_params)
        fit_params_cv['eval_set'] = [(X_out, y_out)]

        for value_combination in product(*param_values):
            param_grid_cv = tuple(zip(param_names, value_combination))
            xgboost = XGBRegressor(**dict(param_grid_cv))

            xgboost.fit(X_in, y_in, **fit_params_cv)
            best_iteration = xgboost.get_num_boosting_rounds() if 'early_stopping_rounds' not in fit_params_cv else xgboost.best_iteration
            cv_best_iterations[param_grid_cv].append(best_iteration)

            score = scorer(xgboost, X_out, y_out)
            cv_results[param_grid_cv].append(score)
        
    best_params_xgb, score_list = max(cv_results.items(), key=lambda x: np.array(x[1]).mean())

    # Note that our XGBoost model may stop early,
    # so we calculate the mean of the actual number of estimators in each fold,
    # in place of the originally planned n_estimators after finishing cross validation.
    n_estimators = int(round(np.array(cv_best_iterations[best_params_xgb]).mean()))
    
    best_params_xgb = dict(best_params_xgb)
    best_params_xgb['n_estimators'] = n_estimators

    print ("Best score: {:.3f}".format(np.array(score_list).mean()))
    print ("Best Parameters: {}".format(best_params_xgb))
    
    return best_params_xgb

Let's try it on a real [dataset](https://www.kaggle.com/c/home-data-for-ml-course).

In [None]:
from sklearn.model_selection import train_test_split

seed = 0

# Read the data
X = pd.read_csv('../input/house-prices-data/train.csv', index_col='Id')

# For simplicity, we drop all categorical features
X = X.select_dtypes(exclude=['object'])

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=seed)

In [None]:
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import KFold

param_grid = {
    'objective': ['reg:squarederror'],
    'n_estimators': [200, 500, 1000, 1500, 2000],
    'max_depth': [2],
    'learning_rate': [0.1],
    'random_state': [seed]
}

fit_params = {
    'eval_metric': "rmse",
    'early_stopping_rounds': 100,
    'verbose': False
}

scorer = make_scorer(mean_absolute_error, greater_is_better=False)
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

best_params_xgb = GridSearchCV_XGB_early_stoppping(param_grid, fit_params, scorer, kf, X_train, y_train)

best_xgb = XGBRegressor(**best_params_xgb)
best_xgb.fit(X_train, y_train, eval_metric=fit_params['eval_metric'], verbose=False)
best_score = scorer(best_xgb, X_valid, y_valid)
print ("The best score for XGBoost on validation set is {:.3f}".format(best_score))

np.save('best_params_xgb.npy', best_params_xgb)

In [None]:
# We can load the model back with previously found parameters:
final_xgb = XGBRegressor(**np.load('best_params_xgb.npy', allow_pickle=True).item())

This notebook ends here and please kindly give a vote if you find it helpful. You are also welcome to check my other notebooks.