scikit-learn · thomasjpfan · Apr 13, 2020 · Apr 13, 2020 · Apr 13, 2020 · Apr 13, 2020
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
@@ -1095,6 +1095,44 @@ supported for multiclass context.
 
   * :ref:`sphx_glr_auto_examples_ensemble_plot_monotonic_constraints.py`
 
+.. _categorical_support_gbdt:
+
+Categorical Support
+-------------------
+
+For datasets with categorical data, :class:`HistGradientBoostingClassifier`
+and :class:`HistGradientBoostingRegressor` has native support for splitting
+on categorical features. This is often better than one hot encoding because
+it leads to faster training times and trees with less depth. When splitting
+a node, the categorical feature will be split into two subsets: one going to
+the left child and the other going to the right child. First, the histogram
+for each categorical feature is first sorted according to the ratio:
+`sum of gradient / sum of hessian` in each bin. Then the best split is found
+by considering splits along the stored histogram.
+
+If the cardinality of a categorical feature is greater than `max_bins`, then
+the top `max_bins` categories based on cardinality will be kept and the less
+frequent categories will be considered missing. If there are missing values
+during training, the missing will be considered its own category. When
+predicting, categories that were unknown during fit time, will be consider
+missing.
+
+To enable categorical support, a boolean mask can be passed to the
+`categorical` parameter. In the following, the first feature will be
+treated as categorical and the second feature is nummerical::
+
+  >>> gbdt = HistGradientBoostingRegressor(categorical=[True, False])
+
+Another way to enable categorical support is to pass `'pandas'` to the
+`categorical` parameter. This will infer the categorical features using panda's
+categorical dtype during `fit`.
+
+  >>> gbdt = HistGradientBoostingRegressor(categorical='pandas')
+
+.. topic:: Examples:
+
+  * :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`
+
 Low-level parallelism
 ---------------------
 

diff --git a/examples/ensemble/plot_gradient_boosting_categorical.py b/examples/ensemble/plot_gradient_boosting_categorical.py
@@ -0,0 +1,93 @@
+"""
+========================================
+Categorical Support in Gradient Boosting
+========================================
+
+.. currentmodule:: sklearn
+
+This example, we will compare the performance of
+:class:`~ensemble.HistGradientBoostingRegressor` using one hot encoding
+and with native categorical support.
+
+We will work with the Ames Lowa Housing dataset which consists of numerical
+and categorical features, where the houses' sales prices is the target.
+"""
+##############################################################################
+# Load Ames Housing dataset
+# -------------------------
+# First, we load the ames housing data as a pandas dataframe. The features
+# are either categorical or numerical:
+print(__doc__)
+
+from sklearn.datasets import fetch_openml
+
+X, y = fetch_openml(data_id=41211, as_frame=True, return_X_y=True)
+
+n_features = X.shape[1]
+n_categorical_features = (X.dtypes == 'category').sum()
+n_numerical_features = (X.dtypes == 'float').sum()
+print(f"Number of features: {X.shape[1]}")
+print(f"Number of categorical featuers: {n_categorical_features}")
+print(f"Number of numerical featuers: {n_numerical_features}")
+
+##############################################################################
+# Create gradient boosting estimator with one hot encoding
+# --------------------------------------------------------
+# Next, we create a pipeline that will one hot encode the categorical features
+# and let rest of the numerical data to passthrough:
+
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.pipeline import make_pipeline
+from sklearn.compose import make_column_transformer
+from sklearn.compose import make_column_selector
+from sklearn.preprocessing import OneHotEncoder
+
+preprocessor = make_column_transformer(
+    (OneHotEncoder(sparse=False, handle_unknown='ignore'),
+     make_column_selector(dtype_include='category')),
+    remainder='passthrough'
+)
+
+hist_one_hot = make_pipeline(preprocessor,
+                             HistGradientBoostingRegressor(random_state=0))
+
+##############################################################################
+# Create gradient boosting estimator with native categorical support
+# ------------------------------------------------------------------
+# The :class:`~ensemble.HistGradientBoostingRegressor` has native support
+# for categorical features using the `categorical` parameter:
+
+hist_native = HistGradientBoostingRegressor(categorical='pandas',
+                                            random_state=0)
+
+##############################################################################
+# Train the models with cross-validation
+# --------------------------------
+# Finally, we train the models using cross validation. Here we compare the
+# models performance in terms of :func:`~metrics.r2_score` and fit times. We
+# show that fit times are faster with native categorical support and that the
+# test scores and scores times are comparable:
+
+from sklearn.model_selection import cross_validate
+import matplotlib.pyplot as plt
+import numpy as np
+
+one_hot_result = cross_validate(hist_one_hot, X, y)
+native_result = cross_validate(hist_native, X, y)
+
+fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 8))
+
+plot_info = [('fit_time', 'Fit times (s)', ax1),
+             ('score_time', 'Score times (s)', ax2),
+             ('test_score', 'Test Scores (r2 score)', ax3)]
+
+x, width = np.arange(2), 0.9
+for key, title, ax in plot_info:
+    items = [native_result[key], one_hot_result[key]]
+    ax.bar(x, [np.mean(item) for item in items],
+           width, yerr=[np.std(item) for item in items],
+           color=['b', 'r'])
+    ax.set(xlabel='Split number', title=title, xticks=[0, 1],
+           xticklabels=['Native', "One Hot"])
+plt.show()
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
@@ -34,12 +34,15 @@ def _map_to_bins(const X_DTYPE_C [:, :] data,
     """
     cdef:
         int feature_idx
+        X_DTYPE_C [:] binning_threshold
 
     for feature_idx in range(data.shape[1]):
-        _map_num_col_to_bins(data[:, feature_idx],
-                             binning_thresholds[feature_idx],
-                             missing_values_bin_idx,
-                             binned[:, feature_idx])
+        binning_threshold = binning_thresholds[feature_idx]
+        if binning_threshold is not None:
+            _map_num_col_to_bins(data[:, feature_idx],
+                                binning_threshold,
+                                missing_values_bin_idx,
+                                binned[:, feature_idx])
 
 
 cdef void _map_num_col_to_bins(const X_DTYPE_C [:] data,

diff --git a/sklearn/ensemble/_hist_gradient_boosting/_bitset.pxd b/sklearn/ensemble/_hist_gradient_boosting/_bitset.pxd
@@ -0,0 +1,10 @@
+# cython: language_level=3
+from .common cimport X_BITSET_DTYPE_C
+from .common cimport X_BINNED_DTYPE_C
+
+
+cdef void init_bitset(X_BITSET_DTYPE_C bitset) nogil
+
+cdef void insert_bitset(X_BINNED_DTYPE_C val, X_BITSET_DTYPE_C bitset) nogil
+
+cdef unsigned char in_bitset(X_BINNED_DTYPE_C val, X_BITSET_DTYPE_C bitset) nogil
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_bitset.pyx b/sklearn/ensemble/_hist_gradient_boosting/_bitset.pyx
@@ -0,0 +1,33 @@
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: language_level=3
+from .common cimport X_BITSET_DTYPE_C
+from .common cimport X_BINNED_DTYPE_C
+
+
+cdef inline void init_bitset(X_BITSET_DTYPE_C bitset) nogil: # OUT
+    cdef:
+        unsigned int i
+
+    for i in range(8):
+        bitset[i] = 0
+
+cdef inline void insert_bitset(X_BINNED_DTYPE_C val,
+                               X_BITSET_DTYPE_C bitset) nogil: # OUT
+    cdef:
+        unsigned int i1 = val / 32
+        unsigned int i2 = val % 32
+
+    # It is assumed that val < 256 or i1 < 8
+    bitset[i1] |= (1 << i2)
+
+cdef inline unsigned char in_bitset(X_BINNED_DTYPE_C val,
+                                    X_BITSET_DTYPE_C bitset) nogil:
+    cdef:
+        unsigned int i1 = val / 32
+        unsigned int i2 = val % 32
+
+    if i1 >= 8:
+        return 0
+    return (bitset[i1] >> i2) & 1
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
@@ -17,44 +17,59 @@ from .common cimport Y_DTYPE_C
 from .common import Y_DTYPE
 from .common cimport X_BINNED_DTYPE_C
 from .common cimport node_struct
+from ._bitset cimport in_bitset
 
 
 def _predict_from_numeric_data(
         node_struct [:] nodes,
         const X_DTYPE_C [:, :] numeric_data,
+        const X_BINNED_DTYPE_C [:, :] categorical_data,
+        const long[:] orig_feature_to_binned_cat,
         Y_DTYPE_C [:] out):
 
     cdef:
         int i
 
     for i in prange(numeric_data.shape[0], schedule='static', nogil=True):
-        out[i] = _predict_one_from_numeric_data(nodes, numeric_data, i)
+        out[i] = _predict_one_from_numeric_data(
+            nodes, numeric_data, categorical_data,
+            orig_feature_to_binned_cat, i)
 
 
 cdef inline Y_DTYPE_C _predict_one_from_numeric_data(
         node_struct [:] nodes,
         const X_DTYPE_C [:, :] numeric_data,
+        const X_BINNED_DTYPE_C [:, :] categorical_data,
+        const long[:] orig_feature_to_binned_cat,
         const int row) nogil:
     # Need to pass the whole array and the row index, else prange won't work.
     # See issue Cython #2798
 
     cdef:
         node_struct node = nodes[0]
+        long cat_idx
 
     while True:
         if node.is_leaf:
             return node.value
 
-        if isnan(numeric_data[row, node.feature_idx]):
-            if node.missing_go_to_left:
+        if node.is_categorical:
+            cat_idx = orig_feature_to_binned_cat[node.feature_idx]
+            if in_bitset(categorical_data[row, cat_idx], node.cat_threshold):
                 node = nodes[node.left]
             else:
                 node = nodes[node.right]
         else:
-            if numeric_data[row, node.feature_idx] <= node.threshold:
-                node = nodes[node.left]
+            if isnan(numeric_data[row, node.feature_idx]):
+                if node.missing_go_to_left:
+                    node = nodes[node.left]
+                else:
+                    node = nodes[node.right]
             else:
-                node = nodes[node.right]
+                if numeric_data[row, node.feature_idx] <= node.threshold:
+                    node = nodes[node.left]
+                else:
+                    node = nodes[node.right]
 
 
 def _predict_from_binned_data(
@@ -85,16 +100,24 @@ cdef inline Y_DTYPE_C _predict_one_from_binned_data(
     while True:
         if node.is_leaf:
             return node.value
-        if binned_data[row, node.feature_idx] ==  missing_values_bin_idx:
-            if node.missing_go_to_left:
+
+        if node.is_categorical:
+            if in_bitset(binned_data[row, node.feature_idx],
+                         node.cat_threshold):
                 node = nodes[node.left]
             else:
                 node = nodes[node.right]
         else:
-            if binned_data[row, node.feature_idx] <= node.bin_threshold:
-                node = nodes[node.left]
+            if binned_data[row, node.feature_idx] ==  missing_values_bin_idx:
+                if node.missing_go_to_left:
+                    node = nodes[node.left]
+                else:
+                    node = nodes[node.right]
             else:
-                node = nodes[node.right]
+                if binned_data[row, node.feature_idx] <= node.bin_threshold:
+                    node = nodes[node.left]
+                else:
+                    node = nodes[node.right]
 
 def _compute_partial_dependence(
     node_struct [:] nodes,