scikit-learn-contrib · PaulWestenthanner · Oct 20, 2021 · May 31, 2021 · May 31, 2021 · May 31, 2021
diff --git a/README.md b/README.md
@@ -35,6 +35,7 @@ __Supervised:__
  * Target Encoding [7]
  * Weight of Evidence [8]
  * Quantile Encoder [13]
+ * Summary Encoder [13]
 
 Installation
 ------------

diff --git a/category_encoders/__init__.py b/category_encoders/__init__.py
@@ -23,29 +23,30 @@
 from category_encoders.james_stein import JamesSteinEncoder
 from category_encoders.cat_boost import CatBoostEncoder
 from category_encoders.glmm import GLMMEncoder
-from category_encoders.quantile_encoder import QuantileEncoder
+from category_encoders.quantile_encoder import QuantileEncoder, SummaryEncoder
 
-__version__ = '2.2.2'
+__version__ = "2.2.2"
 
-__author__ = 'willmcginnis','cmougan'
+__author__ = "willmcginnis", "cmougan"
 
 __all__ = [
-    'BackwardDifferenceEncoder',
-    'BinaryEncoder',
-    'CountEncoder',
-    'HashingEncoder',
-    'HelmertEncoder',
-    'OneHotEncoder',
-    'OrdinalEncoder',
-    'SumEncoder',
-    'PolynomialEncoder',
-    'BaseNEncoder',
-    'LeaveOneOutEncoder',
-    'TargetEncoder',
-    'WOEEncoder',
-    'MEstimateEncoder',
-    'JamesSteinEncoder',
-    'CatBoostEncoder',
-    'GLMMEncoder',
-    'QuantileEncoder',
+    "BackwardDifferenceEncoder",
+    "BinaryEncoder",
+    "CountEncoder",
+    "HashingEncoder",
+    "HelmertEncoder",
+    "OneHotEncoder",
+    "OrdinalEncoder",
+    "SumEncoder",
+    "PolynomialEncoder",
+    "BaseNEncoder",
+    "LeaveOneOutEncoder",
+    "TargetEncoder",
+    "WOEEncoder",
+    "MEstimateEncoder",
+    "JamesSteinEncoder",
+    "CatBoostEncoder",
+    "GLMMEncoder",
+    "QuantileEncoder",
+    "SummaryEncoder",
 ]
diff --git a/category_encoders/quantile_encoder.py b/category_encoders/quantile_encoder.py
@@ -1,15 +1,15 @@
-"""Target Encoder"""
+"""Quantile Encoder"""
 __author__ = "david26694", "cmougan"
 
 import numpy as np
-import pandas as pd
-from sklearn.base import BaseEstimator
 from category_encoders.ordinal import OrdinalEncoder
-from category_encoders.m_estimate import MEstimateEncoder
+from sklearn.base import BaseEstimator
 import category_encoders.utils as util
-from sklearn.utils.random import check_random_state
+
+
 class QuantileEncoder(BaseEstimator, util.TransformerWithTargetMixin):
     """Quantile Encoding for categorical features.
+
     This a statistically modified version of target MEstimate encoder where selected features
     are replaced the statistical quantile instead than the mean. Replacing with the
     median is a particular case where self.quantile = 0.5. In comparison to MEstimateEncoder
@@ -18,10 +18,10 @@ class QuantileEncoder(BaseEstimator, util.TransformerWithTargetMixin):
     ----------
     verbose: int
         integer indicating verbosity of the output. 0 for none.
-    quantile: int
-        integer indicating statistical quantile. ´0.5´ for median.
-    m: int
-        integer indicating the smoothing parameter. 0 for no smoothing.
+    quantile: float
+        float indicating statistical quantile. ´0.5´ for median.
+    m: float
+        this is the “m” in the m-probability estimate. Higher value of m results into stronger shrinking. M is non-negative. 0 for no smoothing.
     cols: list
         a list of columns to encode, if None, all string columns will be encoded.
     drop_invariant: bool
@@ -40,7 +40,7 @@ class QuantileEncoder(BaseEstimator, util.TransformerWithTargetMixin):
     >>> bunch = load_boston()
     >>> y = bunch.target
     >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
-    >>> enc = QuantileEncoder(cols=['CHAS', 'RAD']).fit(X, y)
+    >>> enc = QuantileEncoder(cols=['CHAS', 'RAD'], quantile=0.5, m=1.0).fit(X, y)
     >>> numeric_dataset = enc.transform(X)
     >>> print(numeric_dataset.info())
     <class 'pandas.core.frame.DataFrame'>
@@ -169,9 +169,7 @@ def fit_quantile_encoding(self, X, y):
         mapping = {}
 
         # Calculate global statistics
-        prior = self._quantile = np.quantile(y, self.quantile)
-        self._sum = y.sum()
-        self._count = y.count()
+        prior = np.quantile(y, self.quantile)
 
         for switch in self.ordinal_encoder.category_mapping:
             col = switch.get("col")
@@ -230,7 +228,11 @@ def transform(self, X, y=None, override_return_df=False):
         # then make sure that it is the right size
         if X.shape[1] != self._dim:
             raise ValueError(
-                "Unexpected input dimension %d, expected %d" % (X.shape[1], self._dim,)
+                "Unexpected input dimension %d, expected %d"
+                % (
+                    X.shape[1],
+                    self._dim,
+                )
             )
 
         # if we are encoding the training data, we have to check the target
@@ -285,7 +287,118 @@ def get_feature_names(self):
 
         if not isinstance(self.feature_names, list):
             raise ValueError(
-                "Must fit data first. Affected feature names are not known " "before."
+                "Must fit data first. Affected feature names are not known before."
             )
         else:
             return self.feature_names
+
+
+class SummaryEncoder(BaseEstimator, util.TransformerWithTargetMixin):
+    """Summary Encoding for categorical features.
+
+     This method provides a broader description of a categorical variable than the quantile encoder.
+     It's an encoder designed for creating richer representations.
+    This is built by leveraging information from different quantiles by concatenating them
+
+        This a statistically modified version of target MEstimate encoder where selected features
+        are replaced the statistical quantile instead than the mean. Replacing with the
+        median is a particular case where self.quantile = 0.5. In comparison to MEstimateEncoder
+        it has two tunable parameter `m` and `quantile`
+        Parameters
+        ----------
+        verbose: int
+            integer indicating verbosity of the output. 0 for none.
+        quantiles: list
+            list of floats indicating the statistical quantiles. Each element represent a column
+        m: float
+            this is the “m” in the m-probability estimate. Higher value of m results into stronger shrinking. M is non-negative. 0 for no smoothing.
+        cols: list
+            a list of columns to encode, if None, all string columns will be encoded.
+        drop_invariant: bool
+            boolean for whether or not to drop columns with 0 variance.
+        return_df: bool
+            boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
+        handle_missing: str
+            options are 'error', 'return_nan'  and 'value', defaults to 'value', which returns the target quantile.
+        handle_unknown: str
+            options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target quantile.
+        Example
+        -------
+        >>> from category_encoders import *
+        >>> import pandas as pd
+        >>> from sklearn.datasets import load_boston
+        >>> bunch = load_boston()
+        >>> y = bunch.target
+        >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
+        >>> enc = SummaryEncoder(cols=["CHAS", "RAD"], quantiles=[0.25, 0.5, 0.75]).fit(X, y)
+        >>> numeric_dataset = enc.transform(X)
+        >>> print(numeric_dataset.info())
+        <class 'pandas.core.frame.DataFrame'>
+        RangeIndex: 506 entries, 0 to 505
+        Data columns (total 13 columns):
+        CRIM       506 non-null float64
+        ZN         506 non-null float64
+        INDUS      506 non-null float64
+        CHAS       506 non-null float64
+        NOX        506 non-null float64
+        RM         506 non-null float64
+        AGE        506 non-null float64
+        DIS        506 non-null float64
+        RAD        506 non-null float64
+        TAX        506 non-null float64
+        PTRATIO    506 non-null float64
+        B          506 non-null float64
+        LSTAT      506 non-null float64
+        dtypes: float64(13)
+        memory usage: 51.5 KB
+        None
+        References
+        ----------
+        .. [1] Quantile Encoder: Tackling High Cardinality Categorical Features in Regression Problems, https://arxiv.org/abs/2105.13783
+        .. [2] A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems, equation 7, from https://dl.acm.org/citation.cfm?id=507538
+        .. [3] On estimating probabilities in tree pruning, equation 1, from https://link.springer.com/chapter/10.1007/BFb0017010
+        .. [4] Additive smoothing, from https://en.wikipedia.org/wiki/Additive_smoothing#Generalized_to_the_case_of_known_incidence_rates
+        .. [5] Target encoding done the right way https://maxhalford.github.io/blog/target-encoding/
+    """
+
+    def __init__(self, cols, quantiles, m=1.0):
+
+        self.cols = cols
+        self.quantiles = quantiles
+        self.m = m
+        self.encoder_list = None
+
+    def fit(self, X, y):
+
+        X = X.copy()
+
+        for quantile in self.quantiles:
+            for col in self.cols:
+                percentile = round(quantile * 100)
+                X[f"{col}_{percentile}"] = X[col]
+
+        encoder_list = []
+        for quantile in self.quantiles:
+            col_names = []
+            for col in self.cols:
+                percentile = round(quantile * 100)
+                col_names.append(f"{col}_{percentile}")
+            enc = QuantileEncoder(cols=col_names, quantile=quantile, m=self.m)
+            enc.fit(X, y)
+            encoder_list.append(enc)
+
+        self.encoder_list = encoder_list
+
+        return self
+
+    def transform(self, X, y=None):
+        X_encoded = X.copy()
+
+        for quantile in self.quantiles:
+            for col in self.cols:
+                percentile = round(quantile * 100)
+                X_encoded[f"{col}_{percentile}"] = X_encoded[col]
+
+        for encoder in self.encoder_list:
+            X_encoded = encoder.transform(X_encoded)
+        return X_encoded
diff --git a/docs/source/summary.rst b/docs/source/summary.rst
@@ -0,0 +1,5 @@
+Quantile Encoder
+==============
+
+.. autoclass:: category_encoders.quantile_encoder.SummaryEncoder
+    :members:
diff --git a/tests/test_quantile_encoder.py b/tests/test_quantile_encoder.py
@@ -11,9 +11,7 @@ class TestQuantileEncoder(unittest.TestCase):
     def setUp(self):
         """Create dataframe with categories and a target variable"""
 
-        self.df = pd.DataFrame(
-            {"categories": ["a", "b", "c", "a", "b", "c", "a", "b"]}
-        )
+        self.df = pd.DataFrame({"categories": ["a", "b", "c", "a", "b", "c", "a", "b"]})
         self.target = np.array([1, 2, 0, 4, 5, 0, 6, 7])
 
     def test_median_works(self):
@@ -42,9 +40,7 @@ def test_max_works(self):
             - b max is 7
             - c max is 0
         """
-        expected_output_max = pd.DataFrame(
-            {"categories": [6.0, 7, 0, 6, 7, 0, 6, 7]}
-        )
+        expected_output_max = pd.DataFrame({"categories": [6.0, 7, 0, 6, 7, 0, 6, 7]})
 
         pd.testing.assert_frame_equal(
             encoders.QuantileEncoder(quantile=1, m=0.0).fit_transform(
@@ -65,6 +61,58 @@ def test_new_category(self):
 
         new_medians = pd.DataFrame({"categories": [3.0, 3.0]})
 
-        pd.testing.assert_frame_equal(
-            transformer_median.transform(new_df), new_medians
+        pd.testing.assert_frame_equal(transformer_median.transform(new_df), new_medians)
+
+
+class TestSummaryEncoder(unittest.TestCase):
+    """Tests for percentile encoder."""
+
+    def setUp(self):
+        """Create dataframe with categories and a target variable"""
+
+        self.df = pd.DataFrame({"categories": ["a", "b", "c", "a", "b", "c", "a", "b"]})
+        self.target = np.array([1, 2, 0, 4, 5, 0, 6, 7])
+        self.col = "categories"
+
+    def assert_same_quantile(self, quantile):
+
+        quantile_results = encoders.QuantileEncoder(
+            cols=[self.col], quantile=quantile
+        ).fit_transform(self.df, self.target)
+
+        summary_results = encoders.SummaryEncoder(
+            cols=[self.col], quantiles=[quantile]
+        ).fit_transform(self.df, self.target)
+
+        percentile = round(quantile * 100)
+
+        np.testing.assert_allclose(
+            quantile_results[self.col].values,
+            summary_results[f"{self.col}_{percentile}"].values,
         )
+
+    def test_several_quantiles(self):
+
+        for quantile in [0.1, 0.5, 0.9]:
+            self.assert_same_quantile(quantile)
+
+    def test_several_quantiles(self):
+
+        quantile_list = [0.2, 0.1, 0.8]
+
+        summary_results = encoders.SummaryEncoder(
+            cols=[self.col], quantiles=quantile_list
+        ).fit_transform(self.df, self.target)
+
+        for quantile in quantile_list:
+
+            quantile_results = encoders.QuantileEncoder(
+                cols=[self.col], quantile=quantile
+            ).fit_transform(self.df, self.target)
+
+            percentile = round(quantile * 100)
+
+            np.testing.assert_allclose(
+                quantile_results[self.col].values,
+                summary_results[f"{self.col}_{percentile}"].values,
+            )