Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Quantile encoder #303

Merged
merged 32 commits into from Oct 20, 2021
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
284f378
#302 quantileEncoder and SummaryEncoder
cmougan May 31, 2021
55e00f4
#302 test for QE and SE - passing
cmougan May 31, 2021
591257d
Quantile Encoder and Summary Encoder
cmougan May 31, 2021
21bbb24
Quantile Encoder and Summary Encoder update docs
cmougan May 31, 2021
da6de04
#302 Quantile Encoder and Summary Encoder update docs
cmougan May 31, 2021
56ca905
doc QE
cmougan Jun 17, 2021
e3ea3e7
remove summary encoder
cmougan Jun 17, 2021
40a8a1c
Update quantile_encoder.py
cmougan Oct 10, 2021
b06f108
remove unnecesary imports
cmougan Oct 10, 2021
c72e73f
qe cosmetic issues
cmougan Oct 10, 2021
828e518
m bio
cmougan Oct 10, 2021
4df3bf1
formatting
cmougan Oct 10, 2021
64d1d5c
summary encoder
cmougan Oct 10, 2021
2032815
summary encoder
cmougan Oct 10, 2021
7a6da5b
e
cmougan Oct 10, 2021
ae7478a
change name Summary Encoder
cmougan Oct 12, 2021
a7bc033
cosmetic docs
cmougan Oct 12, 2021
d9ff993
Merge branch 'master' into quantileEncoder
cmougan Oct 12, 2021
70d46e5
test_summary_quantile
cmougan Oct 12, 2021
3d11c8e
Throw error in case of two quantiles with same percentile
david26694 Oct 12, 2021
3d5c91c
Merge pull request #1 from david26694/quantileEncoder
cmougan Oct 12, 2021
bbe1a15
Refactor summary encoder
david26694 Oct 15, 2021
86173c6
Fix failing tests QE
david26694 Oct 15, 2021
60ddb4f
Add default arguments to SE
david26694 Oct 15, 2021
979c774
Parametrise summary encoder
david26694 Oct 15, 2021
29f7c0d
Add summary encoder in all QE tests
david26694 Oct 15, 2021
741a21e
Merge pull request #2 from david26694/quantileEncoder
cmougan Oct 15, 2021
6618f26
fixed tests for summary encoder
Oct 17, 2021
b4b814f
Merge pull request #3 from PaulWestenthanner/quantileEncoder
cmougan Oct 17, 2021
b9bf00f
add future string to support python3.5 for summary encoder test
cmougan Oct 19, 2021
3292dd1
remove fstring from QE tests
cmougan Oct 19, 2021
c88e53e
handling coinciding quantiles for SE
cmougan Oct 20, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Expand Up @@ -35,6 +35,7 @@ __Supervised:__
* Target Encoding [7]
* Weight of Evidence [8]
* Quantile Encoder [13]
* Summary Encoder [13]

Installation
------------
Expand Down
43 changes: 22 additions & 21 deletions category_encoders/__init__.py
Expand Up @@ -23,29 +23,30 @@
from category_encoders.james_stein import JamesSteinEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.glmm import GLMMEncoder
from category_encoders.quantile_encoder import QuantileEncoder
from category_encoders.quantile_encoder import QuantileEncoder, SummaryEncoder

__version__ = '2.2.2'
__version__ = "2.2.2"

__author__ = 'willmcginnis','cmougan'
__author__ = "willmcginnis", "cmougan"

__all__ = [
'BackwardDifferenceEncoder',
'BinaryEncoder',
'CountEncoder',
'HashingEncoder',
'HelmertEncoder',
'OneHotEncoder',
'OrdinalEncoder',
'SumEncoder',
'PolynomialEncoder',
'BaseNEncoder',
'LeaveOneOutEncoder',
'TargetEncoder',
'WOEEncoder',
'MEstimateEncoder',
'JamesSteinEncoder',
'CatBoostEncoder',
'GLMMEncoder',
'QuantileEncoder',
"BackwardDifferenceEncoder",
"BinaryEncoder",
"CountEncoder",
"HashingEncoder",
"HelmertEncoder",
"OneHotEncoder",
"OrdinalEncoder",
"SumEncoder",
"PolynomialEncoder",
"BaseNEncoder",
"LeaveOneOutEncoder",
"TargetEncoder",
"WOEEncoder",
"MEstimateEncoder",
"JamesSteinEncoder",
"CatBoostEncoder",
"GLMMEncoder",
"QuantileEncoder",
"SummaryEncoder",
]
143 changes: 128 additions & 15 deletions category_encoders/quantile_encoder.py
@@ -1,15 +1,15 @@
"""Target Encoder"""
"""Quantile Encoder"""
__author__ = "david26694", "cmougan"

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.m_estimate import MEstimateEncoder
from sklearn.base import BaseEstimator
import category_encoders.utils as util
from sklearn.utils.random import check_random_state


class QuantileEncoder(BaseEstimator, util.TransformerWithTargetMixin):
cmougan marked this conversation as resolved.
Show resolved Hide resolved
"""Quantile Encoding for categorical features.

This a statistically modified version of target MEstimate encoder where selected features
are replaced the statistical quantile instead than the mean. Replacing with the
cmougan marked this conversation as resolved.
Show resolved Hide resolved
median is a particular case where self.quantile = 0.5. In comparison to MEstimateEncoder
Expand All @@ -18,10 +18,10 @@ class QuantileEncoder(BaseEstimator, util.TransformerWithTargetMixin):
----------
verbose: int
integer indicating verbosity of the output. 0 for none.
quantile: int
integer indicating statistical quantile. ´0.5´ for median.
m: int
integer indicating the smoothing parameter. 0 for no smoothing.
quantile: float
float indicating statistical quantile. ´0.5´ for median.
m: float
this is the “m” in the m-probability estimate. Higher value of m results into stronger shrinking. M is non-negative. 0 for no smoothing.
cols: list
a list of columns to encode, if None, all string columns will be encoded.
drop_invariant: bool
Expand All @@ -40,7 +40,7 @@ class QuantileEncoder(BaseEstimator, util.TransformerWithTargetMixin):
>>> bunch = load_boston()
>>> y = bunch.target
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
>>> enc = QuantileEncoder(cols=['CHAS', 'RAD']).fit(X, y)
>>> enc = QuantileEncoder(cols=['CHAS', 'RAD'], quantile=0.5, m=1.0).fit(X, y)
>>> numeric_dataset = enc.transform(X)
>>> print(numeric_dataset.info())
<class 'pandas.core.frame.DataFrame'>
Expand Down Expand Up @@ -169,9 +169,7 @@ def fit_quantile_encoding(self, X, y):
mapping = {}

# Calculate global statistics
prior = self._quantile = np.quantile(y, self.quantile)
self._sum = y.sum()
self._count = y.count()
prior = np.quantile(y, self.quantile)

for switch in self.ordinal_encoder.category_mapping:
col = switch.get("col")
Expand Down Expand Up @@ -230,7 +228,11 @@ def transform(self, X, y=None, override_return_df=False):
# then make sure that it is the right size
if X.shape[1] != self._dim:
raise ValueError(
"Unexpected input dimension %d, expected %d" % (X.shape[1], self._dim,)
"Unexpected input dimension %d, expected %d"
% (
X.shape[1],
self._dim,
)
)

# if we are encoding the training data, we have to check the target
Expand Down Expand Up @@ -285,7 +287,118 @@ def get_feature_names(self):

if not isinstance(self.feature_names, list):
raise ValueError(
"Must fit data first. Affected feature names are not known " "before."
"Must fit data first. Affected feature names are not known before."
)
else:
return self.feature_names


class SummaryEncoder(BaseEstimator, util.TransformerWithTargetMixin):
"""Summary Encoding for categorical features.

This method provides a broader description of a categorical variable than the quantile encoder.
It's an encoder designed for creating richer representations.
cmougan marked this conversation as resolved.
Show resolved Hide resolved
This is built by leveraging information from different quantiles by concatenating them

This a statistically modified version of target MEstimate encoder where selected features
cmougan marked this conversation as resolved.
Show resolved Hide resolved
are replaced the statistical quantile instead than the mean. Replacing with the
median is a particular case where self.quantile = 0.5. In comparison to MEstimateEncoder
it has two tunable parameter `m` and `quantile`
Parameters
----------
verbose: int
integer indicating verbosity of the output. 0 for none.
quantiles: list
list of floats indicating the statistical quantiles. Each element represent a column
m: float
this is the “m” in the m-probability estimate. Higher value of m results into stronger shrinking. M is non-negative. 0 for no smoothing.
cols: list
a list of columns to encode, if None, all string columns will be encoded.
drop_invariant: bool
boolean for whether or not to drop columns with 0 variance.
return_df: bool
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
handle_missing: str
options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target quantile.
handle_unknown: str
options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target quantile.
Example
-------
>>> from category_encoders import *
>>> import pandas as pd
>>> from sklearn.datasets import load_boston
>>> bunch = load_boston()
>>> y = bunch.target
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
>>> enc = SummaryEncoder(cols=["CHAS", "RAD"], quantiles=[0.25, 0.5, 0.75]).fit(X, y)
>>> numeric_dataset = enc.transform(X)
>>> print(numeric_dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
CRIM 506 non-null float64
ZN 506 non-null float64
INDUS 506 non-null float64
CHAS 506 non-null float64
NOX 506 non-null float64
RM 506 non-null float64
AGE 506 non-null float64
DIS 506 non-null float64
RAD 506 non-null float64
TAX 506 non-null float64
PTRATIO 506 non-null float64
B 506 non-null float64
LSTAT 506 non-null float64
dtypes: float64(13)
memory usage: 51.5 KB
None
References
----------
.. [1] Quantile Encoder: Tackling High Cardinality Categorical Features in Regression Problems, https://arxiv.org/abs/2105.13783
.. [2] A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems, equation 7, from https://dl.acm.org/citation.cfm?id=507538
.. [3] On estimating probabilities in tree pruning, equation 1, from https://link.springer.com/chapter/10.1007/BFb0017010
.. [4] Additive smoothing, from https://en.wikipedia.org/wiki/Additive_smoothing#Generalized_to_the_case_of_known_incidence_rates
.. [5] Target encoding done the right way https://maxhalford.github.io/blog/target-encoding/
"""

def __init__(self, cols, quantiles, m=1.0):

self.cols = cols
self.quantiles = quantiles
self.m = m
self.encoder_list = None

def fit(self, X, y):

X = X.copy()

for quantile in self.quantiles:
for col in self.cols:
percentile = round(quantile * 100)
X[f"{col}_{percentile}"] = X[col]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you just add this in the for loop below (either before or after the col-names.append(,,,) you don't need the two loops here and don't need to calculate the percentile.
Note that the comment on coinciding quantiles below also applies here.


encoder_list = []
for quantile in self.quantiles:
col_names = []
for col in self.cols:
percentile = round(quantile * 100)
col_names.append(f"{col}_{percentile}")
enc = QuantileEncoder(cols=col_names, quantile=quantile, m=self.m)
enc.fit(X, y)
encoder_list.append(enc)

self.encoder_list = encoder_list

return self

def transform(self, X, y=None):
X_encoded = X.copy()

for quantile in self.quantiles:
for col in self.cols:
percentile = round(quantile * 100)
X_encoded[f"{col}_{percentile}"] = X_encoded[col]
PaulWestenthanner marked this conversation as resolved.
Show resolved Hide resolved

for encoder in self.encoder_list:
X_encoded = encoder.transform(X_encoded)
return X_encoded
5 changes: 5 additions & 0 deletions docs/source/summary.rst
@@ -0,0 +1,5 @@
Quantile Encoder
cmougan marked this conversation as resolved.
Show resolved Hide resolved
==============

.. autoclass:: category_encoders.quantile_encoder.SummaryEncoder
:members:
64 changes: 56 additions & 8 deletions tests/test_quantile_encoder.py
Expand Up @@ -11,9 +11,7 @@ class TestQuantileEncoder(unittest.TestCase):
def setUp(self):
"""Create dataframe with categories and a target variable"""

self.df = pd.DataFrame(
{"categories": ["a", "b", "c", "a", "b", "c", "a", "b"]}
)
self.df = pd.DataFrame({"categories": ["a", "b", "c", "a", "b", "c", "a", "b"]})
self.target = np.array([1, 2, 0, 4, 5, 0, 6, 7])

def test_median_works(self):
Expand Down Expand Up @@ -42,9 +40,7 @@ def test_max_works(self):
- b max is 7
- c max is 0
"""
expected_output_max = pd.DataFrame(
{"categories": [6.0, 7, 0, 6, 7, 0, 6, 7]}
)
expected_output_max = pd.DataFrame({"categories": [6.0, 7, 0, 6, 7, 0, 6, 7]})

pd.testing.assert_frame_equal(
encoders.QuantileEncoder(quantile=1, m=0.0).fit_transform(
Expand All @@ -65,6 +61,58 @@ def test_new_category(self):

new_medians = pd.DataFrame({"categories": [3.0, 3.0]})

pd.testing.assert_frame_equal(
transformer_median.transform(new_df), new_medians
pd.testing.assert_frame_equal(transformer_median.transform(new_df), new_medians)


class TestSummaryEncoder(unittest.TestCase):
"""Tests for percentile encoder."""

def setUp(self):
"""Create dataframe with categories and a target variable"""

self.df = pd.DataFrame({"categories": ["a", "b", "c", "a", "b", "c", "a", "b"]})
self.target = np.array([1, 2, 0, 4, 5, 0, 6, 7])
self.col = "categories"

def assert_same_quantile(self, quantile):

quantile_results = encoders.QuantileEncoder(
cols=[self.col], quantile=quantile
).fit_transform(self.df, self.target)

summary_results = encoders.SummaryEncoder(
cols=[self.col], quantiles=[quantile]
).fit_transform(self.df, self.target)

percentile = round(quantile * 100)

np.testing.assert_allclose(
quantile_results[self.col].values,
summary_results[f"{self.col}_{percentile}"].values,
)

def test_several_quantiles(self):

for quantile in [0.1, 0.5, 0.9]:
self.assert_same_quantile(quantile)

def test_several_quantiles(self):
cmougan marked this conversation as resolved.
Show resolved Hide resolved

quantile_list = [0.2, 0.1, 0.8]

summary_results = encoders.SummaryEncoder(
cols=[self.col], quantiles=quantile_list
).fit_transform(self.df, self.target)

for quantile in quantile_list:

quantile_results = encoders.QuantileEncoder(
cols=[self.col], quantile=quantile
).fit_transform(self.df, self.target)

percentile = round(quantile * 100)

np.testing.assert_allclose(
quantile_results[self.col].values,
summary_results[f"{self.col}_{percentile}"].values,
)