-
-
Notifications
You must be signed in to change notification settings - Fork 25.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
FEA allow unknowns in OrdinalEncoder transform #17406
Changes from 6 commits
04b263a
38456cc
82e31d3
439b08e
ce75639
274b999
3394490
6be44fa
19e9474
1d84ec4
72a7b75
c86a834
9b5359e
d02c410
215ec88
3127688
41fb313
8c8b565
fd7c9fe
46f8f1a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -4,6 +4,7 @@ | |||||||
|
||||||||
import numpy as np | ||||||||
from scipy import sparse | ||||||||
import numbers | ||||||||
|
||||||||
from ..base import BaseEstimator, TransformerMixin | ||||||||
from ..utils import check_array | ||||||||
|
@@ -115,7 +116,7 @@ def _transform(self, X, handle_unknown='error'): | |||||||
for i in range(n_features): | ||||||||
Xi = X_list[i] | ||||||||
diff, valid_mask = _check_unknown(Xi, self.categories_[i], | ||||||||
return_mask=True) | ||||||||
return_mask=True) | ||||||||
|
||||||||
if not np.all(valid_mask): | ||||||||
if handle_unknown == 'error': | ||||||||
|
@@ -621,6 +622,18 @@ class OrdinalEncoder(_BaseEncoder): | |||||||
dtype : number type, default np.float64 | ||||||||
Desired dtype of output. | ||||||||
|
||||||||
handle_unknown : 'error' or 'use_encoded_value', default='error' | ||||||||
When set to 'error' an error will be raised in case an unknown | ||||||||
categorical feature is present during transform. When set to | ||||||||
'use_encoded_value', the encoded value of unknown categories will be | ||||||||
set to the value given for the parameter unknown_value. In | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
:meth:inverse_transform, an unknown category will be denoted as None. | ||||||||
jnothman marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||
|
||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please use |
||||||||
unknown_value : int, default=None | ||||||||
When the parameter handle_unknown is set to 'use_encoded_value', this | ||||||||
parameter is mandatory and will set the encoded value of unknown | ||||||||
jnothman marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||
categories. | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please specify that unknown_value should be distinct from the value used to encode any categories. |
||||||||
|
||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please use |
||||||||
Attributes | ||||||||
---------- | ||||||||
categories_ : list of arrays | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This says that categories_ will correspond with the output of transform. I suspect this is no longer true. Is there a way we can make them line up? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct, in case of unknown categories there will be outputs of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure how to change the code here. But the description, yes. |
||||||||
|
@@ -657,9 +670,12 @@ class OrdinalEncoder(_BaseEncoder): | |||||||
""" | ||||||||
|
||||||||
@_deprecate_positional_args | ||||||||
def __init__(self, *, categories='auto', dtype=np.float64): | ||||||||
def __init__(self, *, categories='auto', dtype=np.float64, | ||||||||
handle_unknown='error', unknown_value=None): | ||||||||
self.categories = categories | ||||||||
self.dtype = dtype | ||||||||
self.handle_unknown = handle_unknown | ||||||||
self.unknown_value = unknown_value | ||||||||
|
||||||||
def fit(self, X, y=None): | ||||||||
""" | ||||||||
|
@@ -678,6 +694,14 @@ def fit(self, X, y=None): | |||||||
------- | ||||||||
self | ||||||||
""" | ||||||||
if self.handle_unknown == 'use_encoded_value': | ||||||||
if self.unknown_value is None: | ||||||||
raise TypeError("Please set unknown_value to an integer " | ||||||||
"value.") | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can have both checks raise the same error (may need to wrap)
Suggested change
|
||||||||
if not isinstance(self.unknown_value, numbers.Integral): | ||||||||
raise TypeError(f"The used value for unknown_value " | ||||||||
f"{self.unknown_value} is not an integer.") | ||||||||
|
||||||||
NicolasHug marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||
self._fit(X) | ||||||||
|
||||||||
return self | ||||||||
|
@@ -696,7 +720,17 @@ def transform(self, X): | |||||||
X_out : sparse matrix or a 2-d array | ||||||||
Transformed input. | ||||||||
""" | ||||||||
X_int, _ = self._transform(X) | ||||||||
X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) | ||||||||
|
||||||||
# create separate category for unknown values | ||||||||
if self.handle_unknown == 'use_encoded_value': | ||||||||
for i in range(len(self.categories_)): | ||||||||
if 0 <= self.unknown_value < len(self.categories_[i]): | ||||||||
raise ValueError(f"The used value for unknown_value " | ||||||||
f"{self.unknown_value} is one of the " | ||||||||
f"values already used for encoding the " | ||||||||
f"seen categories.") | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This validation check should be in |
||||||||
X_int[~X_mask[:, i], i] = self.unknown_value | ||||||||
return X_int.astype(self.dtype, copy=False) | ||||||||
|
||||||||
def inverse_transform(self, X): | ||||||||
|
@@ -731,6 +765,13 @@ def inverse_transform(self, X): | |||||||
|
||||||||
for i in range(n_features): | ||||||||
labels = X[:, i].astype('int64', copy=False) | ||||||||
X_tr[:, i] = self.categories_[i][labels] | ||||||||
# set unknown values to None | ||||||||
if self.handle_unknown == 'use_encoded_value': | ||||||||
X_tr[:, i] = np.where( | ||||||||
labels == self.unknown_value, None, | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can set |
||||||||
self.categories_[i][np.where( | ||||||||
labels == self.unknown_value, 0, labels)]) | ||||||||
else: | ||||||||
X_tr[:, i] = self.categories_[i][labels] | ||||||||
|
||||||||
return X_tr |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -553,6 +553,48 @@ def test_ordinal_encoder_raise_missing(X): | |||||
ohe.transform(X) | ||||||
|
||||||
|
||||||
def test_ordinal_encoder_handle_unknowns(): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's add separate test function for encoding numerical values. |
||||||
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-2) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should test that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure what you mean. Using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why would that be an error? What's wrong with the user wanting to conflate unknowns with a specific known category? I had thought we would want to handle that case. If we won't allow that case then we should certainly test that an error is raised if it is attempted. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a test already for checking correct raise in case of using an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And nothing wrong with conflating a specific known category with unknowns. I think it is just a matter of taste whether we want to allow conflating or not. However, I would suggest to allow it only in mode There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TBH, I don't mind whether we allow or disallow it at this point, but we need to explicitly do one or the other, and test it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed. Please let me know if the test I added for this (last check in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @thomasjpfan can you confirm whether it was your expectation to allow/disallow There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm personally fine with being conservative and not allowing this for now. We can allow it later without breaking backward compatibility. The test is good |
||||||
X_fit = np.array([['a', 'x'], ['b', 'y'], ['c', 'z']], dtype=object) | ||||||
X_trans = np.array([['c', 'xy'], ['bla', 'y'], ['a', 'x']], dtype=object) | ||||||
enc.fit(X_fit) | ||||||
|
||||||
X_trans_enc = enc.transform(X_trans) | ||||||
exp = np.array([[2, -2], [-2, 1], [0, 0]], dtype='int64') | ||||||
assert_array_equal(X_trans_enc, exp) | ||||||
|
||||||
X_trans_inv = enc.inverse_transform(X_trans_enc) | ||||||
inv_exp = np.array([['c', None], [None, 'y'], ['a', 'x']], dtype=object) | ||||||
assert_array_equal(X_trans_inv, inv_exp) | ||||||
|
||||||
|
||||||
def test_ordinal_encoder_handle_unknowns_raise_fit(): | ||||||
X = np.array([['a', 'x'], ['b', 'y']], dtype=object) | ||||||
|
||||||
enc = OrdinalEncoder(handle_unknown='use_encoded_value') | ||||||
msg = ("Please set unknown_value to an integer value.") | ||||||
with pytest.raises(TypeError, match=msg): | ||||||
enc.fit(X) | ||||||
|
||||||
enc = OrdinalEncoder(handle_unknown='use_encoded_value', | ||||||
unknown_value='bla') | ||||||
msg = ("The used value for unknown_value bla is not an integer.") | ||||||
with pytest.raises(TypeError, match=msg): | ||||||
enc.fit(X) | ||||||
|
||||||
|
||||||
def test_ordinal_encoder_handle_unknowns_raise_transform(): | ||||||
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=1) | ||||||
X_fit = np.array([['a', 'x'], ['b', 'y'], ['c', 'z']], dtype=object) | ||||||
X_trans = np.array([['c', 'xy'], ['bla', 'y'], ['a', 'x']], dtype=object) | ||||||
enc.fit(X_fit) | ||||||
|
||||||
msg = ("The used value for unknown_value 1 is one of the values already " | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
I think this is a bit less ambiguous |
||||||
"used for encoding the seen categories.") | ||||||
with pytest.raises(ValueError, match=msg): | ||||||
enc.transform(X_trans) | ||||||
|
||||||
|
||||||
def test_ordinal_encoder_raise_categories_shape(): | ||||||
|
||||||
X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.