-
-
Notifications
You must be signed in to change notification settings - Fork 25k
/
_function_transformer.py
431 lines (358 loc) · 16.2 KB
/
_function_transformer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
import warnings
import numpy as np
from ..base import BaseEstimator, TransformerMixin, _fit_context
from ..utils._param_validation import StrOptions
from ..utils._set_output import ADAPTERS_MANAGER, _get_output_config
from ..utils.metaestimators import available_if
from ..utils.validation import (
_allclose_dense_sparse,
_check_feature_names_in,
_get_feature_names,
_is_pandas_df,
_is_polars_df,
check_array,
)
def _get_adapter_from_container(container):
"""Get the adapter that nows how to handle such container.
See :class:`sklearn.utils._set_output.ContainerAdapterProtocol` for more
details.
"""
module_name = container.__class__.__module__.split(".")[0]
try:
return ADAPTERS_MANAGER.adapters[module_name]
except KeyError as exc:
available_adapters = list(ADAPTERS_MANAGER.adapters.keys())
raise ValueError(
"The container does not have a registered adapter in scikit-learn. "
f"Available adapters are: {available_adapters} while the container "
f"provided is: {container!r}."
) from exc
def _identity(X):
"""The identity function."""
return X
class FunctionTransformer(TransformerMixin, BaseEstimator):
"""Constructs a transformer from an arbitrary callable.
A FunctionTransformer forwards its X (and optionally y) arguments to a
user-defined function or function object and returns the result of this
function. This is useful for stateless transformations such as taking the
log of frequencies, doing custom scaling, etc.
Note: If a lambda is used as the function, then the resulting
transformer will not be pickleable.
.. versionadded:: 0.17
Read more in the :ref:`User Guide <function_transformer>`.
Parameters
----------
func : callable, default=None
The callable to use for the transformation. This will be passed
the same arguments as transform, with args and kwargs forwarded.
If func is None, then func will be the identity function.
inverse_func : callable, default=None
The callable to use for the inverse transformation. This will be
passed the same arguments as inverse transform, with args and
kwargs forwarded. If inverse_func is None, then inverse_func
will be the identity function.
validate : bool, default=False
Indicate that the input X array should be checked before calling
``func``. The possibilities are:
- If False, there is no input validation.
- If True, then X will be converted to a 2-dimensional NumPy array or
sparse matrix. If the conversion is not possible an exception is
raised.
.. versionchanged:: 0.22
The default of ``validate`` changed from True to False.
accept_sparse : bool, default=False
Indicate that func accepts a sparse matrix as input. If validate is
False, this has no effect. Otherwise, if accept_sparse is false,
sparse matrix inputs will cause an exception to be raised.
check_inverse : bool, default=True
Whether to check that or ``func`` followed by ``inverse_func`` leads to
the original inputs. It can be used for a sanity check, raising a
warning when the condition is not fulfilled.
.. versionadded:: 0.20
feature_names_out : callable, 'one-to-one' or None, default=None
Determines the list of feature names that will be returned by the
`get_feature_names_out` method. If it is 'one-to-one', then the output
feature names will be equal to the input feature names. If it is a
callable, then it must take two positional arguments: this
`FunctionTransformer` (`self`) and an array-like of input feature names
(`input_features`). It must return an array-like of output feature
names. The `get_feature_names_out` method is only defined if
`feature_names_out` is not None.
See ``get_feature_names_out`` for more details.
.. versionadded:: 1.1
kw_args : dict, default=None
Dictionary of additional keyword arguments to pass to func.
.. versionadded:: 0.18
inv_kw_args : dict, default=None
Dictionary of additional keyword arguments to pass to inverse_func.
.. versionadded:: 0.18
Attributes
----------
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X` has feature
names that are all strings.
.. versionadded:: 1.0
See Also
--------
MaxAbsScaler : Scale each feature by its maximum absolute value.
StandardScaler : Standardize features by removing the mean and
scaling to unit variance.
LabelBinarizer : Binarize labels in a one-vs-all fashion.
MultiLabelBinarizer : Transform between iterable of iterables
and a multilabel format.
Notes
-----
If `func` returns an output with a `columns` attribute, then the columns is enforced
to be consistent with the output of `get_feature_names_out`.
Examples
--------
>>> import numpy as np
>>> from sklearn.preprocessing import FunctionTransformer
>>> transformer = FunctionTransformer(np.log1p)
>>> X = np.array([[0, 1], [2, 3]])
>>> transformer.transform(X)
array([[0. , 0.6931...],
[1.0986..., 1.3862...]])
"""
_parameter_constraints: dict = {
"func": [callable, None],
"inverse_func": [callable, None],
"validate": ["boolean"],
"accept_sparse": ["boolean"],
"check_inverse": ["boolean"],
"feature_names_out": [callable, StrOptions({"one-to-one"}), None],
"kw_args": [dict, None],
"inv_kw_args": [dict, None],
}
def __init__(
self,
func=None,
inverse_func=None,
*,
validate=False,
accept_sparse=False,
check_inverse=True,
feature_names_out=None,
kw_args=None,
inv_kw_args=None,
):
self.func = func
self.inverse_func = inverse_func
self.validate = validate
self.accept_sparse = accept_sparse
self.check_inverse = check_inverse
self.feature_names_out = feature_names_out
self.kw_args = kw_args
self.inv_kw_args = inv_kw_args
def _check_input(self, X, *, reset):
if self.validate:
return self._validate_data(X, accept_sparse=self.accept_sparse, reset=reset)
elif reset:
# Set feature_names_in_ and n_features_in_ even if validate=False
# We run this only when reset==True to store the attributes but not
# validate them, because validate=False
self._check_n_features(X, reset=reset)
self._check_feature_names(X, reset=reset)
return X
def _check_inverse_transform(self, X):
"""Check that func and inverse_func are the inverse."""
idx_selected = slice(None, None, max(1, X.shape[0] // 100))
X_round_trip = self.inverse_transform(self.transform(X[idx_selected]))
if hasattr(X, "dtype"):
dtypes = [X.dtype]
elif hasattr(X, "dtypes"):
# Dataframes can have multiple dtypes
dtypes = X.dtypes
if not all(np.issubdtype(d, np.number) for d in dtypes):
raise ValueError(
"'check_inverse' is only supported when all the elements in `X` is"
" numerical."
)
if not _allclose_dense_sparse(X[idx_selected], X_round_trip):
warnings.warn(
(
"The provided functions are not strictly"
" inverse of each other. If you are sure you"
" want to proceed regardless, set"
" 'check_inverse=False'."
),
UserWarning,
)
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Fit transformer by checking X.
If ``validate`` is ``True``, ``X`` will be checked.
Parameters
----------
X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
if `validate=True` else any object that `func` can handle
Input array.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self : object
FunctionTransformer class instance.
"""
X = self._check_input(X, reset=True)
if self.check_inverse and not (self.func is None or self.inverse_func is None):
self._check_inverse_transform(X)
return self
def transform(self, X):
"""Transform X using the forward function.
Parameters
----------
X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
if `validate=True` else any object that `func` can handle
Input array.
Returns
-------
X_out : array-like, shape (n_samples, n_features)
Transformed input.
"""
X = self._check_input(X, reset=False)
out = self._transform(X, func=self.func, kw_args=self.kw_args)
output_config = _get_output_config("transform", self)["dense"]
if hasattr(out, "columns") and self.feature_names_out is not None:
# check the consistency between the column provided by `transform` and
# the the column names provided by `get_feature_names_out`.
feature_names_out = self.get_feature_names_out()
if list(out.columns) != list(feature_names_out):
# we can override the column names of the output if it is inconsistent
# with the column names provided by `get_feature_names_out` in the
# following cases:
# * `func` preserved the column names between the input and the output
# * the input column names are all numbers
# * the output is requested to be a DataFrame (pandas or polars)
feature_names_in = getattr(
X, "feature_names_in_", _get_feature_names(X)
)
same_feature_names_in_out = feature_names_in is not None and list(
feature_names_in
) == list(out.columns)
not_all_str_columns = not all(
isinstance(col, str) for col in out.columns
)
if same_feature_names_in_out or not_all_str_columns:
adapter = _get_adapter_from_container(out)
out = adapter.create_container(
X_output=out,
X_original=out,
columns=feature_names_out,
inplace=False,
)
else:
raise ValueError(
"The output generated by `func` have different column names "
"than the ones provided by `get_feature_names_out`. "
f"Got output with columns names: {list(out.columns)} and "
"`get_feature_names_out` returned: "
f"{list(self.get_feature_names_out())}. "
"The column names can be overridden by setting "
"`set_output(transform='pandas')` or "
"`set_output(transform='polars')` such that the column names "
"are set to the names provided by `get_feature_names_out`."
)
if self.feature_names_out is None:
warn_msg = (
"When `set_output` is configured to be '{0}', `func` should return "
"a {0} DataFrame to follow the `set_output` API or `feature_names_out`"
" should be defined."
)
if output_config == "pandas" and not _is_pandas_df(out):
warnings.warn(warn_msg.format("pandas"))
elif output_config == "polars" and not _is_polars_df(out):
warnings.warn(warn_msg.format("polars"))
return out
def inverse_transform(self, X):
"""Transform X using the inverse function.
Parameters
----------
X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
if `validate=True` else any object that `inverse_func` can handle
Input array.
Returns
-------
X_out : array-like, shape (n_samples, n_features)
Transformed input.
"""
if self.validate:
X = check_array(X, accept_sparse=self.accept_sparse)
return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args)
@available_if(lambda self: self.feature_names_out is not None)
def get_feature_names_out(self, input_features=None):
"""Get output feature names for transformation.
This method is only defined if `feature_names_out` is not None.
Parameters
----------
input_features : array-like of str or None, default=None
Input feature names.
- If `input_features` is None, then `feature_names_in_` is
used as the input feature names. If `feature_names_in_` is not
defined, then names are generated:
`[x0, x1, ..., x(n_features_in_ - 1)]`.
- If `input_features` is array-like, then `input_features` must
match `feature_names_in_` if `feature_names_in_` is defined.
Returns
-------
feature_names_out : ndarray of str objects
Transformed feature names.
- If `feature_names_out` is 'one-to-one', the input feature names
are returned (see `input_features` above). This requires
`feature_names_in_` and/or `n_features_in_` to be defined, which
is done automatically if `validate=True`. Alternatively, you can
set them in `func`.
- If `feature_names_out` is a callable, then it is called with two
arguments, `self` and `input_features`, and its return value is
returned by this method.
"""
if hasattr(self, "n_features_in_") or input_features is not None:
input_features = _check_feature_names_in(self, input_features)
if self.feature_names_out == "one-to-one":
names_out = input_features
elif callable(self.feature_names_out):
names_out = self.feature_names_out(self, input_features)
else:
raise ValueError(
f"feature_names_out={self.feature_names_out!r} is invalid. "
'It must either be "one-to-one" or a callable with two '
"arguments: the function transformer and an array-like of "
"input feature names. The callable must return an array-like "
"of output feature names."
)
return np.asarray(names_out, dtype=object)
def _transform(self, X, func=None, kw_args=None):
if func is None:
func = _identity
return func(X, **(kw_args if kw_args else {}))
def __sklearn_is_fitted__(self):
"""Return True since FunctionTransfomer is stateless."""
return True
def _more_tags(self):
return {"no_validation": not self.validate, "stateless": True}
def set_output(self, *, transform=None):
"""Set output container.
See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
for an example on how to use the API.
Parameters
----------
transform : {"default", "pandas"}, default=None
Configure output of `transform` and `fit_transform`.
- `"default"`: Default output format of a transformer
- `"pandas"`: DataFrame output
- `"polars"`: Polars output
- `None`: Transform configuration is unchanged
.. versionadded:: 1.4
`"polars"` option was added.
Returns
-------
self : estimator instance
Estimator instance.
"""
if not hasattr(self, "_sklearn_output_config"):
self._sklearn_output_config = {}
self._sklearn_output_config["transform"] = transform
return self