-
Notifications
You must be signed in to change notification settings - Fork 90
/
_table_vectorizer.py
825 lines (718 loc) · 32.5 KB
/
_table_vectorizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
"""
Implements the TableVectorizer: a preprocessor to automatically apply
transformers/encoders to different types of data, without the need to
manually categorize them beforehand, or construct complex Pipelines.
"""
from collections import Counter
import numpy as np
import pandas as pd
from pandas.api.types import (
CategoricalDtype,
is_datetime64_any_dtype,
is_extension_array_dtype,
is_numeric_dtype,
is_object_dtype,
)
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.compose import ColumnTransformer
from sklearn.compose._column_transformer import _get_transformer_list
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import Bunch
from sklearn.utils.validation import check_is_fitted
from skrub import DatetimeEncoder, GapEncoder, to_datetime
from skrub._utils import clone_if_default
HIGH_CARDINALITY_TRANSFORMER = GapEncoder(n_components=30)
LOW_CARDINALITY_TRANSFORMER = OneHotEncoder(
sparse_output=False,
handle_unknown="ignore",
drop="if_binary",
)
DATETIME_TRANSFORMER = DatetimeEncoder()
def _to_numeric(X):
"""Convert the columns of a dataframe into a numeric representation.
Parameters
----------
X : pandas.DataFrame
Returns
-------
X : pandas.DataFrame
"""
X_out = dict()
for col in X.columns:
if not is_datetime64_any_dtype(X[col]):
# We don't use errors="ignore" because it casts string
# and categories to object dtype in Pandas < 2.0.
# TODO: replace 'raise' by 'ignore' and remove the exception
# catching when the minimum pandas version of skrub is 2.0.
try:
X_out[col] = pd.to_numeric(X[col], errors="raise")
continue
except (ValueError, TypeError):
pass
X_out[col] = X[col]
return pd.DataFrame(X_out, index=X.index)
def _replace_missing_indicators(column):
"""Replace missing indicators, e.g., #NA, with np.nan and returns a copy.
Parameters
----------
column : pandas.Series
Returns
-------
column : pandas.Series
"""
# Taken from pandas.io.parsers (version 1.1.4)
STR_NA_VALUES = [
"null",
"",
"1.#QNAN",
"#NA",
"nan",
"#N/A N/A",
"-1.#QNAN",
"<NA>",
"-1.#IND",
"-nan",
"n/a",
"-NaN",
"1.#IND",
"NULL",
"NA",
"N/A",
"#N/A",
"NaN",
None,
"?",
"...",
]
# Also replaces the whitespaces
column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
return column
def _union_category(X_col, dtype):
"""Update a categorical dtype with new entries."""
known_categories = dtype.categories
new_categories = pd.unique(X_col.loc[X_col.notnull()])
dtype = pd.CategoricalDtype(categories=known_categories.union(new_categories))
return dtype
def _clone_during_fit(transformer, remainder, n_jobs):
if isinstance(transformer, TransformerMixin):
return _propagate_n_jobs(clone(transformer), n_jobs)
elif transformer == "remainder":
return remainder if isinstance(remainder, str) else clone(remainder)
elif transformer == "passthrough":
return transformer
else:
raise ValueError(
"'transformer' must be an instance of sklearn.base.TransformerMixin, "
f"'remainder' or 'passthrough'. Got {transformer=!r}."
)
def _check_specific_transformers(specific_transformers, n_jobs):
if (specific_transformers is None) or len(specific_transformers) == 0:
return []
else:
first_item_length = len(specific_transformers[0])
# Check that all tuples have the same length
for idx, tuple_ in enumerate(specific_transformers):
if len(tuple_) != first_item_length:
raise TypeError(
"Expected `specific_transformers` to be a list of "
"tuples with all the same length, got length "
f"{len(tuple_)} at index {idx} (elements at previous "
f"indices have {first_item_length} in length). "
)
if first_item_length == 2:
# Unnamed assignments, transform to named
specific_transformers = _get_transformer_list(specific_transformers)
elif first_item_length == 3:
# Named assignments, no-op
pass
else:
raise TypeError(
"Expected `specific_transformers` to be a list of tuples "
"of length 2 or 3, got a list of tuples of length "
f"{first_item_length}. "
)
return [
(
(name, _propagate_n_jobs(clone(transformer), n_jobs), cols)
if isinstance(transformer, TransformerMixin)
else (name, transformer, cols)
)
for name, transformer, cols in specific_transformers
]
def _propagate_n_jobs(transformer, n_jobs):
if n_jobs is not None and (
hasattr(transformer, "n_jobs") and transformer.n_jobs is None
):
transformer.set_params(n_jobs=n_jobs)
return transformer
class TableVectorizer(TransformerMixin, BaseEstimator):
"""Automatically transform a heterogeneous dataframe to a numerical array.
Easily transforms a heterogeneous data table
(such as a :obj:`pandas.DataFrame`) to a numerical array for machine
learning. To do so, the TableVectorizer transforms each column depending
on its data type.
Parameters
----------
cardinality_threshold : int, default=40
Two lists of features will be created depending on this value: strictly
under this value, the low cardinality categorical features, and above or
equal, the high cardinality categorical features.
Different transformers will be applied to these two groups,
defined by the parameters `low_cardinality_transformer` and
`high_cardinality_transformer` respectively.
Note: currently, missing values are counted as a single unique value
(so they count in the cardinality).
low_cardinality_transformer : {'drop', 'remainder', 'passthrough'} \
or Transformer, optional
Transformer used on categorical/string features with low cardinality
(threshold is defined by `cardinality_threshold`).
Can either be a:
- transformer object instance (e.g. OneHotEncoder)
- a Pipeline containing the preprocessing steps
- 'drop' for dropping the columns
- 'remainder' for applying `remainder`
- 'passthrough' to return the unencoded columns
The default transformer is
```
OneHotEncoder(
handle_unknown='ignore',
drop='if_binary',
sparse_output=False,
)
```
When the downstream estimator is a tree-based model
(e.g., scikit-learn HistGradientBoostingRegressor), the OneHotEncoder
may lead to lower performances than other transformers,
such as the OrdinalEncoder.
high_cardinality_transformer : {'drop', 'remainder', 'passthrough'} \
or Transformer, optional
Transformer used on categorical/string features with high cardinality
(threshold is defined by `cardinality_threshold`).
Can either be a transformer object instance
(e.g. GapEncoder), a Pipeline containing the preprocessing steps,
'drop' for dropping the columns,
'remainder' for applying `remainder`,
or 'passthrough' to return the unencoded columns.
The default transformer is ``GapEncoder(n_components=30)``
numerical_transformer : {'drop', 'remainder', 'passthrough'} \
or Transformer, optional
Transformer used on numerical features.
Can either be a transformer object instance (e.g. StandardScaler),
a Pipeline containing the preprocessing steps,
'drop' for dropping the columns,
'remainder' for applying `remainder`,
or 'passthrough' to return the unencoded columns (default).
datetime_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional
Transformer used on datetime features.
Can either be a transformer object instance (e.g. DatetimeEncoder),
a Pipeline containing the preprocessing steps,
'drop' for dropping the columns,
'remainder' for applying `remainder`,
'passthrough' to return the unencoded columns.
The default transformer is ``DatetimeEncoder()``.
specific_transformers : list of tuples ({'drop', 'remainder', 'passthrough'} or \
Transformer, list of str or int) or (str, {'drop', 'remainder', 'passthrough'} \
or Transformer, list of str or int), optional
On top of the default column type classification (see parameters above),
this parameter allows you to manually specify transformers for
specific columns.
This is equivalent to using a ColumnTransformer for assigning the
column-specific transformers, and passing the ``TableVectorizer``
as the ``remainder``.
This parameter can take two different formats, either:
- a list of 2-tuples (transformer, column names or indices)
- a list of 3-tuple (name, transformer, column names or indices)
In the latter format, you can specify the name of the assignment.
Mixing the two is not supported.
auto_cast : bool, default=True
If set to ``True``, calling ``fit``, ``transform`` or ``fit_transform``
will call ``_auto_cast`` to convert each column to the "optimal" dtype
for scikit-learn estimators.
The main heuristics are the following:
- pandas extension dtypes conversion to numpy dtype
- datetime conversion using ``skrub.to_datetime``
- numeric conversion using ``pandas.to_numeric``
- numeric columns with missing values are converted to float to input np.nan
- categorical columns dtypes are updated with the new entries (if any)
during transform.
remainder : {'drop', 'passthrough'} or Transformer, default='passthrough'
By default, all remaining columns that were not specified in `transformers`
will be automatically passed through. This subset of columns is concatenated
with the output of the transformers. (default 'passthrough').
By specifying ``remainder='drop'``, only the specified columns
in `transformers` are transformed and combined in the output, and the
non-specified columns are dropped.
By setting `remainder` to be an estimator, the remaining
non-specified columns will use the `remainder` estimator. The
estimator must support ``fit`` and ``transform``.
Note that using this feature requires that the DataFrame columns
input at ``fit`` and ``transform`` have identical order.
sparse_threshold : float, default=0.0
If the output of the different transformers contains sparse matrices,
these will be stacked as a sparse matrix if the overall density is
lower than this value. Use ``sparse_threshold=0`` to always return dense.
When the transformed output consists of all dense data, the stacked
result will be dense, and this keyword will be ignored.
Note that with the default encoders, the output will never be sparse.
n_jobs : int, default=None
Number of jobs to run in parallel. This number of jobs will be dispatched to
the underlying transformers, if those support parallelization and they do not
set specifically ``n_jobs``.
``None`` (the default) means 1 unless in a :func:`joblib.parallel_config`
context. ``-1`` means using all processors.
transformer_weights : dict, default=None
Multiplicative weights for features per transformer. The output of the
transformer is multiplied by these weights. Keys are transformer names,
values the weights.
verbose : bool, default=False
If True, the time elapsed while fitting each transformer will be
printed as it is completed.
verbose_feature_names_out : bool, default=False
If True, :meth:`TableVectorizer.get_feature_names_out` will prefix
all feature names with the name of the transformer that generated that
feature.
If False, :meth:`TableVectorizer.get_feature_names_out` will not
prefix any feature names and will error if feature names are not
unique.
Attributes
----------
transformers_ : list of 3-tuples (str, Transformer or str, list of str)
The collection of fitted transformers as tuples of
(name, fitted_transformer, column). `fitted_transformer` can be an
estimator, 'drop', or 'passthrough'. In case there were no columns
selected, this will be an unfitted transformer.
If there are remaining columns, the final element is a tuple of the
form:
('remainder', transformer, remaining_columns) corresponding to the
`remainder` parameter. If there are remaining columns, then
``len(transformers_)==len(transformers)+1``, otherwise
``len(transformers_)==len(transformers)``.
inferred_column_types_ : dict mapping of int to type
A mapping of inferred types per column.
See Also
--------
GapEncoder :
Encodes dirty categories (strings) by constructing latent topics with \
continuous encoding.
MinHashEncoder :
Encode string columns as a numeric array with the minhash method.
SimilarityEncoder :
Encode string columns as a numeric array with n-gram string similarity.
Notes
-----
The column order of the input data is not guaranteed to be the same
as the output data (returned by TableVectorizer.transform).
This is a due to the way the underlying ColumnTransformer works.
However, the output column order will always be the same for different
calls to ``TableVectorize.transform`` on a same fitted TableVectorizer instance.
For example, if input data has columns ['name', 'job', 'year'], then output
columns might be shuffled, e.g. ['job', 'year', 'name'], but every call
to ``TableVectorizer.transform`` on this instance will return this order.
Examples
--------
Fit a TableVectorizer on an example dataset:
>>> from skrub.datasets import fetch_employee_salaries
>>> ds = fetch_employee_salaries()
>>> ds.X.head(3)
gender department ... date_first_hired year_first_hired
0 F POL ... 09/22/1986 1986
1 M POL ... 09/12/1988 1988
2 F HHS ... 11/19/1989 1989
[3 rows x 8 columns]
>>> tv = TableVectorizer()
>>> tv.fit(ds.X)
TableVectorizer()
Now, we can inspect the transformers assigned to each column:
>>> tv.transformers_
[('numeric', 'passthrough', ['year_first_hired']), \
('datetime', DatetimeEncoder(), ['date_first_hired']), \
('low_cardinality', OneHotEncoder(drop='if_binary', handle_unknown='ignore', \
sparse_output=False), \
['gender', 'department', 'department_name', 'assignment_category']), \
('high_cardinality', GapEncoder(n_components=30), \
['division', 'employee_position_title'])]
"""
def __init__(
self,
*,
cardinality_threshold=40,
low_cardinality_transformer=LOW_CARDINALITY_TRANSFORMER,
high_cardinality_transformer=HIGH_CARDINALITY_TRANSFORMER,
numerical_transformer="passthrough",
datetime_transformer=DATETIME_TRANSFORMER,
specific_transformers=None,
auto_cast=True,
remainder="passthrough",
sparse_threshold=0.0,
n_jobs=None,
transformer_weights=None,
verbose=False,
verbose_feature_names_out=False,
):
self.cardinality_threshold = cardinality_threshold
self.low_cardinality_transformer = clone_if_default(
low_cardinality_transformer, LOW_CARDINALITY_TRANSFORMER
)
self.high_cardinality_transformer = clone_if_default(
high_cardinality_transformer, HIGH_CARDINALITY_TRANSFORMER
)
self.datetime_transformer = clone_if_default(
datetime_transformer, DATETIME_TRANSFORMER
)
self.numerical_transformer = numerical_transformer
self.specific_transformers = specific_transformers
self.auto_cast = auto_cast
# Parameter from `ColumnTransformer`
self.remainder = remainder
self.sparse_threshold = sparse_threshold
self.n_jobs = n_jobs
self.transformer_weights = transformer_weights
self.verbose = verbose
self.verbose_feature_names_out = verbose_feature_names_out
def _clone_transformers(self):
"""
For each of the different transformers that can be passed,
create the corresponding variable name with a trailing underscore,
which is the value that will be used in `transformers`.
We clone the instances to avoid altering them.
See the clone function docstring.
Note: typos are not detected here, they are left in and are detected
down the line in ColumnTransformer.fit_transform.
"""
for transformer_name in [
"high_cardinality_transformer",
"low_cardinality_transformer",
"datetime_transformer",
"numerical_transformer",
]:
transformer = _clone_during_fit(
getattr(self, transformer_name),
remainder=self.remainder,
n_jobs=self.n_jobs,
)
setattr(self, f"{transformer_name}_", transformer)
self.specific_transformers_ = _check_specific_transformers(
self.specific_transformers,
self.n_jobs,
)
def _auto_cast(self, X, reset=True):
"""Convert each column of a dataframe to the "optimal" dtype \
for scikit-learn estimators.
The main heuristics are the following:
- pandas extension dtypes conversion to numpy dtype
- datetime conversion using ``skrub.to_datetime``
- numeric conversion using ``pandas.to_numeric``
- numeric columns with missing values are converted to float to input np.nan
- categorical columns dtypes are updated with the new entries (if any)
during transform.
Parameters
----------
X : :obj:`~pandas.DataFrame` of shape (n_samples, n_features)
The data to be transformed.
reset : bool, default=True
If set to ``True`` (during fit), creates ``inferred_column_types_``,
the mapping between columns of the training dataframe and their types.
If set to ``False`` (during transform), updates ``inferred_column_types_``
for the categorical columns with the new categories seen during transform.
Returns
-------
X : :obj:`~pandas.DataFrame`
The same :obj:`~pandas.DataFrame`, with its columns cast.
"""
if self.auto_cast:
for col in X.columns:
X[col] = _replace_missing_indicators(X[col])
# Some numerical dtypes like Int64 or Float64 only support
# pd.NA, so they must be converted to np.float64 before imputing
# with np.nan.
if is_numeric_dtype(X[col]) and X[col].isna().any():
X[col] = X[col].astype(np.float64)
# Cast pandas dtypes to numpy dtypes for earlier versions of sklearn.
# Categorical dtypes don't need to be casted.
# Note that 'is_category_dtype' is deprecated.
if (
is_extension_array_dtype(X[col])
and not isinstance(X[col].dtype, CategoricalDtype)
and not is_datetime64_any_dtype(X[col])
):
dtype = X[col].dtype.type
X[col] = X[col].astype(dtype)
# When converting string to object, <NA> values becomes '<NA>'
# so we need to replace false missing values once more.
X[col] = _replace_missing_indicators(X[col])
# For object dtype columns, convert to string to avoid mixed types.
if is_object_dtype(X[col]):
mask = X[col].notnull()
X.loc[mask, col] = X.loc[mask, col].astype(str)
if reset:
X = to_datetime(X)
X = _to_numeric(X)
self.inferred_column_types_ = X.dtypes.to_dict()
else:
category_columns = X.select_dtypes("category").columns
for col in category_columns:
dtype = self.inferred_column_types_[col]
dtype = _union_category(X[col], dtype)
self.inferred_column_types_[col] = dtype
X[col] = X[col].astype(dtype)
# Enforce dtypes conversion using the dtypes seen during fit.
# As this behavior is more aggressive than skrub's to_datetime
# or _to_numeric and only makes sense for the TableVectorizer, we
# define it here rather than within these two functions.
# See: https://github.com/skrub-data/skrub/issues/837
for column, dtype in self.inferred_column_types_.items():
if is_numeric_dtype(dtype):
X[column] = pd.to_numeric(X[column], errors="coerce")
elif is_datetime64_any_dtype(dtype):
X[column] = pd.to_datetime(X[column], errors="coerce")
else:
X[column] = X[column].astype(dtype, errors="ignore")
return X
def _check_X(self, X):
if sparse.isspmatrix(X):
raise TypeError(
"A sparse matrix was passed, but dense data is required. Use "
"X.toarray() to convert to a dense numpy array."
)
if not isinstance(X, pd.DataFrame):
# check the dimension of X before to create a dataframe that always
# `ndim == 2`
# unfortunately, we need to call `asarray` before to call `ndim`
# in case that the container implement `__array_function__`
X_array = np.asarray(X)
if X_array.ndim == 0:
raise ValueError(
f"Expected 2D array, got scalar array instead:\narray={X}.\n"
"Reshape your data either using array.reshape(-1, 1) if "
"your data has a single feature or array.reshape(1, -1) "
"if it contains a single sample."
)
if X_array.ndim == 1:
raise ValueError(
f"Expected 2D array, got 1D array instead:\narray={X}.\n"
"Reshape your data either using array.reshape(-1, 1) if "
"your data has a single feature or array.reshape(1, -1) "
"if it contains a single sample."
)
feature_names = getattr(self, "feature_names_in_", None)
X = pd.DataFrame(X_array, columns=feature_names)
else:
# Create a copy to avoid altering the original data.
X = X.copy()
# Check for duplicate column names.
duplicate_columns = {k for k, v in Counter(X.columns).items() if v > 1}
if len(duplicate_columns) > 0:
raise AssertionError(
f"Duplicate column names in the dataframe: {duplicate_columns}"
)
# Check Pandas sparse arrays
sparse_cols = [
col for col in X.columns if isinstance(X[col].dtype, pd.SparseDtype)
]
if len(sparse_cols) > 0:
raise TypeError(
f"Columns {sparse_cols!r} are sparse Pandas series, but dense "
"data is required. Use df[col].sparse.to_dense() to convert "
"a series from sparse to dense."
)
if X.shape[0] < 1:
raise ValueError(
f"Found array with {X.shape[0]} sample(s) (shape={X.shape}) while a"
" minimum of 1 is required."
)
if X.shape[1] < 1:
raise ValueError(
f"Found array with {X.shape[1]} feature(s) (shape={X.shape}) while"
" a minimum of 1 is required."
)
return X
def fit(self, X, y=None):
"""Fit all transformers using X.
Parameters
----------
X : dataframe of shape (n_samples, n_features)
Input data, of which specified subsets are used to fit the
transformers.
y : array-like of shape (n_samples, ...), default=None
Targets for supervised learning.
Returns
-------
self : TableVectorizer
This estimator.
"""
# we use fit_transform to make sure to set sparse_output_ (for which we
# need the transformed data) to have consistent output type in predict
self.fit_transform(X, y=y)
return self
def fit_transform(self, X, y=None):
"""Fit all transformers, transform the data, and concatenate the results.
In practice, it:
- Converts features to their best possible types for scikit-learn estimators
if ``auto_cast=True`` (see ``auto_cast`` docstring).
- Classify columns based on their data types and match them to each
dtype-specific transformers.
- Use scikit-learn ColumnTransformer to run fit_transform on all transformers.
Parameters
----------
X : dataframe of shape (n_samples, n_features)
Input data, of which specified subsets are used to fit the
transformers.
y : array-like of shape (n_samples,), optional
Targets for supervised learning.
Returns
-------
{array-like, sparse matrix} of shape (n_samples, sum_n_components)
Hstack of results of transformers. sum_n_components is the
sum of n_components (output dimension) over transformers. If
any result is a sparse matrix, everything will be converted to
sparse matrices.
"""
self._clone_transformers()
self._check_feature_names(X, reset=True)
X = self._check_X(X)
self._check_n_features(X, reset=True)
X = self._auto_cast(X, reset=True)
# Filter ``X`` to keep only the columns that are not specified
# explicitly by the user.
X_filtered = X.drop(
columns=[
col
for (_, _, columns) in self.specific_transformers_
for col in columns
]
)
# Select columns by dtype
numeric_columns = X_filtered.select_dtypes(include="number").columns.to_list()
categorical_columns = X_filtered.select_dtypes(
include=["string", "object", "category"]
).columns.to_list()
datetime_columns = X_filtered.select_dtypes(
include=["datetime", "datetimetz"]
).columns.to_list()
# Classify categorical columns by cardinality
low_cardinality_columns, high_cardinality_columns = [], []
for col in categorical_columns:
if X[col].nunique() < self.cardinality_threshold:
low_cardinality_columns.append(col)
else:
high_cardinality_columns.append(col)
all_transformers = [
("numeric", self.numerical_transformer_, numeric_columns),
("datetime", self.datetime_transformer_, datetime_columns),
(
"low_cardinality",
self.low_cardinality_transformer_,
low_cardinality_columns,
),
(
"high_cardinality",
self.high_cardinality_transformer_,
high_cardinality_columns,
),
*self.specific_transformers_,
]
# Filter this list, by keeping only transformers with:
# - at least one column
# - a valid encoder or string (filter out if None)
transformers = []
for name, transformer, columns in all_transformers:
if len(columns) > 0 and transformer is not None:
transformers.append((name, transformer, columns))
if self.verbose:
print(f"[TableVectorizer] Assigned transformers: {transformers}")
self._column_transformer = ColumnTransformer(
transformers=transformers,
remainder=self.remainder,
sparse_threshold=self.sparse_threshold,
n_jobs=1, # we don't parallelize the outer loop
transformer_weights=self.transformer_weights,
verbose=self.verbose,
verbose_feature_names_out=self.verbose_feature_names_out,
)
X_enc = self._column_transformer.fit_transform(X, y=y)
return X_enc
def transform(self, X):
"""Transform ``X`` by applying the fitted transformers on the columns.
Parameters
----------
X : dataframe of shape (n_samples, n_features)
The data to be transformed.
Returns
-------
{array-like, sparse matrix} of shape (n_samples, sum_n_components)
Hstack of results of transformers. sum_n_components is the
sum of n_components (output dimension) over transformers. If
any result is a sparse matrix, everything will be converted to
sparse matrices.
"""
check_is_fitted(self, attributes=["_column_transformer"])
X = self._check_X(X)
X = self._auto_cast(X, reset=False)
return self._column_transformer.transform(X)
def get_feature_names_out(self, input_features=None):
"""Return clean feature names.
Feature names are formatted like:
"<column_name>_<value>" if encoded by OneHotEncoder or alike,
(e.g. "job_title_Police officer"), or "<column_name>" otherwise.
Parameters
----------
input_features : None
Unused, only here for compatibility.
Returns
-------
feature_names : ndarray of str
Feature names.
"""
return self._column_transformer.get_feature_names_out(input_features)
@property
def transformers_(self):
"""Transformers applied to the different columns."""
# For the "remainder" columns, the `ColumnTransformer` `transformers_`
# attribute contains the index instead of the column name,
# so we convert the values to the appropriate column names
# if there is less than 20 columns in the remainder.
transformers = []
for name, transformer, columns in self._column_transformer.transformers_:
# TODO: potentially remove when
# https://github.com/scikit-learn/scikit-learn/issues/27533 is resolved.
if name == "remainder" and len(columns) < 20:
columns = self.feature_names_in_[columns].tolist()
transformers.append((name, transformer, columns))
return transformers
@property
def named_transformers_(self) -> Bunch:
"""Map transformer names to transformer objects.
Read-only attribute to access any transformer by given name.
Keys are transformer names and values are the fitted transformer
objects.
"""
return self._column_transformer.named_transformers_
@property
def sparse_output_(self) -> bool:
"""Whether the output of ``transform`` is sparse or dense.
Boolean flag indicating whether the output of ``transform`` is a
sparse matrix or a dense numpy array, which depends on the output
of the individual transformers and the `sparse_threshold` keyword.
"""
return self._column_transformer.sparse_output_
@property
def output_indices_(self) -> dict[str, slice]:
"""Map the transformer names to their input indices.
A dictionary from each transformer name to a slice, where the slice
corresponds to indices in the transformed output. This is useful to
inspect which transformer is responsible for which transformed
feature(s).
"""
return self._column_transformer.output_indices_
def _more_tags(self) -> dict:
"""
Used internally by sklearn to ease the estimator checks.
"""
return {
"X_types": ["2darray", "string"],
"allow_nan": [True],
"_xfail_checks": {
"check_complex_data": "Passthrough complex columns as-is.",
},
}