Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG+2] Change default of ColumnTransformer remainder from passthrough to drop #11603

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions doc/modules/compose.rst
Expand Up @@ -413,7 +413,7 @@ variable, but apply a :class:`feature_extraction.text.CountVectorizer
<sklearn.feature_extraction.text.CountVectorizer>` to the ``'title'`` column.
As we might use multiple feature extraction methods on the same column, we give
each transformer a unique name, say ``'city_category'`` and ``'title_bow'``.
We can ignore the remaining rating columns by setting ``remainder='drop'``::
By default, the remaining rating columns are ignored (``remainder='drop'``)::

>>> from sklearn.compose import ColumnTransformer
>>> from sklearn.feature_extraction.text import CountVectorizer
Expand Down Expand Up @@ -495,7 +495,7 @@ above example would be::
... ('city', CountVectorizer(analyzer=lambda x: [x])),
... ('title', CountVectorizer()))
>>> column_trans # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
ColumnTransformer(n_jobs=1, remainder='passthrough', transformer_weights=None,
ColumnTransformer(n_jobs=1, remainder='drop', transformer_weights=None,
transformers=[('countvectorizer-1', ...)

.. topic:: Examples:
Expand Down
23 changes: 11 additions & 12 deletions sklearn/compose/_column_transformer.py
Expand Up @@ -71,14 +71,14 @@ class ColumnTransformer(_BaseComposition, TransformerMixin):
A callable is passed the input data `X` and can return any of the
above.

remainder : {'passthrough', 'drop'} or estimator, default 'passthrough'
By default, all remaining columns that were not specified in
`transformers` will be automatically passed through (default of
``'passthrough'``). This subset of columns is concatenated with the
output of the transformers.
By using ``remainder='drop'``, only the specified columns in
`transformers` are transformed and combined in the output, and the
non-specified columns are dropped.
remainder : {'passthrough', 'drop'} or estimator, default 'drop'
By default, only the specified columns in `transformers` are
transformed and combined in the output, and the non-specified
columns are dropped. (default of ``'drop'``).
By specifying ``remainder='passthrough'``, all remaining columns that
were not specified in `transformers` will be automatically passed
through. This subset of columns is concatenated with the output of
the transformers.
By setting ``remainder`` to be an estimator, the remaining
non-specified columns will use the ``remainder`` estimator. The
estimator must support `fit` and `transform`.
Expand Down Expand Up @@ -141,7 +141,7 @@ class ColumnTransformer(_BaseComposition, TransformerMixin):

"""

def __init__(self, transformers, remainder='passthrough', n_jobs=1,
def __init__(self, transformers, remainder='drop', n_jobs=1,
transformer_weights=None):
self.transformers = transformers
self.remainder = remainder
Expand Down Expand Up @@ -658,8 +658,7 @@ def make_column_transformer(*transformers, **kwargs):
... (['numerical_column'], StandardScaler()),
... (['categorical_column'], OneHotEncoder()))
... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
ColumnTransformer(n_jobs=1, remainder='passthrough',
transformer_weights=None,
ColumnTransformer(n_jobs=1, remainder='drop', transformer_weights=None,
transformers=[('standardscaler',
StandardScaler(...),
['numerical_column']),
Expand All @@ -669,7 +668,7 @@ def make_column_transformer(*transformers, **kwargs):

"""
n_jobs = kwargs.pop('n_jobs', 1)
remainder = kwargs.pop('remainder', 'passthrough')
remainder = kwargs.pop('remainder', 'drop')
if kwargs:
raise TypeError('Unknown keyword arguments: "{}"'
.format(list(kwargs.keys())[0]))
Expand Down
31 changes: 17 additions & 14 deletions sklearn/compose/tests/test_column_transformer.py
Expand Up @@ -405,7 +405,7 @@ def test_column_transformer_get_set_params():
('trans2', StandardScaler(), [1])])

exp = {'n_jobs': 1,
'remainder': 'passthrough',
'remainder': 'drop',
'trans1': ct.transformers[0][1],
'trans1__copy': True,
'trans1__with_mean': True,
Expand All @@ -424,7 +424,7 @@ def test_column_transformer_get_set_params():

ct.set_params(trans1='passthrough')
exp = {'n_jobs': 1,
'remainder': 'passthrough',
'remainder': 'drop',
'trans1': 'passthrough',
'trans2': ct.transformers[1][1],
'trans2__copy': True,
Expand Down Expand Up @@ -492,7 +492,8 @@ def test_column_transformer_get_feature_names():
NotImplementedError, 'get_feature_names is not yet supported',
ct.get_feature_names)

ct = ColumnTransformer([('trans', DictVectorizer(), 0)])
ct = ColumnTransformer([('trans', DictVectorizer(), 0)],
remainder='passthrough')
ct.fit(X)
assert_raise_message(
NotImplementedError, 'get_feature_names is not yet supported',
Expand Down Expand Up @@ -552,23 +553,22 @@ def test_column_transformer_remainder():
X_res_second = np.array([2, 4, 6]).reshape(-1, 1)
X_res_both = X_array

# default passthrough
ct = ColumnTransformer([('trans', Trans(), [0])])
assert_array_equal(ct.fit_transform(X_array), X_res_both)
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
# default drop
ct = ColumnTransformer([('trans1', Trans(), [0])])
assert_array_equal(ct.fit_transform(X_array), X_res_first)
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)
assert len(ct.transformers_) == 2
assert ct.transformers_[-1][0] == 'remainder'
assert ct.transformers_[-1][1] == 'passthrough'
assert ct.transformers_[-1][1] == 'drop'
assert_array_equal(ct.transformers_[-1][2], [1])

# specify to drop remaining columns
ct = ColumnTransformer([('trans1', Trans(), [0])],
remainder='drop')
assert_array_equal(ct.fit_transform(X_array), X_res_first)
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)
# specify passthrough
ct = ColumnTransformer([('trans', Trans(), [0])], remainder='passthrough')
assert_array_equal(ct.fit_transform(X_array), X_res_both)
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
assert len(ct.transformers_) == 2
assert ct.transformers_[-1][0] == 'remainder'
assert ct.transformers_[-1][1] == 'drop'
assert ct.transformers_[-1][1] == 'passthrough'
assert_array_equal(ct.transformers_[-1][2], [1])

# column order is not preserved (passed through added to end)
Expand Down Expand Up @@ -602,6 +602,9 @@ def test_column_transformer_remainder():
"remainder keyword needs to be one of \'drop\', \'passthrough\', "
"or estimator.", ct.fit_transform, X_array)

# check default for make_column_transformer
ct = make_column_transformer(([0], Trans()))
assert ct.remainder == 'drop'

@pytest.mark.parametrize("key", [[0], np.array([0]), slice(0, 1),
np.array([True, False])])
Expand Down