diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index fba40e25a9e7e..9c1084e393e8d 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -53,6 +53,12 @@ Changelog :pr:`20880` by :user:`Guillaume Lemaitre ` and :user:`AndrĂ¡s Simon `. +:mod:`sklearn.pipeline` +....................... + +- |Enhancement| Added support for "passthrough" in :class:`FeatureUnion`. + Setting a transformer to "passthrough" will pass the features unchanged. + :pr:`20860` by :user:`Shubhraneel Pal `. Code and Documentation Contributors ----------------------------------- diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 9d4997686612b..e2f9b0f0950ec 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -17,6 +17,7 @@ from joblib import Parallel from .base import clone, TransformerMixin +from .preprocessing import FunctionTransformer from .utils._estimator_html_repr import _VisualBlock from .utils.metaestimators import available_if from .utils import ( @@ -853,8 +854,9 @@ class FeatureUnion(TransformerMixin, _BaseComposition): Parameters of the transformers may be set using its name and the parameter name separated by a '__'. A transformer may be replaced entirely by - setting the parameter with its name to another transformer, - or removed by setting to 'drop'. + setting the parameter with its name to another transformer, removed by + setting to 'drop' or disabled by setting to 'passthrough' (features are + passed without transformation). Read more in the :ref:`User Guide `. @@ -862,12 +864,14 @@ class FeatureUnion(TransformerMixin, _BaseComposition): Parameters ---------- - transformer_list : list of tuple - List of tuple containing `(str, transformer)`. The first element - of the tuple is name affected to the transformer while the - second element is a scikit-learn transformer instance. - The transformer instance can also be `"drop"` for it to be - ignored. + transformer_list : list of (str, transformer) tuples + List of transformer objects to be applied to the data. The first + half of each tuple is the name of the transformer. The transformer can + be 'drop' for it to be ignored or can be 'passthrough' for features to + be passed unchanged. + + .. versionadded:: 1.1 + Added the option `"passthrough"`. .. versionchanged:: 0.22 Deprecated `None` as a transformer in favor of 'drop'. @@ -977,7 +981,7 @@ def _validate_transformers(self): # validate estimators for t in transformers: - if t == "drop": + if t in ("drop", "passthrough"): continue if not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not hasattr( t, "transform" @@ -1004,12 +1008,15 @@ def _iter(self): Generate (name, trans, weight) tuples excluding None and 'drop' transformers. """ + get_weight = (self.transformer_weights or {}).get - return ( - (name, trans, get_weight(name)) - for name, trans in self.transformer_list - if trans != "drop" - ) + + for name, trans in self.transformer_list: + if trans == "drop": + continue + if trans == "passthrough": + trans = FunctionTransformer() + yield (name, trans, get_weight(name)) @deprecated( "get_feature_names is deprecated in 1.0 and will be removed " diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 445bd9064b959..fa01b6e834b11 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1004,6 +1004,60 @@ def test_set_feature_union_step_drop(get_names): assert not record +def test_set_feature_union_passthrough(): + """Check the behaviour of setting a transformer to `"passthrough"`.""" + mult2 = Mult(2) + mult3 = Mult(3) + X = np.asarray([[1]]) + + ft = FeatureUnion([("m2", mult2), ("m3", mult3)]) + assert_array_equal([[2, 3]], ft.fit(X).transform(X)) + assert_array_equal([[2, 3]], ft.fit_transform(X)) + + ft.set_params(m2="passthrough") + assert_array_equal([[1, 3]], ft.fit(X).transform(X)) + assert_array_equal([[1, 3]], ft.fit_transform(X)) + + ft.set_params(m3="passthrough") + assert_array_equal([[1, 1]], ft.fit(X).transform(X)) + assert_array_equal([[1, 1]], ft.fit_transform(X)) + + # check we can change back + ft.set_params(m3=mult3) + assert_array_equal([[1, 3]], ft.fit(X).transform(X)) + assert_array_equal([[1, 3]], ft.fit_transform(X)) + + # Check 'passthrough' step at construction time + ft = FeatureUnion([("m2", "passthrough"), ("m3", mult3)]) + assert_array_equal([[1, 3]], ft.fit(X).transform(X)) + assert_array_equal([[1, 3]], ft.fit_transform(X)) + + X = iris.data + columns = X.shape[1] + pca = PCA(n_components=2, svd_solver="randomized", random_state=0) + + ft = FeatureUnion([("passthrough", "passthrough"), ("pca", pca)]) + assert_array_equal(X, ft.fit(X).transform(X)[:, :columns]) + assert_array_equal(X, ft.fit_transform(X)[:, :columns]) + + ft.set_params(pca="passthrough") + X_ft = ft.fit(X).transform(X) + assert_array_equal(X_ft, np.hstack([X, X])) + X_ft = ft.fit_transform(X) + assert_array_equal(X_ft, np.hstack([X, X])) + + ft.set_params(passthrough=pca) + assert_array_equal(X, ft.fit(X).transform(X)[:, -columns:]) + assert_array_equal(X, ft.fit_transform(X)[:, -columns:]) + + ft = FeatureUnion( + [("passthrough", "passthrough"), ("pca", pca)], + transformer_weights={"passthrough": 2}, + ) + assert_array_equal(X * 2, ft.fit(X).transform(X)[:, :columns]) + assert_array_equal(X * 2, ft.fit_transform(X)[:, :columns]) + + def test_step_name_validation(): error_message_1 = r"Estimator names must not contain __: got \['a__q'\]" error_message_2 = r"Names provided are not unique: \['a', 'a'\]"