Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] MAINT add base class for voting and stacking #15084

Merged
merged 12 commits into from
Oct 5, 2019
5 changes: 5 additions & 0 deletions doc/whats_new/v0.22.rst
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,11 @@ Changelog
:user:`Matt Hancock <notmatthancock>` and
:pr:`5963` by :user:`Pablo Duboue <DrDub>`.

- |Fix| Enforce underlying estimator to be the same type than voting estimator.
glemaitre marked this conversation as resolved.
Show resolved Hide resolved
We introduced a new base class to raise consistent error message in
Stacking and Voting estimator.
glemaitre marked this conversation as resolved.
Show resolved Hide resolved
By :pr:`15084` :user:`Guillaume Lemaitre <glemaitre>`.
glemaitre marked this conversation as resolved.
Show resolved Hide resolved

:mod:`sklearn.feature_extraction`
.................................

Expand Down
86 changes: 3 additions & 83 deletions sklearn/ensemble/_stacking.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from ..base import MetaEstimatorMixin

from .base import _parallel_fit_estimator
from .base import _BaseHeterogeneousEnsemble

from ..linear_model import LogisticRegression
from ..linear_model import RidgeCV
Expand All @@ -32,80 +33,26 @@
from ..utils.validation import column_or_1d


class _BaseStacking(TransformerMixin, MetaEstimatorMixin, _BaseComposition,
class _BaseStacking(TransformerMixin, _BaseHeterogeneousEnsemble,
metaclass=ABCMeta):
"""Base class for stacking method."""
_required_parameters = ['estimators']

@abstractmethod
def __init__(self, estimators, final_estimator=None, cv=None,
stack_method='auto', n_jobs=None, verbose=0):
self.estimators = estimators
super().__init__(estimators=estimators)
self.final_estimator = final_estimator
self.cv = cv
self.stack_method = stack_method
self.n_jobs = n_jobs
self.verbose = verbose

@abstractmethod
def _validate_estimators(self):
if self.estimators is None or len(self.estimators) == 0:
raise ValueError(
"Invalid 'estimators' attribute, 'estimators' should be a list"
" of (string, estimator) tuples."
)
names, estimators = zip(*self.estimators)
self._validate_names(names)
return names, estimators

def _clone_final_estimator(self, default):
if self.final_estimator is not None:
self.final_estimator_ = clone(self.final_estimator)
else:
self.final_estimator_ = clone(default)

def set_params(self, **params):
"""Set the parameters for the stacking estimator.

Valid parameter keys can be listed with `get_params()`.

Parameters
----------
params : keyword arguments
Specific parameters using e.g.
`set_params(parameter_name=new_value)`. In addition, to setting the
parameters of the stacking estimator, the individual estimator of
the stacking estimators can also be set, or can be removed by
setting them to 'drop'.

Examples
--------
In this example, the RandomForestClassifier is removed.

>>> from sklearn.linear_model import LogisticRegression
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.ensemble import VotingClassifier
>>> clf1 = LogisticRegression()
>>> clf2 = RandomForestClassifier()
>>> eclf = StackingClassifier(estimators=[('lr', clf1), ('rf', clf2)])
>>> eclf.set_params(rf='drop')
StackingClassifier(estimators=[('lr', LogisticRegression()),
('rf', 'drop')])
"""
super()._set_params('estimators', **params)
return self

def get_params(self, deep=True):
"""Get the parameters of the stacking estimator.

Parameters
----------
deep : bool
Setting it to True gets the various classifiers and the parameters
of the classifiers as well.
"""
return super()._get_params('estimators', deep=deep)

def _concatenate_predictions(self, predictions):
"""Concatenate the predictions of each first layer learner.

Expand Down Expand Up @@ -172,13 +119,6 @@ def fit(self, X, y, sample_weight=None):
names, all_estimators = self._validate_estimators()
self._validate_final_estimator()

has_estimator = any(est != 'drop' for est in all_estimators)
if not has_estimator:
raise ValueError(
"All estimators are dropped. At least one is required "
"to be an estimator."
)

stack_method = [self.stack_method] * len(all_estimators)

# Fit the base estimators on the whole training data. Those
Expand Down Expand Up @@ -416,16 +356,6 @@ def __init__(self, estimators, final_estimator=None, cv=None,
verbose=verbose
)

def _validate_estimators(self):
names, estimators = super()._validate_estimators()
for est in estimators:
if est != 'drop' and not is_classifier(est):
raise ValueError(
"The estimator {} should be a classifier."
.format(est.__class__.__name__)
)
return names, estimators

def _validate_final_estimator(self):
self._clone_final_estimator(default=LogisticRegression())
if not is_classifier(self.final_estimator_):
Expand Down Expand Up @@ -651,16 +581,6 @@ def __init__(self, estimators, final_estimator=None, cv=None, n_jobs=None,
verbose=verbose
)

def _validate_estimators(self):
names, estimators = super()._validate_estimators()
for est in estimators:
if est != 'drop' and not is_regressor(est):
raise ValueError(
"The estimator {} should be a regressor."
.format(est.__class__.__name__)
)
return names, estimators

def _validate_final_estimator(self):
self._clone_final_estimator(default=RidgeCV())
if not is_regressor(self.final_estimator_):
Expand Down
81 changes: 79 additions & 2 deletions sklearn/ensemble/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,20 @@
# Authors: Gilles Louppe
# License: BSD 3 clause

import numpy as np
from abc import ABCMeta, abstractmethod
import numbers

import numpy as np

from joblib import effective_n_jobs

from ..base import clone
from ..base import is_classifier, is_regressor
from ..base import BaseEstimator
from ..base import MetaEstimatorMixin
from ..utils import Bunch
from ..utils import check_random_state
from abc import ABCMeta, abstractmethod
from ..utils.metaestimators import _BaseComposition

MAX_RAND_SEED = np.iinfo(np.int32).max

Expand Down Expand Up @@ -178,3 +182,76 @@ def _partition_estimators(n_estimators, n_jobs):
starts = np.cumsum(n_estimators_per_job)

return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()


class _BaseHeterogeneousEnsemble(MetaEstimatorMixin, _BaseComposition,
metaclass=ABCMeta):
"""Base class for ensemble learners based on heterogeneous estimators."""
_required_parameters = ['estimators']

@property
def named_estimators(self):
return Bunch(**dict(self.estimators))

@abstractmethod
def __init__(self, estimators):
self.estimators = estimators
glemaitre marked this conversation as resolved.
Show resolved Hide resolved

def _validate_estimators(self):
if self.estimators is None or len(self.estimators) == 0:
raise AttributeError(
"Invalid 'estimators' attribute, 'estimators' should be a list"
" of (string, estimator) tuples."
)
names, estimators = zip(*self.estimators)
# defined by MetaEstimatorMixin
self._validate_names(names)

has_estimator = any(est not in (None, 'drop') for est in estimators)
if not has_estimator:
raise ValueError(
"All estimators are dropped. At least one is required "
"to be an estimator."
)

is_estimator_type = (is_classifier if is_classifier(self)
else is_regressor)

for est in estimators:
if est not in (None, 'drop') and not is_estimator_type(est):
raise ValueError(
"The estimator {} should be a {}."
.format(
est.__class__.__name__, is_estimator_type.__name__[3:]
)
)

return names, estimators

def set_params(self, **params):
"""Set the parameters of an estimator from the ensemble.

Valid parameter keys can be listed with `get_params()`.

Parameters
----------
params : keyword arguments
Specific parameters using e.g.
`set_params(parameter_name=new_value)`. In addition, to setting the
parameters of the stacking estimator, the individual estimator of
the stacking estimators can also be set, or can be removed by
setting them to 'drop'.
"""
super()._set_params('estimators', **params)
return self

def get_params(self, deep=True):
"""Get the parameters of an estimator from the ensemble.

Parameters
----------
deep : bool
Setting it to True gets the various classifiers and the parameters
of the classifiers as well.
"""
return super()._get_params('estimators', deep=deep)
8 changes: 4 additions & 4 deletions sklearn/ensemble/tests/test_stacking.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,10 +223,10 @@ def fit(self, X, y):
"y, params, type_err, msg_err",
[(y_iris,
{'estimators': None},
ValueError, "Invalid 'estimators' attribute,"),
AttributeError, "Invalid 'estimators' attribute,"),
glemaitre marked this conversation as resolved.
Show resolved Hide resolved
(y_iris,
{'estimators': []},
ValueError, "Invalid 'estimators' attribute,"),
AttributeError, "Invalid 'estimators' attribute,"),
(y_iris,
{'estimators': [('lr', LinearRegression()),
('svm', LinearSVC(max_iter=5e4))]},
Expand Down Expand Up @@ -265,10 +265,10 @@ def test_stacking_classifier_error(y, params, type_err, msg_err):
"y, params, type_err, msg_err",
[(y_diabetes,
{'estimators': None},
ValueError, "Invalid 'estimators' attribute,"),
AttributeError, "Invalid 'estimators' attribute,"),
(y_diabetes,
{'estimators': []},
ValueError, "Invalid 'estimators' attribute,"),
AttributeError, "Invalid 'estimators' attribute,"),
(y_diabetes,
{'estimators': [('lr', LogisticRegression()), ('svm', LinearSVR())]},
ValueError, 'should be a regressor'),
Expand Down
6 changes: 3 additions & 3 deletions sklearn/ensemble/tests/test_voting.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@

def test_estimator_init():
eclf = VotingClassifier(estimators=[])
msg = ('Invalid `estimators` attribute, `estimators` should be'
' a list of (string, estimator) tuples')
msg = ("Invalid 'estimators' attribute, 'estimators' should be"
" a list of (string, estimator) tuples.")
assert_raise_message(AttributeError, msg, eclf.fit, X, y)

clf = LogisticRegression(random_state=1)
Expand Down Expand Up @@ -417,7 +417,7 @@ def test_set_estimator_none(drop):
eclf2.set_params(voting='soft').fit(X, y)
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
msg = 'All estimators are None or "drop". At least one is required!'
msg = 'All estimators are dropped. At least one is required'
assert_raise_message(
ValueError, msg, eclf2.set_params(lr=drop, rf=drop, nb=drop).fit, X, y)

Expand Down
Loading