Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] API Changes default strategy to prior in DummyClasssifer #15382

Merged
5 changes: 5 additions & 0 deletions doc/whats_new/v0.22.rst
Expand Up @@ -179,6 +179,11 @@ Changelog
:mod:`sklearn.dummy`
....................

- |API| The default value of the `strategy` parameter in
:class:`dummy.DummyClassifier` will change from `'stratified'` in version
0.22 to `'prior'` in 0.24. A FutureWarning is raised when the default value
is used. :pr:`15382` by `Thomas Fan`_.

- |Fix| :class:`dummy.DummyClassifier` now handles checking the existence
of the provided constant in multiouput cases.
:pr:`14908` by :user:`Martina G. Vilas <martinagvilas>`.
Expand Down
50 changes: 31 additions & 19 deletions sklearn/dummy.py
Expand Up @@ -45,6 +45,10 @@ class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator):
the user. This is useful for metrics that evaluate a non-majority
class

.. versionchanged:: 0.20
TomDLT marked this conversation as resolved.
Show resolved Hide resolved
The default value of `strategy` will change from "stratified" in
version 0.22 to "prior" in version 0.24.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I understand what you want to say.

We could either just say that it will be changed in 0.24, or also mention that a warning is raised starting from 0.22

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


.. versionadded:: 0.17
Dummy Classifier now supports prior fitting strategy using
parameter *prior*.
Expand Down Expand Up @@ -92,7 +96,7 @@ class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator):
0.75
"""

def __init__(self, strategy="stratified", random_state=None,
def __init__(self, strategy="warn", random_state=None,
constant=None):
self.strategy = strategy
self.random_state = random_state
Expand All @@ -118,11 +122,19 @@ def fit(self, X, y, sample_weight=None):
"""
allowed_strategies = ("most_frequent", "stratified", "uniform",
"constant", "prior")
if self.strategy not in allowed_strategies:

# TODO: Remove in 0.24
if self.strategy == "warn":
warnings.warn("The default value of strategy will change from "
"stratified to prior in 0.24.", FutureWarning)
self._strategy = "stratified"
elif self.strategy not in allowed_strategies:
raise ValueError("Unknown strategy type: %s, expected one of %s."
% (self.strategy, allowed_strategies))
else:
self._strategy = self.strategy

if self.strategy == "uniform" and sp.issparse(y):
if self._strategy == "uniform" and sp.issparse(y):
y = y.toarray()
warnings.warn('A local copy of the target data has been converted '
'to a numpy array. Predicting on sparse target data '
Expand All @@ -143,7 +155,7 @@ def fit(self, X, y, sample_weight=None):

check_consistent_length(X, y, sample_weight)

if self.strategy == "constant":
if self._strategy == "constant":
if self.constant is None:
raise ValueError("Constant target value has to be specified "
"when the constant strategy is used.")
Expand All @@ -157,7 +169,7 @@ def fit(self, X, y, sample_weight=None):
self.n_classes_,
self.class_prior_) = class_distribution(y, sample_weight)

if self.strategy == "constant":
if self._strategy == "constant":
for k in range(self.n_outputs_):
if not any(constant[k][0] == c for c in self.classes_[k]):
# Checking in case of constant strategy if the constant
Expand Down Expand Up @@ -206,43 +218,43 @@ def predict(self, X):
class_prior_ = [class_prior_]
constant = [constant]
# Compute probability only once
if self.strategy == "stratified":
if self._strategy == "stratified":
proba = self.predict_proba(X)
if self.n_outputs_ == 1:
proba = [proba]

if self.sparse_output_:
class_prob = None
if self.strategy in ("most_frequent", "prior"):
if self._strategy in ("most_frequent", "prior"):
classes_ = [np.array([cp.argmax()]) for cp in class_prior_]

elif self.strategy == "stratified":
elif self._strategy == "stratified":
class_prob = class_prior_

elif self.strategy == "uniform":
elif self._strategy == "uniform":
raise ValueError("Sparse target prediction is not "
"supported with the uniform strategy")

elif self.strategy == "constant":
elif self._strategy == "constant":
classes_ = [np.array([c]) for c in constant]

y = _random_choice_csc(n_samples, classes_, class_prob,
self.random_state)
else:
if self.strategy in ("most_frequent", "prior"):
if self._strategy in ("most_frequent", "prior"):
y = np.tile([classes_[k][class_prior_[k].argmax()] for
k in range(self.n_outputs_)], [n_samples, 1])

elif self.strategy == "stratified":
elif self._strategy == "stratified":
y = np.vstack([classes_[k][proba[k].argmax(axis=1)] for
k in range(self.n_outputs_)]).T

elif self.strategy == "uniform":
elif self._strategy == "uniform":
ret = [classes_[k][rs.randint(n_classes_[k], size=n_samples)]
for k in range(self.n_outputs_)]
y = np.vstack(ret).T

elif self.strategy == "constant":
elif self._strategy == "constant":
y = np.tile(self.constant, (n_samples, 1))

if self.n_outputs_ == 1:
Expand Down Expand Up @@ -286,22 +298,22 @@ def predict_proba(self, X):

P = []
for k in range(self.n_outputs_):
if self.strategy == "most_frequent":
if self._strategy == "most_frequent":
ind = class_prior_[k].argmax()
out = np.zeros((n_samples, n_classes_[k]), dtype=np.float64)
out[:, ind] = 1.0
elif self.strategy == "prior":
elif self._strategy == "prior":
out = np.ones((n_samples, 1)) * class_prior_[k]

elif self.strategy == "stratified":
elif self._strategy == "stratified":
out = rs.multinomial(1, class_prior_[k], size=n_samples)
out = out.astype(np.float64)

elif self.strategy == "uniform":
elif self._strategy == "uniform":
out = np.ones((n_samples, n_classes_[k]), dtype=np.float64)
out /= n_classes_[k]

elif self.strategy == "constant":
elif self._strategy == "constant":
ind = np.where(classes_[k] == constant[k])
out = np.zeros((n_samples, n_classes_[k]), dtype=np.float64)
out[:, ind] = 1.0
Expand Down
2 changes: 1 addition & 1 deletion sklearn/ensemble/tests/test_bagging.py
Expand Up @@ -63,7 +63,7 @@ def test_classification():
"bootstrap_features": [True, False]})

for base_estimator in [None,
DummyClassifier(),
DummyClassifier(strategy='stratified'),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should rather explicitly ignore the warnings in the test and tag for 0.24.

Setting the strategy looks like an intentional thing that has to do with the test, when in reality it's just about avoiding warnings

Perceptron(),
DecisionTreeClassifier(),
KNeighborsClassifier(),
Expand Down
12 changes: 7 additions & 5 deletions sklearn/ensemble/tests/test_gradient_boosting.py
Expand Up @@ -1299,9 +1299,11 @@ def _make_multiclass():

@pytest.mark.parametrize(
"gb, dataset_maker, init_estimator",
[(GradientBoostingClassifier, make_classification, DummyClassifier),
(GradientBoostingClassifier, _make_multiclass, DummyClassifier),
(GradientBoostingRegressor, make_regression, DummyRegressor)],
[(GradientBoostingClassifier, make_classification,
DummyClassifier(strategy="stratified")),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(and other places)

(GradientBoostingClassifier, _make_multiclass,
DummyClassifier(strategy="stratified")),
(GradientBoostingRegressor, make_regression, DummyRegressor())],
ids=["binary classification", "multiclass classification", "regression"])
def test_gradient_boosting_with_init(gb, dataset_maker, init_estimator):
# Check that GradientBoostingRegressor works when init is a sklearn
Expand All @@ -1313,11 +1315,11 @@ def test_gradient_boosting_with_init(gb, dataset_maker, init_estimator):
sample_weight = np.random.RandomState(42).rand(100)

# init supports sample weights
init_est = init_estimator()
init_est = clone(init_estimator)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this needed?

gb(init=init_est).fit(X, y, sample_weight=sample_weight)

# init does not support sample weights
init_est = NoSampleWeightWrapper(init_estimator())
init_est = NoSampleWeightWrapper(clone(init_estimator))
gb(init=init_est).fit(X, y) # ok no sample weights
with pytest.raises(ValueError,
match="estimator.*does not support sample weights"):
Expand Down
2 changes: 1 addition & 1 deletion sklearn/ensemble/tests/test_stacking.py
Expand Up @@ -215,7 +215,7 @@ def predict(self, X):

class NoWeightClassifier(BaseEstimator, ClassifierMixin):
def fit(self, X, y):
self.clf = DummyClassifier()
self.clf = DummyClassifier(strategy='stratified')
return self.clf.fit(X, y)


Expand Down
3 changes: 2 additions & 1 deletion sklearn/ensemble/tests/test_weight_boosting.py
Expand Up @@ -501,7 +501,8 @@ def test_multidimensional_X():
@pytest.mark.parametrize("algorithm", ['SAMME', 'SAMME.R'])
def test_adaboostclassifier_without_sample_weight(algorithm):
X, y = iris.data, iris.target
base_estimator = NoSampleWeightWrapper(DummyClassifier())
base_estimator = NoSampleWeightWrapper(
DummyClassifier(strategy='stratified'))
clf = AdaBoostClassifier(
base_estimator=base_estimator, algorithm=algorithm
)
Expand Down
5 changes: 3 additions & 2 deletions sklearn/inspection/tests/test_partial_dependence.py
Expand Up @@ -374,7 +374,8 @@ def test_warning_recursion_non_constant_init():
# make sure that passing a non-constant init parameter to a GBDT and using
# recursion method yields a warning.

gbc = GradientBoostingClassifier(init=DummyClassifier(), random_state=0)
gbc = GradientBoostingClassifier(
init=DummyClassifier(strategy="stratified"), random_state=0)
gbc.fit(X, y)

with pytest.warns(
Expand Down Expand Up @@ -419,7 +420,7 @@ def test_partial_dependence_pipeline():
iris = load_iris()

scaler = StandardScaler()
clf = DummyClassifier(random_state=42)
clf = DummyClassifier(strategy="stratified", random_state=42)
pipe = make_pipeline(scaler, clf)

clf.fit(scaler.fit_transform(iris.data), iris.target)
Expand Down
14 changes: 12 additions & 2 deletions sklearn/tests/test_dummy.py
Expand Up @@ -567,7 +567,7 @@ def test_classification_sample_weight():
y = [0, 1, 0]
sample_weight = [0.1, 1., 0.1]

clf = DummyClassifier().fit(X, y, sample_weight)
clf = DummyClassifier(strategy="stratified").fit(X, y, sample_weight)
assert_array_almost_equal(clf.class_prior_, [0.2 / 1.2, 1. / 1.2])


Expand Down Expand Up @@ -687,7 +687,7 @@ def test_dummy_classifier_on_3D_array():
y = [2, 2, 2]
y_expected = [2, 2, 2]
y_proba_expected = [[1], [1], [1]]
cls = DummyClassifier()
cls = DummyClassifier(strategy="stratified")
cls.fit(X, y)
y_pred = cls.predict(X)
y_pred_proba = cls.predict_proba(X)
Expand Down Expand Up @@ -764,3 +764,13 @@ def test_outputs_2d_deprecation(Dummy):
with pytest.warns(DeprecationWarning,
match="will be removed in version 0.24"):
Dummy().fit(X, y).outputs_2d_


# TODO: Remove in 0.24
def test_strategy_stratified_deprecated_for_prior():
X, y = [[1, 2]], [0]

msg = ("The default value of strategy will change from "
"stratified to prior in 0.24")
with pytest.warns(FutureWarning, match=msg):
DummyClassifier().fit(X, y)