From 12121821e0ae9cd3e10ecd0193968555df105a92 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 25 Apr 2020 17:04:02 +0200 Subject: [PATCH 01/16] ENH add d2_tweedie_score as a metric/scorer --- sklearn/metrics/__init__.py | 2 + sklearn/metrics/_regression.py | 103 +++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 8bcb047ec8161..924ea6c86446a 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -68,6 +68,7 @@ from ._regression import mean_tweedie_deviance from ._regression import mean_poisson_deviance from ._regression import mean_gamma_deviance +from ._regression import d2_tweedie_score from ._scorer import check_scoring @@ -101,6 +102,7 @@ 'confusion_matrix', 'consensus_score', 'coverage_error', + 'd2_tweedie_score', 'dcg_score', 'davies_bouldin_score', 'euclidean_distances', diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 6026a5293806a..f92b5c1579b3e 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -792,3 +792,106 @@ def mean_gamma_deviance(y_true, y_pred, sample_weight=None): return mean_tweedie_deviance( y_true, y_pred, sample_weight=sample_weight, power=2 ) + + +def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0): + """D^2 regression score function, percentage of Tweedie deviance explained. + + Best possible score is 1.0 and it can be negative (because the + model can be arbitrarily worse). A constant model that always + predicts the expected value of y, disregarding the input features, + would get a D^2 score of 0.0. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) + Estimated target values. + + sample_weight : array-like of shape (n_samples,), optional + Sample weights. + + power : float, default=0 + Tweedie power parameter. Either power <= 0 or power >= 1. + + The higher `p` the less weight is given to extreme + deviations between true and predicted targets. + + - power < 0: Extreme stable distribution. Requires: y_pred > 0. + - power = 0 : Normal distribution, output corresponds to + mean_squared_error. y_true and y_pred can be any real numbers. + - power = 1 : Poisson distribution. Requires: y_true >= 0 and + y_pred > 0. + - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0 + and y_pred > 0. + - power = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0. + - power = 3 : Inverse Gaussian distribution. Requires: y_true > 0 + and y_pred > 0. + - otherwise : Positive stable distribution. Requires: y_true > 0 + and y_pred > 0. + + Returns + ------- + z : float or ndarray of floats + The D^2 score. + + Notes + ----- + This is not a symmetric function. + + Like R^2, D^2 score may be negative (it need not actually be the square of + a quantity D). + + This metric is not well-defined for single samples and will return a NaN + value if n_samples is less than two. + + References + ---------- + .. [1] Eq. (3.11) of Hastie, Trevor J., Robert Tibshirani and Martin J. + Wainwright. "Statistical Learning with Sparsity: The Lasso and + Generalizations." (2015). https://trevorhastie.github.io + + Examples + -------- + >>> from sklearn.metrics import d2_tweedie_score + >>> y_true = [0.5, 1, 2.5, 7] + >>> y_pred = [1, 1, 5, 3.5] + >>> d2_tweedie_score(y_true, y_pred) + 0.285... + >>> d2_tweedie_score(y_true, y_pred, power=1) + 0.487... + >>> d2_tweedie_score(y_true, y_pred, power=2) + 0.630... + >>> d2_tweedie_score(y_true, y_true, power=2) + 1.0 + """ + y_type, y_true, y_pred, _ = _check_reg_targets( + y_true, y_pred, None, dtype=[np.float64, np.float32]) + if y_type == 'continuous-multioutput': + raise ValueError("Multioutput not supported in d2_tweedie_score") + check_consistent_length(y_true, y_pred, sample_weight) + + # TODO: Do we need this? + if _num_samples(y_pred) < 2: + msg = "D^2 score is not well-defined with less than two samples." + warnings.warn(msg, UndefinedMetricWarning) + return float('nan') + + if sample_weight is not None: + sample_weight = column_or_1d(sample_weight) + sample_weight = sample_weight[:, np.newaxis] + + dist = TweedieDistribution(power=power) + + dev = dist.unit_deviance(y_true, y_pred, check_input=True) + numerator = np.average(dev, weights=sample_weight) + + y_avg = np.average(y_true, weights=sample_weight) + dev = dist.unit_deviance(y_true, y_avg, check_input=True) + denominator = np.average(dev, weights=sample_weight) + + return 1 - numerator / denominator From e0a23765598d5d02044f88a63b3cfab822a80326 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 25 Apr 2020 17:04:16 +0200 Subject: [PATCH 02/16] TST add tests for d2_tweedie_score --- sklearn/metrics/tests/test_common.py | 6 +++++- sklearn/metrics/tests/test_regression.py | 22 +++++++++++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 7301d21a35f39..2f122b6aaf075 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -29,6 +29,7 @@ from sklearn.metrics import cohen_kappa_score from sklearn.metrics import confusion_matrix from sklearn.metrics import coverage_error +from sklearn.metrics import d2_tweedie_score from sklearn.metrics import explained_variance_score from sklearn.metrics import f1_score from sklearn.metrics import fbeta_score @@ -105,6 +106,7 @@ "mean_gamma_deviance": mean_gamma_deviance, "mean_compound_poisson_deviance": partial(mean_tweedie_deviance, power=1.4), + "d2_tweedie_score": partial(d2_tweedie_score, power=1.4), } CLASSIFICATION_METRICS = { @@ -472,7 +474,8 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs): "macro_f0.5_score", "macro_f2_score", "macro_precision_score", "macro_recall_score", "log_loss", "hinge_loss", "mean_gamma_deviance", "mean_poisson_deviance", - "mean_compound_poisson_deviance" + "mean_compound_poisson_deviance", + "d2_tweedie_score", } @@ -488,6 +491,7 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs): "mean_poisson_deviance", "mean_gamma_deviance", "mean_compound_poisson_deviance", + "d2_tweedie_score", } diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 06c44b2b6f59e..708d3a505d122 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -1,6 +1,7 @@ import numpy as np from numpy.testing import assert_allclose +from scipy.special import factorial, xlogy from itertools import product import pytest @@ -16,6 +17,7 @@ from sklearn.metrics import max_error from sklearn.metrics import r2_score from sklearn.metrics import mean_tweedie_deviance +from sklearn.metrics import d2_tweedie_score from sklearn.metrics._regression import _check_reg_targets @@ -37,6 +39,8 @@ def test_regression_metrics(n_samples=50): assert_almost_equal(explained_variance_score(y_true, y_pred), 1.) assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=0), mean_squared_error(y_true, y_pred)) + assert_almost_equal(d2_tweedie_score(y_true, y_pred, power=0), + r2_score(y_true, y_pred)) # Tweedie deviance needs positive y_pred, except for p=0, # p>=2 needs positive y_true @@ -55,6 +59,14 @@ def test_regression_metrics(n_samples=50): assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=3), np.sum(1 / y_true) / (4 * n)) + dev_mean = 2 * np.mean(xlogy(y_true, 2 * y_true / (n + 1))) + assert_almost_equal(d2_tweedie_score(y_true, y_pred, power=1), + 1 - (n + 1) * (1 - np.log(2)) / dev_mean) + + dev_mean = 2 * np.log((n + 1) / 2) - 2/n * np.log(factorial(n)) + assert_almost_equal(d2_tweedie_score(y_true, y_pred, power=2), + 1 - (2 * np.log(2) - 1) / dev_mean) + def test_mean_squared_error_multioutput_raw_value_squared(): # non-regression test for @@ -124,6 +136,7 @@ def test_regression_metrics_at_limits(): with pytest.raises(ValueError, match="can only be used on strictly positive y_pred."): mean_tweedie_deviance([0.], [0.], power=power) + d2_tweedie_score([0.], [0.], power=power) assert_almost_equal(mean_tweedie_deviance([0.], [0.], power=0), 0.00, 2) msg = "only be used on non-negative y and strictly positive y_pred." @@ -136,12 +149,16 @@ def test_regression_metrics_at_limits(): msg = "only be used on non-negative y and strictly positive y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) + d2_tweedie_score([0.], [0.], power=power) + power = 2. assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00, atol=1e-8) msg = "can only be used on strictly positive y and y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) + d2_tweedie_score([0.], [0.], power=power) + power = 3. assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00, atol=1e-8) @@ -149,10 +166,13 @@ def test_regression_metrics_at_limits(): msg = "can only be used on strictly positive y and y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) + d2_tweedie_score([0.], [0.], power=power) + power = 0.5 with pytest.raises(ValueError, match="is only defined for power<=0 and power>=1"): - mean_tweedie_deviance([0.], [0.], power=0.5) + mean_tweedie_deviance([0.], [0.], power=power) + d2_tweedie_score([0.], [0.], power=power) def test__check_reg_targets(): From 980f89fbe4b3c6fe9e92a2a2d7641dbe06aabbf0 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 25 Apr 2020 17:04:28 +0200 Subject: [PATCH 03/16] DOC add d2_tweedie_score to user guide and API --- doc/modules/classes.rst | 3 ++- doc/modules/model_evaluation.rst | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 3d9924638b69b..3871859b77850 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -900,7 +900,7 @@ Miscellaneous manifold.smacof manifold.spectral_embedding manifold.trustworthiness - + .. _metrics_ref: @@ -985,6 +985,7 @@ details. metrics.mean_poisson_deviance metrics.mean_gamma_deviance metrics.mean_tweedie_deviance + metrics.d2_tweedie_score Multilabel ranking metrics -------------------------- diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index e1b7ae34f1647..8b50ae0160565 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -2122,6 +2122,24 @@ the difference in errors decreases. Finally, by setting, ``power=2``:: we would get identical errors. The deviance when ``power=2`` is thus only sensitive to relative errors. +.. _d2_tweedie_score: + +D² score, the coefficient of determination +------------------------------------------- + +The :func:`d2_tweedie_score` function computes the percentage of deviance +explained. It is a generalization of R², where the squared error is replaced by +the Tweedie deviance. D², also known as McFadden's likelihood ratio index, is +calculated as + +.. math:: + + D^2(y, \hat{y}) = 1 - \frac{\text{D}(y, \hat{y})}{\text{D}(y, \bar{y})} \,. + +The argument `power` defines the Tweedie power as for +:func:`mean_tweedie_deviance`. Note that for `power=0`, +`:func:`d2_tweedie_score` equals :func:`r2_score` (for single targets). + .. _clustering_metrics: Clustering metrics From 008f51d3673a84c1c34f58b835df8e5635dc2717 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 25 Apr 2020 18:27:15 +0200 Subject: [PATCH 04/16] DOC add d2_tweedie_score to user guide and API --- doc/modules/model_evaluation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 8b50ae0160565..b543c060adde1 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -2138,7 +2138,7 @@ calculated as The argument `power` defines the Tweedie power as for :func:`mean_tweedie_deviance`. Note that for `power=0`, -`:func:`d2_tweedie_score` equals :func:`r2_score` (for single targets). +:func:`d2_tweedie_score` equals :func:`r2_score` (for single targets). .. _clustering_metrics: From 8e317788fc58096b154da74f3e3080cc944af0b5 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 22 Aug 2021 18:42:38 +0200 Subject: [PATCH 05/16] address some review comments --- sklearn/metrics/_regression.py | 19 +++++++------------ sklearn/metrics/tests/test_regression.py | 16 +++++++--------- 2 files changed, 14 insertions(+), 21 deletions(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 17f2849450e42..3721899d1cf89 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -640,8 +640,8 @@ def r2_score(y_true, y_pred, *, sample_weight=None, Unlike most other scores, R^2 score may be negative (it need not actually be the square of a quantity R). - This metric is not well-defined for single samples and will return a NaN - value if n_samples is less than two. + This metric is not well-defined for single samples and will raise a ValueError if + n_samples is less than two. References ---------- @@ -678,9 +678,7 @@ def r2_score(y_true, y_pred, *, sample_weight=None, check_consistent_length(y_true, y_pred, sample_weight) if _num_samples(y_pred) < 2: - msg = "R^2 score is not well-defined with less than two samples." - warnings.warn(msg, UndefinedMetricWarning) - return float('nan') + raise ValueError("R^2 score is not well-defined with less than two samples.") if sample_weight is not None: sample_weight = column_or_1d(sample_weight) @@ -927,7 +925,7 @@ def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0): - power < 0: Extreme stable distribution. Requires: y_pred > 0. - power = 0 : Normal distribution, output corresponds to - mean_squared_error. y_true and y_pred can be any real numbers. + r2_score. y_true and y_pred can be any real numbers. - power = 1 : Poisson distribution. Requires: y_true >= 0 and y_pred > 0. - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0 @@ -950,8 +948,8 @@ def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0): Like R^2, D^2 score may be negative (it need not actually be the square of a quantity D). - This metric is not well-defined for single samples and will return a NaN - value if n_samples is less than two. + This metric is not well-defined for single samples and will raise a ValueError if + n_samples is less than two. References ---------- @@ -979,11 +977,8 @@ def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0): raise ValueError("Multioutput not supported in d2_tweedie_score") check_consistent_length(y_true, y_pred, sample_weight) - # TODO: Do we need this? if _num_samples(y_pred) < 2: - msg = "D^2 score is not well-defined with less than two samples." - warnings.warn(msg, UndefinedMetricWarning) - return float('nan') + raise ValueError("D^2 score is not well-defined with less than two samples.") if sample_weight is not None: sample_weight = column_or_1d(sample_weight) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 31f2df601eb5f..5c391d47986c7 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -307,16 +307,14 @@ def test_regression_custom_weights(): assert_almost_equal(msle, msle2, decimal=2) -@pytest.mark.parametrize('metric', [r2_score]) +@pytest.mark.parametrize('metric', [r2_score, d2_tweedie_score]) def test_regression_single_sample(metric): - y_true = [0] - y_pred = [1] - warning_msg = 'not well-defined with less than two samples.' - - # Trigger the warning - with pytest.warns(UndefinedMetricWarning, match=warning_msg): - score = metric(y_true, y_pred) - assert np.isnan(score) + y_true = [1] + y_pred = [2] + msg = "not well-defined with less than two samples." + + with pytest.raises(ValueError, match=msg): + metric(y_true, y_pred) def test_tweedie_deviance_continuity(): From f02bde41fbadd5cf8988c7489c2683822a149175 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 22 Aug 2021 19:15:25 +0200 Subject: [PATCH 06/16] CLN nicer tests --- sklearn/metrics/_regression.py | 4 +--- sklearn/metrics/tests/test_regression.py | 21 +++++++++++++-------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 3721899d1cf89..887b38148ca29 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -24,7 +24,6 @@ # License: BSD 3 clause import numpy as np -import warnings from .._loss.glm_distribution import TweedieDistribution from ..utils.validation import (check_array, check_consistent_length, @@ -33,7 +32,6 @@ from ..utils.validation import _deprecate_positional_args from ..utils.validation import _check_sample_weight from ..utils.stats import _weighted_percentile -from ..exceptions import UndefinedMetricWarning __ALL__ = [ @@ -973,7 +971,7 @@ def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0): """ y_type, y_true, y_pred, _ = _check_reg_targets( y_true, y_pred, None, dtype=[np.float64, np.float32]) - if y_type == 'continuous-multioutput': + if y_type == "continuous-multioutput": raise ValueError("Multioutput not supported in d2_tweedie_score") check_consistent_length(y_true, y_pred, sample_weight) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 5c391d47986c7..d78ebfe44fbfe 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -22,8 +22,6 @@ from sklearn.metrics._regression import _check_reg_targets -from ...exceptions import UndefinedMetricWarning - def test_regression_metrics(n_samples=50): y_true = np.arange(n_samples) @@ -142,15 +140,20 @@ def test_regression_metrics_at_limits(): power = -1.2 assert_allclose(mean_tweedie_deviance([0], [1.], power=power), 2 / (2 - power), rtol=1e-3) - with pytest.raises(ValueError, - match="can only be used on strictly positive y_pred."): + msg = "can only be used on strictly positive y_pred." + with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) + with pytest.raises(ValueError, match=msg): d2_tweedie_score([0.], [0.], power=power) + assert_almost_equal(mean_tweedie_deviance([0.], [0.], power=0), 0.00, 2) + power = 1.0 msg = "only be used on non-negative y and strictly positive y_pred." with pytest.raises(ValueError, match=msg): - mean_tweedie_deviance([0.], [0.], power=1.0) + mean_tweedie_deviance([0.], [0.], power=power) + with pytest.raises(ValueError, match=msg): + d2_tweedie_score([0.], [0.], power=power) power = 1.5 assert_allclose(mean_tweedie_deviance([0.], [1.], power=power), @@ -158,6 +161,7 @@ def test_regression_metrics_at_limits(): msg = "only be used on non-negative y and strictly positive y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) + with pytest.raises(ValueError, match=msg): d2_tweedie_score([0.], [0.], power=power) power = 2. @@ -166,21 +170,22 @@ def test_regression_metrics_at_limits(): msg = "can only be used on strictly positive y and y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) + with pytest.raises(ValueError, match=msg): d2_tweedie_score([0.], [0.], power=power) power = 3. assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00, atol=1e-8) - msg = "can only be used on strictly positive y and y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) + with pytest.raises(ValueError, match=msg): d2_tweedie_score([0.], [0.], power=power) power = 0.5 - with pytest.raises(ValueError, - match="is only defined for power<=0 and power>=1"): + with pytest.raises(ValueError, match="is only defined for power<=0 and power>=1"): mean_tweedie_deviance([0.], [0.], power=power) + with pytest.raises(ValueError, match="is only defined for power<=0 and power>=1"): d2_tweedie_score([0.], [0.], power=power) From d1ef272112644c7ab229f86de6b6c9de3939aeee Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 22 Aug 2021 19:25:59 +0200 Subject: [PATCH 07/16] DOC add versionadded --- sklearn/metrics/_regression.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 1b718a68fde6b..e6a28c2355423 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -994,6 +994,8 @@ def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0): Read more in the :ref:`User Guide `. + .. versionadded:: 1.0 + Parameters ---------- y_true : array-like of shape (n_samples,) From cd8a8a8dc6b3b953aa32759f7c389bcdbd517870 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 22 Aug 2021 19:33:44 +0200 Subject: [PATCH 08/16] DOC improve docstring and user guide --- doc/modules/model_evaluation.rst | 6 ++++++ sklearn/metrics/_regression.py | 7 +++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 7fcea220b153e..acbfe0fd603e6 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -2371,6 +2371,12 @@ calculated as The argument `power` defines the Tweedie power as for :func:`mean_tweedie_deviance`. Note that for `power=0`, :func:`d2_tweedie_score` equals :func:`r2_score` (for single targets). + +Like R², the best possible score is 1.0 and it can be negative (because the +model can be arbitrarily worse). A model that always predicts a constant +value for the expected value of y, disregarding the input features, would +get a D^2 score of 0.0. + .. _pinball_loss: Pinball loss diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index e6a28c2355423..3f6afbf61599c 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -987,10 +987,9 @@ def mean_gamma_deviance(y_true, y_pred, *, sample_weight=None): def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0): """D^2 regression score function, percentage of Tweedie deviance explained. - Best possible score is 1.0 and it can be negative (because the - model can be arbitrarily worse). A constant model that always - predicts the expected value of y, disregarding the input features, - would get a D^2 score of 0.0. + Best possible score is 1.0 and it can be negative (because the model can be + arbitrarily worse). A model that always predicts a constant value for the expected + value of y, disregarding the input features, would get a D^2 score of 0.0. Read more in the :ref:`User Guide `. From 420eaf8364e942384a352dc1fc33675b5d94c430 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 22 Aug 2021 19:39:55 +0200 Subject: [PATCH 09/16] DOC add code snippet with make_scorer --- doc/modules/model_evaluation.rst | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index acbfe0fd603e6..695d1c3109041 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -2368,7 +2368,7 @@ calculated as D^2(y, \hat{y}) = 1 - \frac{\text{D}(y, \hat{y})}{\text{D}(y, \bar{y})} \,. -The argument `power` defines the Tweedie power as for +The argument ``power`` defines the Tweedie power as for :func:`mean_tweedie_deviance`. Note that for `power=0`, :func:`d2_tweedie_score` equals :func:`r2_score` (for single targets). @@ -2377,6 +2377,11 @@ model can be arbitrarily worse). A model that always predicts a constant value for the expected value of y, disregarding the input features, would get a D^2 score of 0.0. +A scorer object with a specific choice of ``power`` can be built by:: + + >>> from sklearn.metrics import make_scorer + >>> d2_tweedie_score_15 = make_scorer(d2_tweedie_score, pwoer=1.5) + .. _pinball_loss: Pinball loss @@ -2409,7 +2414,7 @@ Here is a small example of usage of the :func:`mean_pinball_loss` function:: >>> mean_pinball_loss(y_true, y_true, alpha=0.9) 0.0 -It is possible to build a scorer object with a specific choice of alpha:: +It is possible to build a scorer object with a specific choice of ``alpha``:: >>> from sklearn.metrics import make_scorer >>> mean_pinball_loss_95p = make_scorer(mean_pinball_loss, alpha=0.95) From eb97867db304f7fd30173d107bbc83b9342993df Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 23 Aug 2021 08:15:50 +0200 Subject: [PATCH 10/16] DOC add whatsnew entry --- doc/whats_new/v1.0.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 7d8175a3b5046..205eacdc91443 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -602,6 +602,12 @@ Changelog quantile regression. :pr:`19415` by :user:`Xavier Dupré ` and :user:`Oliver Grisel `. +- |Feature| :func:`metrics.d2_tweedie_score` calculates the D^2 regression + score for Tweedie deviances with power parameter ``power``. This is a + generalization of the `r2_score` and can be interpreted as percentage of + Tweedie deviance explained. + :pr:`17036` by :user:`Christian Lorentzen `. + - |Feature| :func:`metrics.mean_squared_log_error` now supports `squared=False`. :pr:`20326` by :user:`Uttam kumar `. @@ -683,7 +689,7 @@ Changelog ............................. - |Fix| :class:`neural_network.MLPClassifier` and - :class:`neural_network.MLPRegressor` now correct supports continued training + :class:`neural_network.MLPRegressor` now correctly support continued training when loading from a pickled file. :pr:`19631` by `Thomas Fan`_. :mod:`sklearn.pipeline` From 4b9609223adbb265d742214c57ecab11a881e53e Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 4 Sep 2021 12:59:01 +0200 Subject: [PATCH 11/16] Revert to return float(nan) This reverts commit 8e317788fc58096b154da74f3e3080cc944af0b5. --- sklearn/metrics/_regression.py | 23 ++++++++++++------- sklearn/metrics/tests/test_regression.py | 28 ++++++++++++++---------- 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 3f6afbf61599c..ceb7a84a57c4c 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -24,9 +24,12 @@ # Uttam kumar # License: BSD 3 clause +import warnings + import numpy as np from .._loss.glm_distribution import TweedieDistribution +from ..exceptions import UndefinedMetricWarning from ..utils.validation import check_array, check_consistent_length, _num_samples from ..utils.validation import column_or_1d from ..utils.validation import _check_sample_weight @@ -735,8 +738,8 @@ def r2_score(y_true, y_pred, *, sample_weight=None, multioutput="uniform_average Unlike most other scores, :math:`R^2` score may be negative (it need not actually be the square of a quantity R). - This metric is not well-defined for single samples and will raise a ValueError if - n_samples is less than two. + This metric is not well-defined for single samples and will return a NaN + value if n_samples is less than two. References ---------- @@ -774,7 +777,9 @@ def r2_score(y_true, y_pred, *, sample_weight=None, multioutput="uniform_average check_consistent_length(y_true, y_pred, sample_weight) if _num_samples(y_pred) < 2: - raise ValueError("R^2 score is not well-defined with less than two samples.") + msg = "R^2 score is not well-defined with less than two samples." + warnings.warn(msg, UndefinedMetricWarning) + return float("nan") if sample_weight is not None: sample_weight = column_or_1d(sample_weight) @@ -1013,8 +1018,8 @@ def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0): deviations between true and predicted targets. - power < 0: Extreme stable distribution. Requires: y_pred > 0. - - power = 0 : Normal distribution, output corresponds to - r2_score. y_true and y_pred can be any real numbers. + - power = 0 : Normal distribution, output corresponds to r2_score. + y_true and y_pred can be any real numbers. - power = 1 : Poisson distribution. Requires: y_true >= 0 and y_pred > 0. - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0 @@ -1037,8 +1042,8 @@ def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0): Like R^2, D^2 score may be negative (it need not actually be the square of a quantity D). - This metric is not well-defined for single samples and will raise a ValueError if - n_samples is less than two. + This metric is not well-defined for single samples and will return a NaN + value if n_samples is less than two. References ---------- @@ -1068,7 +1073,9 @@ def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0): check_consistent_length(y_true, y_pred, sample_weight) if _num_samples(y_pred) < 2: - raise ValueError("D^2 score is not well-defined with less than two samples.") + msg = "D^2 score is not well-defined with less than two samples." + warnings.warn(msg, UndefinedMetricWarning) + return float("nan") if sample_weight is not None: sample_weight = column_or_1d(sample_weight) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index bb4f9b5c02b87..f94f06676872f 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -26,6 +26,8 @@ from sklearn.metrics._regression import _check_reg_targets +from sklearn.exceptions import UndefinedMetricWarning + def test_regression_metrics(n_samples=50): y_true = np.arange(n_samples) @@ -145,23 +147,23 @@ def test_regression_metrics_at_limits(): assert_almost_equal(max_error([0.0], [0.0]), 0.0) assert_almost_equal(explained_variance_score([0.0], [0.0]), 1.0) assert_almost_equal(r2_score([0.0, 1], [0.0, 1]), 1.0) - err_msg = ( + msg = ( "Mean Squared Logarithmic Error cannot be used when targets " "contain negative values." ) - with pytest.raises(ValueError, match=err_msg): + with pytest.raises(ValueError, match=msg): mean_squared_log_error([-1.0], [-1.0]) - err_msg = ( + msg = ( "Mean Squared Logarithmic Error cannot be used when targets " "contain negative values." ) - with pytest.raises(ValueError, match=err_msg): + with pytest.raises(ValueError, match=msg): mean_squared_log_error([1.0, 2.0, 3.0], [1.0, -2.0, 3.0]) - err_msg = ( + msg = ( "Mean Squared Logarithmic Error cannot be used when targets " "contain negative values." ) - with pytest.raises(ValueError, match=err_msg): + with pytest.raises(ValueError, match=msg): mean_squared_log_error([1.0, -2.0, 3.0], [1.0, 2.0, 3.0]) # Tweedie deviance error @@ -350,12 +352,14 @@ def test_regression_custom_weights(): @pytest.mark.parametrize("metric", [r2_score, d2_tweedie_score]) def test_regression_single_sample(metric): - y_true = [1] - y_pred = [2] - msg = "not well-defined with less than two samples." - - with pytest.raises(ValueError, match=msg): - metric(y_true, y_pred) + y_true = [0] + y_pred = [1] + warning_msg = "not well-defined with less than two samples." + + # Trigger the warning + with pytest.warns(UndefinedMetricWarning, match=warning_msg): + score = metric(y_true, y_pred) + assert np.isnan(score) def test_tweedie_deviance_continuity(): From 300b610fed4596a332b5391405f23a5f5fa672b5 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 4 Sep 2021 12:59:57 +0200 Subject: [PATCH 12/16] TST fix tests --- sklearn/metrics/tests/test_regression.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index f94f06676872f..b66ce18ec8da4 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -175,7 +175,7 @@ def test_regression_metrics_at_limits(): with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.0], [0.0], power=power) with pytest.raises(ValueError, match=msg): - d2_tweedie_score([0.0], [0.0], power=power) + d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power) assert_almost_equal(mean_tweedie_deviance([0.0], [0.0], power=0), 0.0, 2) @@ -184,7 +184,7 @@ def test_regression_metrics_at_limits(): with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.0], [0.0], power=power) with pytest.raises(ValueError, match=msg): - d2_tweedie_score([0.0], [0.0], power=power) + d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power) power = 1.5 assert_allclose(mean_tweedie_deviance([0.0], [1.0], power=power), 2 / (2 - power)) @@ -192,7 +192,7 @@ def test_regression_metrics_at_limits(): with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.0], [0.0], power=power) with pytest.raises(ValueError, match=msg): - d2_tweedie_score([0.0], [0.0], power=power) + d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power) power = 2.0 assert_allclose(mean_tweedie_deviance([1.0], [1.0], power=power), 0.00, atol=1e-8) @@ -200,7 +200,7 @@ def test_regression_metrics_at_limits(): with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.0], [0.0], power=power) with pytest.raises(ValueError, match=msg): - d2_tweedie_score([0.0], [0.0], power=power) + d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power) power = 3.0 assert_allclose(mean_tweedie_deviance([1.0], [1.0], power=power), 0.00, atol=1e-8) @@ -208,13 +208,13 @@ def test_regression_metrics_at_limits(): with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.0], [0.0], power=power) with pytest.raises(ValueError, match=msg): - d2_tweedie_score([0.0], [0.0], power=power) + d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power) power = 0.5 with pytest.raises(ValueError, match="is only defined for power<=0 and power>=1"): mean_tweedie_deviance([0.0], [0.0], power=power) with pytest.raises(ValueError, match="is only defined for power<=0 and power>=1"): - d2_tweedie_score([0.0], [0.0], power=power) + d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power) def test__check_reg_targets(): From 8bafea43ea69cb0e95888a75abb58e7ba0cbfa07 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 4 Sep 2021 13:02:05 +0200 Subject: [PATCH 13/16] MNT kwargs only --- sklearn/metrics/_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index ceb7a84a57c4c..b69b4dd5ef96e 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -989,7 +989,7 @@ def mean_gamma_deviance(y_true, y_pred, *, sample_weight=None): return mean_tweedie_deviance(y_true, y_pred, sample_weight=sample_weight, power=2) -def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0): +def d2_tweedie_score(y_true, y_pred, *, sample_weight=None, power=0): """D^2 regression score function, percentage of Tweedie deviance explained. Best possible score is 1.0 and it can be negative (because the model can be From ebf3e5914cd1024ad790d15a49940d5ca6d80c64 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 4 Sep 2021 13:10:28 +0200 Subject: [PATCH 14/16] DOC fix statement about constant predictions --- doc/modules/model_evaluation.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 695d1c3109041..f1a9388344512 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -2373,9 +2373,9 @@ The argument ``power`` defines the Tweedie power as for :func:`d2_tweedie_score` equals :func:`r2_score` (for single targets). Like R², the best possible score is 1.0 and it can be negative (because the -model can be arbitrarily worse). A model that always predicts a constant -value for the expected value of y, disregarding the input features, would -get a D^2 score of 0.0. +model can be arbitrarily worse). A constant model that always predicts the +expected value of y, disregarding the input features, would get a D² score +of 0.0. A scorer object with a specific choice of ``power`` can be built by:: From 454871c239e655b97f0fbf6badc83d7b36af576e Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 4 Sep 2021 22:37:21 +0200 Subject: [PATCH 15/16] DOC more precise statement of zero D2 score --- sklearn/metrics/_regression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index b69b4dd5ef96e..ed9da69b1261c 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -993,8 +993,8 @@ def d2_tweedie_score(y_true, y_pred, *, sample_weight=None, power=0): """D^2 regression score function, percentage of Tweedie deviance explained. Best possible score is 1.0 and it can be negative (because the model can be - arbitrarily worse). A model that always predicts a constant value for the expected - value of y, disregarding the input features, would get a D^2 score of 0.0. + arbitrarily worse). A model that always uses the empirical mean of `y_true` as + constant prediction, disregarding the input features, gets a D^2 score of 0.0. Read more in the :ref:`User Guide `. From 5b95be4091d233d1372207d32bd16a492704c4ef Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 4 Sep 2021 22:40:11 +0200 Subject: [PATCH 16/16] DOC import d2_score in user guide --- doc/modules/model_evaluation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index f1a9388344512..f5f447e118a8e 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -2379,7 +2379,7 @@ of 0.0. A scorer object with a specific choice of ``power`` can be built by:: - >>> from sklearn.metrics import make_scorer + >>> from sklearn.metrics import d2_tweedie_score, make_scorer >>> d2_tweedie_score_15 = make_scorer(d2_tweedie_score, pwoer=1.5) .. _pinball_loss: