From 12121821e0ae9cd3e10ecd0193968555df105a92 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sat, 25 Apr 2020 17:04:02 +0200
Subject: [PATCH 01/16] ENH add d2_tweedie_score as a metric/scorer

---
 sklearn/metrics/__init__.py    |   2 +
 sklearn/metrics/_regression.py | 103 +++++++++++++++++++++++++++++++++
 2 files changed, 105 insertions(+)

diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index 8bcb047ec8161..924ea6c86446a 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -68,6 +68,7 @@
 from ._regression import mean_tweedie_deviance
 from ._regression import mean_poisson_deviance
 from ._regression import mean_gamma_deviance
+from ._regression import d2_tweedie_score
 
 
 from ._scorer import check_scoring
@@ -101,6 +102,7 @@
     'confusion_matrix',
     'consensus_score',
     'coverage_error',
+    'd2_tweedie_score',
     'dcg_score',
     'davies_bouldin_score',
     'euclidean_distances',
diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index 6026a5293806a..f92b5c1579b3e 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -792,3 +792,106 @@ def mean_gamma_deviance(y_true, y_pred, sample_weight=None):
     return mean_tweedie_deviance(
         y_true, y_pred, sample_weight=sample_weight, power=2
     )
+
+
+def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0):
+    """D^2 regression score function, percentage of Tweedie deviance explained.
+
+    Best possible score is 1.0 and it can be negative (because the
+    model can be arbitrarily worse). A constant model that always
+    predicts the expected value of y, disregarding the input features,
+    would get a D^2 score of 0.0.
+
+    Read more in the :ref:`User Guide <d2_tweedie_score>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), optional
+        Sample weights.
+
+    power : float, default=0
+        Tweedie power parameter. Either power <= 0 or power >= 1.
+
+        The higher `p` the less weight is given to extreme
+        deviations between true and predicted targets.
+
+        - power < 0: Extreme stable distribution. Requires: y_pred > 0.
+        - power = 0 : Normal distribution, output corresponds to
+          mean_squared_error. y_true and y_pred can be any real numbers.
+        - power = 1 : Poisson distribution. Requires: y_true >= 0 and
+          y_pred > 0.
+        - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0
+          and y_pred > 0.
+        - power = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0.
+        - power = 3 : Inverse Gaussian distribution. Requires: y_true > 0
+          and y_pred > 0.
+        - otherwise : Positive stable distribution. Requires: y_true > 0
+          and y_pred > 0.
+
+    Returns
+    -------
+    z : float or ndarray of floats
+        The D^2 score.
+
+    Notes
+    -----
+    This is not a symmetric function.
+
+    Like R^2, D^2 score may be negative (it need not actually be the square of
+    a quantity D).
+
+    This metric is not well-defined for single samples and will return a NaN
+    value if n_samples is less than two.
+
+    References
+    ----------
+    .. [1] Eq. (3.11) of Hastie, Trevor J., Robert Tibshirani and Martin J.
+           Wainwright. "Statistical Learning with Sparsity: The Lasso and
+           Generalizations." (2015). https://trevorhastie.github.io
+
+    Examples
+    --------
+    >>> from sklearn.metrics import d2_tweedie_score
+    >>> y_true = [0.5, 1, 2.5, 7]
+    >>> y_pred = [1, 1, 5, 3.5]
+    >>> d2_tweedie_score(y_true, y_pred)
+    0.285...
+    >>> d2_tweedie_score(y_true, y_pred, power=1)
+    0.487...
+    >>> d2_tweedie_score(y_true, y_pred, power=2)
+    0.630...
+    >>> d2_tweedie_score(y_true, y_true, power=2)
+    1.0
+    """
+    y_type, y_true, y_pred, _ = _check_reg_targets(
+        y_true, y_pred, None, dtype=[np.float64, np.float32])
+    if y_type == 'continuous-multioutput':
+        raise ValueError("Multioutput not supported in d2_tweedie_score")
+    check_consistent_length(y_true, y_pred, sample_weight)
+
+    # TODO: Do we need this?
+    if _num_samples(y_pred) < 2:
+        msg = "D^2 score is not well-defined with less than two samples."
+        warnings.warn(msg, UndefinedMetricWarning)
+        return float('nan')
+
+    if sample_weight is not None:
+        sample_weight = column_or_1d(sample_weight)
+        sample_weight = sample_weight[:, np.newaxis]
+
+    dist = TweedieDistribution(power=power)
+
+    dev = dist.unit_deviance(y_true, y_pred, check_input=True)
+    numerator = np.average(dev, weights=sample_weight)
+
+    y_avg = np.average(y_true, weights=sample_weight)
+    dev = dist.unit_deviance(y_true, y_avg, check_input=True)
+    denominator = np.average(dev, weights=sample_weight)
+
+    return 1 - numerator / denominator

From e0a23765598d5d02044f88a63b3cfab822a80326 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sat, 25 Apr 2020 17:04:16 +0200
Subject: [PATCH 02/16] TST add tests for d2_tweedie_score

---
 sklearn/metrics/tests/test_common.py     |  6 +++++-
 sklearn/metrics/tests/test_regression.py | 22 +++++++++++++++++++++-
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 7301d21a35f39..2f122b6aaf075 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -29,6 +29,7 @@
 from sklearn.metrics import cohen_kappa_score
 from sklearn.metrics import confusion_matrix
 from sklearn.metrics import coverage_error
+from sklearn.metrics import d2_tweedie_score
 from sklearn.metrics import explained_variance_score
 from sklearn.metrics import f1_score
 from sklearn.metrics import fbeta_score
@@ -105,6 +106,7 @@
     "mean_gamma_deviance": mean_gamma_deviance,
     "mean_compound_poisson_deviance":
     partial(mean_tweedie_deviance, power=1.4),
+    "d2_tweedie_score": partial(d2_tweedie_score, power=1.4),
 }
 
 CLASSIFICATION_METRICS = {
@@ -472,7 +474,8 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
     "macro_f0.5_score", "macro_f2_score", "macro_precision_score",
     "macro_recall_score", "log_loss", "hinge_loss",
     "mean_gamma_deviance", "mean_poisson_deviance",
-    "mean_compound_poisson_deviance"
+    "mean_compound_poisson_deviance",
+    "d2_tweedie_score",
 }
 
 
@@ -488,6 +491,7 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
     "mean_poisson_deviance",
     "mean_gamma_deviance",
     "mean_compound_poisson_deviance",
+    "d2_tweedie_score",
 }
 
 
diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py
index 06c44b2b6f59e..708d3a505d122 100644
--- a/sklearn/metrics/tests/test_regression.py
+++ b/sklearn/metrics/tests/test_regression.py
@@ -1,6 +1,7 @@
 
 import numpy as np
 from numpy.testing import assert_allclose
+from scipy.special import factorial, xlogy
 from itertools import product
 import pytest
 
@@ -16,6 +17,7 @@
 from sklearn.metrics import max_error
 from sklearn.metrics import r2_score
 from sklearn.metrics import mean_tweedie_deviance
+from sklearn.metrics import d2_tweedie_score
 
 from sklearn.metrics._regression import _check_reg_targets
 
@@ -37,6 +39,8 @@ def test_regression_metrics(n_samples=50):
     assert_almost_equal(explained_variance_score(y_true, y_pred), 1.)
     assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=0),
                         mean_squared_error(y_true, y_pred))
+    assert_almost_equal(d2_tweedie_score(y_true, y_pred, power=0),
+                        r2_score(y_true, y_pred))
 
     # Tweedie deviance needs positive y_pred, except for p=0,
     # p>=2 needs positive y_true
@@ -55,6 +59,14 @@ def test_regression_metrics(n_samples=50):
     assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=3),
                         np.sum(1 / y_true) / (4 * n))
 
+    dev_mean = 2 * np.mean(xlogy(y_true, 2 * y_true / (n + 1)))
+    assert_almost_equal(d2_tweedie_score(y_true, y_pred, power=1),
+                        1 - (n + 1) * (1 - np.log(2)) / dev_mean)
+
+    dev_mean = 2 * np.log((n + 1) / 2) - 2/n * np.log(factorial(n))
+    assert_almost_equal(d2_tweedie_score(y_true, y_pred, power=2),
+                        1 - (2 * np.log(2) - 1) / dev_mean)
+
 
 def test_mean_squared_error_multioutput_raw_value_squared():
     # non-regression test for
@@ -124,6 +136,7 @@ def test_regression_metrics_at_limits():
     with pytest.raises(ValueError,
                        match="can only be used on strictly positive y_pred."):
         mean_tweedie_deviance([0.], [0.], power=power)
+        d2_tweedie_score([0.], [0.], power=power)
     assert_almost_equal(mean_tweedie_deviance([0.], [0.], power=0), 0.00, 2)
 
     msg = "only be used on non-negative y and strictly positive y_pred."
@@ -136,12 +149,16 @@ def test_regression_metrics_at_limits():
     msg = "only be used on non-negative y and strictly positive y_pred."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], power=power)
+        d2_tweedie_score([0.], [0.], power=power)
+
     power = 2.
     assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00,
                     atol=1e-8)
     msg = "can only be used on strictly positive y and y_pred."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], power=power)
+        d2_tweedie_score([0.], [0.], power=power)
+
     power = 3.
     assert_allclose(mean_tweedie_deviance([1.], [1.], power=power),
                     0.00, atol=1e-8)
@@ -149,10 +166,13 @@ def test_regression_metrics_at_limits():
     msg = "can only be used on strictly positive y and y_pred."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], power=power)
+        d2_tweedie_score([0.], [0.], power=power)
 
+    power = 0.5
     with pytest.raises(ValueError,
                        match="is only defined for power<=0 and power>=1"):
-        mean_tweedie_deviance([0.], [0.], power=0.5)
+        mean_tweedie_deviance([0.], [0.], power=power)
+        d2_tweedie_score([0.], [0.], power=power)
 
 
 def test__check_reg_targets():

From 980f89fbe4b3c6fe9e92a2a2d7641dbe06aabbf0 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sat, 25 Apr 2020 17:04:28 +0200
Subject: [PATCH 03/16] DOC add d2_tweedie_score to user guide and API

---
 doc/modules/classes.rst          |  3 ++-
 doc/modules/model_evaluation.rst | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 3d9924638b69b..3871859b77850 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -900,7 +900,7 @@ Miscellaneous
     manifold.smacof
     manifold.spectral_embedding
     manifold.trustworthiness
-	
+
 
 .. _metrics_ref:
 
@@ -985,6 +985,7 @@ details.
    metrics.mean_poisson_deviance
    metrics.mean_gamma_deviance
    metrics.mean_tweedie_deviance
+   metrics.d2_tweedie_score
 
 Multilabel ranking metrics
 --------------------------
diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index e1b7ae34f1647..8b50ae0160565 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -2122,6 +2122,24 @@ the difference in errors decreases. Finally, by setting, ``power=2``::
 we would get identical errors. The deviance when ``power=2`` is thus only
 sensitive to relative errors.
 
+.. _d2_tweedie_score:
+
+D² score, the coefficient of determination
+-------------------------------------------
+
+The :func:`d2_tweedie_score` function computes the percentage of deviance
+explained. It is a generalization of R², where the squared error is replaced by
+the Tweedie deviance. D², also known as McFadden's likelihood ratio index, is
+calculated as
+
+.. math::
+
+  D^2(y, \hat{y}) = 1 - \frac{\text{D}(y, \hat{y})}{\text{D}(y, \bar{y})} \,.
+
+The argument `power` defines the Tweedie power as for
+:func:`mean_tweedie_deviance`. Note that for `power=0`,
+`:func:`d2_tweedie_score` equals :func:`r2_score` (for single targets).
+
 .. _clustering_metrics:
 
 Clustering metrics

From 008f51d3673a84c1c34f58b835df8e5635dc2717 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sat, 25 Apr 2020 18:27:15 +0200
Subject: [PATCH 04/16] DOC add d2_tweedie_score to user guide and API

---
 doc/modules/model_evaluation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 8b50ae0160565..b543c060adde1 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -2138,7 +2138,7 @@ calculated as
 
 The argument `power` defines the Tweedie power as for
 :func:`mean_tweedie_deviance`. Note that for `power=0`,
-`:func:`d2_tweedie_score` equals :func:`r2_score` (for single targets).
+:func:`d2_tweedie_score` equals :func:`r2_score` (for single targets).
 
 .. _clustering_metrics:
 

From 8e317788fc58096b154da74f3e3080cc944af0b5 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 22 Aug 2021 18:42:38 +0200
Subject: [PATCH 05/16] address some review comments

---
 sklearn/metrics/_regression.py           | 19 +++++++------------
 sklearn/metrics/tests/test_regression.py | 16 +++++++---------
 2 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index 17f2849450e42..3721899d1cf89 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -640,8 +640,8 @@ def r2_score(y_true, y_pred, *, sample_weight=None,
     Unlike most other scores, R^2 score may be negative (it need not actually
     be the square of a quantity R).
 
-    This metric is not well-defined for single samples and will return a NaN
-    value if n_samples is less than two.
+    This metric is not well-defined for single samples and will raise a ValueError if
+    n_samples is less than two.
 
     References
     ----------
@@ -678,9 +678,7 @@ def r2_score(y_true, y_pred, *, sample_weight=None,
     check_consistent_length(y_true, y_pred, sample_weight)
 
     if _num_samples(y_pred) < 2:
-        msg = "R^2 score is not well-defined with less than two samples."
-        warnings.warn(msg, UndefinedMetricWarning)
-        return float('nan')
+        raise ValueError("R^2 score is not well-defined with less than two samples.")
 
     if sample_weight is not None:
         sample_weight = column_or_1d(sample_weight)
@@ -927,7 +925,7 @@ def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0):
 
         - power < 0: Extreme stable distribution. Requires: y_pred > 0.
         - power = 0 : Normal distribution, output corresponds to
-          mean_squared_error. y_true and y_pred can be any real numbers.
+          r2_score. y_true and y_pred can be any real numbers.
         - power = 1 : Poisson distribution. Requires: y_true >= 0 and
           y_pred > 0.
         - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0
@@ -950,8 +948,8 @@ def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0):
     Like R^2, D^2 score may be negative (it need not actually be the square of
     a quantity D).
 
-    This metric is not well-defined for single samples and will return a NaN
-    value if n_samples is less than two.
+    This metric is not well-defined for single samples and will raise a ValueError if
+    n_samples is less than two.
 
     References
     ----------
@@ -979,11 +977,8 @@ def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0):
         raise ValueError("Multioutput not supported in d2_tweedie_score")
     check_consistent_length(y_true, y_pred, sample_weight)
 
-    # TODO: Do we need this?
     if _num_samples(y_pred) < 2:
-        msg = "D^2 score is not well-defined with less than two samples."
-        warnings.warn(msg, UndefinedMetricWarning)
-        return float('nan')
+        raise ValueError("D^2 score is not well-defined with less than two samples.")
 
     if sample_weight is not None:
         sample_weight = column_or_1d(sample_weight)
diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py
index 31f2df601eb5f..5c391d47986c7 100644
--- a/sklearn/metrics/tests/test_regression.py
+++ b/sklearn/metrics/tests/test_regression.py
@@ -307,16 +307,14 @@ def test_regression_custom_weights():
     assert_almost_equal(msle, msle2, decimal=2)
 
 
-@pytest.mark.parametrize('metric', [r2_score])
+@pytest.mark.parametrize('metric', [r2_score, d2_tweedie_score])
 def test_regression_single_sample(metric):
-    y_true = [0]
-    y_pred = [1]
-    warning_msg = 'not well-defined with less than two samples.'
-
-    # Trigger the warning
-    with pytest.warns(UndefinedMetricWarning, match=warning_msg):
-        score = metric(y_true, y_pred)
-        assert np.isnan(score)
+    y_true = [1]
+    y_pred = [2]
+    msg = "not well-defined with less than two samples."
+
+    with pytest.raises(ValueError, match=msg):
+        metric(y_true, y_pred)
 
 
 def test_tweedie_deviance_continuity():

From f02bde41fbadd5cf8988c7489c2683822a149175 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 22 Aug 2021 19:15:25 +0200
Subject: [PATCH 06/16] CLN nicer tests

---
 sklearn/metrics/_regression.py           |  4 +---
 sklearn/metrics/tests/test_regression.py | 21 +++++++++++++--------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index 3721899d1cf89..887b38148ca29 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -24,7 +24,6 @@
 # License: BSD 3 clause
 
 import numpy as np
-import warnings
 
 from .._loss.glm_distribution import TweedieDistribution
 from ..utils.validation import (check_array, check_consistent_length,
@@ -33,7 +32,6 @@
 from ..utils.validation import _deprecate_positional_args
 from ..utils.validation import _check_sample_weight
 from ..utils.stats import _weighted_percentile
-from ..exceptions import UndefinedMetricWarning
 
 
 __ALL__ = [
@@ -973,7 +971,7 @@ def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0):
     """
     y_type, y_true, y_pred, _ = _check_reg_targets(
         y_true, y_pred, None, dtype=[np.float64, np.float32])
-    if y_type == 'continuous-multioutput':
+    if y_type == "continuous-multioutput":
         raise ValueError("Multioutput not supported in d2_tweedie_score")
     check_consistent_length(y_true, y_pred, sample_weight)
 
diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py
index 5c391d47986c7..d78ebfe44fbfe 100644
--- a/sklearn/metrics/tests/test_regression.py
+++ b/sklearn/metrics/tests/test_regression.py
@@ -22,8 +22,6 @@
 
 from sklearn.metrics._regression import _check_reg_targets
 
-from ...exceptions import UndefinedMetricWarning
-
 
 def test_regression_metrics(n_samples=50):
     y_true = np.arange(n_samples)
@@ -142,15 +140,20 @@ def test_regression_metrics_at_limits():
     power = -1.2
     assert_allclose(mean_tweedie_deviance([0], [1.], power=power),
                     2 / (2 - power), rtol=1e-3)
-    with pytest.raises(ValueError,
-                       match="can only be used on strictly positive y_pred."):
+    msg = "can only be used on strictly positive y_pred."
+    with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], power=power)
+    with pytest.raises(ValueError, match=msg):
         d2_tweedie_score([0.], [0.], power=power)
+
     assert_almost_equal(mean_tweedie_deviance([0.], [0.], power=0), 0.00, 2)
 
+    power = 1.0
     msg = "only be used on non-negative y and strictly positive y_pred."
     with pytest.raises(ValueError, match=msg):
-        mean_tweedie_deviance([0.], [0.], power=1.0)
+        mean_tweedie_deviance([0.], [0.], power=power)
+    with pytest.raises(ValueError, match=msg):
+        d2_tweedie_score([0.], [0.], power=power)
 
     power = 1.5
     assert_allclose(mean_tweedie_deviance([0.], [1.], power=power),
@@ -158,6 +161,7 @@ def test_regression_metrics_at_limits():
     msg = "only be used on non-negative y and strictly positive y_pred."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], power=power)
+    with pytest.raises(ValueError, match=msg):
         d2_tweedie_score([0.], [0.], power=power)
 
     power = 2.
@@ -166,21 +170,22 @@ def test_regression_metrics_at_limits():
     msg = "can only be used on strictly positive y and y_pred."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], power=power)
+    with pytest.raises(ValueError, match=msg):
         d2_tweedie_score([0.], [0.], power=power)
 
     power = 3.
     assert_allclose(mean_tweedie_deviance([1.], [1.], power=power),
                     0.00, atol=1e-8)
-
     msg = "can only be used on strictly positive y and y_pred."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], power=power)
+    with pytest.raises(ValueError, match=msg):
         d2_tweedie_score([0.], [0.], power=power)
 
     power = 0.5
-    with pytest.raises(ValueError,
-                       match="is only defined for power<=0 and power>=1"):
+    with pytest.raises(ValueError, match="is only defined for power<=0 and power>=1"):
         mean_tweedie_deviance([0.], [0.], power=power)
+    with pytest.raises(ValueError, match="is only defined for power<=0 and power>=1"):
         d2_tweedie_score([0.], [0.], power=power)
 
 

From d1ef272112644c7ab229f86de6b6c9de3939aeee Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 22 Aug 2021 19:25:59 +0200
Subject: [PATCH 07/16] DOC add versionadded

---
 sklearn/metrics/_regression.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index 1b718a68fde6b..e6a28c2355423 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -994,6 +994,8 @@ def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0):
 
     Read more in the :ref:`User Guide <d2_tweedie_score>`.
 
+    .. versionadded:: 1.0
+
     Parameters
     ----------
     y_true : array-like of shape (n_samples,)

From cd8a8a8dc6b3b953aa32759f7c389bcdbd517870 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 22 Aug 2021 19:33:44 +0200
Subject: [PATCH 08/16] DOC improve docstring and user guide

---
 doc/modules/model_evaluation.rst | 6 ++++++
 sklearn/metrics/_regression.py   | 7 +++----
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 7fcea220b153e..acbfe0fd603e6 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -2371,6 +2371,12 @@ calculated as
 The argument `power` defines the Tweedie power as for
 :func:`mean_tweedie_deviance`. Note that for `power=0`,
 :func:`d2_tweedie_score` equals :func:`r2_score` (for single targets).
+
+Like R², the best possible score is 1.0 and it can be negative (because the
+model can be arbitrarily worse). A model that always predicts a constant
+value for the expected value of y, disregarding the input features, would
+get a D^2 score of 0.0.
+
 .. _pinball_loss:
 
 Pinball loss
diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index e6a28c2355423..3f6afbf61599c 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -987,10 +987,9 @@ def mean_gamma_deviance(y_true, y_pred, *, sample_weight=None):
 def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0):
     """D^2 regression score function, percentage of Tweedie deviance explained.
 
-    Best possible score is 1.0 and it can be negative (because the
-    model can be arbitrarily worse). A constant model that always
-    predicts the expected value of y, disregarding the input features,
-    would get a D^2 score of 0.0.
+    Best possible score is 1.0 and it can be negative (because the model can be
+    arbitrarily worse). A model that always predicts a constant value for the expected
+    value of y, disregarding the input features, would get a D^2 score of 0.0.
 
     Read more in the :ref:`User Guide <d2_tweedie_score>`.
 

From 420eaf8364e942384a352dc1fc33675b5d94c430 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 22 Aug 2021 19:39:55 +0200
Subject: [PATCH 09/16] DOC add code snippet with make_scorer

---
 doc/modules/model_evaluation.rst | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index acbfe0fd603e6..695d1c3109041 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -2368,7 +2368,7 @@ calculated as
 
   D^2(y, \hat{y}) = 1 - \frac{\text{D}(y, \hat{y})}{\text{D}(y, \bar{y})} \,.
 
-The argument `power` defines the Tweedie power as for
+The argument ``power`` defines the Tweedie power as for
 :func:`mean_tweedie_deviance`. Note that for `power=0`,
 :func:`d2_tweedie_score` equals :func:`r2_score` (for single targets).
 
@@ -2377,6 +2377,11 @@ model can be arbitrarily worse). A model that always predicts a constant
 value for the expected value of y, disregarding the input features, would
 get a D^2 score of 0.0.
 
+A scorer object with a specific choice of ``power`` can be built by::
+
+  >>> from sklearn.metrics import make_scorer
+  >>> d2_tweedie_score_15 = make_scorer(d2_tweedie_score, pwoer=1.5)
+
 .. _pinball_loss:
 
 Pinball loss
@@ -2409,7 +2414,7 @@ Here is a small example of usage of the :func:`mean_pinball_loss` function::
   >>> mean_pinball_loss(y_true, y_true, alpha=0.9)
   0.0
 
-It is possible to build a scorer object with a specific choice of alpha::
+It is possible to build a scorer object with a specific choice of ``alpha``::
 
   >>> from sklearn.metrics import make_scorer
   >>> mean_pinball_loss_95p = make_scorer(mean_pinball_loss, alpha=0.95)

From eb97867db304f7fd30173d107bbc83b9342993df Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 23 Aug 2021 08:15:50 +0200
Subject: [PATCH 10/16] DOC add whatsnew entry

---
 doc/whats_new/v1.0.rst | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 7d8175a3b5046..205eacdc91443 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -602,6 +602,12 @@ Changelog
   quantile regression. :pr:`19415` by :user:`Xavier Dupré <sdpython>`
   and :user:`Oliver Grisel <ogrisel>`.
 
+- |Feature| :func:`metrics.d2_tweedie_score` calculates the D^2 regression
+  score for Tweedie deviances with power parameter ``power``. This is a
+  generalization of the `r2_score` and can be interpreted as percentage of
+  Tweedie deviance explained.
+  :pr:`17036` by :user:`Christian Lorentzen <lorentzenchr>`.
+
 - |Feature|  :func:`metrics.mean_squared_log_error` now supports
   `squared=False`.
   :pr:`20326` by :user:`Uttam kumar <helper-uttam>`.
@@ -683,7 +689,7 @@ Changelog
 .............................
 
 - |Fix| :class:`neural_network.MLPClassifier` and
-  :class:`neural_network.MLPRegressor` now correct supports continued training
+  :class:`neural_network.MLPRegressor` now correctly support continued training
   when loading from a pickled file. :pr:`19631` by `Thomas Fan`_.
 
 :mod:`sklearn.pipeline`

From 4b9609223adbb265d742214c57ecab11a881e53e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 4 Sep 2021 12:59:01 +0200
Subject: [PATCH 11/16] Revert to return float(nan)

This reverts commit 8e317788fc58096b154da74f3e3080cc944af0b5.
---
 sklearn/metrics/_regression.py           | 23 ++++++++++++-------
 sklearn/metrics/tests/test_regression.py | 28 ++++++++++++++----------
 2 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index 3f6afbf61599c..ceb7a84a57c4c 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -24,9 +24,12 @@
 #          Uttam kumar <bajiraouttamsinha@gmail.com>
 # License: BSD 3 clause
 
+import warnings
+
 import numpy as np
 
 from .._loss.glm_distribution import TweedieDistribution
+from ..exceptions import UndefinedMetricWarning
 from ..utils.validation import check_array, check_consistent_length, _num_samples
 from ..utils.validation import column_or_1d
 from ..utils.validation import _check_sample_weight
@@ -735,8 +738,8 @@ def r2_score(y_true, y_pred, *, sample_weight=None, multioutput="uniform_average
     Unlike most other scores, :math:`R^2` score may be negative (it need not
     actually be the square of a quantity R).
 
-    This metric is not well-defined for single samples and will raise a ValueError if
-    n_samples is less than two.
+    This metric is not well-defined for single samples and will return a NaN
+    value if n_samples is less than two.
 
     References
     ----------
@@ -774,7 +777,9 @@ def r2_score(y_true, y_pred, *, sample_weight=None, multioutput="uniform_average
     check_consistent_length(y_true, y_pred, sample_weight)
 
     if _num_samples(y_pred) < 2:
-        raise ValueError("R^2 score is not well-defined with less than two samples.")
+        msg = "R^2 score is not well-defined with less than two samples."
+        warnings.warn(msg, UndefinedMetricWarning)
+        return float("nan")
 
     if sample_weight is not None:
         sample_weight = column_or_1d(sample_weight)
@@ -1013,8 +1018,8 @@ def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0):
         deviations between true and predicted targets.
 
         - power < 0: Extreme stable distribution. Requires: y_pred > 0.
-        - power = 0 : Normal distribution, output corresponds to
-          r2_score. y_true and y_pred can be any real numbers.
+        - power = 0 : Normal distribution, output corresponds to r2_score.
+          y_true and y_pred can be any real numbers.
         - power = 1 : Poisson distribution. Requires: y_true >= 0 and
           y_pred > 0.
         - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0
@@ -1037,8 +1042,8 @@ def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0):
     Like R^2, D^2 score may be negative (it need not actually be the square of
     a quantity D).
 
-    This metric is not well-defined for single samples and will raise a ValueError if
-    n_samples is less than two.
+    This metric is not well-defined for single samples and will return a NaN
+    value if n_samples is less than two.
 
     References
     ----------
@@ -1068,7 +1073,9 @@ def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0):
     check_consistent_length(y_true, y_pred, sample_weight)
 
     if _num_samples(y_pred) < 2:
-        raise ValueError("D^2 score is not well-defined with less than two samples.")
+        msg = "D^2 score is not well-defined with less than two samples."
+        warnings.warn(msg, UndefinedMetricWarning)
+        return float("nan")
 
     if sample_weight is not None:
         sample_weight = column_or_1d(sample_weight)
diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py
index bb4f9b5c02b87..f94f06676872f 100644
--- a/sklearn/metrics/tests/test_regression.py
+++ b/sklearn/metrics/tests/test_regression.py
@@ -26,6 +26,8 @@
 
 from sklearn.metrics._regression import _check_reg_targets
 
+from sklearn.exceptions import UndefinedMetricWarning
+
 
 def test_regression_metrics(n_samples=50):
     y_true = np.arange(n_samples)
@@ -145,23 +147,23 @@ def test_regression_metrics_at_limits():
     assert_almost_equal(max_error([0.0], [0.0]), 0.0)
     assert_almost_equal(explained_variance_score([0.0], [0.0]), 1.0)
     assert_almost_equal(r2_score([0.0, 1], [0.0, 1]), 1.0)
-    err_msg = (
+    msg = (
         "Mean Squared Logarithmic Error cannot be used when targets "
         "contain negative values."
     )
-    with pytest.raises(ValueError, match=err_msg):
+    with pytest.raises(ValueError, match=msg):
         mean_squared_log_error([-1.0], [-1.0])
-    err_msg = (
+    msg = (
         "Mean Squared Logarithmic Error cannot be used when targets "
         "contain negative values."
     )
-    with pytest.raises(ValueError, match=err_msg):
+    with pytest.raises(ValueError, match=msg):
         mean_squared_log_error([1.0, 2.0, 3.0], [1.0, -2.0, 3.0])
-    err_msg = (
+    msg = (
         "Mean Squared Logarithmic Error cannot be used when targets "
         "contain negative values."
     )
-    with pytest.raises(ValueError, match=err_msg):
+    with pytest.raises(ValueError, match=msg):
         mean_squared_log_error([1.0, -2.0, 3.0], [1.0, 2.0, 3.0])
 
     # Tweedie deviance error
@@ -350,12 +352,14 @@ def test_regression_custom_weights():
 
 @pytest.mark.parametrize("metric", [r2_score, d2_tweedie_score])
 def test_regression_single_sample(metric):
-    y_true = [1]
-    y_pred = [2]
-    msg = "not well-defined with less than two samples."
-
-    with pytest.raises(ValueError, match=msg):
-        metric(y_true, y_pred)
+    y_true = [0]
+    y_pred = [1]
+    warning_msg = "not well-defined with less than two samples."
+
+    # Trigger the warning
+    with pytest.warns(UndefinedMetricWarning, match=warning_msg):
+        score = metric(y_true, y_pred)
+        assert np.isnan(score)
 
 
 def test_tweedie_deviance_continuity():

From 300b610fed4596a332b5391405f23a5f5fa672b5 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 4 Sep 2021 12:59:57 +0200
Subject: [PATCH 12/16] TST fix tests

---
 sklearn/metrics/tests/test_regression.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py
index f94f06676872f..b66ce18ec8da4 100644
--- a/sklearn/metrics/tests/test_regression.py
+++ b/sklearn/metrics/tests/test_regression.py
@@ -175,7 +175,7 @@ def test_regression_metrics_at_limits():
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.0], [0.0], power=power)
     with pytest.raises(ValueError, match=msg):
-        d2_tweedie_score([0.0], [0.0], power=power)
+        d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power)
 
     assert_almost_equal(mean_tweedie_deviance([0.0], [0.0], power=0), 0.0, 2)
 
@@ -184,7 +184,7 @@ def test_regression_metrics_at_limits():
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.0], [0.0], power=power)
     with pytest.raises(ValueError, match=msg):
-        d2_tweedie_score([0.0], [0.0], power=power)
+        d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power)
 
     power = 1.5
     assert_allclose(mean_tweedie_deviance([0.0], [1.0], power=power), 2 / (2 - power))
@@ -192,7 +192,7 @@ def test_regression_metrics_at_limits():
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.0], [0.0], power=power)
     with pytest.raises(ValueError, match=msg):
-        d2_tweedie_score([0.0], [0.0], power=power)
+        d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power)
 
     power = 2.0
     assert_allclose(mean_tweedie_deviance([1.0], [1.0], power=power), 0.00, atol=1e-8)
@@ -200,7 +200,7 @@ def test_regression_metrics_at_limits():
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.0], [0.0], power=power)
     with pytest.raises(ValueError, match=msg):
-        d2_tweedie_score([0.0], [0.0], power=power)
+        d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power)
 
     power = 3.0
     assert_allclose(mean_tweedie_deviance([1.0], [1.0], power=power), 0.00, atol=1e-8)
@@ -208,13 +208,13 @@ def test_regression_metrics_at_limits():
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.0], [0.0], power=power)
     with pytest.raises(ValueError, match=msg):
-        d2_tweedie_score([0.0], [0.0], power=power)
+        d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power)
 
     power = 0.5
     with pytest.raises(ValueError, match="is only defined for power<=0 and power>=1"):
         mean_tweedie_deviance([0.0], [0.0], power=power)
     with pytest.raises(ValueError, match="is only defined for power<=0 and power>=1"):
-        d2_tweedie_score([0.0], [0.0], power=power)
+        d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power)
 
 
 def test__check_reg_targets():

From 8bafea43ea69cb0e95888a75abb58e7ba0cbfa07 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 4 Sep 2021 13:02:05 +0200
Subject: [PATCH 13/16] MNT kwargs only

---
 sklearn/metrics/_regression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index ceb7a84a57c4c..b69b4dd5ef96e 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -989,7 +989,7 @@ def mean_gamma_deviance(y_true, y_pred, *, sample_weight=None):
     return mean_tweedie_deviance(y_true, y_pred, sample_weight=sample_weight, power=2)
 
 
-def d2_tweedie_score(y_true, y_pred, sample_weight=None, power=0):
+def d2_tweedie_score(y_true, y_pred, *, sample_weight=None, power=0):
     """D^2 regression score function, percentage of Tweedie deviance explained.
 
     Best possible score is 1.0 and it can be negative (because the model can be

From ebf3e5914cd1024ad790d15a49940d5ca6d80c64 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 4 Sep 2021 13:10:28 +0200
Subject: [PATCH 14/16] DOC fix statement about constant predictions

---
 doc/modules/model_evaluation.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 695d1c3109041..f1a9388344512 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -2373,9 +2373,9 @@ The argument ``power`` defines the Tweedie power as for
 :func:`d2_tweedie_score` equals :func:`r2_score` (for single targets).
 
 Like R², the best possible score is 1.0 and it can be negative (because the
-model can be arbitrarily worse). A model that always predicts a constant
-value for the expected value of y, disregarding the input features, would
-get a D^2 score of 0.0.
+model can be arbitrarily worse). A constant model that always predicts the
+expected value of y, disregarding the input features, would get a D² score
+of 0.0.
 
 A scorer object with a specific choice of ``power`` can be built by::
 

From 454871c239e655b97f0fbf6badc83d7b36af576e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 4 Sep 2021 22:37:21 +0200
Subject: [PATCH 15/16] DOC more precise statement of zero D2 score

---
 sklearn/metrics/_regression.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index b69b4dd5ef96e..ed9da69b1261c 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -993,8 +993,8 @@ def d2_tweedie_score(y_true, y_pred, *, sample_weight=None, power=0):
     """D^2 regression score function, percentage of Tweedie deviance explained.
 
     Best possible score is 1.0 and it can be negative (because the model can be
-    arbitrarily worse). A model that always predicts a constant value for the expected
-    value of y, disregarding the input features, would get a D^2 score of 0.0.
+    arbitrarily worse). A model that always uses the empirical mean of `y_true` as
+    constant prediction, disregarding the input features, gets a D^2 score of 0.0.
 
     Read more in the :ref:`User Guide <d2_tweedie_score>`.
 

From 5b95be4091d233d1372207d32bd16a492704c4ef Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 4 Sep 2021 22:40:11 +0200
Subject: [PATCH 16/16] DOC import d2_score in user guide

---
 doc/modules/model_evaluation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index f1a9388344512..f5f447e118a8e 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -2379,7 +2379,7 @@ of 0.0.
 
 A scorer object with a specific choice of ``power`` can be built by::
 
-  >>> from sklearn.metrics import make_scorer
+  >>> from sklearn.metrics import d2_tweedie_score, make_scorer
   >>> d2_tweedie_score_15 = make_scorer(d2_tweedie_score, pwoer=1.5)
 
 .. _pinball_loss: