From b44dd9d617a6174ef02f6744b9286b0922659273 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 31 Mar 2023 17:14:44 +0200
Subject: [PATCH 001/194] MAINT refactor scorer using _get_response_values

---
 sklearn/metrics/_scorer.py                  | 94 +++++++--------------
 sklearn/metrics/tests/test_score_objects.py |  3 +-
 sklearn/utils/_response.py                  | 10 +--
 sklearn/utils/tests/test_response.py        | 20 -----
 4 files changed, 38 insertions(+), 89 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 9df1b482bdeb3..f10e398debcd6 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -18,9 +18,10 @@
 #          Arnaud Joly <arnaud.v.joly@gmail.com>
 # License: Simplified BSD
 
+from collections import Counter
 from collections.abc import Iterable
+from inspect import signature
 from functools import partial
-from collections import Counter
 from traceback import format_exc
 
 import numpy as np
@@ -65,19 +66,21 @@
 
 from ..utils.multiclass import type_of_target
 from ..base import is_regressor
+from ..utils._response import _get_response_values
 from ..utils._param_validation import validate_params
 
 
-def _cached_call(cache, estimator, method, *args, **kwargs):
+def _cached_call(cache, estimator, *args, **kwargs):
     """Call estimator with method and args and kwargs."""
     if cache is None:
-        return getattr(estimator, method)(*args, **kwargs)
+        return _get_response_values(estimator, *args, **kwargs)
 
+    response_method = kwargs["response_method"]
     try:
-        return cache[method]
+        return cache[response_method]
     except KeyError:
-        result = getattr(estimator, method)(*args, **kwargs)
-        cache[method] = result
+        result = _get_response_values(estimator, *args, **kwargs)
+        cache[response_method] = result
         return result
 
 
@@ -163,40 +166,15 @@ def __init__(self, score_func, sign, kwargs):
         self._score_func = score_func
         self._sign = sign
 
-    @staticmethod
-    def _check_pos_label(pos_label, classes):
-        if pos_label not in list(classes):
-            raise ValueError(f"pos_label={pos_label} is not a valid label: {classes}")
-
-    def _select_proba_binary(self, y_pred, classes):
-        """Select the column of the positive label in `y_pred` when
-        probabilities are provided.
-
-        Parameters
-        ----------
-        y_pred : ndarray of shape (n_samples, n_classes)
-            The prediction given by `predict_proba`.
-
-        classes : ndarray of shape (n_classes,)
-            The class labels for the estimator.
-
-        Returns
-        -------
-        y_pred : ndarray of shape (n_samples,)
-            Probability predictions of the positive class.
-        """
-        if y_pred.shape[1] == 2:
-            pos_label = self._kwargs.get("pos_label", classes[1])
-            self._check_pos_label(pos_label, classes)
-            col_idx = np.flatnonzero(classes == pos_label)[0]
-            return y_pred[:, col_idx]
-
-        err_msg = (
-            f"Got predict_proba of shape {y_pred.shape}, but need "
-            f"classifier with two classes for {self._score_func.__name__} "
-            "scoring"
-        )
-        raise ValueError(err_msg)
+    def _get_pos_label(self):
+        score_func_params = signature(self._score_func).parameters
+        if "pos_label" in self._kwargs:
+            pos_label = self._kwargs["pos_label"]
+        elif "pos_label" in score_func_params:
+            pos_label = score_func_params["pos_label"].default
+        else:
+            pos_label = None
+        return pos_label
 
     def __repr__(self):
         kwargs_string = "".join(
@@ -274,7 +252,7 @@ def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
             Score function applied to prediction of estimator on X.
         """
 
-        y_pred = method_caller(estimator, "predict", X)
+        y_pred, _ = method_caller(estimator, X, response_method="predict")
         if sample_weight is not None:
             return self._sign * self._score_func(
                 y_true, y_pred, sample_weight=sample_weight, **self._kwargs
@@ -312,14 +290,9 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
         score : float
             Score function applied to prediction of estimator on X.
         """
-
-        y_type = type_of_target(y)
-        y_pred = method_caller(clf, "predict_proba", X)
-        if y_type == "binary" and y_pred.shape[1] <= 2:
-            # `y_type` could be equal to "binary" even in a multi-class
-            # problem: (when only 2 class are given to `y_true` during scoring)
-            # Thus, we need to check for the shape of `y_pred`.
-            y_pred = self._select_proba_binary(y_pred, clf.classes_)
+        y_pred, _ = method_caller(
+            clf, X, response_method="predict_proba", pos_label=self._get_pos_label()
+        )
         if sample_weight is not None:
             return self._sign * self._score_func(
                 y, y_pred, sample_weight=sample_weight, **self._kwargs
@@ -368,28 +341,23 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
             raise ValueError("{0} format is not supported".format(y_type))
 
         if is_regressor(clf):
-            y_pred = method_caller(clf, "predict", X)
+            y_pred, _ = method_caller(clf, X, response_method="predict")
         else:
+            pos_label = self._get_pos_label()
             try:
-                y_pred = method_caller(clf, "decision_function", X)
+                y_pred, _ = method_caller(
+                    clf, X, response_method="decision_function", pos_label=pos_label
+                )
 
                 if isinstance(y_pred, list):
                     # For multi-output multi-class estimator
                     y_pred = np.vstack([p for p in y_pred]).T
-                elif y_type == "binary" and "pos_label" in self._kwargs:
-                    self._check_pos_label(self._kwargs["pos_label"], clf.classes_)
-                    if self._kwargs["pos_label"] == clf.classes_[0]:
-                        # The implicit positive class of the binary classifier
-                        # does not match `pos_label`: we need to invert the
-                        # predictions
-                        y_pred *= -1
 
             except (NotImplementedError, AttributeError):
-                y_pred = method_caller(clf, "predict_proba", X)
-
-                if y_type == "binary":
-                    y_pred = self._select_proba_binary(y_pred, clf.classes_)
-                elif isinstance(y_pred, list):
+                y_pred, _ = method_caller(
+                    clf, X, response_method="predict_proba", pos_label=pos_label
+                )
+                if isinstance(y_pred, list):
                     y_pred = np.vstack([p[:, -1] for p in y_pred]).T
 
         if sample_weight is not None:
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index d39db7fc894c4..b0186c42f8921 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -770,6 +770,7 @@ def test_multimetric_scorer_calls_method_once(
     X, y = np.array([[1], [1], [0], [0], [0]]), np.array([0, 1, 1, 1, 0])
 
     mock_est = Mock()
+    mock_est._estimator_type = "classifier"
     fit_func = Mock(return_value=mock_est)
     predict_func = Mock(return_value=y)
 
@@ -972,7 +973,7 @@ def test_multiclass_roc_no_proba_scorer_errors(scorer_name):
         n_classes=3, n_informative=3, n_samples=20, random_state=0
     )
     lr = Perceptron().fit(X, y)
-    msg = "'Perceptron' object has no attribute 'predict_proba'"
+    msg = "Perceptron has none of the following attributes: predict_proba."
     with pytest.raises(AttributeError, match=msg):
         scorer(lr, X, y)
 
diff --git a/sklearn/utils/_response.py b/sklearn/utils/_response.py
index 50b9409c8276d..e11a67cca6d56 100644
--- a/sklearn/utils/_response.py
+++ b/sklearn/utils/_response.py
@@ -78,11 +78,11 @@ def _get_response_values(
 
         target_type = "binary" if len(classes) <= 2 else "multiclass"
 
-        if target_type == "multiclass" and prediction_method.__name__ != "predict":
-            raise ValueError(
-                "With a multiclass estimator, the response method should be "
-                f"predict, got {prediction_method.__name__} instead."
-            )
+        # if target_type == "multiclass" and prediction_method.__name__ != "predict":
+        #     raise ValueError(
+        #         "With a multiclass estimator, the response method should be "
+        #         f"predict, got {prediction_method.__name__} instead."
+        #     )
 
         if pos_label is not None and pos_label not in classes.tolist():
             raise ValueError(
diff --git a/sklearn/utils/tests/test_response.py b/sklearn/utils/tests/test_response.py
index 0e2ce5fe5f038..9d6d90ddd94ae 100644
--- a/sklearn/utils/tests/test_response.py
+++ b/sklearn/utils/tests/test_response.py
@@ -6,7 +6,6 @@
     LinearRegression,
     LogisticRegression,
 )
-from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.utils._mocking import _MockEstimatorOnOffPrediction
 from sklearn.utils._testing import assert_allclose, assert_array_equal
@@ -29,25 +28,6 @@ def test_get_response_values_regressor_error(response_method):
         _get_response_values(my_estimator, X, response_method=response_method)
 
 
-@pytest.mark.parametrize(
-    "estimator, response_method",
-    [
-        (DecisionTreeClassifier(), "predict_proba"),
-        (SVC(), "decision_function"),
-    ],
-)
-def test_get_response_values_error_multiclass_classifier(estimator, response_method):
-    """Check that we raise an error with multiclass classifier and requesting
-    response values different from `predict`."""
-    X, y = make_classification(
-        n_samples=10, n_clusters_per_class=1, n_classes=3, random_state=0
-    )
-    classifier = estimator.fit(X, y)
-    err_msg = "With a multiclass estimator, the response method should be predict"
-    with pytest.raises(ValueError, match=err_msg):
-        _get_response_values(classifier, X, response_method=response_method)
-
-
 def test_get_response_values_regressor():
     """Check the behaviour of `_get_response_values` with regressor."""
     X, y = make_regression(n_samples=10, random_state=0)

From 516f62f887fc67101ec2c6edb43dfecb39fc8e5a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 1 Apr 2023 15:51:30 +0200
Subject: [PATCH 002/194] Add __name__ for method of Mock

---
 sklearn/metrics/tests/test_score_objects.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index b0186c42f8921..a2d47e1b81a2a 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -771,13 +771,17 @@ def test_multimetric_scorer_calls_method_once(
 
     mock_est = Mock()
     mock_est._estimator_type = "classifier"
-    fit_func = Mock(return_value=mock_est)
-    predict_func = Mock(return_value=y)
+    fit_func = Mock(return_value=mock_est, name="fit")
+    fit_func.__name__ = "fit"
+    predict_func = Mock(return_value=y, name="predict")
+    predict_func.__name__ = "predict"
 
     pos_proba = np.random.rand(X.shape[0])
     proba = np.c_[1 - pos_proba, pos_proba]
-    predict_proba_func = Mock(return_value=proba)
-    decision_function_func = Mock(return_value=pos_proba)
+    predict_proba_func = Mock(return_value=proba, name="predict_proba")
+    predict_proba_func.__name__ = "predict_proba"
+    decision_function_func = Mock(return_value=pos_proba, name="decision_function")
+    decision_function_func.__name__ = "decision_function"
 
     mock_est.fit = fit_func
     mock_est.predict = predict_func

From d2fbee0235a3380b6eb49df31bbf9e8d210eb4d5 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 1 Apr 2023 15:52:34 +0200
Subject: [PATCH 003/194] remove multiclass issue

---
 sklearn/utils/_response.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/sklearn/utils/_response.py b/sklearn/utils/_response.py
index e11a67cca6d56..e92b167b241de 100644
--- a/sklearn/utils/_response.py
+++ b/sklearn/utils/_response.py
@@ -19,9 +19,6 @@ def _get_response_values(
     The response values are predictions, one scalar value for each sample in X
     that depends on the specific choice of `response_method`.
 
-    This helper only accepts multiclass classifiers with the `predict` response
-    method.
-
     If `estimator` is a binary classifier, also return the label for the
     effective positive class.
 
@@ -75,15 +72,8 @@ def _get_response_values(
     if is_classifier(estimator):
         prediction_method = _check_response_method(estimator, response_method)
         classes = estimator.classes_
-
         target_type = "binary" if len(classes) <= 2 else "multiclass"
 
-        # if target_type == "multiclass" and prediction_method.__name__ != "predict":
-        #     raise ValueError(
-        #         "With a multiclass estimator, the response method should be "
-        #         f"predict, got {prediction_method.__name__} instead."
-        #     )
-
         if pos_label is not None and pos_label not in classes.tolist():
             raise ValueError(
                 f"pos_label={pos_label} is not a valid label: It should be "

From 29e5e876b2744d229e5448ee1246fcf19a4ec99f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 3 Apr 2023 10:22:47 +0200
Subject: [PATCH 004/194] make response_method a mandatory arg

---
 sklearn/metrics/_scorer.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index f10e398debcd6..bd93d1bd80e3c 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -70,16 +70,18 @@
 from ..utils._param_validation import validate_params
 
 
-def _cached_call(cache, estimator, *args, **kwargs):
+def _cached_call(cache, estimator, response_method, *args, **kwargs):
     """Call estimator with method and args and kwargs."""
     if cache is None:
-        return _get_response_values(estimator, *args, **kwargs)
-
-    response_method = kwargs["response_method"]
+        return _get_response_values(
+            estimator, *args, response_method=response_method, **kwargs
+        )
     try:
         return cache[response_method]
     except KeyError:
-        result = _get_response_values(estimator, *args, **kwargs)
+        result = _get_response_values(
+            estimator, *args, response_method=response_method, **kwargs
+        )
         cache[response_method] = result
         return result
 
@@ -252,7 +254,7 @@ def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
             Score function applied to prediction of estimator on X.
         """
 
-        y_pred, _ = method_caller(estimator, X, response_method="predict")
+        y_pred, _ = method_caller(estimator, "predict", X)
         if sample_weight is not None:
             return self._sign * self._score_func(
                 y_true, y_pred, sample_weight=sample_weight, **self._kwargs
@@ -291,7 +293,7 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
             Score function applied to prediction of estimator on X.
         """
         y_pred, _ = method_caller(
-            clf, X, response_method="predict_proba", pos_label=self._get_pos_label()
+            clf, "predict_proba", X, pos_label=self._get_pos_label()
         )
         if sample_weight is not None:
             return self._sign * self._score_func(
@@ -341,12 +343,12 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
             raise ValueError("{0} format is not supported".format(y_type))
 
         if is_regressor(clf):
-            y_pred, _ = method_caller(clf, X, response_method="predict")
+            y_pred, _ = method_caller(clf, "predict", X)
         else:
             pos_label = self._get_pos_label()
             try:
                 y_pred, _ = method_caller(
-                    clf, X, response_method="decision_function", pos_label=pos_label
+                    clf, "decision_function", X, pos_label=pos_label
                 )
 
                 if isinstance(y_pred, list):
@@ -354,9 +356,7 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
                     y_pred = np.vstack([p for p in y_pred]).T
 
             except (NotImplementedError, AttributeError):
-                y_pred, _ = method_caller(
-                    clf, X, response_method="predict_proba", pos_label=pos_label
-                )
+                y_pred, _ = method_caller(clf, "predict_proba", X, pos_label=pos_label)
                 if isinstance(y_pred, list):
                     y_pred = np.vstack([p[:, -1] for p in y_pred]).T
 

From b645ade807103ad1c40dd4572250f9e5c4399a0d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 3 Apr 2023 16:42:39 +0200
Subject: [PATCH 005/194] Update sklearn/metrics/_scorer.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
---
 sklearn/metrics/_scorer.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index bd93d1bd80e3c..85eb50058542c 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -171,12 +171,10 @@ def __init__(self, score_func, sign, kwargs):
     def _get_pos_label(self):
         score_func_params = signature(self._score_func).parameters
         if "pos_label" in self._kwargs:
-            pos_label = self._kwargs["pos_label"]
-        elif "pos_label" in score_func_params:
-            pos_label = score_func_params["pos_label"].default
-        else:
-            pos_label = None
-        return pos_label
+            return self._kwargs["pos_label"]
+        if "pos_label" in score_func_params:
+            return score_func_params["pos_label"].default
+        return None
 
     def __repr__(self):
         kwargs_string = "".join(

From 3397c5603e80e3be42ad5c57c6f971f8959de288 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 3 Apr 2023 16:45:20 +0200
Subject: [PATCH 006/194] apply jeremie comments

---
 sklearn/metrics/_scorer.py | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 85eb50058542c..ab83be055ebe5 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -72,18 +72,17 @@
 
 def _cached_call(cache, estimator, response_method, *args, **kwargs):
     """Call estimator with method and args and kwargs."""
-    if cache is None:
-        return _get_response_values(
-            estimator, *args, response_method=response_method, **kwargs
-        )
-    try:
+    if cache is not None and response_method in cache:
         return cache[response_method]
-    except KeyError:
-        result = _get_response_values(
-            estimator, *args, response_method=response_method, **kwargs
-        )
+
+    result, _ = _get_response_values(
+        estimator, *args, response_method=response_method, **kwargs
+    )
+
+    if cache is not None:
         cache[response_method] = result
-        return result
+
+    return result
 
 
 class _MultimetricScorer:
@@ -252,7 +251,7 @@ def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
             Score function applied to prediction of estimator on X.
         """
 
-        y_pred, _ = method_caller(estimator, "predict", X)
+        y_pred = method_caller(estimator, "predict", X)
         if sample_weight is not None:
             return self._sign * self._score_func(
                 y_true, y_pred, sample_weight=sample_weight, **self._kwargs
@@ -290,9 +289,7 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
         score : float
             Score function applied to prediction of estimator on X.
         """
-        y_pred, _ = method_caller(
-            clf, "predict_proba", X, pos_label=self._get_pos_label()
-        )
+        y_pred = method_caller(clf, "predict_proba", X, pos_label=self._get_pos_label())
         if sample_weight is not None:
             return self._sign * self._score_func(
                 y, y_pred, sample_weight=sample_weight, **self._kwargs
@@ -341,20 +338,18 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
             raise ValueError("{0} format is not supported".format(y_type))
 
         if is_regressor(clf):
-            y_pred, _ = method_caller(clf, "predict", X)
+            y_pred = method_caller(clf, "predict", X)
         else:
             pos_label = self._get_pos_label()
             try:
-                y_pred, _ = method_caller(
-                    clf, "decision_function", X, pos_label=pos_label
-                )
+                y_pred = method_caller(clf, "decision_function", X, pos_label=pos_label)
 
                 if isinstance(y_pred, list):
                     # For multi-output multi-class estimator
                     y_pred = np.vstack([p for p in y_pred]).T
 
             except (NotImplementedError, AttributeError):
-                y_pred, _ = method_caller(clf, "predict_proba", X, pos_label=pos_label)
+                y_pred = method_caller(clf, "predict_proba", X, pos_label=pos_label)
                 if isinstance(y_pred, list):
                     y_pred = np.vstack([p[:, -1] for p in y_pred]).T
 

From e871558bd4ec8953183aacb957bf590c8a97429d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 4 Apr 2023 20:12:24 +0200
Subject: [PATCH 007/194] iter

---
 sklearn/metrics/_scorer.py | 60 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index f10e398debcd6..41ba9ba3a22dd 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -371,6 +371,66 @@ def _factory_args(self):
         return ", needs_threshold=True"
 
 
+class _ContinuousScorer(_BaseScorer):
+    def __init__(self, score_func, sign, response_method, kwargs):
+        super().__init__(score_func=score_func, sign=sign, kwargs=kwargs)
+        self.response_method = response_method
+
+    def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
+        """Evaluate predicted target values for X relative to y_true.
+        Parameters
+        ----------
+        method_caller : callable
+            Returns predictions given an estimator, method name, and other
+            arguments, potentially caching results.
+        estimator : object
+            Trained estimator to use for scoring. Must have a predict_proba
+            method; the output of that is used to compute the score.
+        X : {array-like, sparse matrix}
+            Test data that will be fed to estimator.predict.
+        y_true : array-like
+            Gold standard target values for X.
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+        Returns
+        -------
+        score : float
+            Score function applied to prediction of estimator on X.
+        """
+        response_method = _check_classifier_response_method(
+            estimator=estimator, response_method=self.response_method
+        )
+        y_score = response_method(X)
+        if response_method.__name__ == "decision_function":
+            y_score = self._check_decision_function(y_score, estimator.classes_)
+        else:
+            y_score = self._select_proba(
+                y_score, estimator.classes_, support_multi_class=False
+            )
+
+        # `np.unique` returned sorted array, thus no need to sort values
+        potential_thresholds = np.unique(y_score)
+        score_thresholds = []
+        for th in potential_thresholds:
+            y_score_thresholded = estimator.classes_[(y_score >= th).astype(int)]
+            if sample_weight is not None:
+                score_thresholds.append(
+                    self._sign
+                    * self._score_func(
+                        y_true,
+                        y_score_thresholded,
+                        sample_weight=sample_weight,
+                        **self._kwargs,
+                    )
+                )
+            else:
+                score_thresholds.append(
+                    self._sign
+                    * self._score_func(y_true, y_score_thresholded, **self._kwargs)
+                )
+        return np.array(potential_thresholds), np.array(score_thresholds)
+
+
 @validate_params(
     {
         "scoring": [str, callable, None],

From 74614e889605e9fdbf5ba501c4c0a31e6c7ac4a9 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 7 Apr 2023 14:00:53 +0200
Subject: [PATCH 008/194] FEA add CutOffClassifier to post-tune prediction
 threshold

---
 sklearn/metrics/_scorer.py                  | 61 +++++++++------------
 sklearn/metrics/tests/test_score_objects.py | 41 ++++++++++++++
 2 files changed, 67 insertions(+), 35 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 9b49301cfba86..2d49e274359d8 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -365,63 +365,54 @@ def _factory_args(self):
 
 
 class _ContinuousScorer(_BaseScorer):
+    """ "Scorer taking a continuous response and output a score for each threshold."""
+
     def __init__(self, score_func, sign, response_method, kwargs):
         super().__init__(score_func=score_func, sign=sign, kwargs=kwargs)
         self.response_method = response_method
 
     def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
         """Evaluate predicted target values for X relative to y_true.
+
         Parameters
         ----------
-        method_caller : callable
-            Returns predictions given an estimator, method name, and other
-            arguments, potentially caching results.
         estimator : object
-            Trained estimator to use for scoring. Must have a predict_proba
-            method; the output of that is used to compute the score.
-        X : {array-like, sparse matrix}
+            Trained estimator to use for scoring.
+
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Test data that will be fed to estimator.predict.
-        y_true : array-like
+
+        y_true : array-like of shape (n_samples,)
             Gold standard target values for X.
+
         sample_weight : array-like of shape (n_samples,), default=None
             Sample weights.
+
         Returns
         -------
         score : float
             Score function applied to prediction of estimator on X.
         """
-        response_method = _check_classifier_response_method(
-            estimator=estimator, response_method=self.response_method
-        )
-        y_score = response_method(X)
-        if response_method.__name__ == "decision_function":
-            y_score = self._check_decision_function(y_score, estimator.classes_)
+        y_score = method_caller(estimator, self.response_method, X)
+
+        if sample_weight is not None:
+            score_func = partial(self._score_func, sample_weight=sample_weight)
         else:
-            y_score = self._select_proba(
-                y_score, estimator.classes_, support_multi_class=False
-            )
+            score_func = self._score_func
 
-        # `np.unique` returned sorted array, thus no need to sort values
         potential_thresholds = np.unique(y_score)
-        score_thresholds = []
-        for th in potential_thresholds:
-            y_score_thresholded = estimator.classes_[(y_score >= th).astype(int)]
-            if sample_weight is not None:
-                score_thresholds.append(
-                    self._sign
-                    * self._score_func(
-                        y_true,
-                        y_score_thresholded,
-                        sample_weight=sample_weight,
-                        **self._kwargs,
-                    )
+        score_thresholds = np.array(
+            [
+                self._sign
+                * score_func(
+                    y_true,
+                    estimator.classes_[(y_score >= th).astype(int)],
+                    **self._kwargs,
                 )
-            else:
-                score_thresholds.append(
-                    self._sign
-                    * self._score_func(y_true, y_score_thresholded, **self._kwargs)
-                )
-        return np.array(potential_thresholds), np.array(score_thresholds)
+                for th in potential_thresholds
+            ]
+        )
+        return potential_thresholds, score_thresholds
 
 
 @validate_params(
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index a2d47e1b81a2a..09f5f5a296bc2 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -40,6 +40,7 @@
     _passthrough_scorer,
     _MultimetricScorer,
     _check_multimetric_scoring,
+    _ContinuousScorer,
 )
 from sklearn.metrics import make_scorer, get_scorer, SCORERS, get_scorer_names
 from sklearn.neighbors import KNeighborsClassifier
@@ -1203,3 +1204,43 @@ def test_scorer_no_op_multiclass_select_proba():
         labels=lr.classes_,
     )
     scorer(lr, X_test, y_test)
+
+
+def test_continuous_scorer():
+    """Check the behaviour of the `_ContinuousScorer` class."""
+    X, y = make_classification(random_state=0)
+    estimator = LogisticRegression().fit(X, y)
+    scorer = _ContinuousScorer(
+        balanced_accuracy_score, sign=1, response_method="predict_proba", kwargs={}
+    )
+    thresholds, scores = scorer(estimator, X, y)
+
+    assert thresholds.shape == scores.shape
+    # check that the thresholds are probability with extreme values close to 0 and 1
+    assert 0 <= thresholds.min() <= 0.01
+    assert 0.99 <= thresholds.max() <= 1
+    # balanced accuracy should be between 0.5 and 1 when it is not adjusted
+    assert 0.5 <= scores.min() <= 1
+
+    # check that passing kwargs to the scorer works
+    scorer = _ContinuousScorer(
+        balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        kwargs={"adjusted": True},
+    )
+    thresholds, scores = scorer(estimator, X, y)
+
+    # balanced accuracy should be between 0.5 and 1 when it is not adjusted
+    assert 0 <= scores.min() <= 0.5
+
+    # check that we can inverse the sign of the score when dealing with `neg_*` scorer
+    scorer = _ContinuousScorer(
+        balanced_accuracy_score,
+        sign=-1,
+        response_method="predict_proba",
+        kwargs={"adjusted": True},
+    )
+    thresholds, scores = scorer(estimator, X, y)
+
+    assert all(scores <= 0)

From 27713af037b7de4b117af81a1e7a4f1c5f6ccaf1 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 7 Apr 2023 14:06:14 +0200
Subject: [PATCH 009/194] DOC add changelog entry

---
 doc/whats_new/v1.3.rst | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 4fede62e61b34..458df4a7d3ed4 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -284,8 +284,8 @@ Changelog
   estimators consistent with the rest of estimators.
   :pr:`25697` by :user:`John Pangas <jpangas>`.
 
-- |Enhancement| The `n_iter_` attribute has been included in 
-  :class:`linear_model.ARDRegression` to expose the actual number of iterations 
+- |Enhancement| The `n_iter_` attribute has been included in
+  :class:`linear_model.ARDRegression` to expose the actual number of iterations
   required to reach the stopping criterion.
   :pr:`25697` by :user:`John Pangas <jpangas>`.
 
@@ -353,6 +353,11 @@ Changelog
   `return_indices` to return the train-test indices of each cv split.
   :pr:`25659` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- |MajorFeature| :class:`model_selection.CutOffClassifier` calibrates decision threshold
+  function of a binary classifier by maximizing a classification metric through
+  cross-validatin.
+  :pr:`26120` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.naive_bayes`
 ..........................
 

From ed1d9b30fa5c36848e9726c20e121d073afd0847 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 7 Apr 2023 16:27:30 +0200
Subject: [PATCH 010/194] refresh implementation

---
 sklearn/model_selection/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py
index 76dc02e625408..d55d981e5a76c 100644
--- a/sklearn/model_selection/__init__.py
+++ b/sklearn/model_selection/__init__.py
@@ -1,5 +1,7 @@
 import typing
 
+from ._prediction import CutOffClassifier
+
 from ._split import BaseCrossValidator
 from ._split import BaseShuffleSplit
 from ._split import KFold
@@ -65,6 +67,7 @@
     "StratifiedKFold",
     "StratifiedGroupKFold",
     "StratifiedShuffleSplit",
+    "CutOffClassifier",
     "check_cv",
     "cross_val_predict",
     "cross_val_score",

From 8410317b8db8b5f9888a9e5b885ebe7f12b7933f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 7 Apr 2023 16:27:47 +0200
Subject: [PATCH 011/194] add files

---
 sklearn/model_selection/_prediction.py        | 171 ++++++++++++++++++
 .../model_selection/tests/test_prediction.py  |  12 ++
 2 files changed, 183 insertions(+)
 create mode 100644 sklearn/model_selection/_prediction.py
 create mode 100644 sklearn/model_selection/tests/test_prediction.py

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
new file mode 100644
index 0000000000000..bb2752c92daa0
--- /dev/null
+++ b/sklearn/model_selection/_prediction.py
@@ -0,0 +1,171 @@
+from numbers import Integral, Real
+
+import numpy as np
+
+from ..base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin, clone
+from ..metrics import check_scoring, get_scorer_names, make_scorer, roc_curve
+from ..metrics._scorer import _ContinuousScorer
+from ..utils import _safe_indexing
+from ..utils._param_validation import HasMethods, Interval, StrOptions
+from ..utils._response import _get_response_values_binary
+from ..utils.multiclass import type_of_target
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import _check_sample_weight, _num_samples
+
+from ._split import check_cv
+
+
+def _fit_and_score(classifier, X, y, train_idx, val_idx, scorer, score_method):
+    if train_idx is not None:
+        X_train, X_val = _safe_indexing(X, train_idx), _safe_indexing(X, val_idx)
+        y_train, y_val = _safe_indexing(y, train_idx), _safe_indexing(y, val_idx)
+        classifier.fit(X_train, y_train)
+    else:  # prefit estimator, only a validation set is provided
+        X_val, y_val = X, y
+
+    if score_method == {"tnr", "tpr"}:
+        fpr, tpr, potential_thresholds = scorer(classifier, X_val, y_val)
+        if score_method == "tnr":
+            return potential_thresholds[::-1], (1 - fpr)[::-1]
+        return potential_thresholds, tpr
+    return scorer(classifier, X_val, y_val)
+
+
+class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
+    def __init__(
+        self,
+        estimator,
+        *,
+        objective_metric="balanced_accuracy",
+        objective_value=None,
+        response_method="auto",
+        n_thresholds=1_000,
+        cv=None,
+        random_state=None,
+        n_jobs=None,
+    ):
+        self.estimator = estimator
+        self.objective_metric = objective_metric
+        self.objective_value = objective_value
+        self.response_method = response_method
+        self.n_thresholds = n_thresholds
+        self.cv = cv
+        self.random_state = random_state
+        self.n_jobs = n_jobs
+
+    _parameter_constraints: dict = {
+        "estimator": [
+            HasMethods(["fit", "predict_proba"]),
+            HasMethods(["fit", "decision_function"]),
+        ],
+        "objective_metric": [
+            StrOptions(set(get_scorer_names()) | {"tpr", "fpr"}),
+            callable,
+        ],
+        "objective_value": [Real, None],
+        "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})],
+        "n_thresholds": [Interval(Integral, 1, None, closed="left")],
+        "cv": ["cv_object", StrOptions({"prefit"})],
+        "random_state": ["random_state"],
+        "n_jobs": [Integral, None],
+    }
+
+    def fit(self, X, y, sample_weight=None, **fit_params):
+        self._validate_params()
+
+        y_type = type_of_target(y, input_name="y")
+        if y_type != "binary":
+            raise ValueError(
+                f"Only binary classification is supported. Got {y_type} instead."
+            )
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X)
+
+        cv = self.cv if self.cv == "prefit" else check_cv(self.cv, y, classifier=True)
+
+        if self.response_method == "auto":
+            self._response_method = ["predict_proba", "decision_function"]
+        else:
+            self._response_method = self.response_method
+
+        if self.objective_metric in {"tpr", "fpr"}:
+            if self.objective_value is None:
+                raise ValueError(
+                    "When `objective_metric` is 'tpr' or 'fpr', `objective_value` must "
+                    "be provided. Got None instead."
+                )
+            objective_value = self.objective_value
+        else:
+            objective_value = "highest"
+
+        self.estimator_ = clone(self.estimator).fit(X, y, sample_weight, **fit_params)
+
+        if cv == "prefit":
+            classifier = self.estimator
+            split = ([None, range(_num_samples(X))],)
+        else:
+            classifier = clone(self.estimator)
+            split = cv.split(X, y)
+
+        if self.objective_metric in {"tpr", "fpr"}:
+            self._scorer = make_scorer(roc_curve, needs_threshold=True)
+        else:
+            scoring = check_scoring(classifier, scoring=self.objective_metric)
+            # transform a binary metric into a curve metric for all possible decision
+            # thresholds
+            self._scorer = _ContinuousScorer(
+                score_func=scoring._score_func,
+                sign=scoring._sign,
+                response_method=self._response_method,
+                kwargs=scoring._kwargs,
+            )
+
+        thresholds, scores = zip(
+            *Parallel(n_jobs=self.n_jobs)(
+                delayed(_fit_and_score)(
+                    classifier,
+                    X,
+                    y,
+                    train_idx,
+                    val_idx,
+                    self._scorer,
+                    self.objective_metric,
+                )
+                for train_idx, val_idx in split
+            )
+        )
+
+        min_threshold = np.min([th.min() for th in thresholds])
+        max_threshold = np.max([th.max() for th in thresholds])
+        ascending = thresholds[0].argmin() == 0
+        start = min_threshold if ascending else max_threshold
+        stop = max_threshold if ascending else min_threshold
+        thresholds_interpolated = np.linspace(start, stop, num=self.n_thresholds)
+        mean_score = np.mean(
+            [
+                np.interp(thresholds_interpolated, th, sc)
+                for th, sc in zip(thresholds, scores)
+            ],
+            axis=0,
+        )
+        if objective_value == "highest":
+            best_idx = mean_score.argmax()
+        else:
+            best_idx = np.searchsorted(mean_score, objective_value)
+        self.decision_threshold_ = thresholds_interpolated[best_idx]
+
+        return self
+
+    @property
+    def classes_(self):
+        """Classes labels."""
+        return self.estimator_.classes_
+
+    def predict(self, X):
+        pos_label = self._scorer._get_pos_label()
+        y_score, _ = _get_response_values_binary(
+            self.estimator_, X, self._response_method, pos_label=pos_label
+        )
+        y_pred = (y_score >= self.decision_threshold_).astype(int)
+        return self.classes_[y_pred]
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
new file mode 100644
index 0000000000000..b67b1e8ff3e13
--- /dev/null
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -0,0 +1,12 @@
+from sklearn.datasets import make_classification
+from sklearn.linear_model import LogisticRegression
+
+from sklearn.model_selection import CutOffClassifier
+
+
+def test_xxx():
+    X, y = make_classification(random_state=0)
+    clf = CutOffClassifier(LogisticRegression())
+    clf.fit(X, y)
+
+    assert clf.predict(X) is not None

From c7d1fe4a88b2e20c8ba8d34d42a247d96416b284 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 7 Apr 2023 16:35:23 +0200
Subject: [PATCH 012/194] remove random state for the moment

---
 sklearn/model_selection/_prediction.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index bb2752c92daa0..9e5579d7ebd47 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -41,7 +41,6 @@ def __init__(
         response_method="auto",
         n_thresholds=1_000,
         cv=None,
-        random_state=None,
         n_jobs=None,
     ):
         self.estimator = estimator
@@ -50,7 +49,6 @@ def __init__(
         self.response_method = response_method
         self.n_thresholds = n_thresholds
         self.cv = cv
-        self.random_state = random_state
         self.n_jobs = n_jobs
 
     _parameter_constraints: dict = {
@@ -66,7 +64,6 @@ def __init__(
         "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})],
         "n_thresholds": [Interval(Integral, 1, None, closed="left")],
         "cv": ["cv_object", StrOptions({"prefit"})],
-        "random_state": ["random_state"],
         "n_jobs": [Integral, None],
     }
 

From c9d7a221243ec22365ad3ba0dacbd05d1df41af3 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 15 Apr 2023 20:54:02 +0200
Subject: [PATCH 013/194] TST make sure to pass the common test

---
 sklearn/model_selection/_prediction.py | 135 +++++++++++++++++++++++--
 1 file changed, 128 insertions(+), 7 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 9e5579d7ebd47..2652a8b976cfc 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -1,3 +1,4 @@
+from inspect import signature
 from numbers import Integral, Real
 
 import numpy as np
@@ -10,28 +11,137 @@
 from ..utils._response import _get_response_values_binary
 from ..utils.multiclass import type_of_target
 from ..utils.parallel import Parallel, delayed
-from ..utils.validation import _check_sample_weight, _num_samples
+from ..utils.validation import (
+    _check_sample_weight,
+    _num_samples,
+    check_is_fitted,
+    indexable,
+)
 
 from ._split import check_cv
 
 
-def _fit_and_score(classifier, X, y, train_idx, val_idx, scorer, score_method):
+def _fit_and_score(
+    classifier, X, y, sample_weight, train_idx, val_idx, scorer, score_method
+):
+    fit_parameters = signature(classifier.fit).parameters
+    supports_sw = "sample_weight" in fit_parameters
+
     if train_idx is not None:
         X_train, X_val = _safe_indexing(X, train_idx), _safe_indexing(X, val_idx)
         y_train, y_val = _safe_indexing(y, train_idx), _safe_indexing(y, val_idx)
-        classifier.fit(X_train, y_train)
+        if sample_weight is not None:
+            sw_train, sw_val = (
+                _safe_indexing(sample_weight, train_idx),
+                _safe_indexing(sample_weight, val_idx),
+            )
+        else:
+            sw_train, sw_val = None, None
+        if supports_sw:
+            classifier.fit(X_train, y_train, sample_weight=sw_train)
+        else:
+            classifier.fit(X_train, y_train)
     else:  # prefit estimator, only a validation set is provided
-        X_val, y_val = X, y
+        X_val, y_val, sw_val = X, y, sample_weight
 
     if score_method == {"tnr", "tpr"}:
-        fpr, tpr, potential_thresholds = scorer(classifier, X_val, y_val)
+        fpr, tpr, potential_thresholds = scorer(
+            classifier, X_val, y_val, sample_weight=sw_val
+        )
         if score_method == "tnr":
             return potential_thresholds[::-1], (1 - fpr)[::-1]
         return potential_thresholds, tpr
-    return scorer(classifier, X_val, y_val)
+    return scorer(classifier, X_val, y_val, sample_weight=sw_val)
 
 
 class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
+    """Decision threshold calibration for binary classification.
+
+    Estimator that calibrates the decision threshold (cutoff point) that is
+    used for prediction. The methods for picking cutoff points make use of
+    traditional binary classification evaluation statistics such as the true
+    positive and true negative rates or any metrics accepting true labels and
+    the output of a scoring function from a scikit-learn estimator.
+
+    Parameters
+    ----------
+    estimator : estimator object
+        The classifier, fitted or not fitted, from which we want to optimize
+        the decision threshold used during `predict`.
+
+    objective_metric : {"tpr", "tnr"}, str or callable,  \
+            default="balanced_accuracy"
+        The objective metric to be optimized. Can be one of:
+
+        * a string associated to a scoring function (see model evaluation
+          documentation);
+        * a scorer callable object / function with the signature
+          `metric(estimator, X, y)`;
+        * `"tpr"`: find the decision threshold for a true positive ratio (TPR)
+          of `objective_value`;
+        * `"tnr"`: find the decision threshold for a true negative ratio (TNR)
+          of `objective_value`.
+
+    objective_value : float, default=None
+        The value associated with the `objective_metric` metric for which we
+        want to find the decision threshold when `objective_metric` is equal to
+        `"tpr"` or `"tnr"`.
+
+    response_method : {"auto", "decision_function", "predict_proba"}, \
+            default="auto"
+        Methods by the classifier `base_estimator` corresponding to the
+        decision function for which we want to find a threshold. It can be:
+
+        * if `"auto"`, it will try to invoke, for each classifier,
+          `"predict_proba"` or `"decision_function"` in that order.
+        * otherwise, one of `"predict_proba"` or `"decision_function"`.
+          If the method is not implemented by the classifier, it will raise an
+          error.
+
+    n_thresholds : int, default=1000
+        The number of decision threshold to use when discretizing the output
+        of the classifier `method`.
+
+    cv : int, float, cross-validation generator, iterable or "prefit", \
+            default=None
+        Determines the cross-validation splitting strategy used in
+        `cross_val_predict` to train classifier. Possible inputs for cv are:
+
+        * None, to use the default 5-fold stratified K-fold cross validation;
+        * An integer number, to specify the number of folds in a stratified
+          k-fold;
+        * A float number, to specify a single shuffle split. The floating
+          number should be in (0, 1) and represent the size of the validation
+          set;
+        * An object to be used as a cross-validation generator;
+        * An iterable yielding train, test splits;
+        * "prefit", to bypass the cross-validation.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel all `estimators` `fit`.
+        `None` means 1 unless in a `joblib.parallel_backend` context. -1 means
+        using all processors. See Glossary for more details.
+
+    Attributes
+    ----------
+    decision_threshold_ : float
+        The new decision threshold.
+
+    classes_ : ndarray of shape (n_classes,)
+        The class labels.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+    """
+
     def __init__(
         self,
         estimator,
@@ -69,11 +179,12 @@ def __init__(
 
     def fit(self, X, y, sample_weight=None, **fit_params):
         self._validate_params()
+        X, y = indexable(X, y)
 
         y_type = type_of_target(y, input_name="y")
         if y_type != "binary":
             raise ValueError(
-                f"Only binary classification is supported. Got {y_type} instead."
+                f"Only binary classification is supported. Unknown label type: {y_type}"
             )
 
         if sample_weight is not None:
@@ -124,6 +235,7 @@ def fit(self, X, y, sample_weight=None, **fit_params):
                     classifier,
                     X,
                     y,
+                    sample_weight,
                     train_idx,
                     val_idx,
                     self._scorer,
@@ -152,6 +264,11 @@ def fit(self, X, y, sample_weight=None, **fit_params):
             best_idx = np.searchsorted(mean_score, objective_value)
         self.decision_threshold_ = thresholds_interpolated[best_idx]
 
+        if hasattr(self.estimator_, "n_features_in_"):
+            self.n_features_in_ = self.estimator_.n_features_in_
+        if hasattr(self.estimator_, "feature_names_in_"):
+            self.feature_names_in_ = self.estimator_.feature_names_in_
+
         return self
 
     @property
@@ -160,9 +277,13 @@ def classes_(self):
         return self.estimator_.classes_
 
     def predict(self, X):
+        check_is_fitted(self, "estimator_")
         pos_label = self._scorer._get_pos_label()
         y_score, _ = _get_response_values_binary(
             self.estimator_, X, self._response_method, pos_label=pos_label
         )
         y_pred = (y_score >= self.decision_threshold_).astype(int)
         return self.classes_[y_pred]
+
+    def _more_tags(self):
+        return {"binary_only": True}

From 9981f3a04bb6f81131c6d700083786b9594dfc95 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 15 Apr 2023 20:58:04 +0200
Subject: [PATCH 014/194] TST metaestimator sample_weight

---
 sklearn/model_selection/_prediction.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 2652a8b976cfc..37c65a087c188 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -207,7 +207,14 @@ def fit(self, X, y, sample_weight=None, **fit_params):
         else:
             objective_value = "highest"
 
-        self.estimator_ = clone(self.estimator).fit(X, y, sample_weight, **fit_params)
+        fit_parameters = signature(self.estimator.fit).parameters
+        supports_sw = "sample_weight" in fit_parameters
+        if sample_weight is not None and supports_sw:
+            self.estimator_ = clone(self.estimator).fit(
+                X, y, sample_weight, **fit_params
+            )
+        else:
+            self.estimator_ = clone(self.estimator).fit(X, y, **fit_params)
 
         if cv == "prefit":
             classifier = self.estimator

From b9c9d5ef22274b1e0a24529589e912be77546708 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 15 Apr 2023 22:41:30 +0200
Subject: [PATCH 015/194] API add prediction functions

---
 sklearn/model_selection/_prediction.py | 124 ++++++++++++++++++++++---
 1 file changed, 111 insertions(+), 13 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 37c65a087c188..509295c1204e5 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -9,6 +9,7 @@
 from ..utils import _safe_indexing
 from ..utils._param_validation import HasMethods, Interval, StrOptions
 from ..utils._response import _get_response_values_binary
+from ..utils.metaestimators import available_if
 from ..utils.multiclass import type_of_target
 from ..utils.parallel import Parallel, delayed
 from ..utils.validation import (
@@ -21,6 +22,19 @@
 from ._split import check_cv
 
 
+def _estimator_has(attr):
+    """Check if we can delegate a method to the underlying estimator.
+
+    First, we check the first fitted estimator if available, otherwise we
+    check the unfitted estimator.
+    """
+    return lambda self: (
+        hasattr(self.estimator_, attr)
+        if hasattr(self, "estimator_")
+        else hasattr(self.estimator, attr)
+    )
+
+
 def _fit_and_score(
     classifier, X, y, sample_weight, train_idx, val_idx, scorer, score_method
 ):
@@ -57,16 +71,10 @@ def _fit_and_score(
 class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
     """Decision threshold calibration for binary classification.
 
-    Estimator that calibrates the decision threshold (cutoff point) that is
-    used for prediction. The methods for picking cutoff points make use of
-    traditional binary classification evaluation statistics such as the true
-    positive and true negative rates or any metrics accepting true labels and
-    the output of a scoring function from a scikit-learn estimator.
-
     Parameters
     ----------
-    estimator : estimator object
-        The classifier, fitted or not fitted, from which we want to optimize
+    estimator : estimator instance
+        The classifier, fitted or not fitted, for which we want to optimize
         the decision threshold used during `predict`.
 
     objective_metric : {"tpr", "tnr"}, str or callable,  \
@@ -104,8 +112,8 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
 
     cv : int, float, cross-validation generator, iterable or "prefit", \
             default=None
-        Determines the cross-validation splitting strategy used in
-        `cross_val_predict` to train classifier. Possible inputs for cv are:
+        Determines the cross-validation splitting strategy to train classifier.
+        Possible inputs for cv are:
 
         * None, to use the default 5-fold stratified K-fold cross validation;
         * An integer number, to specify the number of folds in a stratified
@@ -121,9 +129,11 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
         cross-validation strategies that can be used here.
 
     n_jobs : int, default=None
-        The number of jobs to run in parallel all `estimators` `fit`.
-        `None` means 1 unless in a `joblib.parallel_backend` context. -1 means
-        using all processors. See Glossary for more details.
+        The number of jobs to run in parallel. When `cv` represents a
+        cross-validation strategy, the fitting and scoring on each data split
+        is done in parallel. ``None`` means 1 unless in a
+        :obj:`joblib.parallel_backend` context. ``-1`` means using all
+        processors. See :term:`Glossary <n_jobs>` for more details.
 
     Attributes
     ----------
@@ -178,6 +188,28 @@ def __init__(
     }
 
     def fit(self, X, y, sample_weight=None, **fit_params):
+        """Fit the calibrated model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If `None`, then samples are equally weighted.
+
+        **fit_params : dict
+            Parameters to pass to the `fit` method of the underlying
+            classifier.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
         self._validate_params()
         X, y = indexable(X, y)
 
@@ -284,6 +316,18 @@ def classes_(self):
         return self.estimator_.classes_
 
     def predict(self, X):
+        """Predict the target of new samples.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The samples, as accepted by `estimator.predict`.
+
+        Returns
+        -------
+        C : ndarray of shape (n_samples,)
+            The predicted class.
+        """
         check_is_fitted(self, "estimator_")
         pos_label = self._scorer._get_pos_label()
         y_score, _ = _get_response_values_binary(
@@ -292,5 +336,59 @@ def predict(self, X):
         y_pred = (y_score >= self.decision_threshold_).astype(int)
         return self.classes_[y_pred]
 
+    @available_if(_estimator_has("predict_proba"))
+    def predict_proba(self, X):
+        """Predict class probabilities for `X` using the fitted estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        probabilities : ndarray of shape (n_samples, n_classes)
+            The class probabilities of the input samples.
+        """
+        check_is_fitted(self, "estimator_")
+        return self.estimator_.predict_proba(X)
+
+    @available_if(_estimator_has("predict_log_proba"))
+    def predict_log_proba(self, X):
+        """Predict logarithm class probabilities for `X` using the fitted estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        log_probabilities : ndarray of shape (n_samples, n_classes)
+            The logarithm class probabilities of the input samples.
+        """
+        check_is_fitted(self, "estimator_")
+        return self.estimator_.predict_log_proba(X)
+
+    @available_if(_estimator_has("decision_function"))
+    def decision_function(self, X):
+        """Decision function for samples in `X` using the fitted estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        decisions : ndarray of shape (n_samples,)
+            The decision function computed the fitted estimator.
+        """
+        check_is_fitted(self, "estimator_")
+        return self.estimator_.decision_function(X)
+
     def _more_tags(self):
         return {"binary_only": True}

From 588f1c485fa58a47f036cc9d3e0973ebf3e58e32 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 17 Apr 2023 16:20:12 +0200
Subject: [PATCH 016/194] TST bypass the test for classification

---
 sklearn/model_selection/_prediction.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 509295c1204e5..14ac4ee4f2a33 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -391,4 +391,9 @@ def decision_function(self, X):
         return self.estimator_.decision_function(X)
 
     def _more_tags(self):
-        return {"binary_only": True}
+        return {
+            "binary_only": True,
+            "_xfail_checks": {
+                "check_classifiers_train": "Threshold at probability 0.5 does not hold"
+            },
+        }

From 243d1735f6ab9e206a262d49bfbcca9d3b83d72d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 17 Apr 2023 18:03:08 +0200
Subject: [PATCH 017/194] iter before another bug

---
 sklearn/model_selection/_prediction.py        |  71 +++++----
 .../model_selection/tests/test_prediction.py  | 145 +++++++++++++++++-
 2 files changed, 182 insertions(+), 34 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 14ac4ee4f2a33..714ce27499060 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -58,7 +58,7 @@ def _fit_and_score(
     else:  # prefit estimator, only a validation set is provided
         X_val, y_val, sw_val = X, y, sample_weight
 
-    if score_method == {"tnr", "tpr"}:
+    if score_method in {"tnr", "tpr"}:
         fpr, tpr, potential_thresholds = scorer(
             classifier, X_val, y_val, sample_weight=sw_val
         )
@@ -77,8 +77,7 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
         The classifier, fitted or not fitted, for which we want to optimize
         the decision threshold used during `predict`.
 
-    objective_metric : {"tpr", "tnr"}, str or callable,  \
-            default="balanced_accuracy"
+    objective_metric : {"tpr", "tnr"}, str or callable, default="balanced_accuracy"
         The objective metric to be optimized. Can be one of:
 
         * a string associated to a scoring function (see model evaluation
@@ -95,8 +94,14 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
         want to find the decision threshold when `objective_metric` is equal to
         `"tpr"` or `"tnr"`.
 
-    response_method : {"auto", "decision_function", "predict_proba"}, \
-            default="auto"
+    pos_label : int, float, bool or str, default=None
+        The label of the positive class. Used with `objective_metric="tpr"` or
+        `"tnr"`. When `pos_label=None`, if `y_true` is in `{-1, 1}` or `{0, 1}`,
+        `pos_label` is set to 1, otherwise an error will be raised. When using a
+        scorer, `pos_label` can be passed as a keyword argument to
+        :func:`~sklearn.metrics.make_scorer`.
+
+    response_method : {"auto", "decision_function", "predict_proba"}, default="auto"
         Methods by the classifier `base_estimator` corresponding to the
         decision function for which we want to find a threshold. It can be:
 
@@ -110,8 +115,7 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
         The number of decision threshold to use when discretizing the output
         of the classifier `method`.
 
-    cv : int, float, cross-validation generator, iterable or "prefit", \
-            default=None
+    cv : int, float, cross-validation generator, iterable or "prefit", default=None
         Determines the cross-validation splitting strategy to train classifier.
         Possible inputs for cv are:
 
@@ -152,12 +156,30 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
         underlying estimator exposes such an attribute when fit.
     """
 
+    _parameter_constraints: dict = {
+        "estimator": [
+            HasMethods(["fit", "predict_proba"]),
+            HasMethods(["fit", "decision_function"]),
+        ],
+        "objective_metric": [
+            StrOptions(set(get_scorer_names()) | {"tpr", "tnr"}),
+            callable,
+        ],
+        "objective_value": [Real, None],
+        "pos_label": [Real, str, "boolean", None],
+        "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})],
+        "n_thresholds": [Interval(Integral, 1, None, closed="left")],
+        "cv": ["cv_object", StrOptions({"prefit"})],
+        "n_jobs": [Integral, None],
+    }
+
     def __init__(
         self,
         estimator,
         *,
         objective_metric="balanced_accuracy",
         objective_value=None,
+        pos_label=None,
         response_method="auto",
         n_thresholds=1_000,
         cv=None,
@@ -166,29 +188,14 @@ def __init__(
         self.estimator = estimator
         self.objective_metric = objective_metric
         self.objective_value = objective_value
+        self.pos_label = pos_label
         self.response_method = response_method
         self.n_thresholds = n_thresholds
         self.cv = cv
         self.n_jobs = n_jobs
 
-    _parameter_constraints: dict = {
-        "estimator": [
-            HasMethods(["fit", "predict_proba"]),
-            HasMethods(["fit", "decision_function"]),
-        ],
-        "objective_metric": [
-            StrOptions(set(get_scorer_names()) | {"tpr", "fpr"}),
-            callable,
-        ],
-        "objective_value": [Real, None],
-        "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})],
-        "n_thresholds": [Interval(Integral, 1, None, closed="left")],
-        "cv": ["cv_object", StrOptions({"prefit"})],
-        "n_jobs": [Integral, None],
-    }
-
     def fit(self, X, y, sample_weight=None, **fit_params):
-        """Fit the calibrated model.
+        """Fit the classifier and post-tune the decision threshold.
 
         Parameters
         ----------
@@ -229,10 +236,10 @@ def fit(self, X, y, sample_weight=None, **fit_params):
         else:
             self._response_method = self.response_method
 
-        if self.objective_metric in {"tpr", "fpr"}:
+        if self.objective_metric in {"tpr", "tnr"}:
             if self.objective_value is None:
                 raise ValueError(
-                    "When `objective_metric` is 'tpr' or 'fpr', `objective_value` must "
+                    "When `objective_metric` is 'tpr' or 'tnr', `objective_value` must "
                     "be provided. Got None instead."
                 )
             objective_value = self.objective_value
@@ -255,8 +262,15 @@ def fit(self, X, y, sample_weight=None, **fit_params):
             classifier = clone(self.estimator)
             split = cv.split(X, y)
 
-        if self.objective_metric in {"tpr", "fpr"}:
-            self._scorer = make_scorer(roc_curve, needs_threshold=True)
+        if self.objective_metric in {"tpr", "tnr"}:
+            if (
+                self._response_method == "predict_proba"
+                or self._response_method[0] == "predict_proba"
+            ):
+                params_scorer = {"needs_proba": True, "pos_label": self.pos_label}
+            else:
+                params_scorer = {"needs_threshold": True, "pos_label": self.pos_label}
+            self._scorer = make_scorer(roc_curve, **params_scorer)
         else:
             scoring = check_scoring(classifier, scoring=self.objective_metric)
             # transform a binary metric into a curve metric for all possible decision
@@ -283,6 +297,7 @@ def fit(self, X, y, sample_weight=None, **fit_params):
                 for train_idx, val_idx in split
             )
         )
+        # print(thresholds, scores)
 
         min_threshold = np.min([th.min() for th in thresholds])
         max_threshold = np.max([th.max() for th in thresholds])
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index b67b1e8ff3e13..8f68b952830cc 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -1,12 +1,145 @@
-from sklearn.datasets import make_classification
+import numpy as np
+import pytest
+
+from sklearn.datasets import load_breast_cancer, make_classification
+from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import balanced_accuracy_score, fbeta_score, f1_score, make_scorer
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC
+from sklearn.utils._testing import assert_allclose, assert_array_equal
 
 from sklearn.model_selection import CutOffClassifier
 
 
-def test_xxx():
-    X, y = make_classification(random_state=0)
-    clf = CutOffClassifier(LogisticRegression())
-    clf.fit(X, y)
+def test_cutoffclassifier_no_binary():
+    """Check that we raise an informative error message for non-binary problem."""
+    X, y = make_classification(n_classes=3, n_clusters_per_class=1)
+    err_msg = "Only binary classification is supported."
+    with pytest.raises(ValueError, match=err_msg):
+        CutOffClassifier(LogisticRegression()).fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [LogisticRegression(), SVC(), GradientBoostingClassifier(n_estimators=4)],
+)
+@pytest.mark.parametrize(
+    "response_method", ["predict_proba", "predict_log_proba", "decision_function"]
+)
+def test_cutoffclassifier_estimator_response_methods(estimator, response_method):
+    """Check that `CutOffClassifier` exposes the same response methods as the
+    underlying estimator.
+    """
+    X, y = make_classification(n_samples=100, random_state=0)
+
+    model = CutOffClassifier(estimator)
+    assert hasattr(model, response_method) == hasattr(estimator, response_method)
+
+    model.fit(X, y)
+    assert hasattr(model, response_method) == hasattr(estimator, response_method)
+
+    if hasattr(model, response_method):
+        y_pred_cutoff = getattr(model, response_method)(X)
+        y_pred_underlying_estimator = getattr(model.estimator_, response_method)(X)
+
+        assert_allclose(y_pred_cutoff, y_pred_underlying_estimator)
+
+
+@pytest.mark.parametrize(
+    "response_method", ["auto", "decision_function", "predict_proba"]
+)
+def test_cutoffclassifier_with_objective_value(response_method):
+    """Check that `CutOffClassifier` is optimizing a given objective metric."""
+    X, y = load_breast_cancer(return_X_y=True)
+    # remove feature to degrade performances
+    X = X[:, :5]
+
+    # make the problem completely imbalanced such that the balanced accuracy is low
+    indices_pos = np.flatnonzero(y == 1)
+    indices_pos = indices_pos[: indices_pos.size // 50]
+    indices_neg = np.flatnonzero(y == 0)
+
+    X = np.vstack([X[indices_neg], X[indices_pos]])
+    y = np.hstack([y[indices_neg], y[indices_pos]])
+
+    lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
+    model = CutOffClassifier(
+        estimator=lr,
+        objective_metric="balanced_accuracy",
+        response_method=response_method,
+    )
+    score_optimized = balanced_accuracy_score(y, model.fit(X, y).predict(X))
+    score_baseline = balanced_accuracy_score(y, lr.predict(X))
+    assert score_optimized > score_baseline
+
+
+def test_cutoffclassifier_limit_tpr_tnr():
+    """Check that an objective value of 0 give opposite predictions with objective
+    metrics `tpr` and `tnr`.
+    """
+    X, y = load_breast_cancer(return_X_y=True)
+    estimator = make_pipeline(StandardScaler(), LogisticRegression())
+    clf = CutOffClassifier(
+        estimator=estimator, objective_metric="tpr", objective_value=0
+    )
+    y_pred_tpr = clf.fit(X, y).predict(X)
+    clf.set_params(objective_metric="tnr")
+    y_pred_tnr = (~clf.fit(X, y).predict(X).astype(bool)).astype(int)
+    assert np.mean(y_pred_tnr == y_pred_tpr) > 0.98
+
+
+def test_cutoffclassifier_metric_with_parameter():
+    """Check that we can pass a metric with a parameter in addition check that
+    `f_beta with beta=1` is equivalent to `f1`.
+    """
+    X, y = load_breast_cancer(return_X_y=True)
+    lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
+    model_fbeta = CutOffClassifier(
+        estimator=lr, objective_metric=make_scorer(fbeta_score, beta=1)
+    ).fit(X, y)
+    model_f1 = CutOffClassifier(
+        estimator=lr, objective_metric=make_scorer(f1_score)
+    ).fit(X, y)
+
+    assert model_fbeta.decision_threshold_ == pytest.approx(
+        model_f1.decision_threshold_
+    )
+
+
+@pytest.mark.parametrize(
+    "response_method", ["auto", "decision_function", "predict_proba"]
+)
+@pytest.mark.parametrize(
+    "metric",
+    [
+        "tpr",
+        "tnr",
+        make_scorer(balanced_accuracy_score),
+        make_scorer(f1_score, pos_label="cancer"),
+    ],
+)
+def test_cutoffclassifier_with_string_targets(response_method, metric):
+    """Check that targets represented by str are properly managed.
+    Also, check with several metrics to be sure that `pos_label` is properly
+    dispatched.
+    """
+    X, y = load_breast_cancer(return_X_y=True)
+    # Encode numeric targets by meaningful strings. We purposely designed the class
+    # names such that the `pos_label` is the first alphabetically sorted class and thus
+    # encoded as 0.
+    classes = np.array(["healthy", "cancer"], dtype=object)
+    y = classes[y]
+    model = CutOffClassifier(
+        estimator=make_pipeline(StandardScaler(), LogisticRegression()),
+        objective_metric=metric,
+        objective_value=0.5,
+        pos_label="cancer",
+        response_method=response_method,
+    ).fit(X, y)
+    assert_array_equal(model.classes_, np.sort(classes))
+    y_pred = model.predict(X[[0], :])
+    assert y_pred.item(0) in classes
 
-    assert clf.predict(X) is not None
+    print(model.decision_threshold_)

From 883e929317ad5e382c5b755d56a274b391011da8 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 18 Apr 2023 18:26:33 +0200
Subject: [PATCH 018/194] iter

---
 sklearn/model_selection/_prediction.py        | 57 ++++++++++++++++---
 .../model_selection/tests/test_prediction.py  |  5 +-
 2 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 714ce27499060..06feb6b259496 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -38,6 +38,39 @@ def _estimator_has(attr):
 def _fit_and_score(
     classifier, X, y, sample_weight, train_idx, val_idx, scorer, score_method
 ):
+    """Fit a classifier and compute the scores for different decision thresholds.
+
+    Parameters
+    ----------
+    classifier : estimator instance
+        The classifier to fit and used for scoring. If `classifier` is already fitted,
+        it will be used as is.
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The entire dataset.
+    y : array-like of shape (n_samples,)
+        The entire target vector.
+    sample_weight : array-like of shape (n_samples,)
+        Some optional associated sample weights.
+    train_idx : ndarray of shape (n_train_samples,) or None
+        The indices of the training set. If `None`, `classifier` is expected to be
+        already fitted.
+    val_idx : ndarray of shape (n_val_samples,)
+        The indices of the validation set used to score `classifier`.
+    scorer : scorer instance
+        The scorer taking `classifier` and the validation set as input and outputting
+        decision thresholds and scores.
+    score_method : str or callable
+        The scoring method to use. Used to detect `tpr` and `tnr` since they are not
+        an usual scikit-learn scorer and need to be handled differently.
+
+    Returns
+    -------
+    thresholds : ndarray of shape (n_thresholds,)
+        The decision thresholds used to compute the scores. They are returned in
+        increasing order.
+    scores : ndarray of shape (n_thresholds,)
+        The scores computed for each decision threshold.
+    """
     fit_parameters = signature(classifier.fit).parameters
     supports_sw = "sample_weight" in fit_parameters
 
@@ -64,7 +97,7 @@ def _fit_and_score(
         )
         if score_method == "tnr":
             return potential_thresholds[::-1], (1 - fpr)[::-1]
-        return potential_thresholds, tpr
+        return potential_thresholds[::-1], tpr[::-1]
     return scorer(classifier, X_val, y_val, sample_weight=sw_val)
 
 
@@ -144,6 +177,9 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
     decision_threshold_ : float
         The new decision threshold.
 
+    objective_score_ : float
+        The score of the objective metric associated with the decision threshold found.
+
     classes_ : ndarray of shape (n_classes,)
         The class labels.
 
@@ -297,14 +333,13 @@ def fit(self, X, y, sample_weight=None, **fit_params):
                 for train_idx, val_idx in split
             )
         )
-        # print(thresholds, scores)
 
         min_threshold = np.min([th.min() for th in thresholds])
         max_threshold = np.max([th.max() for th in thresholds])
-        ascending = thresholds[0].argmin() == 0
-        start = min_threshold if ascending else max_threshold
-        stop = max_threshold if ascending else min_threshold
-        thresholds_interpolated = np.linspace(start, stop, num=self.n_thresholds)
+        thresholds_interpolated = np.linspace(
+            min_threshold, max_threshold, num=self.n_thresholds
+        )
+
         mean_score = np.mean(
             [
                 np.interp(thresholds_interpolated, th, sc)
@@ -312,10 +347,18 @@ def fit(self, X, y, sample_weight=None, **fit_params):
             ],
             axis=0,
         )
+        mean_score_argsort = np.argsort(mean_score)
+        mean_score, thresholds_interpolated = (
+            mean_score[mean_score_argsort],
+            thresholds_interpolated[mean_score_argsort],
+        )
+
         if objective_value == "highest":
-            best_idx = mean_score.argmax()
+            best_idx = mean_score.size - 1
         else:
             best_idx = np.searchsorted(mean_score, objective_value)
+
+        self.objective_score_ = mean_score[best_idx]
         self.decision_threshold_ = thresholds_interpolated[best_idx]
 
         if hasattr(self.estimator_, "n_features_in_"):
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index 8f68b952830cc..dc383659aba5d 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -134,12 +134,13 @@ def test_cutoffclassifier_with_string_targets(response_method, metric):
     model = CutOffClassifier(
         estimator=make_pipeline(StandardScaler(), LogisticRegression()),
         objective_metric=metric,
-        objective_value=0.5,
+        objective_value=0.9,
         pos_label="cancer",
         response_method=response_method,
+        n_thresholds=10,
     ).fit(X, y)
     assert_array_equal(model.classes_, np.sort(classes))
     y_pred = model.predict(X[[0], :])
     assert y_pred.item(0) in classes
 
-    print(model.decision_threshold_)
+    # print(model.decision_threshold_, model.objective_score_)

From 69333edcc57874bc36cd1d8971a743e76faf1497 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 19 Apr 2023 15:04:36 +0200
Subject: [PATCH 019/194] TST add test for _fit_and_score

---
 sklearn/model_selection/_prediction.py        |  26 ++-
 .../model_selection/tests/test_prediction.py  | 185 +++++++++++++++++-
 2 files changed, 199 insertions(+), 12 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 06feb6b259496..c479e6190aa4b 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -55,7 +55,8 @@ def _fit_and_score(
         The indices of the training set. If `None`, `classifier` is expected to be
         already fitted.
     val_idx : ndarray of shape (n_val_samples,)
-        The indices of the validation set used to score `classifier`.
+        The indices of the validation set used to score `classifier`. If `train_idx`,
+        the entire set will be used.
     scorer : scorer instance
         The scorer taking `classifier` and the validation set as input and outputting
         decision thresholds and scores.
@@ -67,7 +68,7 @@ def _fit_and_score(
     -------
     thresholds : ndarray of shape (n_thresholds,)
         The decision thresholds used to compute the scores. They are returned in
-        increasing order.
+        ascending order.
     scores : ndarray of shape (n_thresholds,)
         The scores computed for each decision threshold.
     """
@@ -90,6 +91,7 @@ def _fit_and_score(
             classifier.fit(X_train, y_train)
     else:  # prefit estimator, only a validation set is provided
         X_val, y_val, sw_val = X, y, sample_weight
+        check_is_fitted(classifier, "classes_")
 
     if score_method in {"tnr", "tpr"}:
         fpr, tpr, potential_thresholds = scorer(
@@ -334,6 +336,8 @@ def fit(self, X, y, sample_weight=None, **fit_params):
             )
         )
 
+        # thresholds are sorted in ascending order which is necessary for the
+        # interpolation of the score below
         min_threshold = np.min([th.min() for th in thresholds])
         max_threshold = np.max([th.max() for th in thresholds])
         thresholds_interpolated = np.linspace(
@@ -347,15 +351,17 @@ def fit(self, X, y, sample_weight=None, **fit_params):
             ],
             axis=0,
         )
-        mean_score_argsort = np.argsort(mean_score)
-        mean_score, thresholds_interpolated = (
-            mean_score[mean_score_argsort],
-            thresholds_interpolated[mean_score_argsort],
-        )
 
-        if objective_value == "highest":
-            best_idx = mean_score.size - 1
-        else:
+        if objective_value == "highest":  # find best score
+            # we don't need to sort the scores and directly take the maximum
+            best_idx = mean_score.argmax()
+        else:  # seeking for a specific objective value
+            # we need to sort the scores before applying `np.searchsorted`
+            mean_score_argsort = np.argsort(mean_score)
+            mean_score, thresholds_interpolated = (
+                mean_score[mean_score_argsort],
+                thresholds_interpolated[mean_score_argsort],
+            )
             best_idx = np.searchsorted(mean_score, objective_value)
 
         self.objective_score_ = mean_score[best_idx]
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index dc383659aba5d..37aec59f03621 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -1,16 +1,197 @@
 import numpy as np
 import pytest
 
-from sklearn.datasets import load_breast_cancer, make_classification
+from sklearn.datasets import load_breast_cancer, load_iris, make_classification
 from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.exceptions import NotFittedError
 from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import balanced_accuracy_score, fbeta_score, f1_score, make_scorer
+from sklearn.metrics import (
+    balanced_accuracy_score,
+    fbeta_score,
+    f1_score,
+    make_scorer,
+    roc_curve,
+)
+from sklearn.metrics._scorer import _ContinuousScorer
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
 from sklearn.utils._testing import assert_allclose, assert_array_equal
 
 from sklearn.model_selection import CutOffClassifier
+from sklearn.model_selection._prediction import _fit_and_score
+
+
+@pytest.mark.parametrize(
+    "scorer, score_method",
+    [
+        (
+            _ContinuousScorer(
+                score_func=balanced_accuracy_score,
+                sign=1,
+                response_method="predict_proba",
+                kwargs={},
+            ),
+            "balanced_accuracy",
+        ),
+        (
+            make_scorer(roc_curve, needs_proba=True),
+            "tpr",
+        ),
+        (
+            make_scorer(roc_curve, needs_proba=True),
+            "tnr",
+        ),
+    ],
+)
+def test_fit_and_score_scorers(scorer, score_method):
+    """Check that `_fit_and_score` returns thresholds in ascending order for the
+    different accepted scorers."""
+    X, y = make_classification(n_samples=100, random_state=0)
+    train_idx, val_idx = np.arange(50), np.arange(50, 100)
+    classifier = LogisticRegression()
+
+    thresholds, scores = _fit_and_score(
+        classifier,
+        X,
+        y,
+        sample_weight=None,
+        train_idx=train_idx,
+        val_idx=val_idx,
+        scorer=scorer,
+        score_method=score_method,
+    )
+
+    assert_array_equal(np.argsort(thresholds), np.arange(len(thresholds)))
+    assert np.logical_and(scores >= 0, scores <= 1).all()
+
+
+@pytest.mark.parametrize(
+    "scorer, score_method, expected_score",
+    [
+        (
+            _ContinuousScorer(
+                score_func=balanced_accuracy_score,
+                sign=1,
+                response_method="predict_proba",
+                kwargs={},
+            ),
+            "balanced_accuracy",
+            [0.5, 1.0],
+        ),
+        (
+            make_scorer(roc_curve, needs_proba=True),
+            "tpr",
+            [1.0, 1.0, 0.0],
+        ),
+        (
+            make_scorer(roc_curve, needs_proba=True),
+            "tnr",
+            [0.0, 1.0, 1.0],
+        ),
+    ],
+)
+def test_fit_and_score_prefit(scorer, score_method, expected_score):
+    """Check the behaviour with a prefit classifier."""
+    X, y = make_classification(n_samples=100, random_state=0)
+
+    # `train_idx is None` to indicate that the classifier is prefit
+    train_idx, val_idx = None, np.arange(50, 100)
+    classifier = DecisionTreeClassifier(random_state=0)
+
+    with pytest.raises(NotFittedError):
+        _fit_and_score(
+            classifier,
+            X,
+            y,
+            sample_weight=None,
+            train_idx=train_idx,
+            val_idx=val_idx,
+            scorer=scorer,
+            score_method=score_method,
+        )
+
+    classifier.fit(X, y)
+    # make sure that the classifier memorized the full dataset such that
+    # we get perfect predictions and thus match the expected score
+    assert classifier.score(X[val_idx], y[val_idx]) == pytest.approx(1.0)
+
+    thresholds, scores = _fit_and_score(
+        classifier,
+        X,
+        y,
+        sample_weight=None,
+        train_idx=train_idx,
+        val_idx=val_idx,
+        scorer=scorer,
+        score_method=score_method,
+    )
+    assert_array_equal(np.argsort(thresholds), np.arange(len(thresholds)))
+    assert_allclose(scores, expected_score)
+
+
+@pytest.mark.parametrize(
+    "scorer, score_method",
+    [
+        (
+            _ContinuousScorer(
+                score_func=balanced_accuracy_score,
+                sign=1,
+                response_method="predict_proba",
+                kwargs={},
+            ),
+            "balanced_accuracy",
+        ),
+        (
+            make_scorer(roc_curve, needs_proba=True),
+            "tpr",
+        ),
+        (
+            make_scorer(roc_curve, needs_proba=True),
+            "tnr",
+        ),
+    ],
+)
+def test_fit_and_score_sample_weight(scorer, score_method):
+    """Check that we dispatch the sample-weight to fit and score the classifier."""
+    X, y = load_iris(return_X_y=True)
+    X, y = X[:100], y[:100]  # only 2 classes
+
+    # create a dataset and repeat twice the sample of class #0
+    X_repeated, y_repeated = np.vstack([X, X[y == 0]]), np.hstack([y, y[y == 0]])
+    # create a sample weight vector that is equivalent to the repeated dataset
+    sample_weight = np.ones_like(y_repeated)
+    sample_weight[:50] *= 2
+
+    classifier = LogisticRegression()
+    train_repeated_idx = np.arange(X_repeated.shape[0])
+    val_repeated_idx = np.arange(X_repeated.shape[0])
+    thresholds_repeated, scores_repeated = _fit_and_score(
+        classifier,
+        X_repeated,
+        y_repeated,
+        sample_weight=None,
+        train_idx=train_repeated_idx,
+        val_idx=val_repeated_idx,
+        scorer=scorer,
+        score_method=score_method,
+    )
+
+    train_idx, val_idx = np.arange(X.shape[0]), np.arange(X.shape[0])
+    thresholds, scores = _fit_and_score(
+        classifier,
+        X,
+        y,
+        sample_weight=sample_weight,
+        train_idx=train_idx,
+        val_idx=val_idx,
+        scorer=scorer,
+        score_method=score_method,
+    )
+
+    assert_allclose(thresholds_repeated, thresholds)
+    assert_allclose(scores_repeated, scores)
 
 
 def test_cutoffclassifier_no_binary():

From 8616da1ee3a14ebaa92c6822c28b900042461c9e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 19 Apr 2023 16:11:26 +0200
Subject: [PATCH 020/194] iter

---
 sklearn/model_selection/_prediction.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index c479e6190aa4b..37df0c3756ff6 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -45,21 +45,28 @@ def _fit_and_score(
     classifier : estimator instance
         The classifier to fit and used for scoring. If `classifier` is already fitted,
         it will be used as is.
+
     X : {array-like, sparse matrix} of shape (n_samples, n_features)
         The entire dataset.
+
     y : array-like of shape (n_samples,)
         The entire target vector.
+
     sample_weight : array-like of shape (n_samples,)
         Some optional associated sample weights.
+
     train_idx : ndarray of shape (n_train_samples,) or None
         The indices of the training set. If `None`, `classifier` is expected to be
         already fitted.
+
     val_idx : ndarray of shape (n_val_samples,)
         The indices of the validation set used to score `classifier`. If `train_idx`,
         the entire set will be used.
+
     scorer : scorer instance
         The scorer taking `classifier` and the validation set as input and outputting
         decision thresholds and scores.
+
     score_method : str or callable
         The scoring method to use. Used to detect `tpr` and `tnr` since they are not
         an usual scikit-learn scorer and need to be handled differently.
@@ -69,6 +76,7 @@ def _fit_and_score(
     thresholds : ndarray of shape (n_thresholds,)
         The decision thresholds used to compute the scores. They are returned in
         ascending order.
+
     scores : ndarray of shape (n_thresholds,)
         The scores computed for each decision threshold.
     """

From 99a10b3ff67b61f39d04616dfcbf713dfc875103 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 19 Apr 2023 17:14:07 +0200
Subject: [PATCH 021/194] integrate refit

---
 sklearn/model_selection/_prediction.py        | 58 ++++++++++++++++---
 .../model_selection/tests/test_prediction.py  | 23 +++++++-
 2 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 37df0c3756ff6..b34fd0a372718 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -7,7 +7,7 @@
 from ..metrics import check_scoring, get_scorer_names, make_scorer, roc_curve
 from ..metrics._scorer import _ContinuousScorer
 from ..utils import _safe_indexing
-from ..utils._param_validation import HasMethods, Interval, StrOptions
+from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
 from ..utils._response import _get_response_values_binary
 from ..utils.metaestimators import available_if
 from ..utils.multiclass import type_of_target
@@ -19,7 +19,7 @@
     indexable,
 )
 
-from ._split import check_cv
+from ._split import check_cv, StratifiedShuffleSplit
 
 
 def _estimator_has(attr):
@@ -175,6 +175,15 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
 
+    refit : "auto" or bool, default="auto"
+        Whether or not to refit the classifier on the entire training set once
+        the decision threshold has been found. By default, `refit="auto"` is
+        equivalent to `refit=False` when `cv` is a float number using a single
+        shuffle split or `cv="prefit"` otherwise `refit=True` in all other
+        cases. Note that forcing `refit=False` on cross-validation having more
+        than a single split will raise an error. Similarly, `refit=True` in
+        conjunction with `cv="prefit"` will raise an error.
+
     n_jobs : int, default=None
         The number of jobs to run in parallel. When `cv` represents a
         cross-validation strategy, the fitting and scoring on each data split
@@ -182,6 +191,10 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
         :obj:`joblib.parallel_backend` context. ``-1`` means using all
         processors. See :term:`Glossary <n_jobs>` for more details.
 
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of cross-validation when `cv` is a float.
+        See :term:`Glossary <random_state>`.
+
     Attributes
     ----------
     decision_threshold_ : float
@@ -215,8 +228,14 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
         "pos_label": [Real, str, "boolean", None],
         "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})],
         "n_thresholds": [Interval(Integral, 1, None, closed="left")],
-        "cv": ["cv_object", StrOptions({"prefit"})],
+        "cv": [
+            "cv_object",
+            StrOptions({"prefit"}),
+            Interval(RealNotInt, 0.0, 1.0, closed="right"),
+        ],
+        "refit": ["boolean", StrOptions({"auto"})],
         "n_jobs": [Integral, None],
+        "random_state": ["random_state"],
     }
 
     def __init__(
@@ -229,7 +248,9 @@ def __init__(
         response_method="auto",
         n_thresholds=1_000,
         cv=None,
+        refit="auto",
         n_jobs=None,
+        random_state=None,
     ):
         self.estimator = estimator
         self.objective_metric = objective_metric
@@ -238,7 +259,9 @@ def __init__(
         self.response_method = response_method
         self.n_thresholds = n_thresholds
         self.cv = cv
+        self.refit = refit
         self.n_jobs = n_jobs
+        self.random_state = random_state
 
     def fit(self, X, y, sample_weight=None, **fit_params):
         """Fit the classifier and post-tune the decision threshold.
@@ -275,7 +298,20 @@ def fit(self, X, y, sample_weight=None, **fit_params):
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X)
 
-        cv = self.cv if self.cv == "prefit" else check_cv(self.cv, y, classifier=True)
+        if isinstance(self.cv, Real) and 0 < self.cv <= 1:
+            cv = StratifiedShuffleSplit(
+                n_splits=1, test_size=self.cv, random_state=self.random_state
+            )
+            refit = False if self.refit == "auto" else self.refit
+        elif self.cv == "prefit":
+            if self.refit is True:
+                raise ValueError("When cv='prefit', refit cannot be True.")
+            cv, refit = self.cv, False
+        else:
+            cv = check_cv(self.cv, y=y, classifier=True)
+            if self.refit is False:
+                raise ValueError("When cv has several folds, refit cannot be False.")
+            refit = True
 
         if self.response_method == "auto":
             self._response_method = ["predict_proba", "decision_function"]
@@ -294,12 +330,16 @@ def fit(self, X, y, sample_weight=None, **fit_params):
 
         fit_parameters = signature(self.estimator.fit).parameters
         supports_sw = "sample_weight" in fit_parameters
-        if sample_weight is not None and supports_sw:
-            self.estimator_ = clone(self.estimator).fit(
-                X, y, sample_weight, **fit_params
-            )
+
+        if refit:
+            if sample_weight is not None and supports_sw:
+                self.estimator_ = clone(self.estimator).fit(
+                    X, y, sample_weight, **fit_params
+                )
+            else:
+                self.estimator_ = clone(self.estimator).fit(X, y, **fit_params)
         else:
-            self.estimator_ = clone(self.estimator).fit(X, y, **fit_params)
+            self.estimator_ = self.estimator
 
         if cv == "prefit":
             classifier = self.estimator
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index 37aec59f03621..1653c98f5e5f7 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -202,6 +202,25 @@ def test_cutoffclassifier_no_binary():
         CutOffClassifier(LogisticRegression()).fit(X, y)
 
 
+@pytest.mark.parametrize(
+    "params, err_msg",
+    [
+        ({"cv": "prefit", "refit": True}, "When cv='prefit', refit cannot be True."),
+        (
+            {"cv": 10, "refit": False},
+            "When cv has several folds, refit cannot be False.",
+        ),
+    ],
+)
+def test_cutoffclassifier_conflit_cv_refit(params, err_msg):
+    """Check that we raise an informative error message when `cv` and `refit`
+    cannot be used together.
+    """
+    X, y = make_classification(n_samples=100, random_state=0)
+    with pytest.raises(ValueError, match=err_msg):
+        CutOffClassifier(LogisticRegression(), **params).fit(X, y)
+
+
 @pytest.mark.parametrize(
     "estimator",
     [LogisticRegression(), SVC(), GradientBoostingClassifier(n_estimators=4)],
@@ -318,10 +337,8 @@ def test_cutoffclassifier_with_string_targets(response_method, metric):
         objective_value=0.9,
         pos_label="cancer",
         response_method=response_method,
-        n_thresholds=10,
+        n_thresholds=100,
     ).fit(X, y)
     assert_array_equal(model.classes_, np.sort(classes))
     y_pred = model.predict(X[[0], :])
     assert y_pred.item(0) in classes
-
-    # print(model.decision_threshold_, model.objective_score_)

From 0f6dce27bec7aff722e7e27cbad6de84fd9143cc Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 19 Apr 2023 19:22:02 +0200
Subject: [PATCH 022/194] TST more test

---
 sklearn/model_selection/_prediction.py        | 55 +++++++++++++------
 .../model_selection/tests/test_prediction.py  | 45 +++++++++++++--
 2 files changed, 78 insertions(+), 22 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index b34fd0a372718..2512f4f92d6af 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -4,6 +4,7 @@
 import numpy as np
 
 from ..base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin, clone
+from ..exceptions import NotFittedError
 from ..metrics import check_scoring, get_scorer_names, make_scorer, roc_curve
 from ..metrics._scorer import _ContinuousScorer
 from ..utils import _safe_indexing
@@ -306,12 +307,21 @@ def fit(self, X, y, sample_weight=None, **fit_params):
         elif self.cv == "prefit":
             if self.refit is True:
                 raise ValueError("When cv='prefit', refit cannot be True.")
+            try:
+                check_is_fitted(self.estimator, "classes_")
+            except NotFittedError as exc:
+                raise NotFittedError(
+                    """When cv='prefit', `estimator` must be fitted."""
+                ) from exc
             cv, refit = self.cv, False
         else:
             cv = check_cv(self.cv, y=y, classifier=True)
-            if self.refit is False:
+            if self.refit is False and cv.get_n_splits() > 1:
                 raise ValueError("When cv has several folds, refit cannot be False.")
-            refit = True
+            if self.refit == "auto":
+                refit = True if cv.get_n_splits() == 1 else False
+            else:
+                refit = self.refit
 
         if self.response_method == "auto":
             self._response_method = ["predict_proba", "decision_function"]
@@ -331,22 +341,35 @@ def fit(self, X, y, sample_weight=None, **fit_params):
         fit_parameters = signature(self.estimator.fit).parameters
         supports_sw = "sample_weight" in fit_parameters
 
-        if refit:
-            if sample_weight is not None and supports_sw:
-                self.estimator_ = clone(self.estimator).fit(
-                    X, y, sample_weight, **fit_params
-                )
-            else:
-                self.estimator_ = clone(self.estimator).fit(X, y, **fit_params)
-        else:
-            self.estimator_ = self.estimator
-
+        # in the following block, we:
+        # - define the final classifier `self.estimator_` and train it if necessary
+        # - define `classifier` to be used to post-tune the decision threshold
+        # - define `split` to be used to fit/score `classifier`
         if cv == "prefit":
-            classifier = self.estimator
-            split = ([None, range(_num_samples(X))],)
+            self.estimator_ = self.estimator
+            classifier = self.estimator_
+            splits = ([None, range(_num_samples(X))],)
         else:
+            self.estimator_ = clone(self.estimator)
             classifier = clone(self.estimator)
-            split = cv.split(X, y)
+            splits = cv.split(X, y)
+
+            if refit:
+                # train on the whole dataset
+                X_train, y_train, sw_train = X, y, sample_weight
+            else:
+                # single split cross-validation
+                train_idx, _ = next(cv.split(X, y))
+                X_train = _safe_indexing(X, train_idx)
+                y_train = _safe_indexing(y, train_idx)
+                if sample_weight is not None:
+                    sw_train = _safe_indexing(sample_weight, train_idx)
+                else:
+                    sw_train = None
+            if sw_train is not None and supports_sw:
+                self.estimator_.fit(X_train, y_train, sample_weight=sw_train)
+            else:
+                self.estimator_.fit(X_train, y_train)
 
         if self.objective_metric in {"tpr", "tnr"}:
             if (
@@ -380,7 +403,7 @@ def fit(self, X, y, sample_weight=None, **fit_params):
                     self._scorer,
                     self.objective_metric,
                 )
-                for train_idx, val_idx in split
+                for train_idx, val_idx in splits
             )
         )
 
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index 1653c98f5e5f7..6fdfc76076903 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -203,21 +203,31 @@ def test_cutoffclassifier_no_binary():
 
 
 @pytest.mark.parametrize(
-    "params, err_msg",
+    "params, err_type, err_msg",
     [
-        ({"cv": "prefit", "refit": True}, "When cv='prefit', refit cannot be True."),
+        (
+            {"cv": "prefit", "refit": True},
+            ValueError,
+            "When cv='prefit', refit cannot be True.",
+        ),
         (
             {"cv": 10, "refit": False},
+            ValueError,
             "When cv has several folds, refit cannot be False.",
         ),
+        (
+            {"cv": "prefit", "refit": False},
+            NotFittedError,
+            "`estimator` must be fitted.",
+        ),
     ],
 )
-def test_cutoffclassifier_conflit_cv_refit(params, err_msg):
+def test_cutoffclassifier_conflit_cv_refit(params, err_type, err_msg):
     """Check that we raise an informative error message when `cv` and `refit`
     cannot be used together.
     """
     X, y = make_classification(n_samples=100, random_state=0)
-    with pytest.raises(ValueError, match=err_msg):
+    with pytest.raises(err_type, match=err_msg):
         CutOffClassifier(LogisticRegression(), **params).fit(X, y)
 
 
@@ -340,5 +350,28 @@ def test_cutoffclassifier_with_string_targets(response_method, metric):
         n_thresholds=100,
     ).fit(X, y)
     assert_array_equal(model.classes_, np.sort(classes))
-    y_pred = model.predict(X[[0], :])
-    assert y_pred.item(0) in classes
+    y_pred = model.predict(X)
+    assert_array_equal(np.sort(np.unique(y_pred)), np.sort(classes))
+
+
+def test_cutoffclassifier_refit():
+    """Check the behaviour of the `refit` parameter."""
+    X, y = make_classification(n_samples=100, random_state=0)
+
+    # check that `estimator_` if fitted on the full dataset when `refit=True`
+    estimator = LogisticRegression()
+    model = CutOffClassifier(estimator, refit=True).fit(X, y)
+
+    assert model.estimator_ is not estimator
+    estimator.fit(X, y)
+    assert_allclose(model.estimator_.coef_, estimator.coef_)
+    assert_allclose(model.estimator_.intercept_, estimator.intercept_)
+    model.predict(X)
+
+    # check that `estimator_` was not altered when `refit=False`
+    estimator = LogisticRegression().fit(X, y)
+    coef = estimator.coef_.copy()
+    model = CutOffClassifier(estimator, cv="prefit", refit=False).fit(X, y)
+
+    assert model.estimator_ is estimator
+    assert_allclose(model.estimator_.coef_, coef)

From d6fb9f799150f04a24913efb181a222a75914880 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 19 Apr 2023 19:35:12 +0200
Subject: [PATCH 023/194] TST more test with sample_weight

---
 .../model_selection/tests/test_prediction.py  | 40 +++++++++++++++----
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index 6fdfc76076903..fe3530e765ab0 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -354,24 +354,50 @@ def test_cutoffclassifier_with_string_targets(response_method, metric):
     assert_array_equal(np.sort(np.unique(y_pred)), np.sort(classes))
 
 
-def test_cutoffclassifier_refit():
+@pytest.mark.parametrize("with_sample_weight", [True, False])
+def test_cutoffclassifier_refit(with_sample_weight, global_random_seed):
     """Check the behaviour of the `refit` parameter."""
+    rng = np.random.RandomState(global_random_seed)
     X, y = make_classification(n_samples=100, random_state=0)
+    if with_sample_weight:
+        sample_weight = rng.randn(X.shape[0])
+    else:
+        sample_weight = None
 
     # check that `estimator_` if fitted on the full dataset when `refit=True`
     estimator = LogisticRegression()
-    model = CutOffClassifier(estimator, refit=True).fit(X, y)
+    model = CutOffClassifier(estimator, refit=True).fit(
+        X, y, sample_weight=sample_weight
+    )
 
     assert model.estimator_ is not estimator
-    estimator.fit(X, y)
+    estimator.fit(X, y, sample_weight=sample_weight)
     assert_allclose(model.estimator_.coef_, estimator.coef_)
     assert_allclose(model.estimator_.intercept_, estimator.intercept_)
-    model.predict(X)
 
-    # check that `estimator_` was not altered when `refit=False`
-    estimator = LogisticRegression().fit(X, y)
+    # check that `estimator_` was not altered when `refit=False` and `cv="prefit"`
+    estimator = LogisticRegression().fit(X, y, sample_weight=sample_weight)
     coef = estimator.coef_.copy()
-    model = CutOffClassifier(estimator, cv="prefit", refit=False).fit(X, y)
+    model = CutOffClassifier(estimator, cv="prefit", refit=False).fit(
+        X, y, sample_weight=sample_weight
+    )
 
     assert model.estimator_ is estimator
     assert_allclose(model.estimator_.coef_, coef)
+
+    # check that we train `estimator_` on the training split of a given cross-validation
+    estimator = LogisticRegression()
+    cv = [
+        (np.arange(50), np.arange(50, 100)),
+    ]  # single split
+    model = CutOffClassifier(estimator, cv=cv, refit=False).fit(
+        X, y, sample_weight=sample_weight
+    )
+
+    assert model.estimator_ is not estimator
+    if with_sample_weight:
+        sw_train = sample_weight[cv[0][0]]
+    else:
+        sw_train = None
+    estimator.fit(X[cv[0][0]], y[cv[0][0]], sample_weight=sw_train)
+    assert_allclose(model.estimator_.coef_, estimator.coef_)

From 7ff3d0d95b24038d372fafc90d81ebaa6669c9db Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 19 Apr 2023 19:47:26 +0200
Subject: [PATCH 024/194] BUG fit_params split

---
 doc/whats_new/v1.3.rst                        |  2 +-
 sklearn/model_selection/_prediction.py        | 29 +++++++++++++++----
 .../model_selection/tests/test_prediction.py  | 11 +++++++
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 458df4a7d3ed4..3c3d159a6eb65 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -355,7 +355,7 @@ Changelog
 
 - |MajorFeature| :class:`model_selection.CutOffClassifier` calibrates decision threshold
   function of a binary classifier by maximizing a classification metric through
-  cross-validatin.
+  cross-validation.
   :pr:`26120` by :user:`Guillaume Lemaitre <glemaitre>`.
 
 :mod:`sklearn.naive_bayes`
diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 2512f4f92d6af..e22f2f1ec11ea 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -14,6 +14,7 @@
 from ..utils.multiclass import type_of_target
 from ..utils.parallel import Parallel, delayed
 from ..utils.validation import (
+    _check_fit_params,
     _check_sample_weight,
     _num_samples,
     check_is_fitted,
@@ -37,7 +38,15 @@ def _estimator_has(attr):
 
 
 def _fit_and_score(
-    classifier, X, y, sample_weight, train_idx, val_idx, scorer, score_method
+    classifier,
+    X,
+    y,
+    sample_weight,
+    fit_params,
+    train_idx,
+    val_idx,
+    scorer,
+    score_method,
 ):
     """Fit a classifier and compute the scores for different decision thresholds.
 
@@ -56,6 +65,9 @@ def _fit_and_score(
     sample_weight : array-like of shape (n_samples,)
         Some optional associated sample weights.
 
+    fit_params : dict
+        Parameters to pass to the `fit` method of the underlying classifier.
+
     train_idx : ndarray of shape (n_train_samples,) or None
         The indices of the training set. If `None`, `classifier` is expected to be
         already fitted.
@@ -94,10 +106,11 @@ def _fit_and_score(
             )
         else:
             sw_train, sw_val = None, None
+        fit_params_train = _check_fit_params(X_train, fit_params, train_idx)
         if supports_sw:
-            classifier.fit(X_train, y_train, sample_weight=sw_train)
+            classifier.fit(X_train, y_train, sample_weight=sw_train, **fit_params_train)
         else:
-            classifier.fit(X_train, y_train)
+            classifier.fit(X_train, y_train, **fit_params_train)
     else:  # prefit estimator, only a validation set is provided
         X_val, y_val, sw_val = X, y, sample_weight
         check_is_fitted(classifier, "classes_")
@@ -357,6 +370,7 @@ def fit(self, X, y, sample_weight=None, **fit_params):
             if refit:
                 # train on the whole dataset
                 X_train, y_train, sw_train = X, y, sample_weight
+                fit_params_train = _check_fit_params(X, fit_params, indices=None)
             else:
                 # single split cross-validation
                 train_idx, _ = next(cv.split(X, y))
@@ -366,10 +380,14 @@ def fit(self, X, y, sample_weight=None, **fit_params):
                     sw_train = _safe_indexing(sample_weight, train_idx)
                 else:
                     sw_train = None
+                fit_params_train = _check_fit_params(X, fit_params, indices=train_idx)
+
             if sw_train is not None and supports_sw:
-                self.estimator_.fit(X_train, y_train, sample_weight=sw_train)
+                self.estimator_.fit(
+                    X_train, y_train, sample_weight=sw_train, **fit_params_train
+                )
             else:
-                self.estimator_.fit(X_train, y_train)
+                self.estimator_.fit(X_train, y_train, **fit_params_train)
 
         if self.objective_metric in {"tpr", "tnr"}:
             if (
@@ -398,6 +416,7 @@ def fit(self, X, y, sample_weight=None, **fit_params):
                     X,
                     y,
                     sample_weight,
+                    fit_params,
                     train_idx,
                     val_idx,
                     self._scorer,
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index fe3530e765ab0..d7a5d7a8cbb8c 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -57,6 +57,7 @@ def test_fit_and_score_scorers(scorer, score_method):
         X,
         y,
         sample_weight=None,
+        fit_params={},
         train_idx=train_idx,
         val_idx=val_idx,
         scorer=scorer,
@@ -106,6 +107,7 @@ def test_fit_and_score_prefit(scorer, score_method, expected_score):
             X,
             y,
             sample_weight=None,
+            fit_params={},
             train_idx=train_idx,
             val_idx=val_idx,
             scorer=scorer,
@@ -122,6 +124,7 @@ def test_fit_and_score_prefit(scorer, score_method, expected_score):
         X,
         y,
         sample_weight=None,
+        fit_params={},
         train_idx=train_idx,
         val_idx=val_idx,
         scorer=scorer,
@@ -172,6 +175,7 @@ def test_fit_and_score_sample_weight(scorer, score_method):
         X_repeated,
         y_repeated,
         sample_weight=None,
+        fit_params={},
         train_idx=train_repeated_idx,
         val_idx=val_repeated_idx,
         scorer=scorer,
@@ -184,6 +188,7 @@ def test_fit_and_score_sample_weight(scorer, score_method):
         X,
         y,
         sample_weight=sample_weight,
+        fit_params={},
         train_idx=train_idx,
         val_idx=val_idx,
         scorer=scorer,
@@ -194,6 +199,9 @@ def test_fit_and_score_sample_weight(scorer, score_method):
     assert_allclose(scores_repeated, scores)
 
 
+# TODO: add a test for `fit_params` for the `_fit_and_score` function
+
+
 def test_cutoffclassifier_no_binary():
     """Check that we raise an informative error message for non-binary problem."""
     X, y = make_classification(n_classes=3, n_clusters_per_class=1)
@@ -401,3 +409,6 @@ def test_cutoffclassifier_refit(with_sample_weight, global_random_seed):
         sw_train = None
     estimator.fit(X[cv[0][0]], y[cv[0][0]], sample_weight=sw_train)
     assert_allclose(model.estimator_.coef_, estimator.coef_)
+
+
+# TODO: add a test to check that `fit_params` is dispatched properly

From 6985ae9b2a3f2aa6ee0fd4a2facc25f0c394aa4e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 19 Apr 2023 22:46:43 +0200
Subject: [PATCH 025/194] TST add test for fit_params

---
 sklearn/model_selection/_prediction.py        |  2 +-
 .../model_selection/tests/test_prediction.py  | 69 ++++++++++++++++++-
 2 files changed, 67 insertions(+), 4 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index e22f2f1ec11ea..db1152097efe7 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -106,7 +106,7 @@ def _fit_and_score(
             )
         else:
             sw_train, sw_val = None, None
-        fit_params_train = _check_fit_params(X_train, fit_params, train_idx)
+        fit_params_train = _check_fit_params(X, fit_params, indices=train_idx)
         if supports_sw:
             classifier.fit(X_train, y_train, sample_weight=sw_train, **fit_params_train)
         else:
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index d7a5d7a8cbb8c..d8bbc1069a496 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -17,7 +17,12 @@
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
-from sklearn.utils._testing import assert_allclose, assert_array_equal
+from sklearn.utils._mocking import CheckingClassifier
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_allclose,
+    assert_array_equal,
+)
 
 from sklearn.model_selection import CutOffClassifier
 from sklearn.model_selection._prediction import _fit_and_score
@@ -199,7 +204,51 @@ def test_fit_and_score_sample_weight(scorer, score_method):
     assert_allclose(scores_repeated, scores)
 
 
-# TODO: add a test for `fit_params` for the `_fit_and_score` function
+@pytest.mark.parametrize(
+    "scorer, score_method",
+    [
+        (
+            _ContinuousScorer(
+                score_func=balanced_accuracy_score,
+                sign=1,
+                response_method="predict_proba",
+                kwargs={},
+            ),
+            "balanced_accuracy",
+        ),
+        (
+            make_scorer(roc_curve, needs_proba=True),
+            "tpr",
+        ),
+        (
+            make_scorer(roc_curve, needs_proba=True),
+            "tnr",
+        ),
+    ],
+)
+@pytest.mark.parametrize("fit_params_type", ["list", "array"])
+def test_fit_and_score_fit_params(scorer, score_method, fit_params_type):
+    """Check that we pass `fit_params` to the classifier when calling `fit`."""
+    X, y = make_classification(n_samples=100, random_state=0)
+    fit_params = {
+        "a": _convert_container(y, fit_params_type),
+        "b": _convert_container(y, fit_params_type),
+    }
+
+    classifier = CheckingClassifier(expected_fit_params=["a", "b"])
+    train_idx, val_idx = np.arange(50), np.arange(50, 100)
+
+    _fit_and_score(
+        classifier,
+        X,
+        y,
+        sample_weight=None,
+        fit_params=fit_params,
+        train_idx=train_idx,
+        val_idx=val_idx,
+        scorer=scorer,
+        score_method=score_method,
+    )
 
 
 def test_cutoffclassifier_no_binary():
@@ -411,4 +460,18 @@ def test_cutoffclassifier_refit(with_sample_weight, global_random_seed):
     assert_allclose(model.estimator_.coef_, estimator.coef_)
 
 
-# TODO: add a test to check that `fit_params` is dispatched properly
+@pytest.mark.parametrize("objective_metric", ["tpr", "tnr", "balanced_accuracy"])
+@pytest.mark.parametrize("fit_params_type", ["list", "array"])
+def test_cutoffclassifier_fit_params(objective_metric, fit_params_type):
+    """Check that we pass `fit_params` to the classifier when calling `fit`."""
+    X, y = make_classification(n_samples=100, random_state=0)
+    fit_params = {
+        "a": _convert_container(y, fit_params_type),
+        "b": _convert_container(y, fit_params_type),
+    }
+
+    classifier = CheckingClassifier(expected_fit_params=["a", "b"])
+    model = CutOffClassifier(
+        classifier, objective_metric=objective_metric, objective_value=0.5
+    )
+    model.fit(X, y, **fit_params)

From 239793ab3314eb016107f35992a4dc901b64d70c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 20 Apr 2023 10:00:06 +0200
Subject: [PATCH 026/194] TST check underlying response method for TNR/TPR

---
 sklearn/model_selection/_prediction.py        |  2 +
 .../model_selection/tests/test_prediction.py  | 44 +++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index db1152097efe7..e61f0e1ed420f 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -394,6 +394,8 @@ def fit(self, X, y, sample_weight=None, **fit_params):
                 self._response_method == "predict_proba"
                 or self._response_method[0] == "predict_proba"
             ):
+                # `needs_proba=True` will first try to use `predict_proba` and then
+                # `decision_function`
                 params_scorer = {"needs_proba": True, "pos_label": self.pos_label}
             else:
                 params_scorer = {"needs_threshold": True, "pos_label": self.pos_label}
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index d8bbc1069a496..b0efdc5e3b265 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -475,3 +475,47 @@ def test_cutoffclassifier_fit_params(objective_metric, fit_params_type):
         classifier, objective_metric=objective_metric, objective_value=0.5
     )
     model.fit(X, y, **fit_params)
+
+
+@pytest.mark.parametrize(
+    "objective_metric, objective_value", [("tpr", 0.5), ("tnr", 0.5)]
+)
+@pytest.mark.parametrize(
+    "response_method", ["auto", "decision_function", "predict_proba"]
+)
+def test_cutoffclassifier_response_method_scorer_tnr_tpr(
+    objective_metric, objective_value, response_method, global_random_seed
+):
+    """Check that we use the proper scorer and forwarding the requested response method
+    for `tnr` and `tpr`.
+    """
+    X, y = make_classification(n_samples=100, random_state=global_random_seed)
+    classifier = LogisticRegression()
+
+    model = CutOffClassifier(
+        classifier,
+        objective_metric=objective_metric,
+        objective_value=objective_value,
+        response_method=response_method,
+    )
+    model.fit(X, y)
+
+    # Note that optimizing TPR will increase the decision threshold while optimizing
+    # TNR will decrease it. We therefore use the centered threshold (i.e. 0.5 for
+    # probabilities and 0.0 for decision function) to check that the decision threshold
+    # is properly set.
+    if response_method in ("auto", "predict_proba"):
+        # "auto" will fall back  in priority on `predict_proba` if `estimator`
+        # supports it.
+        # we expect the decision threshold to be in [0, 1]
+        if objective_metric == "tpr":
+            assert 0.5 < model.decision_threshold_ < 1
+        else:  # "tnr"
+            assert 0 < model.decision_threshold_ < 0.5
+    else:  # "decision_function"
+        # we expect the decision function to be centered in 0.0 and to be larger than
+        # -1 and 1.
+        if objective_metric == "tpr":
+            assert 0 < model.decision_threshold_ < 20
+        else:  # "tnr"
+            assert -20 < model.decision_threshold_ < 0

From 92083edd085709f05ac30a0367879a491a306f04 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 20 Apr 2023 11:40:03 +0200
Subject: [PATCH 027/194] FEA add the possibility to provide a dict

---
 sklearn/model_selection/_prediction.py        | 72 ++++++++++++++++---
 .../model_selection/tests/test_prediction.py  | 34 +++++++++
 2 files changed, 97 insertions(+), 9 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index e61f0e1ed420f..0e365f6761e46 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -1,3 +1,4 @@
+from collections.abc import MutableMapping
 from inspect import signature
 from numbers import Integral, Real
 
@@ -5,7 +6,13 @@
 
 from ..base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin, clone
 from ..exceptions import NotFittedError
-from ..metrics import check_scoring, get_scorer_names, make_scorer, roc_curve
+from ..metrics import (
+    check_scoring,
+    confusion_matrix,
+    get_scorer_names,
+    make_scorer,
+    roc_curve,
+)
 from ..metrics._scorer import _ContinuousScorer
 from ..utils import _safe_indexing
 from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
@@ -16,6 +23,7 @@
 from ..utils.validation import (
     _check_fit_params,
     _check_sample_weight,
+    _check_pos_label_consistency,
     _num_samples,
     check_is_fitted,
     indexable,
@@ -115,7 +123,7 @@ def _fit_and_score(
         X_val, y_val, sw_val = X, y, sample_weight
         check_is_fitted(classifier, "classes_")
 
-    if score_method in {"tnr", "tpr"}:
+    if isinstance(score_method, str) and score_method in {"tnr", "tpr"}:
         fpr, tpr, potential_thresholds = scorer(
             classifier, X_val, y_val, sample_weight=sw_val
         )
@@ -134,17 +142,20 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
         The classifier, fitted or not fitted, for which we want to optimize
         the decision threshold used during `predict`.
 
-    objective_metric : {"tpr", "tnr"}, str or callable, default="balanced_accuracy"
+    objective_metric : {"tpr", "tnr"}, str, dict or callable, \
+            default="balanced_accuracy"
         The objective metric to be optimized. Can be one of:
 
         * a string associated to a scoring function (see model evaluation
           documentation);
-        * a scorer callable object / function with the signature
-          `metric(estimator, X, y)`;
+        * a scorer callable object created with :func:`~sklearn.metrics.make_scorer`;
         * `"tpr"`: find the decision threshold for a true positive ratio (TPR)
           of `objective_value`;
         * `"tnr"`: find the decision threshold for a true negative ratio (TNR)
           of `objective_value`.
+        * a dictionary representing a cost-matrix. The keys of the dictionary
+          should be: `("tp", "fp", "tn", "fn")`. The values of the dictionary
+          corresponds to the cost/gain.
 
     objective_value : float, default=None
         The value associated with the `objective_metric` metric for which we
@@ -152,8 +163,9 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
         `"tpr"` or `"tnr"`.
 
     pos_label : int, float, bool or str, default=None
-        The label of the positive class. Used with `objective_metric="tpr"` or
-        `"tnr"`. When `pos_label=None`, if `y_true` is in `{-1, 1}` or `{0, 1}`,
+        The label of the positive class. Used when `objective_metric` is `"tpr"`,
+        `"tnr"`, or a dictionary representing a cost-matrix.
+        When `pos_label=None`, if `y_true` is in `{-1, 1}` or `{0, 1}`,
         `pos_label` is set to 1, otherwise an error will be raised. When using a
         scorer, `pos_label` can be passed as a keyword argument to
         :func:`~sklearn.metrics.make_scorer`.
@@ -237,6 +249,7 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
         "objective_metric": [
             StrOptions(set(get_scorer_names()) | {"tpr", "tnr"}),
             callable,
+            MutableMapping,
         ],
         "objective_value": [Real, None],
         "pos_label": [Real, str, "boolean", None],
@@ -341,7 +354,10 @@ def fit(self, X, y, sample_weight=None, **fit_params):
         else:
             self._response_method = self.response_method
 
-        if self.objective_metric in {"tpr", "tnr"}:
+        if isinstance(self.objective_metric, str) and self.objective_metric in {
+            "tpr",
+            "tnr",
+        }:
             if self.objective_value is None:
                 raise ValueError(
                     "When `objective_metric` is 'tpr' or 'tnr', `objective_value` must "
@@ -389,7 +405,45 @@ def fit(self, X, y, sample_weight=None, **fit_params):
             else:
                 self.estimator_.fit(X_train, y_train, **fit_params_train)
 
-        if self.objective_metric in {"tpr", "tnr"}:
+        if isinstance(self.objective_metric, MutableMapping):
+            keys = set(self.objective_metric.keys())
+            if not keys == {"tp", "tn", "fp", "fn"}:
+                raise ValueError(
+                    "Invalid keys in `objective_metric`. Valid keys are "
+                    f"'tp', 'tn', 'fp', and 'fn'. Got {keys} instead."
+                )
+            pos_label = _check_pos_label_consistency(self.pos_label, y)
+
+            def cost_score_func(y_true, y_pred, **kwargs):
+                tp_cost, tn_cost, fp_cost, fn_cost = (
+                    kwargs["tp"],
+                    kwargs["tn"],
+                    kwargs["fp"],
+                    kwargs["fn"],
+                )
+                cost_matrix = np.array([[tn_cost, fp_cost], [fn_cost, tp_cost]])
+
+                sample_weight = kwargs.get("sample_weight", None)
+                cm = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
+
+                pos_label, classes = kwargs["pos_label"], np.unique(y_true)
+                pos_label_idx = np.searchsorted(classes, pos_label)
+                if pos_label_idx == 0:
+                    # reorder the confusion matrix to be aligned with the cost-matrix
+                    cm = cm[::-1, ::-1]
+
+                return (cost_matrix * cm).sum()
+
+            self._scorer = _ContinuousScorer(
+                score_func=cost_score_func,
+                sign=1,
+                response_method=self._response_method,
+                kwargs={
+                    **self.objective_metric,
+                    "pos_label": pos_label,
+                },
+            )
+        elif self.objective_metric in {"tpr", "tnr"}:
             if (
                 self._response_method == "predict_proba"
                 or self._response_method[0] == "predict_proba"
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index b0efdc5e3b265..26d1ad9b9ee85 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -519,3 +519,37 @@ def test_cutoffclassifier_response_method_scorer_tnr_tpr(
             assert 0 < model.decision_threshold_ < 20
         else:  # "tnr"
             assert -20 < model.decision_threshold_ < 0
+
+
+def test_cutoffclassifier_custom_objective_metric(global_random_seed):
+    """Check that we can pass a custom objective metric."""
+    X, y = make_classification(n_samples=500, random_state=global_random_seed)
+    classifier = LogisticRegression()
+
+    # we need to set a small number of thresholds to avoid ties and picking a too low
+    # threshold.
+    n_thresholds = 5
+
+    # affect a high gain to true negative and force the classifier to mainly
+    # predict the negative class.
+    cost_matrix = {"tp": 0, "tn": 10, "fp": 0, "fn": 0}
+    model = CutOffClassifier(
+        classifier, objective_metric=cost_matrix, n_thresholds=n_thresholds
+    )
+    model.fit(X, y)
+
+    assert model.decision_threshold_ > 0.99
+    assert np.mean(model.predict(X) == 0) > 0.95
+
+    # use the true positive now
+    cost_matrix = {"tp": 10, "tn": 0, "fp": 0, "fn": 0}
+    model = CutOffClassifier(
+        classifier, objective_metric=cost_matrix, n_thresholds=n_thresholds
+    )
+    model.fit(X, y)
+
+    assert model.decision_threshold_ < 0.01
+    assert np.mean(model.predict(X) == 1) > 0.95
+
+
+# TODO: add a test for interaction with pos_label and string labels and the cost_matrix

From 55d084499de7e8fc47fec270a387c5a34b22cca2 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 20 Apr 2023 11:57:29 +0200
Subject: [PATCH 028/194] TST check string and pos_label interation for
 cost-matrix

---
 .../model_selection/tests/test_prediction.py  | 23 +++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index 26d1ad9b9ee85..c0a6b1e37110f 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -385,6 +385,7 @@ def test_cutoffclassifier_metric_with_parameter():
         "tnr",
         make_scorer(balanced_accuracy_score),
         make_scorer(f1_score, pos_label="cancer"),
+        {"tp": 1, "tn": 1, "fp": 1, "fn": 1},
     ],
 )
 def test_cutoffclassifier_with_string_targets(response_method, metric):
@@ -521,7 +522,7 @@ def test_cutoffclassifier_response_method_scorer_tnr_tpr(
             assert -20 < model.decision_threshold_ < 0
 
 
-def test_cutoffclassifier_custom_objective_metric(global_random_seed):
+def test_cutoffclassifier_objective_metric_dict(global_random_seed):
     """Check that we can pass a custom objective metric."""
     X, y = make_classification(n_samples=500, random_state=global_random_seed)
     classifier = LogisticRegression()
@@ -539,7 +540,7 @@ def test_cutoffclassifier_custom_objective_metric(global_random_seed):
     model.fit(X, y)
 
     assert model.decision_threshold_ > 0.99
-    assert np.mean(model.predict(X) == 0) > 0.95
+    assert np.mean(model.predict(X) == 0) > 0.9
 
     # use the true positive now
     cost_matrix = {"tp": 10, "tn": 0, "fp": 0, "fn": 0}
@@ -549,7 +550,21 @@ def test_cutoffclassifier_custom_objective_metric(global_random_seed):
     model.fit(X, y)
 
     assert model.decision_threshold_ < 0.01
-    assert np.mean(model.predict(X) == 1) > 0.95
+    assert np.mean(model.predict(X) == 1) > 0.9
+
+    # flipping the `pos_label` to zero should flip as well the decision threshold
+    pos_label = 0
+    model = CutOffClassifier(
+        classifier,
+        objective_metric=cost_matrix,
+        n_thresholds=n_thresholds,
+        pos_label=pos_label,
+    )
+    model.fit(X, y)
+
+    assert model.decision_threshold_ > 0.99
+    assert np.mean(model.predict(X) == 0) > 0.9
 
 
-# TODO: add a test for interaction with pos_label and string labels and the cost_matrix
+# TODO: add a test to check that we pass sample_weight when computing the confusion
+# matrix

From 7dfc4a6479a6fff0767ea0f9dd49b9ed4ab50891 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 20 Apr 2023 15:44:28 +0200
Subject: [PATCH 029/194] TST add sample_weight test for cost-matrix

---
 sklearn/model_selection/_prediction.py        |  4 +++
 .../model_selection/tests/test_prediction.py  | 30 ++++++++++++++++---
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 0e365f6761e46..3a6eb441dd20d 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -25,6 +25,7 @@
     _check_sample_weight,
     _check_pos_label_consistency,
     _num_samples,
+    check_consistent_length,
     check_is_fitted,
     indexable,
 )
@@ -101,6 +102,9 @@ def _fit_and_score(
     scores : ndarray of shape (n_thresholds,)
         The scores computed for each decision threshold.
     """
+    arrays = (X, y) if sample_weight is None else (X, y, sample_weight)
+    check_consistent_length(*arrays)
+
     fit_parameters = signature(classifier.fit).parameters
     supports_sw = "sample_weight" in fit_parameters
 
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index c0a6b1e37110f..abf44bc1b03c1 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -169,7 +169,7 @@ def test_fit_and_score_sample_weight(scorer, score_method):
     # create a dataset and repeat twice the sample of class #0
     X_repeated, y_repeated = np.vstack([X, X[y == 0]]), np.hstack([y, y[y == 0]])
     # create a sample weight vector that is equivalent to the repeated dataset
-    sample_weight = np.ones_like(y_repeated)
+    sample_weight = np.ones_like(y)
     sample_weight[:50] *= 2
 
     classifier = LogisticRegression()
@@ -279,7 +279,7 @@ def test_cutoffclassifier_no_binary():
         ),
     ],
 )
-def test_cutoffclassifier_conflit_cv_refit(params, err_type, err_msg):
+def test_cutoffclassifier_conflict_cv_refit(params, err_type, err_msg):
     """Check that we raise an informative error message when `cv` and `refit`
     cannot be used together.
     """
@@ -566,5 +566,27 @@ def test_cutoffclassifier_objective_metric_dict(global_random_seed):
     assert np.mean(model.predict(X) == 0) > 0.9
 
 
-# TODO: add a test to check that we pass sample_weight when computing the confusion
-# matrix
+def test_cutoffclassifier_sample_weight_cost_matrix():
+    """Check that we dispatch the `sample_weight` to the scorer when computing the
+    confusion matrix."""
+    X, y = load_iris(return_X_y=True)
+    X, y = X[:100], y[:100]  # only 2 classes
+
+    # create a dataset and repeat twice the sample of class #0
+    X_repeated, y_repeated = np.vstack([X, X[y == 0]]), np.hstack([y, y[y == 0]])
+    # create a sample weight vector that is equivalent to the repeated dataset
+    sample_weight = np.ones_like(y)
+    sample_weight[:50] *= 2
+
+    # we use a prefit classifier to simplify the test
+    cv = "prefit"
+    estimator = LogisticRegression().fit(X, y)
+    cost_matrix = {"tp": 1, "tn": 1, "fp": 1, "fn": 1}
+
+    model_repeat = CutOffClassifier(estimator, cv=cv, objective_metric=cost_matrix)
+    model_repeat.fit(X_repeated, y_repeated, sample_weight=None)
+
+    model_sw = CutOffClassifier(estimator, cv=cv, objective_metric=cost_matrix)
+    model_sw.fit(X, y, sample_weight=sample_weight)
+
+    assert model_repeat.objective_score_ == pytest.approx(model_sw.objective_score_)

From 729c9a8d0ae233dd4cea7dff4c7145bc7b413977 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 21 Apr 2023 00:21:28 +0200
Subject: [PATCH 030/194] iter

---
 sklearn/model_selection/_prediction.py        | 128 ++++++++++--------
 .../model_selection/tests/test_prediction.py  |  94 +++++++------
 2 files changed, 121 insertions(+), 101 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 3a6eb441dd20d..3808f9bdf3be7 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -127,13 +127,14 @@ def _fit_and_score(
         X_val, y_val, sw_val = X, y, sample_weight
         check_is_fitted(classifier, "classes_")
 
-    if isinstance(score_method, str) and score_method in {"tnr", "tpr"}:
+    if isinstance(score_method, str) and score_method in {
+        "max_tpr_at_tnr_constraint",
+        "max_tnr_at_tpr_constraint",
+    }:
         fpr, tpr, potential_thresholds = scorer(
             classifier, X_val, y_val, sample_weight=sw_val
         )
-        if score_method == "tnr":
-            return potential_thresholds[::-1], (1 - fpr)[::-1]
-        return potential_thresholds[::-1], tpr[::-1]
+        return potential_thresholds[::-1], (tpr[::-1], (1 - fpr)[::-1])
     return scorer(classifier, X_val, y_val, sample_weight=sw_val)
 
 
@@ -146,29 +147,30 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
         The classifier, fitted or not fitted, for which we want to optimize
         the decision threshold used during `predict`.
 
-    objective_metric : {"tpr", "tnr"}, str, dict or callable, \
-            default="balanced_accuracy"
+    objective_metric : {"max_tpr_at_tnr_constraint", "max_tnr_at_tpr_constraint"}, \
+            str, dict or callable, default="balanced_accuracy"
         The objective metric to be optimized. Can be one of:
 
         * a string associated to a scoring function (see model evaluation
           documentation);
         * a scorer callable object created with :func:`~sklearn.metrics.make_scorer`;
-        * `"tpr"`: find the decision threshold for a true positive ratio (TPR)
-          of `objective_value`;
-        * `"tnr"`: find the decision threshold for a true negative ratio (TNR)
-          of `objective_value`.
-        * a dictionary representing a cost-matrix. The keys of the dictionary
-          should be: `("tp", "fp", "tn", "fn")`. The values of the dictionary
-          corresponds to the cost/gain.
-
-    objective_value : float, default=None
+        * `"max_tnr_at_tpr_constraint"`: find the decision threshold for a true
+          positive ratio (TPR) of `constraint_value`;
+        * `"max_tpr_at_tnr_constraint"`: find the decision threshold for a true
+          negative ratio (TNR) of `constraint_value`.
+        * a dictionary to be used as cost-sensitive matrix. The keys of the
+          dictionary should be: `("tp", "fp", "tn", "fn")`. The values of the
+          dictionary corresponds costs (negative values) and gains (positive
+          values).
+
+    constraint_value : float, default=None
         The value associated with the `objective_metric` metric for which we
         want to find the decision threshold when `objective_metric` is equal to
-        `"tpr"` or `"tnr"`.
+        `"max_tnr_at_tpr_constraint"` or `"max_tpr_at_tnr_constraint"`.
 
     pos_label : int, float, bool or str, default=None
-        The label of the positive class. Used when `objective_metric` is `"tpr"`,
-        `"tnr"`, or a dictionary representing a cost-matrix.
+        The label of the positive class. Used when `objective_metric` is
+        `"max_tnr_at_tpr_constraint"`"`, `"max_tpr_at_tnr_constraint"`, or a dictionary.
         When `pos_label=None`, if `y_true` is in `{-1, 1}` or `{0, 1}`,
         `pos_label` is set to 1, otherwise an error will be raised. When using a
         scorer, `pos_label` can be passed as a keyword argument to
@@ -251,11 +253,14 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
             HasMethods(["fit", "decision_function"]),
         ],
         "objective_metric": [
-            StrOptions(set(get_scorer_names()) | {"tpr", "tnr"}),
+            StrOptions(
+                set(get_scorer_names())
+                | {"max_tnr_at_tpr_constraint", "max_tpr_at_tnr_constraint"}
+            ),
             callable,
             MutableMapping,
         ],
-        "objective_value": [Real, None],
+        "constraint_value": [Real, None],
         "pos_label": [Real, str, "boolean", None],
         "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})],
         "n_thresholds": [Interval(Integral, 1, None, closed="left")],
@@ -274,7 +279,7 @@ def __init__(
         estimator,
         *,
         objective_metric="balanced_accuracy",
-        objective_value=None,
+        constraint_value=None,
         pos_label=None,
         response_method="auto",
         n_thresholds=1_000,
@@ -285,7 +290,7 @@ def __init__(
     ):
         self.estimator = estimator
         self.objective_metric = objective_metric
-        self.objective_value = objective_value
+        self.constraint_value = constraint_value
         self.pos_label = pos_label
         self.response_method = response_method
         self.n_thresholds = n_thresholds
@@ -359,17 +364,18 @@ def fit(self, X, y, sample_weight=None, **fit_params):
             self._response_method = self.response_method
 
         if isinstance(self.objective_metric, str) and self.objective_metric in {
-            "tpr",
-            "tnr",
+            "max_tpr_at_tnr_constraint",
+            "max_tnr_at_tpr_constraint",
         }:
-            if self.objective_value is None:
+            if self.constraint_value is None:
                 raise ValueError(
-                    "When `objective_metric` is 'tpr' or 'tnr', `objective_value` must "
-                    "be provided. Got None instead."
+                    "When `objective_metric` is 'max_tpr_at_tnr_constraint' or "
+                    "'max_tnr_at_tpr_constraint', `constraint_value` must be provided. "
+                    "Got None instead."
                 )
-            objective_value = self.objective_value
+            constraint_value = self.constraint_value
         else:
-            objective_value = "highest"
+            constraint_value = "highest"
 
         fit_parameters = signature(self.estimator.fit).parameters
         supports_sw = "sample_weight" in fit_parameters
@@ -418,14 +424,13 @@ def fit(self, X, y, sample_weight=None, **fit_params):
                 )
             pos_label = _check_pos_label_consistency(self.pos_label, y)
 
-            def cost_score_func(y_true, y_pred, **kwargs):
-                tp_cost, tn_cost, fp_cost, fn_cost = (
-                    kwargs["tp"],
-                    kwargs["tn"],
-                    kwargs["fp"],
-                    kwargs["fn"],
+            def cost_sensitive_score_func(y_true, y_pred, **kwargs):
+                costs_and_gain = np.array(
+                    [
+                        [kwargs["tn"], kwargs["fp"]],
+                        [kwargs["fn"], kwargs["tp"]],
+                    ]
                 )
-                cost_matrix = np.array([[tn_cost, fp_cost], [fn_cost, tp_cost]])
 
                 sample_weight = kwargs.get("sample_weight", None)
                 cm = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
@@ -436,10 +441,10 @@ def cost_score_func(y_true, y_pred, **kwargs):
                     # reorder the confusion matrix to be aligned with the cost-matrix
                     cm = cm[::-1, ::-1]
 
-                return (cost_matrix * cm).sum()
+                return (costs_and_gain * cm).sum()
 
             self._scorer = _ContinuousScorer(
-                score_func=cost_score_func,
+                score_func=cost_sensitive_score_func,
                 sign=1,
                 response_method=self._response_method,
                 kwargs={
@@ -447,7 +452,10 @@ def cost_score_func(y_true, y_pred, **kwargs):
                     "pos_label": pos_label,
                 },
             )
-        elif self.objective_metric in {"tpr", "tnr"}:
+        elif self.objective_metric in {
+            "max_tnr_at_tpr_constraint",
+            "max_tpr_at_tnr_constraint",
+        }:
             if (
                 self._response_method == "predict_proba"
                 or self._response_method[0] == "predict_proba"
@@ -494,28 +502,34 @@ def cost_score_func(y_true, y_pred, **kwargs):
             min_threshold, max_threshold, num=self.n_thresholds
         )
 
-        mean_score = np.mean(
-            [
-                np.interp(thresholds_interpolated, th, sc)
-                for th, sc in zip(thresholds, scores)
-            ],
-            axis=0,
-        )
+        def _mean_interpolated_score(thresholds, scores):
+            return np.mean(
+                [
+                    np.interp(thresholds_interpolated, th, sc)
+                    for th, sc in zip(thresholds, scores)
+                ],
+                axis=0,
+            )
 
-        if objective_value == "highest":  # find best score
+        if constraint_value == "highest":  # find best score
             # we don't need to sort the scores and directly take the maximum
+            mean_score = _mean_interpolated_score(thresholds, scores)
             best_idx = mean_score.argmax()
-        else:  # seeking for a specific objective value
-            # we need to sort the scores before applying `np.searchsorted`
-            mean_score_argsort = np.argsort(mean_score)
-            mean_score, thresholds_interpolated = (
-                mean_score[mean_score_argsort],
-                thresholds_interpolated[mean_score_argsort],
-            )
-            best_idx = np.searchsorted(mean_score, objective_value)
+            self.objective_score_ = mean_score[best_idx]
+            self.decision_threshold_ = thresholds_interpolated[best_idx]
+        else:
+            tpr, tnr = zip(*scores)
+            mean_tpr = _mean_interpolated_score(thresholds, tpr)
+            mean_tnr = _mean_interpolated_score(thresholds, tnr)
 
-        self.objective_score_ = mean_score[best_idx]
-        self.decision_threshold_ = thresholds_interpolated[best_idx]
+            if self.objective_metric == "max_tpr_at_tnr_constraint":
+                mask = mean_tnr >= constraint_value
+                best_idx = mean_tpr[mask].argmax()
+            else:
+                mask = mean_tpr >= constraint_value
+                best_idx = mean_tnr[mask].argmax()
+            self.objective_score_ = (mean_tpr[mask][best_idx], mean_tnr[mask][best_idx])
+            self.decision_threshold_ = thresholds_interpolated[mask][best_idx]
 
         if hasattr(self.estimator_, "n_features_in_"):
             self.n_features_in_ = self.estimator_.n_features_in_
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index abf44bc1b03c1..9cdc2c184c59c 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -42,11 +42,11 @@
         ),
         (
             make_scorer(roc_curve, needs_proba=True),
-            "tpr",
+            "max_tnr_at_tpr_constraint",
         ),
         (
             make_scorer(roc_curve, needs_proba=True),
-            "tnr",
+            "max_tpr_at_tnr_constraint",
         ),
     ],
 )
@@ -88,12 +88,12 @@ def test_fit_and_score_scorers(scorer, score_method):
         ),
         (
             make_scorer(roc_curve, needs_proba=True),
-            "tpr",
+            "max_tnr_at_tpr_constraint",
             [1.0, 1.0, 0.0],
         ),
         (
             make_scorer(roc_curve, needs_proba=True),
-            "tnr",
+            "max_tpr_at_tnr_constraint",
             [0.0, 1.0, 1.0],
         ),
     ],
@@ -153,11 +153,11 @@ def test_fit_and_score_prefit(scorer, score_method, expected_score):
         ),
         (
             make_scorer(roc_curve, needs_proba=True),
-            "tpr",
+            "max_tnr_at_tpr_constraint",
         ),
         (
             make_scorer(roc_curve, needs_proba=True),
-            "tnr",
+            "max_tpr_at_tnr_constraint",
         ),
     ],
 )
@@ -218,11 +218,11 @@ def test_fit_and_score_sample_weight(scorer, score_method):
         ),
         (
             make_scorer(roc_curve, needs_proba=True),
-            "tpr",
+            "max_tnr_at_tpr_constraint",
         ),
         (
             make_scorer(roc_curve, needs_proba=True),
-            "tnr",
+            "max_tpr_at_tnr_constraint",
         ),
     ],
 )
@@ -317,7 +317,7 @@ def test_cutoffclassifier_estimator_response_methods(estimator, response_method)
 @pytest.mark.parametrize(
     "response_method", ["auto", "decision_function", "predict_proba"]
 )
-def test_cutoffclassifier_with_objective_value(response_method):
+def test_cutoffclassifier_with_constraint_value(response_method):
     """Check that `CutOffClassifier` is optimizing a given objective metric."""
     X, y = load_breast_cancer(return_X_y=True)
     # remove feature to degrade performances
@@ -342,19 +342,21 @@ def test_cutoffclassifier_with_objective_value(response_method):
     assert score_optimized > score_baseline
 
 
-def test_cutoffclassifier_limit_tpr_tnr():
-    """Check that an objective value of 0 give opposite predictions with objective
-    metrics `tpr` and `tnr`.
-    """
-    X, y = load_breast_cancer(return_X_y=True)
-    estimator = make_pipeline(StandardScaler(), LogisticRegression())
-    clf = CutOffClassifier(
-        estimator=estimator, objective_metric="tpr", objective_value=0
-    )
-    y_pred_tpr = clf.fit(X, y).predict(X)
-    clf.set_params(objective_metric="tnr")
-    y_pred_tnr = (~clf.fit(X, y).predict(X).astype(bool)).astype(int)
-    assert np.mean(y_pred_tnr == y_pred_tpr) > 0.98
+# def test_cutoffclassifier_limit_tpr_tnr():
+#     """Check that an objective value of 0 give opposite predictions with objective
+#     metrics `max_tnr_at_tpr_constraint` and `max_tpr_at_tnr_constraint`.
+#     """
+#     X, y = load_breast_cancer(return_X_y=True)
+#     estimator = make_pipeline(StandardScaler(), LogisticRegression())
+#     clf = CutOffClassifier(
+#         estimator=estimator,
+#         objective_metric="max_tnr_at_tpr_constraint",
+#         constraint_value=0,
+#     )
+#     y_pred_tpr = clf.fit(X, y).predict(X)
+#     clf.set_params(objective_metric="max_tpr_at_tnr_constraint")
+#     y_pred_tnr = (~clf.fit(X, y).predict(X).astype(bool)).astype(int)
+#     assert np.mean(y_pred_tnr == y_pred_tpr) > 0.98
 
 
 def test_cutoffclassifier_metric_with_parameter():
@@ -381,8 +383,8 @@ def test_cutoffclassifier_metric_with_parameter():
 @pytest.mark.parametrize(
     "metric",
     [
-        "tpr",
-        "tnr",
+        "max_tnr_at_tpr_constraint",
+        "max_tpr_at_tnr_constraint",
         make_scorer(balanced_accuracy_score),
         make_scorer(f1_score, pos_label="cancer"),
         {"tp": 1, "tn": 1, "fp": 1, "fn": 1},
@@ -402,7 +404,7 @@ def test_cutoffclassifier_with_string_targets(response_method, metric):
     model = CutOffClassifier(
         estimator=make_pipeline(StandardScaler(), LogisticRegression()),
         objective_metric=metric,
-        objective_value=0.9,
+        constraint_value=0.9,
         pos_label="cancer",
         response_method=response_method,
         n_thresholds=100,
@@ -461,7 +463,10 @@ def test_cutoffclassifier_refit(with_sample_weight, global_random_seed):
     assert_allclose(model.estimator_.coef_, estimator.coef_)
 
 
-@pytest.mark.parametrize("objective_metric", ["tpr", "tnr", "balanced_accuracy"])
+@pytest.mark.parametrize(
+    "objective_metric",
+    ["max_tnr_at_tpr_constraint", "max_tpr_at_tnr_constraint", "balanced_accuracy"],
+)
 @pytest.mark.parametrize("fit_params_type", ["list", "array"])
 def test_cutoffclassifier_fit_params(objective_metric, fit_params_type):
     """Check that we pass `fit_params` to the classifier when calling `fit`."""
@@ -473,22 +478,23 @@ def test_cutoffclassifier_fit_params(objective_metric, fit_params_type):
 
     classifier = CheckingClassifier(expected_fit_params=["a", "b"])
     model = CutOffClassifier(
-        classifier, objective_metric=objective_metric, objective_value=0.5
+        classifier, objective_metric=objective_metric, constraint_value=0.5
     )
     model.fit(X, y, **fit_params)
 
 
 @pytest.mark.parametrize(
-    "objective_metric, objective_value", [("tpr", 0.5), ("tnr", 0.5)]
+    "objective_metric, constraint_value",
+    [("max_tnr_at_tpr_constraint", 0.5), ("max_tpr_at_tnr_constraint", 0.5)],
 )
 @pytest.mark.parametrize(
     "response_method", ["auto", "decision_function", "predict_proba"]
 )
 def test_cutoffclassifier_response_method_scorer_tnr_tpr(
-    objective_metric, objective_value, response_method, global_random_seed
+    objective_metric, constraint_value, response_method, global_random_seed
 ):
     """Check that we use the proper scorer and forwarding the requested response method
-    for `tnr` and `tpr`.
+    for `max_tpr_at_tnr_constraint` and `max_tnr_at_tpr_constraint`.
     """
     X, y = make_classification(n_samples=100, random_state=global_random_seed)
     classifier = LogisticRegression()
@@ -496,7 +502,7 @@ def test_cutoffclassifier_response_method_scorer_tnr_tpr(
     model = CutOffClassifier(
         classifier,
         objective_metric=objective_metric,
-        objective_value=objective_value,
+        constraint_value=constraint_value,
         response_method=response_method,
     )
     model.fit(X, y)
@@ -509,16 +515,16 @@ def test_cutoffclassifier_response_method_scorer_tnr_tpr(
         # "auto" will fall back  in priority on `predict_proba` if `estimator`
         # supports it.
         # we expect the decision threshold to be in [0, 1]
-        if objective_metric == "tpr":
+        if objective_metric == "max_tnr_at_tpr_constraint":
             assert 0.5 < model.decision_threshold_ < 1
-        else:  # "tnr"
+        else:  # "max_tpr_at_tnr_constraint"
             assert 0 < model.decision_threshold_ < 0.5
     else:  # "decision_function"
         # we expect the decision function to be centered in 0.0 and to be larger than
         # -1 and 1.
-        if objective_metric == "tpr":
+        if objective_metric == "max_tnr_at_tpr_constraint":
             assert 0 < model.decision_threshold_ < 20
-        else:  # "tnr"
+        else:  # "max_tpr_at_tnr_constraint"
             assert -20 < model.decision_threshold_ < 0
 
 
@@ -533,9 +539,9 @@ def test_cutoffclassifier_objective_metric_dict(global_random_seed):
 
     # affect a high gain to true negative and force the classifier to mainly
     # predict the negative class.
-    cost_matrix = {"tp": 0, "tn": 10, "fp": 0, "fn": 0}
+    costs_and_again = {"tp": 0, "tn": 10, "fp": 0, "fn": 0}
     model = CutOffClassifier(
-        classifier, objective_metric=cost_matrix, n_thresholds=n_thresholds
+        classifier, objective_metric=costs_and_again, n_thresholds=n_thresholds
     )
     model.fit(X, y)
 
@@ -543,9 +549,9 @@ def test_cutoffclassifier_objective_metric_dict(global_random_seed):
     assert np.mean(model.predict(X) == 0) > 0.9
 
     # use the true positive now
-    cost_matrix = {"tp": 10, "tn": 0, "fp": 0, "fn": 0}
+    costs_and_again = {"tp": 10, "tn": 0, "fp": 0, "fn": 0}
     model = CutOffClassifier(
-        classifier, objective_metric=cost_matrix, n_thresholds=n_thresholds
+        classifier, objective_metric=costs_and_again, n_thresholds=n_thresholds
     )
     model.fit(X, y)
 
@@ -556,7 +562,7 @@ def test_cutoffclassifier_objective_metric_dict(global_random_seed):
     pos_label = 0
     model = CutOffClassifier(
         classifier,
-        objective_metric=cost_matrix,
+        objective_metric=costs_and_again,
         n_thresholds=n_thresholds,
         pos_label=pos_label,
     )
@@ -566,7 +572,7 @@ def test_cutoffclassifier_objective_metric_dict(global_random_seed):
     assert np.mean(model.predict(X) == 0) > 0.9
 
 
-def test_cutoffclassifier_sample_weight_cost_matrix():
+def test_cutoffclassifier_sample_weight_costs_and_again():
     """Check that we dispatch the `sample_weight` to the scorer when computing the
     confusion matrix."""
     X, y = load_iris(return_X_y=True)
@@ -581,12 +587,12 @@ def test_cutoffclassifier_sample_weight_cost_matrix():
     # we use a prefit classifier to simplify the test
     cv = "prefit"
     estimator = LogisticRegression().fit(X, y)
-    cost_matrix = {"tp": 1, "tn": 1, "fp": 1, "fn": 1}
+    costs_and_again = {"tp": 1, "tn": 1, "fp": 1, "fn": 1}
 
-    model_repeat = CutOffClassifier(estimator, cv=cv, objective_metric=cost_matrix)
+    model_repeat = CutOffClassifier(estimator, cv=cv, objective_metric=costs_and_again)
     model_repeat.fit(X_repeated, y_repeated, sample_weight=None)
 
-    model_sw = CutOffClassifier(estimator, cv=cv, objective_metric=cost_matrix)
+    model_sw = CutOffClassifier(estimator, cv=cv, objective_metric=costs_and_again)
     model_sw.fit(X, y, sample_weight=sample_weight)
 
     assert model_repeat.objective_score_ == pytest.approx(model_sw.objective_score_)

From 146b1702d0009f6977459320db50e9b0f881479c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 24 Apr 2023 11:39:49 +0200
Subject: [PATCH 031/194] change strategy for finding max

---
 sklearn/model_selection/_prediction.py        | 43 ++++++++++++----
 .../model_selection/tests/test_prediction.py  | 50 +++++++++++--------
 2 files changed, 64 insertions(+), 29 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 3808f9bdf3be7..cd343973374f0 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -99,8 +99,11 @@ def _fit_and_score(
         The decision thresholds used to compute the scores. They are returned in
         ascending order.
 
-    scores : ndarray of shape (n_thresholds,)
-        The scores computed for each decision threshold.
+    scores : ndarray of shape (n_thresholds,) or tuple os such arrays
+        The scores computed for each decision threshold. When `score_method` is
+        `"max_tpr_at_tnr_constraint"` or `"max_tnr_at_tpr_constraint"`, `scores` is a
+        tuple of two arrays, the first one containing the true positive rates and the
+        second one containing the true negative rates.
     """
     arrays = (X, y) if sample_weight is None else (X, y, sample_weight)
     check_consistent_length(*arrays)
@@ -456,12 +459,18 @@ def cost_sensitive_score_func(y_true, y_pred, **kwargs):
             "max_tnr_at_tpr_constraint",
             "max_tpr_at_tnr_constraint",
         }:
-            if (
-                self._response_method == "predict_proba"
-                or self._response_method[0] == "predict_proba"
+            if self._response_method == "predict_proba":
+                params_scorer = {"needs_proba": True, "pos_label": self.pos_label}
+            elif (
+                isinstance(self._response_method, list)
+                and self._response_method[0] == "predict_proba"
+                and hasattr(classifier, "predict_proba")
             ):
-                # `needs_proba=True` will first try to use `predict_proba` and then
-                # `decision_function`
+                # TODO: this is due to a limitation in `make_scorer`: ideally, we should
+                # be able to pass a list of response methods to `make_scorer` and give
+                # priority to `predict_proba` other `decision_function`.
+                # Here, we manually check if the classifier provide `predict_proba` to
+                # use `needs_proba` instead and ensure that no error will be raised.
                 params_scorer = {"needs_proba": True, "pos_label": self.pos_label}
             else:
                 params_scorer = {"needs_threshold": True, "pos_label": self.pos_label}
@@ -494,10 +503,26 @@ def cost_sensitive_score_func(y_true, y_pred, **kwargs):
             )
         )
 
+        # we add/subtract an arbitrary value to the min/max thresholds to ensure that
+        # we get the case where `y_pred` will be all zeros and all ones.
+        if hasattr(classifier, "predict_proba") and (
+            self._response_method == "predict_proba"
+            or (
+                isinstance(self._response_method, list)
+                and self._response_method[0] == "predict_proba"
+                and isinstance(self._scorer, _ContinuousScorer)
+            )
+        ):
+            # `predict_proba` was used to compute scores
+            min_threshold = 0.0 - np.finfo(np.float64).eps
+            max_threshold = 1.0 + np.finfo(np.float64).eps
+        else:
+            # `decision_function` was used to compute scores
+            min_threshold = np.min([th.min() for th in thresholds]) - 1.0
+            max_threshold = np.max([th.max() for th in thresholds]) + 1.0
+
         # thresholds are sorted in ascending order which is necessary for the
         # interpolation of the score below
-        min_threshold = np.min([th.min() for th in thresholds])
-        max_threshold = np.max([th.max() for th in thresholds])
         thresholds_interpolated = np.linspace(
             min_threshold, max_threshold, num=self.n_thresholds
         )
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index 9cdc2c184c59c..e8cf563f49eba 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -69,8 +69,15 @@ def test_fit_and_score_scorers(scorer, score_method):
         score_method=score_method,
     )
 
-    assert_array_equal(np.argsort(thresholds), np.arange(len(thresholds)))
-    assert np.logical_and(scores >= 0, scores <= 1).all()
+    if score_method in {"max_tnr_at_tpr_constraint", "max_tpr_at_tnr_constraint"}:
+        assert_array_equal(np.argsort(thresholds), np.arange(len(thresholds)))
+        assert isinstance(scores, tuple) and len(scores) == 2
+        for sc in scores:
+            assert np.logical_and(sc >= 0, sc <= 1).all()
+    else:
+        assert_array_equal(np.argsort(thresholds), np.arange(len(thresholds)))
+        assert isinstance(scores, np.ndarray)
+        assert np.logical_and(scores >= 0, scores <= 1).all()
 
 
 @pytest.mark.parametrize(
@@ -89,12 +96,12 @@ def test_fit_and_score_scorers(scorer, score_method):
         (
             make_scorer(roc_curve, needs_proba=True),
             "max_tnr_at_tpr_constraint",
-            [1.0, 1.0, 0.0],
+            [[1.0, 1.0, 0.0], [0.0, 1.0, 1.0]],
         ),
         (
             make_scorer(roc_curve, needs_proba=True),
             "max_tpr_at_tnr_constraint",
-            [0.0, 1.0, 1.0],
+            [[1.0, 1.0, 0.0], [0.0, 1.0, 1.0]],
         ),
     ],
 )
@@ -342,21 +349,21 @@ def test_cutoffclassifier_with_constraint_value(response_method):
     assert score_optimized > score_baseline
 
 
-# def test_cutoffclassifier_limit_tpr_tnr():
-#     """Check that an objective value of 0 give opposite predictions with objective
-#     metrics `max_tnr_at_tpr_constraint` and `max_tpr_at_tnr_constraint`.
-#     """
-#     X, y = load_breast_cancer(return_X_y=True)
-#     estimator = make_pipeline(StandardScaler(), LogisticRegression())
-#     clf = CutOffClassifier(
-#         estimator=estimator,
-#         objective_metric="max_tnr_at_tpr_constraint",
-#         constraint_value=0,
-#     )
-#     y_pred_tpr = clf.fit(X, y).predict(X)
-#     clf.set_params(objective_metric="max_tpr_at_tnr_constraint")
-#     y_pred_tnr = (~clf.fit(X, y).predict(X).astype(bool)).astype(int)
-#     assert np.mean(y_pred_tnr == y_pred_tpr) > 0.98
+def test_cutoffclassifier_limit_tpr_tnr():
+    """Check that an objective value of 0 give opposite predictions with objective
+    metrics `max_tnr_at_tpr_constraint` and `max_tpr_at_tnr_constraint`.
+    """
+    X, y = load_breast_cancer(return_X_y=True)
+    estimator = make_pipeline(StandardScaler(), LogisticRegression())
+    clf = CutOffClassifier(
+        estimator=estimator,
+        objective_metric="max_tnr_at_tpr_constraint",
+        constraint_value=0,
+    )
+    y_pred_tpr = clf.fit(X, y).predict(X)
+    clf.set_params(objective_metric="max_tpr_at_tnr_constraint")
+    y_pred_tnr = (~clf.fit(X, y).predict(X).astype(bool)).astype(int)
+    assert np.mean(y_pred_tnr == y_pred_tpr) == pytest.approx(1.0)
 
 
 def test_cutoffclassifier_metric_with_parameter():
@@ -387,7 +394,10 @@ def test_cutoffclassifier_metric_with_parameter():
         "max_tpr_at_tnr_constraint",
         make_scorer(balanced_accuracy_score),
         make_scorer(f1_score, pos_label="cancer"),
-        {"tp": 1, "tn": 1, "fp": 1, "fn": 1},
+        # penalize false negative since we have an imbalanced dataset and the
+        # accuracy would not be a good metric to optimize the decision
+        # threshold
+        {"tp": 1, "tn": 1, "fp": 1, "fn": -10},
     ],
 )
 def test_cutoffclassifier_with_string_targets(response_method, metric):

From 03b1f7f17b961f26794420d5ec4560d05ae56f3f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 24 Apr 2023 12:31:10 +0200
Subject: [PATCH 032/194] iter

---
 sklearn/model_selection/_prediction.py        | 115 ++++++++++++------
 .../model_selection/tests/test_prediction.py  |   7 +-
 2 files changed, 86 insertions(+), 36 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index cd343973374f0..1b95923c98c7d 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -11,6 +11,7 @@
     confusion_matrix,
     get_scorer_names,
     make_scorer,
+    precision_recall_curve,
     roc_curve,
 )
 from ..metrics._scorer import _ContinuousScorer
@@ -90,8 +91,8 @@ def _fit_and_score(
         decision thresholds and scores.
 
     score_method : str or callable
-        The scoring method to use. Used to detect `tpr` and `tnr` since they are not
-        an usual scikit-learn scorer and need to be handled differently.
+        The scoring method to use. Used to detect if we compute TPR/TNR or precision/
+        recall.
 
     Returns
     -------
@@ -100,10 +101,8 @@ def _fit_and_score(
         ascending order.
 
     scores : ndarray of shape (n_thresholds,) or tuple os such arrays
-        The scores computed for each decision threshold. When `score_method` is
-        `"max_tpr_at_tnr_constraint"` or `"max_tnr_at_tpr_constraint"`, `scores` is a
-        tuple of two arrays, the first one containing the true positive rates and the
-        second one containing the true negative rates.
+        The scores computed for each decision threshold. When TPR/TNR or precision/
+        recall are computed, `scores` is a tuple of two arrays.
     """
     arrays = (X, y) if sample_weight is None else (X, y, sample_weight)
     check_consistent_length(*arrays)
@@ -130,14 +129,23 @@ def _fit_and_score(
         X_val, y_val, sw_val = X, y, sample_weight
         check_is_fitted(classifier, "classes_")
 
-    if isinstance(score_method, str) and score_method in {
-        "max_tpr_at_tnr_constraint",
-        "max_tnr_at_tpr_constraint",
-    }:
-        fpr, tpr, potential_thresholds = scorer(
-            classifier, X_val, y_val, sample_weight=sw_val
-        )
-        return potential_thresholds[::-1], (tpr[::-1], (1 - fpr)[::-1])
+    if isinstance(score_method, str):
+        if score_method in {"max_tpr_at_tnr_constraint", "max_tnr_at_tpr_constraint"}:
+            fpr, tpr, potential_thresholds = scorer(
+                classifier, X_val, y_val, sample_weight=sw_val
+            )
+            # thresholds are in decreasing order
+            return potential_thresholds[::-1], ((1 - fpr)[::-1], tpr[::-1])
+        elif score_method in {
+            "max_precision_at_recall_constraint",
+            "max_recall_at_precision_constraint",
+        }:
+            precision, recall, potential_thresholds = scorer(
+                classifier, X_val, y_val, sample_weight=sw_val
+            )
+            # thresholds are in increasing order, we also have one missing threshold
+            # TODO: check what to do with the missing threshold or additional scores.
+            return potential_thresholds, (precision[:-1], recall[:-1])
     return scorer(classifier, X_val, y_val, sample_weight=sw_val)
 
 
@@ -150,8 +158,9 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
         The classifier, fitted or not fitted, for which we want to optimize
         the decision threshold used during `predict`.
 
-    objective_metric : {"max_tpr_at_tnr_constraint", "max_tnr_at_tpr_constraint"}, \
-            str, dict or callable, default="balanced_accuracy"
+    objective_metric : {"max_tpr_at_tnr_constraint", "max_tnr_at_tpr_constraint", \
+            "max_precision_at_recall_constraint, "max_recall_at_precision_constraint"} \
+            , str, dict or callable, default="balanced_accuracy"
         The objective metric to be optimized. Can be one of:
 
         * a string associated to a scoring function (see model evaluation
@@ -161,6 +170,10 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
           positive ratio (TPR) of `constraint_value`;
         * `"max_tpr_at_tnr_constraint"`: find the decision threshold for a true
           negative ratio (TNR) of `constraint_value`.
+        * `"max_precision_at_recall_constraint"`: find the decision threshold for a
+          recall of `constraint_value`;
+        * `"max_recall_at_precision_constraint"`: find the decision threshold for a
+          precision of `constraint_value`.
         * a dictionary to be used as cost-sensitive matrix. The keys of the
           dictionary should be: `("tp", "fp", "tn", "fn")`. The values of the
           dictionary corresponds costs (negative values) and gains (positive
@@ -168,8 +181,10 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
 
     constraint_value : float, default=None
         The value associated with the `objective_metric` metric for which we
-        want to find the decision threshold when `objective_metric` is equal to
-        `"max_tnr_at_tpr_constraint"` or `"max_tpr_at_tnr_constraint"`.
+        want to find the decision threshold when `objective_metric` is equal one of
+        `"max_tnr_at_tpr_constraint"`, `"max_tpr_at_tnr_constraint"`,
+        `"max_precision_at_recall_constraint"`, or
+        `"max_recall_at_precision_constraint".
 
     pos_label : int, float, bool or str, default=None
         The label of the positive class. Used when `objective_metric` is
@@ -258,7 +273,12 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
         "objective_metric": [
             StrOptions(
                 set(get_scorer_names())
-                | {"max_tnr_at_tpr_constraint", "max_tpr_at_tnr_constraint"}
+                | {
+                    "max_tnr_at_tpr_constraint",
+                    "max_tpr_at_tnr_constraint",
+                    "max_precision_at_recall_constraint",
+                    "max_recall_at_precision_constraint",
+                }
             ),
             callable,
             MutableMapping,
@@ -369,12 +389,15 @@ def fit(self, X, y, sample_weight=None, **fit_params):
         if isinstance(self.objective_metric, str) and self.objective_metric in {
             "max_tpr_at_tnr_constraint",
             "max_tnr_at_tpr_constraint",
+            "max_precision_at_recall_constraint",
+            "max_recall_at_precision_constraint",
         }:
             if self.constraint_value is None:
                 raise ValueError(
-                    "When `objective_metric` is 'max_tpr_at_tnr_constraint' or "
-                    "'max_tnr_at_tpr_constraint', `constraint_value` must be provided. "
-                    "Got None instead."
+                    "When `objective_metric` is 'max_tpr_at_tnr_constraint', "
+                    "'max_tnr_at_tpr_constraint', 'max_precision_at_recall_constraint',"
+                    " or 'max_recall_at_precision_constraint', `constraint_value` must "
+                    "be provided. Got None instead."
                 )
             constraint_value = self.constraint_value
         else:
@@ -458,6 +481,8 @@ def cost_sensitive_score_func(y_true, y_pred, **kwargs):
         elif self.objective_metric in {
             "max_tnr_at_tpr_constraint",
             "max_tpr_at_tnr_constraint",
+            "max_precision_at_recall_constraint",
+            "max_recall_at_precision_constraint",
         }:
             if self._response_method == "predict_proba":
                 params_scorer = {"needs_proba": True, "pos_label": self.pos_label}
@@ -474,7 +499,12 @@ def cost_sensitive_score_func(y_true, y_pred, **kwargs):
                 params_scorer = {"needs_proba": True, "pos_label": self.pos_label}
             else:
                 params_scorer = {"needs_threshold": True, "pos_label": self.pos_label}
-            self._scorer = make_scorer(roc_curve, **params_scorer)
+
+            if "tpr" in self.objective_metric:  # tpr/tnr
+                score_func = roc_curve
+            else:  # precision/recall
+                score_func = precision_recall_curve
+            self._scorer = make_scorer(score_func, **params_scorer)
         else:
             scoring = check_scoring(classifier, scoring=self.objective_metric)
             # transform a binary metric into a curve metric for all possible decision
@@ -510,7 +540,6 @@ def cost_sensitive_score_func(y_true, y_pred, **kwargs):
             or (
                 isinstance(self._response_method, list)
                 and self._response_method[0] == "predict_proba"
-                and isinstance(self._scorer, _ContinuousScorer)
             )
         ):
             # `predict_proba` was used to compute scores
@@ -543,18 +572,36 @@ def _mean_interpolated_score(thresholds, scores):
             self.objective_score_ = mean_score[best_idx]
             self.decision_threshold_ = thresholds_interpolated[best_idx]
         else:
-            tpr, tnr = zip(*scores)
-            mean_tpr = _mean_interpolated_score(thresholds, tpr)
-            mean_tnr = _mean_interpolated_score(thresholds, tnr)
+            if "tpr" in self.objective_metric:  # tpr/tnr
+                mean_tnr, mean_tpr = [
+                    _mean_interpolated_score(thresholds, sc) for sc in zip(*scores)
+                ]
+            else:  # precision/recall
+                mean_precision, mean_recall = [
+                    _mean_interpolated_score(thresholds, sc) for sc in zip(*scores)
+                ]
+
+            def _get_best_idx(constrained_score, maximized_score):
+                indices = np.arange(len(constrained_score))
+                mask = constrained_score >= constraint_value
+                mask_idx = maximized_score[mask].argmax()
+                return indices[mask][mask_idx]
 
             if self.objective_metric == "max_tpr_at_tnr_constraint":
-                mask = mean_tnr >= constraint_value
-                best_idx = mean_tpr[mask].argmax()
-            else:
-                mask = mean_tpr >= constraint_value
-                best_idx = mean_tnr[mask].argmax()
-            self.objective_score_ = (mean_tpr[mask][best_idx], mean_tnr[mask][best_idx])
-            self.decision_threshold_ = thresholds_interpolated[mask][best_idx]
+                constrained_score, maximized_score = mean_tnr, mean_tpr
+            elif self.objective_metric == "max_tnr_at_tpr_constraint":
+                constrained_score, maximized_score = mean_tpr, mean_tnr
+            elif self.objective_metric == "max_precision_at_recall_constraint":
+                constrained_score, maximized_score = mean_recall, mean_precision
+            else:  # max_recall_at_precision_constraint
+                constrained_score, maximized_score = mean_precision, mean_recall
+
+            best_idx = _get_best_idx(constrained_score, maximized_score)
+            self.objective_score_ = (
+                constrained_score[best_idx],
+                maximized_score[best_idx],
+            )
+            self.decision_threshold_ = thresholds_interpolated[best_idx]
 
         if hasattr(self.estimator_, "n_features_in_"):
             self.n_features_in_ = self.estimator_.n_features_in_
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index e8cf563f49eba..6612bc77da26a 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -96,12 +96,12 @@ def test_fit_and_score_scorers(scorer, score_method):
         (
             make_scorer(roc_curve, needs_proba=True),
             "max_tnr_at_tpr_constraint",
-            [[1.0, 1.0, 0.0], [0.0, 1.0, 1.0]],
+            [[0.0, 1.0, 1.0], [1.0, 1.0, 0.0]],
         ),
         (
             make_scorer(roc_curve, needs_proba=True),
             "max_tpr_at_tnr_constraint",
-            [[1.0, 1.0, 0.0], [0.0, 1.0, 1.0]],
+            [[0.0, 1.0, 1.0], [1.0, 1.0, 0.0]],
         ),
     ],
 )
@@ -606,3 +606,6 @@ def test_cutoffclassifier_sample_weight_costs_and_again():
     model_sw.fit(X, y, sample_weight=sample_weight)
 
     assert model_repeat.objective_score_ == pytest.approx(model_sw.objective_score_)
+
+
+# TODO: add a test for the precision/recall case

From 8a09a5ffcf4802a1751b87b97ef3a5064b4e2f1a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 24 Apr 2023 18:14:03 +0200
Subject: [PATCH 033/194] add some test for precision-recall

---
 sklearn/model_selection/_prediction.py        |  23 ++--
 .../model_selection/tests/test_prediction.py  | 108 +++++++++++++-----
 2 files changed, 96 insertions(+), 35 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 1b95923c98c7d..c1ccd769fcfc7 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -143,8 +143,9 @@ def _fit_and_score(
             precision, recall, potential_thresholds = scorer(
                 classifier, X_val, y_val, sample_weight=sw_val
             )
-            # thresholds are in increasing order, we also have one missing threshold
-            # TODO: check what to do with the missing threshold or additional scores.
+            # thresholds are in increasing order
+            # the last element of the precision and recall is not associated with any
+            # threshold and should be discarded
             return potential_thresholds, (precision[:-1], recall[:-1])
     return scorer(classifier, X_val, y_val, sample_weight=sw_val)
 
@@ -250,8 +251,13 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
     decision_threshold_ : float
         The new decision threshold.
 
-    objective_score_ : float
+    objective_score_ : float or tuple of float
         The score of the objective metric associated with the decision threshold found.
+        When `objective_metric` is one of `"max_tpr_at_tnr_constraint"`,
+        `"max_tnr_at_tpr_constraint"`, `"max_precision_at_recall_constraint"`,
+        `"max_recall_at_precision_constraint"`, it will corresponds to a tuple of
+        two float values: the first one is the score of the metric which is constrained
+        and the second one is the score of the maximized metric.
 
     classes_ : ndarray of shape (n_classes,)
         The class labels.
@@ -305,7 +311,7 @@ def __init__(
         constraint_value=None,
         pos_label=None,
         response_method="auto",
-        n_thresholds=1_000,
+        n_thresholds=100,
         cv=None,
         refit="auto",
         n_jobs=None,
@@ -543,12 +549,12 @@ def cost_sensitive_score_func(y_true, y_pred, **kwargs):
             )
         ):
             # `predict_proba` was used to compute scores
-            min_threshold = 0.0 - np.finfo(np.float64).eps
-            max_threshold = 1.0 + np.finfo(np.float64).eps
+            min_threshold = 0.0
+            max_threshold = 1.0
         else:
             # `decision_function` was used to compute scores
-            min_threshold = np.min([th.min() for th in thresholds]) - 1.0
-            max_threshold = np.max([th.max() for th in thresholds]) + 1.0
+            min_threshold = np.min([th.min() for th in thresholds])
+            max_threshold = np.max([th.max() for th in thresholds])
 
         # thresholds are sorted in ascending order which is necessary for the
         # interpolation of the score below
@@ -582,6 +588,7 @@ def _mean_interpolated_score(thresholds, scores):
                 ]
 
             def _get_best_idx(constrained_score, maximized_score):
+                """Find the index of the best score constrained by another score."""
                 indices = np.arange(len(constrained_score))
                 mask = constrained_score >= constraint_value
                 mask_idx = maximized_score[mask].argmax()
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index 6612bc77da26a..cbf564394fa91 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -10,6 +10,7 @@
     fbeta_score,
     f1_score,
     make_scorer,
+    precision_recall_curve,
     roc_curve,
 )
 from sklearn.metrics._scorer import _ContinuousScorer
@@ -48,6 +49,14 @@
             make_scorer(roc_curve, needs_proba=True),
             "max_tpr_at_tnr_constraint",
         ),
+        (
+            make_scorer(precision_recall_curve, needs_proba=True),
+            "max_precision_at_recall_constraint",
+        ),
+        (
+            make_scorer(precision_recall_curve, needs_proba=True),
+            "max_recall_at_precision_constraint",
+        ),
     ],
 )
 def test_fit_and_score_scorers(scorer, score_method):
@@ -69,7 +78,7 @@ def test_fit_and_score_scorers(scorer, score_method):
         score_method=score_method,
     )
 
-    if score_method in {"max_tnr_at_tpr_constraint", "max_tpr_at_tnr_constraint"}:
+    if score_method.startswith("max_"):
         assert_array_equal(np.argsort(thresholds), np.arange(len(thresholds)))
         assert isinstance(scores, tuple) and len(scores) == 2
         for sc in scores:
@@ -103,6 +112,16 @@ def test_fit_and_score_scorers(scorer, score_method):
             "max_tpr_at_tnr_constraint",
             [[0.0, 1.0, 1.0], [1.0, 1.0, 0.0]],
         ),
+        (
+            make_scorer(precision_recall_curve, needs_proba=True),
+            "max_precision_at_recall_constraint",
+            [[0.5, 1.0], [1.0, 1.0]],
+        ),
+        (
+            make_scorer(precision_recall_curve, needs_proba=True),
+            "max_recall_at_precision_constraint",
+            [[0.5, 1.0], [1.0, 1.0]],
+        ),
     ],
 )
 def test_fit_and_score_prefit(scorer, score_method, expected_score):
@@ -166,6 +185,14 @@ def test_fit_and_score_prefit(scorer, score_method, expected_score):
             make_scorer(roc_curve, needs_proba=True),
             "max_tpr_at_tnr_constraint",
         ),
+        (
+            make_scorer(precision_recall_curve, needs_proba=True),
+            "max_precision_at_recall_constraint",
+        ),
+        (
+            make_scorer(precision_recall_curve, needs_proba=True),
+            "max_recall_at_precision_constraint",
+        ),
     ],
 )
 def test_fit_and_score_sample_weight(scorer, score_method):
@@ -231,6 +258,14 @@ def test_fit_and_score_sample_weight(scorer, score_method):
             make_scorer(roc_curve, needs_proba=True),
             "max_tpr_at_tnr_constraint",
         ),
+        (
+            make_scorer(precision_recall_curve, needs_proba=True),
+            "max_precision_at_recall_constraint",
+        ),
+        (
+            make_scorer(precision_recall_curve, needs_proba=True),
+            "max_recall_at_precision_constraint",
+        ),
     ],
 )
 @pytest.mark.parametrize("fit_params_type", ["list", "array"])
@@ -349,21 +384,28 @@ def test_cutoffclassifier_with_constraint_value(response_method):
     assert score_optimized > score_baseline
 
 
-def test_cutoffclassifier_limit_tpr_tnr():
-    """Check that an objective value of 0 give opposite predictions with objective
-    metrics `max_tnr_at_tpr_constraint` and `max_tpr_at_tnr_constraint`.
+@pytest.mark.parametrize(
+    "metrics",
+    [
+        ("max_tpr_at_tnr_constraint", "max_tnr_at_tpr_constraint"),
+        ("max_tnr_at_tpr_constraint", "max_tpr_at_tnr_constraint"),
+    ],
+)
+def test_cutoffclassifier_limit_metric_tradeoff(metrics):
+    """Check that an objective value of 0 give opposite predictions with tnr/tpr and
+    precision/recall.
     """
     X, y = load_breast_cancer(return_X_y=True)
     estimator = make_pipeline(StandardScaler(), LogisticRegression())
-    clf = CutOffClassifier(
+    model = CutOffClassifier(
         estimator=estimator,
-        objective_metric="max_tnr_at_tpr_constraint",
+        objective_metric=metrics[0],
         constraint_value=0,
     )
-    y_pred_tpr = clf.fit(X, y).predict(X)
-    clf.set_params(objective_metric="max_tpr_at_tnr_constraint")
-    y_pred_tnr = (~clf.fit(X, y).predict(X).astype(bool)).astype(int)
-    assert np.mean(y_pred_tnr == y_pred_tpr) == pytest.approx(1.0)
+    y_pred_1 = model.fit(X, y).predict(X)
+    model.set_params(objective_metric=metrics[1])
+    y_pred_2 = (~model.fit(X, y).predict(X).astype(bool)).astype(int)
+    assert np.mean(y_pred_1 == y_pred_2) == pytest.approx(1.0)
 
 
 def test_cutoffclassifier_metric_with_parameter():
@@ -392,6 +434,8 @@ def test_cutoffclassifier_metric_with_parameter():
     [
         "max_tnr_at_tpr_constraint",
         "max_tpr_at_tnr_constraint",
+        "max_precision_at_recall_constraint",
+        "max_recall_at_precision_constraint",
         make_scorer(balanced_accuracy_score),
         make_scorer(f1_score, pos_label="cancer"),
         # penalize false negative since we have an imbalanced dataset and the
@@ -475,7 +519,13 @@ def test_cutoffclassifier_refit(with_sample_weight, global_random_seed):
 
 @pytest.mark.parametrize(
     "objective_metric",
-    ["max_tnr_at_tpr_constraint", "max_tpr_at_tnr_constraint", "balanced_accuracy"],
+    [
+        "max_tnr_at_tpr_constraint",
+        "max_tpr_at_tnr_constraint",
+        "max_precision_at_recall_constraint",
+        "max_recall_at_precision_constraint",
+        "balanced_accuracy",
+    ],
 )
 @pytest.mark.parametrize("fit_params_type", ["list", "array"])
 def test_cutoffclassifier_fit_params(objective_metric, fit_params_type):
@@ -495,16 +545,21 @@ def test_cutoffclassifier_fit_params(objective_metric, fit_params_type):
 
 @pytest.mark.parametrize(
     "objective_metric, constraint_value",
-    [("max_tnr_at_tpr_constraint", 0.5), ("max_tpr_at_tnr_constraint", 0.5)],
+    [
+        ("max_tnr_at_tpr_constraint", 0.5),
+        ("max_tpr_at_tnr_constraint", 0.5),
+        ("max_precision_at_recall_constraint", 0.5),
+        ("max_recall_at_precision_constraint", 0.5),
+    ],
 )
 @pytest.mark.parametrize(
     "response_method", ["auto", "decision_function", "predict_proba"]
 )
-def test_cutoffclassifier_response_method_scorer_tnr_tpr(
+def test_cutoffclassifier_response_method_scorer_with_constraint_metric(
     objective_metric, constraint_value, response_method, global_random_seed
 ):
     """Check that we use the proper scorer and forwarding the requested response method
-    for `max_tpr_at_tnr_constraint` and `max_tnr_at_tpr_constraint`.
+    for TNR/TPR and precision/recall metrics.
     """
     X, y = make_classification(n_samples=100, random_state=global_random_seed)
     classifier = LogisticRegression()
@@ -517,24 +572,26 @@ def test_cutoffclassifier_response_method_scorer_tnr_tpr(
     )
     model.fit(X, y)
 
-    # Note that optimizing TPR will increase the decision threshold while optimizing
-    # TNR will decrease it. We therefore use the centered threshold (i.e. 0.5 for
-    # probabilities and 0.0 for decision function) to check that the decision threshold
-    # is properly set.
     if response_method in ("auto", "predict_proba"):
         # "auto" will fall back  in priority on `predict_proba` if `estimator`
         # supports it.
         # we expect the decision threshold to be in [0, 1]
-        if objective_metric == "max_tnr_at_tpr_constraint":
-            assert 0.5 < model.decision_threshold_ < 1
-        else:  # "max_tpr_at_tnr_constraint"
-            assert 0 < model.decision_threshold_ < 0.5
+        if objective_metric in (
+            "max_tnr_at_tpr_constraint",
+            "max_precision_at_recall_constraint",
+        ):
+            assert 0.5 <= model.decision_threshold_ <= 1
+        else:  # "max_tpr_at_tnr_constraint" or "max_recall_at_precision_constraint"
+            assert 0 <= model.decision_threshold_ <= 0.5
     else:  # "decision_function"
         # we expect the decision function to be centered in 0.0 and to be larger than
         # -1 and 1.
-        if objective_metric == "max_tnr_at_tpr_constraint":
+        if objective_metric in (
+            "max_tnr_at_tpr_constraint",
+            "max_precision_at_recall_constraint",
+        ):
             assert 0 < model.decision_threshold_ < 20
-        else:  # "max_tpr_at_tnr_constraint"
+        else:  # "max_tpr_at_tnr_constraint" or "max_recall_at_precision_constraint"
             assert -20 < model.decision_threshold_ < 0
 
 
@@ -606,6 +663,3 @@ def test_cutoffclassifier_sample_weight_costs_and_again():
     model_sw.fit(X, y, sample_weight=sample_weight)
 
     assert model_repeat.objective_score_ == pytest.approx(model_sw.objective_score_)
-
-
-# TODO: add a test for the precision/recall case

From d56f57f3bc64d0299e3c42b8129a3382783a7a75 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 24 Apr 2023 19:30:29 +0200
Subject: [PATCH 034/194] TST add invariance zeros weight

---
 sklearn/model_selection/_prediction.py        | 14 ++++-----
 .../model_selection/tests/test_prediction.py  | 30 +++++++++++++++++++
 2 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index c1ccd769fcfc7..ccbcc9d203a45 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -251,7 +251,7 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
     decision_threshold_ : float
         The new decision threshold.
 
-    objective_score_ : float or tuple of float
+    objective_score_ : float or tuple of floats
         The score of the objective metric associated with the decision threshold found.
         When `objective_metric` is one of `"max_tpr_at_tnr_constraint"`,
         `"max_tnr_at_tpr_constraint"`, `"max_precision_at_recall_constraint"`,
@@ -539,8 +539,6 @@ def cost_sensitive_score_func(y_true, y_pred, **kwargs):
             )
         )
 
-        # we add/subtract an arbitrary value to the min/max thresholds to ensure that
-        # we get the case where `y_pred` will be all zeros and all ones.
         if hasattr(classifier, "predict_proba") and (
             self._response_method == "predict_proba"
             or (
@@ -556,8 +554,6 @@ def cost_sensitive_score_func(y_true, y_pred, **kwargs):
             min_threshold = np.min([th.min() for th in thresholds])
             max_threshold = np.max([th.max() for th in thresholds])
 
-        # thresholds are sorted in ascending order which is necessary for the
-        # interpolation of the score below
         thresholds_interpolated = np.linspace(
             min_threshold, max_threshold, num=self.n_thresholds
         )
@@ -572,7 +568,6 @@ def _mean_interpolated_score(thresholds, scores):
             )
 
         if constraint_value == "highest":  # find best score
-            # we don't need to sort the scores and directly take the maximum
             mean_score = _mean_interpolated_score(thresholds, scores)
             best_idx = mean_score.argmax()
             self.objective_score_ = mean_score[best_idx]
@@ -701,6 +696,11 @@ def _more_tags(self):
         return {
             "binary_only": True,
             "_xfail_checks": {
-                "check_classifiers_train": "Threshold at probability 0.5 does not hold"
+                "check_classifiers_train": "Threshold at probability 0.5 does not hold",
+                "check_sample_weights_invariance": (
+                    "Due to the cross-validation and sample ordering, removing a sample"
+                    " is not strictly equal to putting is weight to zero. Specific unit"
+                    " tests are added for CutOffClassifier specifically."
+                ),
             },
         }
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index cbf564394fa91..17e77bfcfa5e4 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pytest
 
+from sklearn.base import clone
 from sklearn.datasets import load_breast_cancer, load_iris, make_classification
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.exceptions import NotFittedError
@@ -663,3 +664,32 @@ def test_cutoffclassifier_sample_weight_costs_and_again():
     model_sw.fit(X, y, sample_weight=sample_weight)
 
     assert model_repeat.objective_score_ == pytest.approx(model_sw.objective_score_)
+
+
+def test_cutoffclassifier_cv_zeros_sample_weights_equivalence():
+    """Check that passing removing some sample from the dataset `X` is
+    equivalent to passing a `sample_weight` with a factor 0."""
+    X, y = load_iris(return_X_y=True)
+    # Scale the data to avoid any convergence issue
+    X = StandardScaler().fit_transform(X)
+    # Only use 2 classes and select samples such that 2-fold cross-validation
+    # split will lead to an equivalence with a `sample_weight` of 0
+    X = np.vstack((X[:40], X[50:90]))
+    y = np.hstack((y[:40], y[50:90]))
+    sample_weight = np.zeros_like(y)
+    sample_weight[::2] = 1
+
+    estimator = LogisticRegression()
+    model_without_weights = CutOffClassifier(estimator, cv=2)
+    model_with_weights = clone(model_without_weights)
+
+    model_with_weights.fit(X, y, sample_weight=sample_weight)
+    model_without_weights.fit(X[::2], y[::2])
+
+    assert_allclose(
+        model_with_weights.estimator_.coef_, model_without_weights.estimator_.coef_
+    )
+
+    y_pred_with_weights = model_with_weights.predict_proba(X)
+    y_pred_without_weights = model_without_weights.predict_proba(X)
+    assert_allclose(y_pred_with_weights, y_pred_without_weights)

From cf164c5021703b6112afc09e96562417f20ce4a3 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 24 Apr 2023 19:32:29 +0200
Subject: [PATCH 035/194] DOC fix default n_thresholds

---
 sklearn/model_selection/_prediction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index ccbcc9d203a45..f42e41914a104 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -205,7 +205,7 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
           If the method is not implemented by the classifier, it will raise an
           error.
 
-    n_thresholds : int, default=1000
+    n_thresholds : int, default=100
         The number of decision threshold to use when discretizing the output
         of the classifier `method`.
 

From c943f5e7bccd7bd28356cfc7d2cabd213d5460dd Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 25 Apr 2023 11:05:02 +0200
Subject: [PATCH 036/194] DOC add a small example

---
 setup.cfg                              |  2 +-
 sklearn/model_selection/_prediction.py | 64 +++++++++++++++++++++++++-
 2 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 19f2bebeb7280..ba69be9fa9978 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -14,7 +14,7 @@ doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS
 testpaths = sklearn
 addopts =
     --doctest-modules
-    --disable-pytest-warnings
+    # --disable-pytest-warnings
     --color=yes
     # Activate the plugin explicitly to ensure that the seed is reported
     # correctly on the CI when running `pytest --pyargs sklearn` from the
diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index f42e41914a104..82a45665626e2 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -151,7 +151,17 @@ def _fit_and_score(
 
 
 class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
-    """Decision threshold calibration for binary classification.
+    """Decision threshold tuning for binary classification.
+
+    This estimator post-tunes the decision threshold (cut-off point) that is
+    used for converting probabilities (i.e. output of `predict_proba`) or
+    decision function (i.e. output of `decision_function`) into a predicted
+    class. The tuning is done by maximizing a binary metric, potentially
+    constrained by a another metric.
+
+    Read more in the :ref:`User Guide <cutoffclassifier>`.
+
+    .. versionadded:: 1.3
 
     Parameters
     ----------
@@ -248,6 +258,9 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
 
     Attributes
     ----------
+    estimator_ : estimator instance
+        The fitted classifier used when predicting.
+
     decision_threshold_ : float
         The new decision threshold.
 
@@ -269,6 +282,55 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
     feature_names_in_ : ndarray of shape (`n_features_in_`,)
         Names of features seen during :term:`fit`. Only defined if the
         underlying estimator exposes such an attribute when fit.
+
+    See Also
+    --------
+    sklearn.calibration.CalibratedClassifierCV : Estimator that calibrates
+        probabilities.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.metrics import classification_report
+    >>> from sklearn.model_selection import CutOffClassifier, train_test_split
+    >>> X, y = make_classification(
+    ...     n_samples=1_000, weights=[0.9, 0.1], class_sep=0.8, random_state=42
+    ... )
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, stratify=y, random_state=42
+    ... )
+    >>> classifier = RandomForestClassifier(random_state=0).fit(X_train, y_train)
+    >>> print(classification_report(y_test, classifier.predict(X_test)))
+                  precision    recall  f1-score   support
+    <BLANKLINE>
+               0       0.94      0.99      0.96       224
+               1       0.80      0.46      0.59        26
+    <BLANKLINE>
+        accuracy                           0.93       250
+       macro avg       0.87      0.72      0.77       250
+    weighted avg       0.93      0.93      0.92       250
+    <BLANKLINE>
+    >>> classifier_tuned = CutOffClassifier(
+    ...     classifier, objective_metric="max_precision_at_recall_constraint",
+    ...     constraint_value=0.7,
+    ... ).fit(X_train, y_train)
+    >>> print(
+    ...     f"Cut-off point found at {classifier_tuned.decision_threshold_:.3f} for a "
+    ...     f"recall of {classifier_tuned.objective_score_[0]:.3f} and a precision of "
+    ...     f"{classifier_tuned.objective_score_[1]:.3f}."
+    ... )
+    Cut-off point found at 0.384 for a recall of 0.703 and a precision of 0.714.
+    >>> print(classification_report(y_test, classifier_tuned.predict(X_test)))
+                  precision    recall  f1-score   support
+    <BLANKLINE>
+               0       0.95      0.97      0.96       224
+               1       0.71      0.58      0.64        26
+    <BLANKLINE>
+        accuracy                           0.93       250
+       macro avg       0.83      0.78      0.80       250
+    weighted avg       0.93      0.93      0.93       250
+    <BLANKLINE>
     """
 
     _parameter_constraints: dict = {

From bf1462bac4c2bc2092522ae4aa967ab48d98df07 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 25 Apr 2023 11:25:27 +0200
Subject: [PATCH 037/194] iter

---
 doc/model_selection.rst                |  1 +
 doc/modules/classes.rst                | 10 ++++++++++
 doc/modules/prediction.rst             |  9 +++++++++
 sklearn/model_selection/_prediction.py |  2 +-
 4 files changed, 21 insertions(+), 1 deletion(-)
 create mode 100644 doc/modules/prediction.rst

diff --git a/doc/model_selection.rst b/doc/model_selection.rst
index 25cd2b655ccc5..3a46315e65156 100644
--- a/doc/model_selection.rst
+++ b/doc/model_selection.rst
@@ -14,5 +14,6 @@ Model selection and evaluation
 
     modules/cross_validation
     modules/grid_search
+    modules/prediction
     modules/model_evaluation
     modules/learning_curve
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index dc13c292cb94d..e8e4a3671a542 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1218,6 +1218,16 @@ Hyper-parameter optimizers
    model_selection.RandomizedSearchCV
    model_selection.HalvingRandomSearchCV
 
+Model post-fit tuning
+---------------------
+
+.. currentmodule:: sklearn
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   model_selection.CutOffClassifier
 
 Model validation
 ----------------
diff --git a/doc/modules/prediction.rst b/doc/modules/prediction.rst
new file mode 100644
index 0000000000000..5aeeac16270cb
--- /dev/null
+++ b/doc/modules/prediction.rst
@@ -0,0 +1,9 @@
+.. currentmodule:: sklearn.model_selection
+
+.. _cutoffclassifier:
+
+========================================================
+Tuning cut-off decision threshold for classes prediction
+========================================================
+
+Classifiers are predictive models: they use statistical learning to predict outcomes.
diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 82a45665626e2..85f1927408af8 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -320,7 +320,7 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
     ...     f"recall of {classifier_tuned.objective_score_[0]:.3f} and a precision of "
     ...     f"{classifier_tuned.objective_score_[1]:.3f}."
     ... )
-    Cut-off point found at 0.384 for a recall of 0.703 and a precision of 0.714.
+    Cut-off point found at 0.3.. for a recall of 0.7.. and a precision of 0.7...
     >>> print(classification_report(y_test, classifier_tuned.predict(X_test)))
                   precision    recall  f1-score   support
     <BLANKLINE>

From fa89431425c9a6b3b2d5b4220dfa6261b7277e28 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 25 Apr 2023 11:46:08 +0200
Subject: [PATCH 038/194] iter

---
 doc/modules/prediction.rst             | 21 ++++++++++++++++++++-
 sklearn/model_selection/_prediction.py |  2 +-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/doc/modules/prediction.rst b/doc/modules/prediction.rst
index 5aeeac16270cb..04b7f2da0c653 100644
--- a/doc/modules/prediction.rst
+++ b/doc/modules/prediction.rst
@@ -6,4 +6,23 @@
 Tuning cut-off decision threshold for classes prediction
 ========================================================
 
-Classifiers are predictive models: they use statistical learning to predict outcomes.
+Classifiers are predictive models: they use statistical learning to predict
+outcomes. The outcomes of a classifier takes two forms: a "soft" score for each
+sample in relation to each class, and a "hard" categorical prediction (i.e.
+class label). Soft predictions are obtained using :term:`predict_proba` or
+:term:`decision_function` while hard predictions are obtained using
+:term:`predict`.
+
+In scikit-learn, there is a connection between soft and hard prediction. In the
+case of a binary classification, hard predictions are obtained by associating
+the positive class with probability value greater than 0.5 (obtained with
+:term:`predict_proba`) or decision function value greater than 0 (obtained with
+:term:`decision_function`). Similar rules apply a defined for other
+classification problems.
+
+While these approaches are reasonable as default behaviors, they might not be
+adapted to certain use cases. For instance, in a medical context, it might be
+better to predict the positive class for a lower probability than 0.5 to not
+miss any patient with a disease. However, it will come at the cost of having
+more false positive predictions. In some use cases, one would like to define
+the "hard" score based on a "business" metric instead of a statistical metric.
diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 85f1927408af8..7cd2f87f46add 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -320,7 +320,7 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
     ...     f"recall of {classifier_tuned.objective_score_[0]:.3f} and a precision of "
     ...     f"{classifier_tuned.objective_score_[1]:.3f}."
     ... )
-    Cut-off point found at 0.3.. for a recall of 0.7.. and a precision of 0.7...
+    Cut-off point found at 0.3... for a recall of 0.7... and a precision of 0.7...
     >>> print(classification_report(y_test, classifier_tuned.predict(X_test)))
                   precision    recall  f1-score   support
     <BLANKLINE>

From 862519da6108973ab08609f72d0ea84edc325c9c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 25 Apr 2023 21:00:45 +0200
Subject: [PATCH 039/194] bug fixes everywhere

---
 doc/modules/prediction.rst                    | 43 ++++++++++++++++---
 sklearn/metrics/_scorer.py                    | 13 +++++-
 sklearn/metrics/tests/test_score_objects.py   |  3 ++
 sklearn/model_selection/_prediction.py        | 12 +++++-
 .../model_selection/tests/test_prediction.py  | 19 ++++----
 5 files changed, 71 insertions(+), 19 deletions(-)

diff --git a/doc/modules/prediction.rst b/doc/modules/prediction.rst
index 04b7f2da0c653..c534757709f7f 100644
--- a/doc/modules/prediction.rst
+++ b/doc/modules/prediction.rst
@@ -17,12 +17,41 @@ In scikit-learn, there is a connection between soft and hard prediction. In the
 case of a binary classification, hard predictions are obtained by associating
 the positive class with probability value greater than 0.5 (obtained with
 :term:`predict_proba`) or decision function value greater than 0 (obtained with
-:term:`decision_function`). Similar rules apply a defined for other
-classification problems.
+:term:`decision_function`).
+
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.tree import DecisionTreeClassifier
+    >>> X, y = make_classification(random_state=0)
+    >>> classifier = DecisionTreeClassifier(max_depth=2, random_state=0).fit(X, y)
+    >>> classifier.predict_proba(X[:4])
+    array([[0.94   , 0.06   ],
+           [0.94   , 0.06   ],
+           [0.04..., 0.95...],
+           [0.04..., 0.95...]])
+    >>> classifier.predict(X[:4])
+    array([0, 0, 1, 1])
+
+
+Similar rules apply for other classification problems.
 
 While these approaches are reasonable as default behaviors, they might not be
-adapted to certain use cases. For instance, in a medical context, it might be
-better to predict the positive class for a lower probability than 0.5 to not
-miss any patient with a disease. However, it will come at the cost of having
-more false positive predictions. In some use cases, one would like to define
-the "hard" score based on a "business" metric instead of a statistical metric.
+adapted to some cases. The context and nature of the use case will define the
+expected behavior of the classifier and thus the strategy to convert soft
+predictions into hard predictions. We will illustrate this point with an
+example.
+
+Let's imagine the deployment of a predictive model helping medical doctors to
+detect cancers. In a setting where this model would be a tool to discard
+obvious cases, doctors might be interested to have a high recall (all cancers
+cases should be tagged as such) to not miss any patient with a cancer. However,
+it will be at the cost of having more false positive predictions (i.e. lower
+precision). Thus, in terms of decision threshold, it would be better to
+classify a patient having a cancer for a lower probability than 0.5.
+
+Post-tuning of the decision threshold
+=====================================
+
+One solution to address the problem stated in the introduction is to tune the
+decision threshold of the classifier once this model has been trained. The
+:class:`CutOffClassifier` allows to tune this threshold using an internal
+cross-validation.
diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index b00a1703af217..5f3a3a85e1b70 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -393,7 +393,16 @@ def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
         score : float
             Score function applied to prediction of estimator on X.
         """
-        y_score = method_caller(estimator, self.response_method, X)
+        pos_label = self._get_pos_label()
+        # TODO: this part is also repeated in the predict of `CutOffClassifier`
+        # We should refactor this
+        if pos_label is None:
+            map_pred_to_label = np.array([0, 1])
+        else:
+            pos_label_idx = np.flatnonzero(estimator.classes_ == pos_label)[0]
+            neg_label_idx = np.flatnonzero(estimator.classes_ != pos_label)[0]
+            map_pred_to_label = np.array([neg_label_idx, pos_label_idx])
+        y_score = method_caller(estimator, self.response_method, X, pos_label=pos_label)
 
         if sample_weight is not None:
             score_func = partial(self._score_func, sample_weight=sample_weight)
@@ -406,7 +415,7 @@ def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
                 self._sign
                 * score_func(
                     y_true,
-                    estimator.classes_[(y_score >= th).astype(int)],
+                    estimator.classes_[map_pred_to_label[(y_score >= th).astype(int)]],
                     **self._kwargs,
                 )
                 for th in potential_thresholds
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index c55f53e923e81..9b6ac15a65504 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -1246,3 +1246,6 @@ def test_continuous_scorer():
     thresholds, scores = scorer(estimator, X, y)
 
     assert all(scores <= 0)
+
+
+# TODO: add more test involving `pos_label` in the continuous scorer
diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 7cd2f87f46add..c96066d62be93 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -694,11 +694,19 @@ def predict(self, X):
         """
         check_is_fitted(self, "estimator_")
         pos_label = self._scorer._get_pos_label()
+        # TODO: this part is also repeated in the predict of `_ContinuousScorer`
+        # We should refactor this
+        if pos_label is None:
+            map_pred_to_label = np.array([0, 1])
+        else:
+            pos_label_idx = np.flatnonzero(self.classes_ == pos_label)[0]
+            neg_label_idx = np.flatnonzero(self.classes_ != pos_label)[0]
+            map_pred_to_label = np.array([neg_label_idx, pos_label_idx])
         y_score, _ = _get_response_values_binary(
             self.estimator_, X, self._response_method, pos_label=pos_label
         )
-        y_pred = (y_score >= self.decision_threshold_).astype(int)
-        return self.classes_[y_pred]
+        y_pred_pos_label = (y_score >= self.decision_threshold_).astype(int)
+        return self.classes_[map_pred_to_label[y_pred_pos_label]]
 
     @available_if(_estimator_has("predict_proba"))
     def predict_proba(self, X):
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index 17e77bfcfa5e4..9aa0e1fab4dcd 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -439,10 +439,7 @@ def test_cutoffclassifier_metric_with_parameter():
         "max_recall_at_precision_constraint",
         make_scorer(balanced_accuracy_score),
         make_scorer(f1_score, pos_label="cancer"),
-        # penalize false negative since we have an imbalanced dataset and the
-        # accuracy would not be a good metric to optimize the decision
-        # threshold
-        {"tp": 1, "tn": 1, "fp": 1, "fn": -10},
+        {"tp": 5, "tn": 1, "fp": -1, "fn": -1},
     ],
 )
 def test_cutoffclassifier_with_string_targets(response_method, metric):
@@ -454,7 +451,7 @@ def test_cutoffclassifier_with_string_targets(response_method, metric):
     # Encode numeric targets by meaningful strings. We purposely designed the class
     # names such that the `pos_label` is the first alphabetically sorted class and thus
     # encoded as 0.
-    classes = np.array(["healthy", "cancer"], dtype=object)
+    classes = np.array(["cancer", "healthy"], dtype=object)
     y = classes[y]
     model = CutOffClassifier(
         estimator=make_pipeline(StandardScaler(), LogisticRegression()),
@@ -626,7 +623,8 @@ def test_cutoffclassifier_objective_metric_dict(global_random_seed):
     assert model.decision_threshold_ < 0.01
     assert np.mean(model.predict(X) == 1) > 0.9
 
-    # flipping the `pos_label` to zero should flip as well the decision threshold
+    # flipping the `pos_label` to zero should force the classifier to always predict 0
+    # and thus have a low threshold
     pos_label = 0
     model = CutOffClassifier(
         classifier,
@@ -636,7 +634,7 @@ def test_cutoffclassifier_objective_metric_dict(global_random_seed):
     )
     model.fit(X, y)
 
-    assert model.decision_threshold_ > 0.99
+    assert model.decision_threshold_ < 0.01
     assert np.mean(model.predict(X) == 0) > 0.9
 
 
@@ -655,7 +653,7 @@ def test_cutoffclassifier_sample_weight_costs_and_again():
     # we use a prefit classifier to simplify the test
     cv = "prefit"
     estimator = LogisticRegression().fit(X, y)
-    costs_and_again = {"tp": 1, "tn": 1, "fp": 1, "fn": 1}
+    costs_and_again = {"tp": 1, "tn": 1, "fp": -1, "fn": -1}
 
     model_repeat = CutOffClassifier(estimator, cv=cv, objective_metric=costs_and_again)
     model_repeat.fit(X_repeated, y_repeated, sample_weight=None)
@@ -693,3 +691,8 @@ def test_cutoffclassifier_cv_zeros_sample_weights_equivalence():
     y_pred_with_weights = model_with_weights.predict_proba(X)
     y_pred_without_weights = model_without_weights.predict_proba(X)
     assert_allclose(y_pred_with_weights, y_pred_without_weights)
+
+
+# TODO write non-regression test when pos_label corresponds to idx #0
+# Before we did not think to potentially remap the the output of the comparison
+# to the original labels.

From aa520daef45111ded25252ff6795a47930cd9f1b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 25 Apr 2023 21:42:15 +0200
Subject: [PATCH 040/194] iter

---
 sklearn/metrics/_scorer.py             | 44 ++++++++++++++------------
 sklearn/model_selection/_prediction.py | 13 ++------
 2 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 5f3a3a85e1b70..7fbebb37546e7 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -221,6 +221,20 @@ def _factory_args(self):
         """Return non-default make_scorer arguments for repr."""
         return ""
 
+    def _from_scores_to_class_labels(self, y_score, threshold, classes):
+        """Threshold `y_score` and return the associated class labels."""
+        pos_label = self._get_pos_label()
+        if pos_label is None:
+            map_thresholded_score_to_label = np.array([0, 1])
+        else:
+            pos_label_idx = np.flatnonzero(classes == pos_label)[0]
+            neg_label_idx = np.flatnonzero(classes != pos_label)[0]
+            map_thresholded_score_to_label = np.array([neg_label_idx, pos_label_idx])
+
+        return classes[
+            map_thresholded_score_to_label[(y_score >= threshold).astype(int)]
+        ]
+
 
 class _PredictScorer(_BaseScorer):
     def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
@@ -394,14 +408,6 @@ def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
             Score function applied to prediction of estimator on X.
         """
         pos_label = self._get_pos_label()
-        # TODO: this part is also repeated in the predict of `CutOffClassifier`
-        # We should refactor this
-        if pos_label is None:
-            map_pred_to_label = np.array([0, 1])
-        else:
-            pos_label_idx = np.flatnonzero(estimator.classes_ == pos_label)[0]
-            neg_label_idx = np.flatnonzero(estimator.classes_ != pos_label)[0]
-            map_pred_to_label = np.array([neg_label_idx, pos_label_idx])
         y_score = method_caller(estimator, self.response_method, X, pos_label=pos_label)
 
         if sample_weight is not None:
@@ -410,18 +416,16 @@ def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
             score_func = self._score_func
 
         potential_thresholds = np.unique(y_score)
-        score_thresholds = np.array(
-            [
-                self._sign
-                * score_func(
-                    y_true,
-                    estimator.classes_[map_pred_to_label[(y_score >= th).astype(int)]],
-                    **self._kwargs,
-                )
-                for th in potential_thresholds
-            ]
-        )
-        return potential_thresholds, score_thresholds
+        score_thresholds = [
+            self._sign
+            * score_func(
+                y_true,
+                self._from_scores_to_class_labels(y_score, th, estimator.classes_),
+                **self._kwargs,
+            )
+            for th in potential_thresholds
+        ]
+        return potential_thresholds, np.array(score_thresholds)
 
 
 @validate_params(
diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index c96066d62be93..753c286bf99d6 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -694,19 +694,12 @@ def predict(self, X):
         """
         check_is_fitted(self, "estimator_")
         pos_label = self._scorer._get_pos_label()
-        # TODO: this part is also repeated in the predict of `_ContinuousScorer`
-        # We should refactor this
-        if pos_label is None:
-            map_pred_to_label = np.array([0, 1])
-        else:
-            pos_label_idx = np.flatnonzero(self.classes_ == pos_label)[0]
-            neg_label_idx = np.flatnonzero(self.classes_ != pos_label)[0]
-            map_pred_to_label = np.array([neg_label_idx, pos_label_idx])
         y_score, _ = _get_response_values_binary(
             self.estimator_, X, self._response_method, pos_label=pos_label
         )
-        y_pred_pos_label = (y_score >= self.decision_threshold_).astype(int)
-        return self.classes_[map_pred_to_label[y_pred_pos_label]]
+        return self._scorer._from_scores_to_class_labels(
+            y_score, self.decision_threshold_, self.classes_
+        )
 
     @available_if(_estimator_has("predict_proba"))
     def predict_proba(self, X):

From 5403cf6ecf68e9e7b91c1b86c982e1ae0cfb4fb7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 Apr 2023 16:06:24 +0200
Subject: [PATCH 041/194] Do not allow for single threshold

---
 sklearn/metrics/tests/test_score_objects.py   | 30 ++++++++++++++++++-
 sklearn/model_selection/_prediction.py        |  6 ++++
 .../model_selection/tests/test_prediction.py  | 12 ++++++++
 sklearn/utils/_mocking.py                     | 20 ++++++++-----
 sklearn/utils/tests/test_mocking.py           | 12 ++------
 5 files changed, 62 insertions(+), 18 deletions(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 9b6ac15a65504..cb9182ea551e6 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -56,6 +56,7 @@
 from sklearn.model_selection import train_test_split, cross_val_score
 from sklearn.model_selection import GridSearchCV
 from sklearn.multiclass import OneVsRestClassifier
+from sklearn.dummy import DummyClassifier
 
 
 REGRESSION_SCORERS = [
@@ -1248,4 +1249,31 @@ def test_continuous_scorer():
     assert all(scores <= 0)
 
 
-# TODO: add more test involving `pos_label` in the continuous scorer
+def test_continuous_scorer_pos_label():
+    """Check that we propagate properly the `pos_label` parameter to the scorer."""
+    X, _ = make_classification(n_samples=100, random_state=0)
+    y = np.hstack([np.ones(75), np.zeros(25)])
+
+    estimator = DummyClassifier(strategy="constant", constant=1).fit(X, y)
+
+    # By setting `pos_label=1`, we force the scorer to use the probability p(c=1) that
+    # is always 100% for the dummy classifier predicting only 1.
+    scorer = _ContinuousScorer(
+        precision_score,
+        sign=1,
+        response_method="predict_proba",
+        kwargs={"pos_label": 1},
+    )
+    thresholds, scores = scorer(estimator, X, y)
+    print(thresholds, scores)
+
+    # By setting `pos_label=0`, we force the scorer to use the probability p(c=0) that
+    # is always 0% for the dummy classifier predicting only 1.
+    scorer = _ContinuousScorer(
+        precision_score,
+        sign=1,
+        response_method="predict_proba",
+        kwargs={"pos_label": 0},
+    )
+    thresholds, scores = scorer(estimator, X, y)
+    print(thresholds, scores)
diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 753c286bf99d6..19d99a300706b 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -601,6 +601,12 @@ def cost_sensitive_score_func(y_true, y_pred, **kwargs):
             )
         )
 
+        if any(len(th) == 1 for th in thresholds):
+            raise ValueError(
+                "The provided estimator makes constant predictions. Therefore, it is "
+                "impossible to optimize the decision threshold."
+            )
+
         if hasattr(classifier, "predict_proba") and (
             self._response_method == "predict_proba"
             or (
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index 9aa0e1fab4dcd..3f82919c73795 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -3,6 +3,7 @@
 
 from sklearn.base import clone
 from sklearn.datasets import load_breast_cancer, load_iris, make_classification
+from sklearn.dummy import DummyClassifier
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.exceptions import NotFittedError
 from sklearn.linear_model import LogisticRegression
@@ -473,6 +474,7 @@ def test_cutoffclassifier_refit(with_sample_weight, global_random_seed):
     X, y = make_classification(n_samples=100, random_state=0)
     if with_sample_weight:
         sample_weight = rng.randn(X.shape[0])
+        sample_weight = np.abs(sample_weight, out=sample_weight)
     else:
         sample_weight = None
 
@@ -693,6 +695,16 @@ def test_cutoffclassifier_cv_zeros_sample_weights_equivalence():
     assert_allclose(y_pred_with_weights, y_pred_without_weights)
 
 
+def test_cutoffclassifier_error_constant_learner():
+    """Check that we raise an error message when providing an estimator that predicts
+    only a single class."""
+    X, y = make_classification(random_state=0)
+    estimator = DummyClassifier(strategy="constant", constant=1)
+    err_msg = "The provided estimator makes constant predictions."
+    with pytest.raises(ValueError, match=err_msg):
+        CutOffClassifier(estimator).fit(X, y)
+
+
 # TODO write non-regression test when pos_label corresponds to idx #0
 # Before we did not think to potentially remap the the output of the comparison
 # to the original labels.
diff --git a/sklearn/utils/_mocking.py b/sklearn/utils/_mocking.py
index 688bfb68ed484..7feb85568ae07 100644
--- a/sklearn/utils/_mocking.py
+++ b/sklearn/utils/_mocking.py
@@ -3,7 +3,7 @@
 from ..base import BaseEstimator, ClassifierMixin
 from .metaestimators import available_if
 from .validation import _check_sample_weight, _num_samples, check_array
-from .validation import check_is_fitted
+from .validation import check_is_fitted, check_random_state
 
 
 class ArraySlicingWrapper:
@@ -131,6 +131,7 @@ def __init__(
         foo_param=0,
         expected_sample_weight=None,
         expected_fit_params=None,
+        random_state=None,
     ):
         self.check_y = check_y
         self.check_y_params = check_y_params
@@ -140,6 +141,7 @@ def __init__(
         self.foo_param = foo_param
         self.expected_sample_weight = expected_sample_weight
         self.expected_fit_params = expected_fit_params
+        self.random_state = random_state
 
     def _check_X_y(self, X, y=None, should_be_fitted=True):
         """Validate X and y and make extra check.
@@ -239,7 +241,8 @@ def predict(self, X):
         """
         if self.methods_to_check == "all" or "predict" in self.methods_to_check:
             X, y = self._check_X_y(X)
-        return self.classes_[np.zeros(_num_samples(X), dtype=int)]
+        rng = check_random_state(self.random_state)
+        return rng.choice(self.classes_, size=_num_samples(X))
 
     def predict_proba(self, X):
         """Predict probabilities for each class.
@@ -259,8 +262,10 @@ def predict_proba(self, X):
         """
         if self.methods_to_check == "all" or "predict_proba" in self.methods_to_check:
             X, y = self._check_X_y(X)
-        proba = np.zeros((_num_samples(X), len(self.classes_)))
-        proba[:, 0] = 1
+        rng = check_random_state(self.random_state)
+        proba = rng.randn(_num_samples(X), len(self.classes_))
+        proba = np.abs(proba, out=proba)
+        proba /= np.sum(proba, axis=1)[:, np.newaxis]
         return proba
 
     def decision_function(self, X):
@@ -282,14 +287,13 @@ def decision_function(self, X):
             or "decision_function" in self.methods_to_check
         ):
             X, y = self._check_X_y(X)
+        rng = check_random_state(self.random_state)
         if len(self.classes_) == 2:
             # for binary classifier, the confidence score is related to
             # classes_[1] and therefore should be null.
-            return np.zeros(_num_samples(X))
+            return rng.randn(_num_samples(X))
         else:
-            decision = np.zeros((_num_samples(X), len(self.classes_)))
-            decision[:, 0] = 1
-            return decision
+            return rng.randn(_num_samples(X), len(self.classes_))
 
     def score(self, X=None, Y=None):
         """Fake score.
diff --git a/sklearn/utils/tests/test_mocking.py b/sklearn/utils/tests/test_mocking.py
index 718c62d5cc83b..b24dd13a3bbd7 100644
--- a/sklearn/utils/tests/test_mocking.py
+++ b/sklearn/utils/tests/test_mocking.py
@@ -3,7 +3,6 @@
 from scipy import sparse
 
 from numpy.testing import assert_array_equal
-from numpy.testing import assert_allclose
 
 from sklearn.datasets import load_iris
 from sklearn.utils import check_array
@@ -93,7 +92,7 @@ def test_checking_classifier(iris, input_type):
     assert clf.n_features_in_ == 4
 
     y_pred = clf.predict(X)
-    assert_array_equal(y_pred, np.zeros(y_pred.size, dtype=int))
+    assert all(pred in clf.classes_ for pred in y_pred)
 
     assert clf.score(X) == pytest.approx(0)
     clf.set_params(foo_param=10)
@@ -101,13 +100,10 @@ def test_checking_classifier(iris, input_type):
 
     y_proba = clf.predict_proba(X)
     assert y_proba.shape == (150, 3)
-    assert_allclose(y_proba[:, 0], 1)
-    assert_allclose(y_proba[:, 1:], 0)
+    assert np.logical_and(y_proba >= 0, y_proba <= 1).all()
 
     y_decision = clf.decision_function(X)
     assert y_decision.shape == (150, 3)
-    assert_allclose(y_decision[:, 0], 1)
-    assert_allclose(y_decision[:, 1:], 0)
 
     # check the shape in case of binary classification
     first_2_classes = np.logical_or(y == 0, y == 1)
@@ -117,12 +113,10 @@ def test_checking_classifier(iris, input_type):
 
     y_proba = clf.predict_proba(X)
     assert y_proba.shape == (100, 2)
-    assert_allclose(y_proba[:, 0], 1)
-    assert_allclose(y_proba[:, 1], 0)
+    assert np.logical_and(y_proba >= 0, y_proba <= 1).all()
 
     y_decision = clf.decision_function(X)
     assert y_decision.shape == (100,)
-    assert_allclose(y_decision, 0)
 
 
 def test_checking_classifier_with_params(iris):

From cd377430d4b9cbb3351c5620da8f1db2cdadedf6 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 Apr 2023 16:07:06 +0200
Subject: [PATCH 042/194] TST add random state checkingclassifier

---
 sklearn/model_selection/tests/test_prediction.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index 3f82919c73795..3ee1b50ae5f01 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -279,7 +279,7 @@ def test_fit_and_score_fit_params(scorer, score_method, fit_params_type):
         "b": _convert_container(y, fit_params_type),
     }
 
-    classifier = CheckingClassifier(expected_fit_params=["a", "b"])
+    classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0)
     train_idx, val_idx = np.arange(50), np.arange(50, 100)
 
     _fit_and_score(
@@ -536,7 +536,7 @@ def test_cutoffclassifier_fit_params(objective_metric, fit_params_type):
         "b": _convert_container(y, fit_params_type),
     }
 
-    classifier = CheckingClassifier(expected_fit_params=["a", "b"])
+    classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0)
     model = CutOffClassifier(
         classifier, objective_metric=objective_metric, constraint_value=0.5
     )

From e7d07af7c3c1b96717c6d6be1122d89e3475aac5 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 Apr 2023 17:17:18 +0200
Subject: [PATCH 043/194] TST more test for _ContinuousScorer

---
 sklearn/metrics/tests/test_score_objects.py | 40 ++++++++++++---------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index cb9182ea551e6..bcde2643e2e62 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -56,7 +56,6 @@
 from sklearn.model_selection import train_test_split, cross_val_score
 from sklearn.model_selection import GridSearchCV
 from sklearn.multiclass import OneVsRestClassifier
-from sklearn.dummy import DummyClassifier
 
 
 REGRESSION_SCORERS = [
@@ -1249,31 +1248,40 @@ def test_continuous_scorer():
     assert all(scores <= 0)
 
 
-def test_continuous_scorer_pos_label():
+def test_continuous_scorer_pos_label(global_random_seed):
     """Check that we propagate properly the `pos_label` parameter to the scorer."""
-    X, _ = make_classification(n_samples=100, random_state=0)
-    y = np.hstack([np.ones(75), np.zeros(25)])
-
-    estimator = DummyClassifier(strategy="constant", constant=1).fit(X, y)
+    n_samples = 30
+    X, y = make_classification(
+        n_samples=n_samples, weights=[0.9, 0.1], random_state=global_random_seed
+    )
+    estimator = LogisticRegression().fit(X, y)
 
-    # By setting `pos_label=1`, we force the scorer to use the probability p(c=1) that
-    # is always 100% for the dummy classifier predicting only 1.
     scorer = _ContinuousScorer(
-        precision_score,
+        recall_score,
         sign=1,
         response_method="predict_proba",
         kwargs={"pos_label": 1},
     )
-    thresholds, scores = scorer(estimator, X, y)
-    print(thresholds, scores)
+    thresholds_pos_label_1, scores_pos_label_1 = scorer(estimator, X, y)
 
-    # By setting `pos_label=0`, we force the scorer to use the probability p(c=0) that
-    # is always 0% for the dummy classifier predicting only 1.
     scorer = _ContinuousScorer(
-        precision_score,
+        recall_score,
         sign=1,
         response_method="predict_proba",
         kwargs={"pos_label": 0},
     )
-    thresholds, scores = scorer(estimator, X, y)
-    print(thresholds, scores)
+    thresholds_pos_label_0, scores_pos_label_0 = scorer(estimator, X, y)
+
+    # If `pos_label` is not forwarded to the scorer, the thresholds will be equal.
+    # Make sure that this is not the case.
+    # assert not (thresholds_pos_label_1 == thresholds_pos_label_0).all()
+    # Since we have an imbalanced problem, the thresholds should represent higher
+    # probabilities level when `pos_label=0` than with `pos_label=1`.
+    assert np.sum(thresholds_pos_label_1 < 0.15) > 2 / 3 * n_samples
+    assert np.sum(thresholds_pos_label_0 > 0.85) > 2 / 3 * n_samples
+
+    # The recall cannot be negative and `pos_label=1` should have a higher recall
+    # since there is less samples to be considered.
+    assert 0.0 < scores_pos_label_0.min() < scores_pos_label_1.min()
+    assert scores_pos_label_0.max() == pytest.approx(1.0)
+    assert scores_pos_label_1.max() == pytest.approx(1.0)

From bc20a474c8eed15a4b8d8372fa502ed11a28f0c3 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 Apr 2023 21:55:08 +0200
Subject: [PATCH 044/194] TST add test for pos_label

---
 .../model_selection/tests/test_prediction.py  | 41 +++++++++++++++++--
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index 3ee1b50ae5f01..89782181ad575 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -13,6 +13,8 @@
     f1_score,
     make_scorer,
     precision_recall_curve,
+    precision_score,
+    recall_score,
     roc_curve,
 )
 from sklearn.metrics._scorer import _ContinuousScorer
@@ -705,6 +707,39 @@ def test_cutoffclassifier_error_constant_learner():
         CutOffClassifier(estimator).fit(X, y)
 
 
-# TODO write non-regression test when pos_label corresponds to idx #0
-# Before we did not think to potentially remap the the output of the comparison
-# to the original labels.
+@pytest.mark.parametrize(
+    "objective_metric",
+    ["max_precision_at_recall_constraint", "max_recall_at_precision_constraint"],
+)
+@pytest.mark.parametrize("pos_label", [0, 1])
+def test_cutoffclassifier_pos_label_constraint_metric(
+    objective_metric, pos_label, global_random_seed
+):
+    X, y = make_classification(
+        n_samples=5_000,
+        weights=[0.6, 0.4],
+        random_state=global_random_seed,
+    )
+
+    # prefit the estimator to avoid variability due to the cross-validation
+    estimator = LogisticRegression().fit(X, y)
+
+    constraint_value = 0.7
+    model = CutOffClassifier(
+        estimator,
+        objective_metric=objective_metric,
+        constraint_value=constraint_value,
+        cv="prefit",
+        pos_label=pos_label,
+    ).fit(X, y)
+
+    precision = precision_score(y, model.predict(X), pos_label=pos_label)
+    recall = recall_score(y, model.predict(X), pos_label=pos_label)
+
+    # due to internal interpolation, the scores will vary slightly
+    if objective_metric == "max_precision_at_recall_constraint":
+        assert recall == pytest.approx(model.objective_score_[0], abs=1e-3)
+        assert precision == pytest.approx(model.objective_score_[1], abs=1e-3)
+    else:
+        assert precision == pytest.approx(model.objective_score_[0], abs=1e-3)
+        assert recall == pytest.approx(model.objective_score_[1], abs=1e-3)

From bba2f976fcd4299f0e85888048d987f249942189 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 Apr 2023 22:13:17 +0200
Subject: [PATCH 045/194] TST add pos_label test for TNR/TPR

---
 .../model_selection/tests/test_prediction.py  | 45 ++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index 89782181ad575..1ffd45bc4b3f3 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -9,6 +9,7 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import (
     balanced_accuracy_score,
+    confusion_matrix,
     fbeta_score,
     f1_score,
     make_scorer,
@@ -712,9 +713,11 @@ def test_cutoffclassifier_error_constant_learner():
     ["max_precision_at_recall_constraint", "max_recall_at_precision_constraint"],
 )
 @pytest.mark.parametrize("pos_label", [0, 1])
-def test_cutoffclassifier_pos_label_constraint_metric(
+def test_cutoffclassifier_pos_label_precision_recall(
     objective_metric, pos_label, global_random_seed
 ):
+    """Check that `pos_label` is dispatched correctly by checking the precision and
+    recall score found during the optimization and the one found at `predict` time."""
     X, y = make_classification(
         n_samples=5_000,
         weights=[0.6, 0.4],
@@ -743,3 +746,43 @@ def test_cutoffclassifier_pos_label_constraint_metric(
     else:
         assert precision == pytest.approx(model.objective_score_[0], abs=1e-3)
         assert recall == pytest.approx(model.objective_score_[1], abs=1e-3)
+
+
+@pytest.mark.parametrize(
+    "objective_metric", ["max_tnr_at_tpr_constraint", "max_tpr_at_tnr_constraint"]
+)
+@pytest.mark.parametrize("pos_label", [0, 1])
+def test_cutoffclassifier_pos_label_tnr_tpr(objective_metric, pos_label):
+    """Check that `pos_label` is dispatched correctly by checking the TNR and TPR
+    score found during the optimization and the one found at `predict` time."""
+    X, y = make_classification(n_samples=5_000, weights=[0.6, 0.4], random_state=42)
+
+    # prefit the estimator to avoid variability due to the cross-validation
+    estimator = LogisticRegression().fit(X, y)
+
+    constraint_value = 0.7
+    model = CutOffClassifier(
+        estimator,
+        objective_metric=objective_metric,
+        constraint_value=constraint_value,
+        cv="prefit",
+        pos_label=pos_label,
+    ).fit(X, y)
+
+    def tnr_tpr_score(y_true, y_pred, pos_label=pos_label):
+        cm = confusion_matrix(y_true, y_pred)
+        if pos_label == 0:
+            cm = cm[::-1, ::-1]
+        tn, fp, fn, tp = cm.ravel()
+        tnr = tn / (tn + fp)
+        tpr = tp / (tp + fn)
+        return tnr, tpr
+
+    tnr, tpr = tnr_tpr_score(y, model.predict(X), pos_label=pos_label)
+    # due to internal interpolation, the scores will vary slightly
+    if objective_metric == "max_tnr_at_tpr_constraint":
+        assert tpr == pytest.approx(model.objective_score_[0], abs=0.05)
+        assert tnr == pytest.approx(model.objective_score_[1], abs=0.05)
+    else:
+        assert tnr == pytest.approx(model.objective_score_[0], abs=0.05)
+        assert tpr == pytest.approx(model.objective_score_[1], abs=0.05)

From f9255030698560a47a20a1f9501faf90b18076ae Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 Apr 2023 23:17:18 +0200
Subject: [PATCH 046/194] some more

---
 sklearn/metrics/_scorer.py                    |  3 ++
 sklearn/model_selection/_prediction.py        | 10 +++-
 .../model_selection/tests/test_prediction.py  | 48 +++++++++++++++----
 3 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 7fbebb37546e7..67fadaa8c36b3 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -415,6 +415,9 @@ def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
         else:
             score_func = self._score_func
 
+        # TODO: this is pretty slow if we have a lot of `potential_thresholds`
+        # We could parallelize but then we are inside a nested parallel loop where the
+        # external parallelism is on the CV.
         potential_thresholds = np.unique(y_score)
         score_thresholds = [
             self._sign
diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 19d99a300706b..e8f0eb6b5f650 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -575,13 +575,21 @@ def cost_sensitive_score_func(y_true, y_pred, **kwargs):
             self._scorer = make_scorer(score_func, **params_scorer)
         else:
             scoring = check_scoring(classifier, scoring=self.objective_metric)
+            # add `pos_label` if requested by the scorer function
+            scorer_kwargs = {**scoring._kwargs}
+            signature_scoring_func = signature(scoring._score_func)
+            if (
+                "pos_label" in signature_scoring_func.parameters
+                and "pos_label" not in scorer_kwargs
+            ):
+                scorer_kwargs["pos_label"] = self.pos_label
             # transform a binary metric into a curve metric for all possible decision
             # thresholds
             self._scorer = _ContinuousScorer(
                 score_func=scoring._score_func,
                 sign=scoring._sign,
                 response_method=self._response_method,
-                kwargs=scoring._kwargs,
+                kwargs=scorer_kwargs,
             )
 
         thresholds, scores = zip(
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index 1ffd45bc4b3f3..96acb15fd8e68 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -713,16 +713,10 @@ def test_cutoffclassifier_error_constant_learner():
     ["max_precision_at_recall_constraint", "max_recall_at_precision_constraint"],
 )
 @pytest.mark.parametrize("pos_label", [0, 1])
-def test_cutoffclassifier_pos_label_precision_recall(
-    objective_metric, pos_label, global_random_seed
-):
+def test_cutoffclassifier_pos_label_precision_recall(objective_metric, pos_label):
     """Check that `pos_label` is dispatched correctly by checking the precision and
     recall score found during the optimization and the one found at `predict` time."""
-    X, y = make_classification(
-        n_samples=5_000,
-        weights=[0.6, 0.4],
-        random_state=global_random_seed,
-    )
+    X, y = make_classification(n_samples=5_000, weights=[0.6, 0.4], random_state=42)
 
     # prefit the estimator to avoid variability due to the cross-validation
     estimator = LogisticRegression().fit(X, y)
@@ -786,3 +780,41 @@ def tnr_tpr_score(y_true, y_pred, pos_label=pos_label):
     else:
         assert tnr == pytest.approx(model.objective_score_[0], abs=0.05)
         assert tpr == pytest.approx(model.objective_score_[1], abs=0.05)
+
+
+@pytest.mark.parametrize(
+    "metric_type",
+    ["string", "scorer_without_pos_label", "scorer_with_pos_label"],
+)
+@pytest.mark.parametrize("pos_label", [0, 1])
+def test_cutoffclassifier_pos_label_single_metric(pos_label, metric_type):
+    """Check that `pos_label` is dispatched correctly when getting a scorer linked to
+    a known metric. By default, the scorer in scikit-learn only have a default value
+    for `pos_label` which is 1.
+    """
+    X, y = make_classification(n_samples=100, weights=[0.6, 0.4], random_state=42)
+
+    # prefit the estimator to avoid variability due to the cross-validation
+    estimator = LogisticRegression().fit(X, y)
+
+    if metric_type == "string":
+        objective_metric = "precision"
+    elif metric_type == "scorer_without_pos_label":
+        objective_metric = make_scorer(precision_score)
+    else:  # metric_type == "scorer_with_pos_label"
+        objective_metric = make_scorer(precision_score, pos_label=pos_label)
+
+    model = CutOffClassifier(
+        estimator,
+        objective_metric=objective_metric,
+        cv="prefit",
+        pos_label=pos_label,
+        n_thresholds=500,
+    ).fit(X, y)
+
+    precision = precision_score(y, model.predict(X), pos_label=pos_label)
+    assert precision == pytest.approx(model.objective_score_, abs=1e-3)
+
+
+# TODO: check side effect when `n_samples > n_thresholds` where optimizing a score
+# could lead to making constant predictions of `~pos_label`.

From d539235235aa194c31317e7990e392aa010af88c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 27 Apr 2023 10:08:09 +0200
Subject: [PATCH 047/194] avoid extrapolation

---
 sklearn/metrics/_scorer.py                    |  3 ++
 sklearn/model_selection/_prediction.py        | 28 ++++++++-----------
 .../model_selection/tests/test_prediction.py  |  4 ---
 3 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 67fadaa8c36b3..80aab72ce7276 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -418,6 +418,9 @@ def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
         # TODO: this is pretty slow if we have a lot of `potential_thresholds`
         # We could parallelize but then we are inside a nested parallel loop where the
         # external parallelism is on the CV.
+        # Another guess would be to interpolate the potential thresholds at this moment.
+        # Easy for the probability case but not for the decision function case since it
+        # is not bounded.
         potential_thresholds = np.unique(y_score)
         score_thresholds = [
             self._sign
diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index e8f0eb6b5f650..5547c4b95f165 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -582,7 +582,15 @@ def cost_sensitive_score_func(y_true, y_pred, **kwargs):
                 "pos_label" in signature_scoring_func.parameters
                 and "pos_label" not in scorer_kwargs
             ):
-                scorer_kwargs["pos_label"] = self.pos_label
+                if self.pos_label is None:
+                    # Since the provided `pos_label` is the default, we need to
+                    # use the default value of the scoring function that can be either
+                    # `None` or `1`.
+                    scorer_kwargs["pos_label"] = signature_scoring_func.parameters[
+                        "pos_label"
+                    ].default
+                else:
+                    scorer_kwargs["pos_label"] = self.pos_label
             # transform a binary metric into a curve metric for all possible decision
             # thresholds
             self._scorer = _ContinuousScorer(
@@ -615,21 +623,9 @@ def cost_sensitive_score_func(y_true, y_pred, **kwargs):
                 "impossible to optimize the decision threshold."
             )
 
-        if hasattr(classifier, "predict_proba") and (
-            self._response_method == "predict_proba"
-            or (
-                isinstance(self._response_method, list)
-                and self._response_method[0] == "predict_proba"
-            )
-        ):
-            # `predict_proba` was used to compute scores
-            min_threshold = 0.0
-            max_threshold = 1.0
-        else:
-            # `decision_function` was used to compute scores
-            min_threshold = np.min([th.min() for th in thresholds])
-            max_threshold = np.max([th.max() for th in thresholds])
-
+        # find the global min and max thresholds across all folds
+        min_threshold = np.min([th.min() for th in thresholds])
+        max_threshold = np.max([th.max() for th in thresholds])
         thresholds_interpolated = np.linspace(
             min_threshold, max_threshold, num=self.n_thresholds
         )
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index 96acb15fd8e68..707f9898affae 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -814,7 +814,3 @@ def test_cutoffclassifier_pos_label_single_metric(pos_label, metric_type):
 
     precision = precision_score(y, model.predict(X), pos_label=pos_label)
     assert precision == pytest.approx(model.objective_score_, abs=1e-3)
-
-
-# TODO: check side effect when `n_samples > n_thresholds` where optimizing a score
-# could lead to making constant predictions of `~pos_label`.

From c0acd442387e33b605a4f3723c2b8b656bd1d04e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 27 Apr 2023 16:38:15 +0200
Subject: [PATCH 048/194] FEA add all thresholds and score computed as
 attributes

---
 sklearn/model_selection/_prediction.py        | 45 ++++++++++++-------
 .../model_selection/tests/test_prediction.py  | 17 +++++++
 2 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 5547c4b95f165..0b24fe338d1e2 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -272,6 +272,12 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
         two float values: the first one is the score of the metric which is constrained
         and the second one is the score of the maximized metric.
 
+    decision_thresholds_ : ndarray of shape (n_thresholds,)
+        All decision thresholds that were evaluated.
+
+    objective_scores_ : ndarray of shape (n_thresholds,)
+        The scores of the objective metric associated with the decision thresholds.
+
     classes_ : ndarray of shape (n_classes,)
         The class labels.
 
@@ -600,7 +606,7 @@ def cost_sensitive_score_func(y_true, y_pred, **kwargs):
                 kwargs=scorer_kwargs,
             )
 
-        thresholds, scores = zip(
+        cv_thresholds, cv_scores = zip(
             *Parallel(n_jobs=self.n_jobs)(
                 delayed(_fit_and_score)(
                     classifier,
@@ -617,41 +623,49 @@ def cost_sensitive_score_func(y_true, y_pred, **kwargs):
             )
         )
 
-        if any(len(th) == 1 for th in thresholds):
+        if any(len(th) == 1 for th in cv_thresholds):
             raise ValueError(
                 "The provided estimator makes constant predictions. Therefore, it is "
                 "impossible to optimize the decision threshold."
             )
 
         # find the global min and max thresholds across all folds
-        min_threshold = np.min([th.min() for th in thresholds])
-        max_threshold = np.max([th.max() for th in thresholds])
-        thresholds_interpolated = np.linspace(
+        min_threshold = np.min([th.min() for th in cv_thresholds])
+        max_threshold = np.max([th.max() for th in cv_thresholds])
+        self.decision_thresholds_ = np.linspace(
             min_threshold, max_threshold, num=self.n_thresholds
         )
 
-        def _mean_interpolated_score(thresholds, scores):
+        def _mean_interpolated_score(threshold_interpolated, cv_thresholds, cv_scores):
             return np.mean(
                 [
-                    np.interp(thresholds_interpolated, th, sc)
-                    for th, sc in zip(thresholds, scores)
+                    np.interp(threshold_interpolated, th, sc)
+                    for th, sc in zip(cv_thresholds, cv_scores)
                 ],
                 axis=0,
             )
 
         if constraint_value == "highest":  # find best score
-            mean_score = _mean_interpolated_score(thresholds, scores)
-            best_idx = mean_score.argmax()
-            self.objective_score_ = mean_score[best_idx]
-            self.decision_threshold_ = thresholds_interpolated[best_idx]
+            self.objective_scores_ = _mean_interpolated_score(
+                self.decision_thresholds_, cv_thresholds, cv_scores
+            )
+            best_idx = self.objective_scores_.argmax()
+            self.objective_score_ = self.objective_scores_[best_idx]
+            self.decision_threshold_ = self.decision_thresholds_[best_idx]
         else:
             if "tpr" in self.objective_metric:  # tpr/tnr
                 mean_tnr, mean_tpr = [
-                    _mean_interpolated_score(thresholds, sc) for sc in zip(*scores)
+                    _mean_interpolated_score(
+                        self.decision_thresholds_, cv_thresholds, sc
+                    )
+                    for sc in zip(*cv_scores)
                 ]
             else:  # precision/recall
                 mean_precision, mean_recall = [
-                    _mean_interpolated_score(thresholds, sc) for sc in zip(*scores)
+                    _mean_interpolated_score(
+                        self.decision_thresholds_, cv_thresholds, sc
+                    )
+                    for sc in zip(*cv_scores)
                 ]
 
             def _get_best_idx(constrained_score, maximized_score):
@@ -670,12 +684,13 @@ def _get_best_idx(constrained_score, maximized_score):
             else:  # max_recall_at_precision_constraint
                 constrained_score, maximized_score = mean_precision, mean_recall
 
+            self.objective_scores_ = (constrained_score, maximized_score)
             best_idx = _get_best_idx(constrained_score, maximized_score)
             self.objective_score_ = (
                 constrained_score[best_idx],
                 maximized_score[best_idx],
             )
-            self.decision_threshold_ = thresholds_interpolated[best_idx]
+            self.decision_threshold_ = self.decision_thresholds_[best_idx]
 
         if hasattr(self.estimator_, "n_features_in_"):
             self.n_features_in_ = self.estimator_.n_features_in_
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index 707f9898affae..d984dc85f82b6 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -379,14 +379,18 @@ def test_cutoffclassifier_with_constraint_value(response_method):
     y = np.hstack([y[indices_neg], y[indices_pos]])
 
     lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
+    n_thresholds = 100
     model = CutOffClassifier(
         estimator=lr,
         objective_metric="balanced_accuracy",
         response_method=response_method,
+        n_thresholds=n_thresholds,
     )
     score_optimized = balanced_accuracy_score(y, model.fit(X, y).predict(X))
     score_baseline = balanced_accuracy_score(y, lr.predict(X))
     assert score_optimized > score_baseline
+    assert model.decision_thresholds_.shape == (n_thresholds,)
+    assert model.objective_scores_.shape == (n_thresholds,)
 
 
 @pytest.mark.parametrize(
@@ -567,13 +571,17 @@ def test_cutoffclassifier_response_method_scorer_with_constraint_metric(
     X, y = make_classification(n_samples=100, random_state=global_random_seed)
     classifier = LogisticRegression()
 
+    n_thresholds = 100
     model = CutOffClassifier(
         classifier,
         objective_metric=objective_metric,
         constraint_value=constraint_value,
         response_method=response_method,
+        n_thresholds=n_thresholds,
     )
     model.fit(X, y)
+    assert model.decision_thresholds_.shape == (n_thresholds,)
+    assert all(score.shape == (n_thresholds,) for score in model.objective_scores_)
 
     if response_method in ("auto", "predict_proba"):
         # "auto" will fall back  in priority on `predict_proba` if `estimator`
@@ -615,6 +623,9 @@ def test_cutoffclassifier_objective_metric_dict(global_random_seed):
     )
     model.fit(X, y)
 
+    assert model.decision_thresholds_.shape == (n_thresholds,)
+    assert model.objective_scores_.shape == (n_thresholds,)
+
     assert model.decision_threshold_ > 0.99
     assert np.mean(model.predict(X) == 0) > 0.9
 
@@ -625,6 +636,9 @@ def test_cutoffclassifier_objective_metric_dict(global_random_seed):
     )
     model.fit(X, y)
 
+    assert model.decision_thresholds_.shape == (n_thresholds,)
+    assert model.objective_scores_.shape == (n_thresholds,)
+
     assert model.decision_threshold_ < 0.01
     assert np.mean(model.predict(X) == 1) > 0.9
 
@@ -639,6 +653,9 @@ def test_cutoffclassifier_objective_metric_dict(global_random_seed):
     )
     model.fit(X, y)
 
+    assert model.decision_thresholds_.shape == (n_thresholds,)
+    assert model.objective_scores_.shape == (n_thresholds,)
+
     assert model.decision_threshold_ < 0.01
     assert np.mean(model.predict(X) == 0) > 0.9
 

From f87baa7634fdee4b78dd90c8263eafa3b75fd7fa Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 28 Apr 2023 11:47:18 +0200
Subject: [PATCH 049/194] fix docstring

---
 sklearn/model_selection/_prediction.py           | 8 ++++----
 sklearn/model_selection/tests/test_prediction.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 0b24fe338d1e2..91bdae48d886a 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -330,11 +330,11 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
     >>> print(classification_report(y_test, classifier_tuned.predict(X_test)))
                   precision    recall  f1-score   support
     <BLANKLINE>
-               0       0.95      0.97      0.96       224
-               1       0.71      0.58      0.64        26
+               0       0.96      0.96      0.96       224
+               1       0.68      0.65      0.67        26
     <BLANKLINE>
         accuracy                           0.93       250
-       macro avg       0.83      0.78      0.80       250
+       macro avg       0.82      0.81      0.81       250
     weighted avg       0.93      0.93      0.93       250
     <BLANKLINE>
     """
@@ -451,7 +451,7 @@ def fit(self, X, y, sample_weight=None, **fit_params):
             if self.refit is False and cv.get_n_splits() > 1:
                 raise ValueError("When cv has several folds, refit cannot be False.")
             if self.refit == "auto":
-                refit = True if cv.get_n_splits() == 1 else False
+                refit = cv.get_n_splits() > 1
             else:
                 refit = self.refit
 
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index d984dc85f82b6..afbfa62401634 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -414,7 +414,7 @@ def test_cutoffclassifier_limit_metric_tradeoff(metrics):
     y_pred_1 = model.fit(X, y).predict(X)
     model.set_params(objective_metric=metrics[1])
     y_pred_2 = (~model.fit(X, y).predict(X).astype(bool)).astype(int)
-    assert np.mean(y_pred_1 == y_pred_2) == pytest.approx(1.0)
+    assert np.mean(y_pred_1 == y_pred_2) > 0.98
 
 
 def test_cutoffclassifier_metric_with_parameter():

From e4dac09c2c2274818de4b92ea595542cf3d7acb7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 28 Apr 2023 11:47:40 +0200
Subject: [PATCH 050/194] EXA add example of cut-off tuning

---
 .../model_selection/plot_cutoff_tuning.py     | 657 ++++++++++++++++++
 1 file changed, 657 insertions(+)
 create mode 100644 examples/model_selection/plot_cutoff_tuning.py

diff --git a/examples/model_selection/plot_cutoff_tuning.py b/examples/model_selection/plot_cutoff_tuning.py
new file mode 100644
index 0000000000000..1afa70d7e652c
--- /dev/null
+++ b/examples/model_selection/plot_cutoff_tuning.py
@@ -0,0 +1,657 @@
+"""
+==================================================
+Post-tuning the cut-off point of decision function
+==================================================
+
+Once a classifier is trained, the output of the :term:`predict` method output hard
+predictions corresponding to a thresholding of either the :term:`decision function`
+or the :term:`predict_proba` output. For binary classifier, the default threshold is
+defined as a probability score of 0.5 or a decision function value of 0.0.
+
+However, it happens that this default strategy is not optimized for the task at hand.
+Here, we use the "Statlog" German credit dataset [1]_ to illustrate such an application.
+In this dataset, the task is to predict whether a person is "good" or "bad" credit.
+In addition, cost-matrix is provided where it is defined that classifying a "bad"
+credit as "good" is 5 times more costly than classifying a "good" credit as "bad".
+
+We use the :class:`~sklearn.model_selection.CutOffClassifier` to tune the
+cut-off point of the decision function to minimize the business cost provided to us.
+
+.. topic:: References
+   .. [1] "Statlog (German Credit Data) Data Set", UCI Machine Learning Repository,
+        https://archive.ics.uci.edu/ml/datasets/Statlog+%28German+Credit+Data%29
+"""
+
+# %%
+# "Statlog" German credit dataset
+# -------------------------------
+#
+# We fetch the German credit dataset from OpenML.
+import sklearn
+from sklearn.datasets import fetch_openml
+
+sklearn.set_config(transform_output="pandas")
+german_credit = fetch_openml(data_id=31, as_frame=True, parser="pandas")
+X, y = german_credit.data, german_credit.target
+
+# %%
+# We check the feature types available in `X`.
+X.info()
+
+# %%
+X.head()
+
+# %%
+# Many features are categorical and usually string-encoded. We need to encode
+# these categories when we develop ou predictive model. Let's check the targets.
+y.value_counts()
+
+# %%
+# A first observation is that the dataset is imbalanced. We would need to be careful
+# when evaluating our predictive model and use a family of metrics that are adapted
+# to this setting.
+#
+# In addition, we observe that the target is string-encoded. Some metrics
+# (e.g. precision and recall) require to provide the label of interest also called
+# the "positive label". Here, we define that our goal is to predict whether or not
+# a sample is a "bad" credit.
+pos_label = "bad"
+
+# %%
+# To carry our analysis, we split our dataset using a single stratified split.
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
+
+# %%
+# We are ready to design our predictive model and the associated evaluation strategy.
+#
+# Evaluation metrics
+# ------------------
+#
+# In this section, we define a set of metrics that we use later. To see
+# the effect of tuning the cut-off point, we evaluate the predictive model using
+# the Receiver Operating Characteristic (ROC) curve and the Precision-Recall curve.
+# The values reported on these plots are therefore the true positive rate (TPR) and
+# the false positive rate (FPR) for the ROC curve and the precision and recall for the
+# Precision-Recall curve.
+#
+# From these four metrics, scikit-learn does not provide a scorer for the FPR. We
+# therefore need to define a small custom function to compute it.
+import numpy as np
+from sklearn.metrics import confusion_matrix, make_scorer, precision_score, recall_score
+
+
+def fpr_score(y, y_pred, **kwargs):
+    cm = confusion_matrix(y, y_pred)
+    classes = np.unique(y)
+    pos_label = kwargs.get("pos_label", classes[-1])
+    pos_label_idx = np.searchsorted(classes, pos_label)
+    if pos_label_idx == 0:
+        cm = cm[::-1, ::-1]
+    tn, fp, _, _ = cm.ravel()
+    tnr = tn / (tn + fp)
+    return 1 - tnr
+
+
+# %%
+# As previously stated, the "positive label" is not defined as the value "1" and calling
+# some of the metrics with this non-standard value raise an error. We need to
+# provide the indication of the "positive label" to the metrics.
+#
+# We therefore need to define a scikit-learn scorer using
+# :func:`~sklearn.metrics.make_scorer` where the information is passed. We store all
+# created scorer in a dictionary. To use them, we need to pass the fitted model and
+# the data and target on which we want to evaluate the predictive model.
+tpr_score = recall_score  # TPR and recall are the same metric
+scoring = {
+    "precision": make_scorer(precision_score, pos_label=pos_label),
+    "recall": make_scorer(recall_score, pos_label=pos_label),
+    "fpr": make_scorer(fpr_score, pos_label=pos_label),
+    "tpr": make_scorer(tpr_score, pos_label=pos_label),
+}
+
+# %%
+# In addition, the original research [1]_ defines a business metric. They provide a
+# cost-matrix where they define that predicting a "bad" credit as "good" is 5 times more
+# costly than the opposite. We define a dictionary containing this information and a
+# score function that computes the cost.
+cost_gain_matrix = {"tp": 0, "tn": 0, "fp": -1, "fn": -5}
+
+
+def gain_cost_score(y, y_pred, **kwargs):
+    cm = confusion_matrix(y, y_pred)
+    classes = np.unique(y)
+    pos_label = kwargs.get("pos_label", classes[-1])
+    pos_label_idx = np.searchsorted(classes, pos_label)
+    if pos_label_idx == 0:
+        cm = cm[::-1, ::-1]
+    costs_and_gain = np.array(
+        [
+            [kwargs["cost_gain_matrix"]["tn"], kwargs["cost_gain_matrix"]["fp"]],
+            [kwargs["cost_gain_matrix"]["fn"], kwargs["cost_gain_matrix"]["tp"]],
+        ]
+    )
+    return (costs_and_gain * cm).sum()
+
+
+scoring["cost_gain"] = make_scorer(
+    gain_cost_score, pos_label=pos_label, cost_gain_matrix=cost_gain_matrix
+)
+# %%
+# Vanilla predictive model
+# ------------------------
+#
+# Design of the predictive model
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# In this section, we design our predictive model. We use a gradient boosting classifier
+# using :class:`~sklearn.ensemble.HistGradientBoostingClassifier`. However, we need to
+# encode the categorical features with numerical values and we therefore use an
+# :class:`~sklearn.preprocessing.OrdinalEncoder` to do so. The numerical features are
+# kept as-is. To recover the name of the categorical columns, we use the helper function
+# :func:`~sklearn.compose.make_column_selector` and the fact that the categorical
+# features are stored as `category` dtype.
+from sklearn.compose import ColumnTransformer, make_column_selector as selector
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OrdinalEncoder
+
+categorical_columns = selector(dtype_include="category")(X)
+numerical_columns = selector(dtype_exclude="category")(X)
+
+preprocessor = ColumnTransformer(
+    [
+        ("categorical", OrdinalEncoder(), categorical_columns),
+        ("numerical", "passthrough", numerical_columns),
+    ],
+    verbose_feature_names_out=False,
+)
+model = Pipeline(
+    [
+        ("preprocessor", preprocessor),
+        (
+            "classifier",
+            HistGradientBoostingClassifier(
+                categorical_features=categorical_columns, random_state=0
+            ),
+        ),
+    ]
+)
+
+model.fit(X_train, y_train)
+
+# %%
+# Evaluation of the predictive model
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# We evaluate the performance of our predictive model using the ROC and Precision-Recall
+# curves.
+import matplotlib.pyplot as plt
+from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay
+
+fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))
+
+PrecisionRecallDisplay.from_estimator(
+    model, X_test, y_test, pos_label=pos_label, ax=axs[0], name="GBDT"
+)
+axs[0].plot(
+    scoring["recall"](model, X_test, y_test),
+    scoring["precision"](model, X_test, y_test),
+    marker="o",
+    markersize=10,
+    color="tab:blue",
+    label="Default cut-off point at a probability of 0.5",
+)
+axs[0].set_title("Precision-Recall curve")
+axs[0].legend()
+
+RocCurveDisplay.from_estimator(
+    model,
+    X_test,
+    y_test,
+    pos_label=pos_label,
+    ax=axs[1],
+    name="GBDT",
+    plot_chance_level=True,
+)
+axs[1].plot(
+    scoring["fpr"](model, X_test, y_test),
+    scoring["tpr"](model, X_test, y_test),
+    marker="o",
+    markersize=10,
+    color="tab:blue",
+    label="Default cut-off point at a probability of 0.5",
+)
+axs[1].set_title("ROC curve")
+axs[1].legend()
+_ = fig.suptitle("Evaluation of the vanilla GBDT model")
+
+# %%
+# We recall that these curves corresponds show the statistical performance of the
+# predictive model for different cut-off points. For the Precision-Recall curve, the
+# reported metrics are the precision and recall and for the ROC curve, the reported
+# metrics are the TPR (same as recall) and FPR.
+#
+# Here, the different cut-off points corresponds at different level of probability
+# scores ranging between 0 and 1. By default, `model.predict` uses a cut-off point at
+# a probability of 0.5. Thus, the metrics for this cut-off point is reported with the
+# blue dot on the curves: it corresponds to the statistical performance of the model
+# when using `model.predict`.
+#
+# However, we recall that the original aim was to minimize the cost (or maximize the
+# gain) by the business metric. We can compute the value of the business metric:
+scoring["cost_gain"](model, X_test, y_test)
+
+# %%
+# At this stage, we don't know if some other cut-off points could lead to a greater
+# gain. To be able to decide, if another cut-off point is better, we would need to
+# compute the cost-gain using the business metric for all possible cut-off points and
+# choose the optimal one. This strategy is quite tedious to implement and the
+# :class:`~sklearn.metrics.CutOffClassifier` class is here to help us. It automatically
+# compute the cost-gain for all possible cut-off points and choose the optimal one.
+#
+# Tuning the cut-off point
+# ------------------------
+#
+# We use :class:`~sklearn.metrics.CutOffClassifier` to tune the cut-off point. We need
+# to provide the business metric to optimize as well as the positive label. Internally,
+# the optimum cut-off point is chosen such that it maximized the business metric
+# via cross-validation. By default a 5-fold stratified cross-validation is used.
+from sklearn.model_selection import CutOffClassifier
+
+model_tuned = CutOffClassifier(
+    estimator=model,
+    pos_label=pos_label,
+    objective_metric=cost_gain_matrix,
+)
+model_tuned.fit(X_train, y_train)
+
+# %%
+# We plot the ROC and Precision-Recall curves for the vanilla model and the tuned model.
+# Also we plot the cut-off points that would be used by each model.
+from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay
+
+fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(21, 6))
+
+linestyles = ("dashed", "dotted")
+markerstyles = ("o", ">")
+colors = ("tab:blue", "tab:orange")
+names = ("Vanilla GBDT", "Tuned GBDT")
+for idx, (est, linestyle, marker, color, name) in enumerate(
+    zip((model, model_tuned), linestyles, markerstyles, colors, names)
+):
+    decision_threshold = getattr(est, "decision_threshold_", 0.5)
+    PrecisionRecallDisplay.from_estimator(
+        est,
+        X_test,
+        y_test,
+        pos_label=pos_label,
+        linestyle=linestyle,
+        color=color,
+        ax=axs[0],
+        name=name,
+    )
+# %%
+# We plot the ROC and Precision-Recall curves for the vanilla model and the tuned model.
+# Also we plot the cut-off points that would be used by each model.
+from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay
+
+fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(21, 6))
+
+linestyles = ("dashed", "dotted")
+markerstyles = ("o", ">")
+colors = ("tab:blue", "tab:orange")
+names = ("Vanilla GBDT", "Tuned GBDT")
+for idx, (est, linestyle, marker, color, name) in enumerate(
+    zip((model, model_tuned), linestyles, markerstyles, colors, names)
+):
+    decision_threshold = getattr(est, "decision_threshold_", 0.5)
+    PrecisionRecallDisplay.from_estimator(
+        est,
+        X_test,
+        y_test,
+        pos_label=pos_label,
+        linestyle=linestyle,
+        color=color,
+        ax=axs[0],
+        name=name,
+    )
+# %%
+# We plot the ROC and Precision-Recall curves for the vanilla model and the tuned model.
+# Also we plot the cut-off points that would be used by each model.
+from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay
+
+fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(21, 6))
+
+linestyles = ("dashed", "dotted")
+markerstyles = ("o", ">")
+colors = ("tab:blue", "tab:orange")
+names = ("Vanilla GBDT", "Tuned GBDT")
+for idx, (est, linestyle, marker, color, name) in enumerate(
+    zip((model, model_tuned), linestyles, markerstyles, colors, names)
+):
+    decision_threshold = getattr(est, "decision_threshold_", 0.5)
+    PrecisionRecallDisplay.from_estimator(
+        est,
+        X_test,
+        y_test,
+        pos_label=pos_label,
+        linestyle=linestyle,
+        color=color,
+        ax=axs[0],
+        name=name,
+    )
+# %%
+# We plot the ROC and Precision-Recall curves for the vanilla model and the tuned model.
+# Also we plot the cut-off points that would be used by each model.
+from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay
+
+fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(21, 6))
+
+linestyles = ("dashed", "dotted")
+markerstyles = ("o", ">")
+colors = ("tab:blue", "tab:orange")
+names = ("Vanilla GBDT", "Tuned GBDT")
+for idx, (est, linestyle, marker, color, name) in enumerate(
+    zip((model, model_tuned), linestyles, markerstyles, colors, names)
+):
+    decision_threshold = getattr(est, "decision_threshold_", 0.5)
+    PrecisionRecallDisplay.from_estimator(
+        est,
+        X_test,
+        y_test,
+        pos_label=pos_label,
+        linestyle=linestyle,
+        color=color,
+        ax=axs[0],
+        name=name,
+    )
+# %%
+# We plot the ROC and Precision-Recall curves for the vanilla model and the tuned model.
+# Also we plot the cut-off points that would be used by each model.
+fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(21, 6))
+
+linestyles = ("dashed", "dotted")
+markerstyles = ("o", ">")
+colors = ("tab:blue", "tab:orange")
+names = ("Vanilla GBDT", "Tuned GBDT")
+for idx, (est, linestyle, marker, color, name) in enumerate(
+    zip((model, model_tuned), linestyles, markerstyles, colors, names)
+):
+    decision_threshold = getattr(est, "decision_threshold_", 0.5)
+    PrecisionRecallDisplay.from_estimator(
+        est,
+        X_test,
+        y_test,
+        pos_label=pos_label,
+        linestyle=linestyle,
+        color=color,
+        ax=axs[0],
+        name=name,
+    )
+    axs[0].plot(
+        scoring["recall"](est, X_test, y_test),
+        scoring["precision"](est, X_test, y_test),
+        marker,
+        markersize=10,
+        color=color,
+        label=f"Cut-off point at probability of {decision_threshold:.2f}",
+    )
+    RocCurveDisplay.from_estimator(
+        est,
+        X_test,
+        y_test,
+        pos_label=pos_label,
+        linestyle=linestyle,
+        color=color,
+        ax=axs[1],
+        name=name,
+        plot_chance_level=idx == 1,
+    )
+    axs[1].plot(
+        scoring["fpr"](est, X_test, y_test),
+        scoring["tpr"](est, X_test, y_test),
+        marker,
+        markersize=10,
+        color=color,
+        label=f"Cut-off point at probability of {decision_threshold:.2f}",
+    )
+
+axs[0].set_title("Precision-Recall curve")
+axs[0].legend()
+axs[1].set_title("ROC curve")
+axs[1].legend()
+
+axs[2].plot(
+    model_tuned.decision_thresholds_, model_tuned.objective_scores_, color="tab:orange"
+)
+axs[2].plot(
+    model_tuned.decision_threshold_,
+    model_tuned.objective_score_,
+    "o",
+    markersize=10,
+    color="tab:orange",
+    label="Optimal cut-off point for the business metric",
+)
+axs[2].legend()
+axs[2].set_xlabel("Decision threshold (probability)")
+axs[2].set_ylabel("Objective score (using cost-matrix)")
+axs[2].set_title("Objective score as a function of the decision threshold")
+
+_ = fig.suptitle("Comparison of the cut-off point for the vanilla and tuned GBDT model")
+
+# %%
+# The first remark is that both classifiers have exactly the same ROC and
+# Precision-Recall curves. It is expected because by default, the classifier is fitted
+# on the same training data. In a later section, we discuss more in details the
+# available options regarding model refitting and cross-validation.
+#
+# The second remark is that the cut-off points of the vanilla and tuned model are
+# different. To understand why the tuned model has chosen this cut-off point, we can
+# look at the right-hand side plot that plots the objective score that is our exactly
+# the same as our business metric. We see that the optimum threshold corresponds to the
+# maximum of the objective score.
+#
+# We can now check if choosing this cut-off point leads to a better score on the testing
+# set:
+scoring["cost_gain"](model_tuned, X_test, y_test)
+
+# %%
+# We observe that the decision generalized on the testing set leading to a better
+# business score.
+#
+# Consideration regarding model refitting and cross-validation
+# ------------------------------------------------------------
+#
+# In the above experiment, we use the default parameter of the
+# :class:`~sklearn.model_selection.CutOffClassifier`. By default, the cut-off point is
+# tuned using a 5-fold stratified cross-validation. Also, the underlying predictive
+# model is refitted on the entire training data once the cut-off point is chosen.
+#
+# These two strategies can be changed by providing the `refit` and `cv` parameters.
+# For instance, one could provide a fitted `estimator` and set `cv="prefit"`, in which
+# case the cut-off point is found on the entire dataset provided at fitting time.
+# Also, the underlying classifier is not be refitted. Here, we can try to do such
+# experiment.
+model.fit(X_train, y_train)
+model_tuned.set_params(cv="prefit").fit(X_train, y_train)
+
+
+# %%
+# Then, we evaluate our model with the same approach as before:
+fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(21, 6))
+
+linestyles = ("dashed", "dotted")
+markerstyles = ("o", ">")
+colors = ("tab:blue", "tab:orange")
+names = ("Vanilla GBDT", "Tuned GBDT")
+for idx, (est, linestyle, marker, color, name) in enumerate(
+    zip((model, model_tuned), linestyles, markerstyles, colors, names)
+):
+    decision_threshold = getattr(est, "decision_threshold_", 0.5)
+    PrecisionRecallDisplay.from_estimator(
+        est,
+        X_test,
+        y_test,
+        pos_label=pos_label,
+        linestyle=linestyle,
+        color=color,
+        ax=axs[0],
+        name=name,
+    )
+    axs[0].plot(
+        scoring["recall"](est, X_test, y_test),
+        scoring["precision"](est, X_test, y_test),
+        marker,
+        markersize=10,
+        color=color,
+        label=f"Cut-off point at probability of {decision_threshold:.2f}",
+    )
+    RocCurveDisplay.from_estimator(
+        est,
+        X_test,
+        y_test,
+        pos_label=pos_label,
+        linestyle=linestyle,
+        color=color,
+        ax=axs[1],
+        name=name,
+        plot_chance_level=idx == 1,
+    )
+    axs[1].plot(
+        scoring["fpr"](est, X_test, y_test),
+        scoring["tpr"](est, X_test, y_test),
+        marker,
+        markersize=10,
+        color=color,
+        label=f"Cut-off point at probability of {decision_threshold:.2f}",
+    )
+
+axs[0].set_title("Precision-Recall curve")
+axs[0].legend()
+axs[1].set_title("ROC curve")
+axs[1].legend()
+
+axs[2].plot(
+    model_tuned.decision_thresholds_, model_tuned.objective_scores_, color="tab:orange"
+)
+axs[2].plot(
+    model_tuned.decision_threshold_,
+    model_tuned.objective_score_,
+    "o",
+    markersize=10,
+    color="tab:orange",
+    label="Optimal cut-off point for the business metric",
+)
+axs[2].legend()
+axs[2].set_xlabel("Decision threshold (probability)")
+axs[2].set_ylabel("Objective score (using cost-matrix)")
+axs[2].set_title("Objective score as a function of the decision threshold")
+
+_ = fig.suptitle("Tuned GBDT model without refitting and using the entire dataset")
+
+# %%
+# We observe the that the optimum cut-off point is different than in the previous
+# experiment. If we look at the right-hand side plot, we observe that the objective
+# score has large plateau with a minimum cost (around 0). This behavior is symptomatic
+# of an overfitting. Because we disable cross-validation, we tuned the cut-off point on
+# the same set as the model was trained on, and this is the reason for the observed
+# overfitting.
+#
+# This option should therefore be used with caution. One needs to make sure that the
+# data providing at fitting time to the
+# :class:`~sklearn.model_selection.CutOffClassifier` is not the same as the data used to
+# train the underlying classifier. This could happen sometimes when the idea is just
+# to tune the predictive model on a completely new validation set without a costly
+# complete refit.
+#
+# In the case that cross-validation is too costly, a potential alternative is to use
+# a single train-test split by providing a floating number in range `[0, 1]` to the
+# `cv` parameter. It splits the data into a training and testing set. Let's
+# explore this option:
+model_tuned.set_params(cv=0.75).fit(X_train, y_train)
+
+# %%
+fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(21, 6))
+
+linestyles = ("dashed", "dotted")
+markerstyles = ("o", ">")
+colors = ("tab:blue", "tab:orange")
+names = ("Vanilla GBDT", "Tuned GBDT")
+for idx, (est, linestyle, marker, color, name) in enumerate(
+    zip((model, model_tuned), linestyles, markerstyles, colors, names)
+):
+    decision_threshold = getattr(est, "decision_threshold_", 0.5)
+    PrecisionRecallDisplay.from_estimator(
+        est,
+        X_test,
+        y_test,
+        pos_label=pos_label,
+        linestyle=linestyle,
+        color=color,
+        ax=axs[0],
+        name=name,
+    )
+    axs[0].plot(
+        scoring["recall"](est, X_test, y_test),
+        scoring["precision"](est, X_test, y_test),
+        marker,
+        markersize=10,
+        color=color,
+        label=f"Cut-off point at probability of {decision_threshold:.2f}",
+    )
+    RocCurveDisplay.from_estimator(
+        est,
+        X_test,
+        y_test,
+        pos_label=pos_label,
+        linestyle=linestyle,
+        color=color,
+        ax=axs[1],
+        name=name,
+        plot_chance_level=idx == 1,
+    )
+    axs[1].plot(
+        scoring["fpr"](est, X_test, y_test),
+        scoring["tpr"](est, X_test, y_test),
+        marker,
+        markersize=10,
+        color=color,
+        label=f"Cut-off point at probability of {decision_threshold:.2f}",
+    )
+
+axs[0].set_title("Precision-Recall curve")
+axs[0].legend()
+axs[1].set_title("ROC curve")
+axs[1].legend()
+
+axs[2].plot(
+    model_tuned.decision_thresholds_, model_tuned.objective_scores_, color="tab:orange"
+)
+axs[2].plot(
+    model_tuned.decision_threshold_,
+    model_tuned.objective_score_,
+    "o",
+    markersize=10,
+    color="tab:orange",
+    label="Optimal cut-off point for the business metric",
+)
+axs[2].legend()
+axs[2].set_xlabel("Decision threshold (probability)")
+axs[2].set_ylabel("Objective score (using cost-matrix)")
+axs[2].set_title("Objective score as a function of the decision threshold")
+
+_ = fig.suptitle("Tuned GBDT model without refitting and using the entire dataset")
+
+# %%
+# Regarding the cut-off point, we observe that the optimum is similar to the multiple
+# repeated cross-validation case. However, be aware that a single split does not account
+# for the variability of the fit/predict process and thus we are enable to know if there
+# is any variance in the cut-off point. The repeated cross-validation averages this
+# effect.
+#
+# The second observation is about the ROC and Precision-Recall curves the tuned model.
+# We observe that the curve is now different from the vanilla model. It is not
+# surprising because we train the underlying classifier on a subset of the data provided
+# at `fit` while keeping a validation set to tune the cut-off point.

From bd8659522a4538818c63c0c0055ff90d4a35a454 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 28 Apr 2023 12:21:05 +0200
Subject: [PATCH 051/194] solving the issue of unknown categories

---
 .../model_selection/plot_cutoff_tuning.py     | 106 +-----------------
 setup.cfg                                     |   2 +-
 2 files changed, 6 insertions(+), 102 deletions(-)

diff --git a/examples/model_selection/plot_cutoff_tuning.py b/examples/model_selection/plot_cutoff_tuning.py
index 1afa70d7e652c..eb74a810c10a3 100644
--- a/examples/model_selection/plot_cutoff_tuning.py
+++ b/examples/model_selection/plot_cutoff_tuning.py
@@ -162,7 +162,11 @@ def gain_cost_score(y, y_pred, **kwargs):
 
 preprocessor = ColumnTransformer(
     [
-        ("categorical", OrdinalEncoder(), categorical_columns),
+        (
+            "categorical",
+            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
+            categorical_columns,
+        ),
         ("numerical", "passthrough", numerical_columns),
     ],
     verbose_feature_names_out=False,
@@ -267,106 +271,6 @@ def gain_cost_score(y, y_pred, **kwargs):
 )
 model_tuned.fit(X_train, y_train)
 
-# %%
-# We plot the ROC and Precision-Recall curves for the vanilla model and the tuned model.
-# Also we plot the cut-off points that would be used by each model.
-from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay
-
-fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(21, 6))
-
-linestyles = ("dashed", "dotted")
-markerstyles = ("o", ">")
-colors = ("tab:blue", "tab:orange")
-names = ("Vanilla GBDT", "Tuned GBDT")
-for idx, (est, linestyle, marker, color, name) in enumerate(
-    zip((model, model_tuned), linestyles, markerstyles, colors, names)
-):
-    decision_threshold = getattr(est, "decision_threshold_", 0.5)
-    PrecisionRecallDisplay.from_estimator(
-        est,
-        X_test,
-        y_test,
-        pos_label=pos_label,
-        linestyle=linestyle,
-        color=color,
-        ax=axs[0],
-        name=name,
-    )
-# %%
-# We plot the ROC and Precision-Recall curves for the vanilla model and the tuned model.
-# Also we plot the cut-off points that would be used by each model.
-from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay
-
-fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(21, 6))
-
-linestyles = ("dashed", "dotted")
-markerstyles = ("o", ">")
-colors = ("tab:blue", "tab:orange")
-names = ("Vanilla GBDT", "Tuned GBDT")
-for idx, (est, linestyle, marker, color, name) in enumerate(
-    zip((model, model_tuned), linestyles, markerstyles, colors, names)
-):
-    decision_threshold = getattr(est, "decision_threshold_", 0.5)
-    PrecisionRecallDisplay.from_estimator(
-        est,
-        X_test,
-        y_test,
-        pos_label=pos_label,
-        linestyle=linestyle,
-        color=color,
-        ax=axs[0],
-        name=name,
-    )
-# %%
-# We plot the ROC and Precision-Recall curves for the vanilla model and the tuned model.
-# Also we plot the cut-off points that would be used by each model.
-from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay
-
-fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(21, 6))
-
-linestyles = ("dashed", "dotted")
-markerstyles = ("o", ">")
-colors = ("tab:blue", "tab:orange")
-names = ("Vanilla GBDT", "Tuned GBDT")
-for idx, (est, linestyle, marker, color, name) in enumerate(
-    zip((model, model_tuned), linestyles, markerstyles, colors, names)
-):
-    decision_threshold = getattr(est, "decision_threshold_", 0.5)
-    PrecisionRecallDisplay.from_estimator(
-        est,
-        X_test,
-        y_test,
-        pos_label=pos_label,
-        linestyle=linestyle,
-        color=color,
-        ax=axs[0],
-        name=name,
-    )
-# %%
-# We plot the ROC and Precision-Recall curves for the vanilla model and the tuned model.
-# Also we plot the cut-off points that would be used by each model.
-from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay
-
-fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(21, 6))
-
-linestyles = ("dashed", "dotted")
-markerstyles = ("o", ">")
-colors = ("tab:blue", "tab:orange")
-names = ("Vanilla GBDT", "Tuned GBDT")
-for idx, (est, linestyle, marker, color, name) in enumerate(
-    zip((model, model_tuned), linestyles, markerstyles, colors, names)
-):
-    decision_threshold = getattr(est, "decision_threshold_", 0.5)
-    PrecisionRecallDisplay.from_estimator(
-        est,
-        X_test,
-        y_test,
-        pos_label=pos_label,
-        linestyle=linestyle,
-        color=color,
-        ax=axs[0],
-        name=name,
-    )
 # %%
 # We plot the ROC and Precision-Recall curves for the vanilla model and the tuned model.
 # Also we plot the cut-off points that would be used by each model.
diff --git a/setup.cfg b/setup.cfg
index ba69be9fa9978..19f2bebeb7280 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -14,7 +14,7 @@ doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS
 testpaths = sklearn
 addopts =
     --doctest-modules
-    # --disable-pytest-warnings
+    --disable-pytest-warnings
     --color=yes
     # Activate the plugin explicitly to ensure that the seed is reported
     # correctly on the CI when running `pytest --pyargs sklearn` from the

From 45e6e5abfd87a1a96bc56d90431a57576aa2138e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 28 Apr 2023 13:44:01 +0200
Subject: [PATCH 052/194] fix

---
 examples/model_selection/plot_cutoff_tuning.py | 5 +++--
 sklearn/model_selection/_prediction.py         | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/model_selection/plot_cutoff_tuning.py b/examples/model_selection/plot_cutoff_tuning.py
index eb74a810c10a3..f49613e2f8f88 100644
--- a/examples/model_selection/plot_cutoff_tuning.py
+++ b/examples/model_selection/plot_cutoff_tuning.py
@@ -18,8 +18,9 @@
 cut-off point of the decision function to minimize the business cost provided to us.
 
 .. topic:: References
-   .. [1] "Statlog (German Credit Data) Data Set", UCI Machine Learning Repository,
-        https://archive.ics.uci.edu/ml/datasets/Statlog+%28German+Credit+Data%29
+
+    .. [1] "Statlog (German Credit Data) Data Set", UCI Machine Learning Repository,
+       https://archive.ics.uci.edu/ml/datasets/Statlog+%28German+Credit+Data%29
 """
 
 # %%
diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 91bdae48d886a..21f62f8c4acd3 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -195,7 +195,7 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
         want to find the decision threshold when `objective_metric` is equal one of
         `"max_tnr_at_tpr_constraint"`, `"max_tpr_at_tnr_constraint"`,
         `"max_precision_at_recall_constraint"`, or
-        `"max_recall_at_precision_constraint".
+        `"max_recall_at_precision_constraint"`.
 
     pos_label : int, float, bool or str, default=None
         The label of the positive class. Used when `objective_metric` is

From 402a1a72bc3798197a13270e008ffae08c7fd776 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 29 Apr 2023 11:15:04 +0200
Subject: [PATCH 053/194] EXA add hyperlink in the example

---
 examples/model_selection/plot_cutoff_tuning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/model_selection/plot_cutoff_tuning.py b/examples/model_selection/plot_cutoff_tuning.py
index f49613e2f8f88..0aefee2083327 100644
--- a/examples/model_selection/plot_cutoff_tuning.py
+++ b/examples/model_selection/plot_cutoff_tuning.py
@@ -20,7 +20,7 @@
 .. topic:: References
 
     .. [1] "Statlog (German Credit Data) Data Set", UCI Machine Learning Repository,
-       https://archive.ics.uci.edu/ml/datasets/Statlog+%28German+Credit+Data%29
+       `Link <https://archive.ics.uci.edu/ml/datasets/Statlog+%28German+Credit+Data%29>`_.  # noqa
 """
 
 # %%

From 6745afcc0ab477b965cf30b4a25e69f74364f783 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 29 Apr 2023 11:20:24 +0200
Subject: [PATCH 054/194] DOC add warning regarding overfitting

---
 .../model_selection/plot_cutoff_tuning.py     |  2 ++
 sklearn/model_selection/_prediction.py        | 21 ++++++++++++-------
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/examples/model_selection/plot_cutoff_tuning.py b/examples/model_selection/plot_cutoff_tuning.py
index 0aefee2083327..f95cb2f2345f0 100644
--- a/examples/model_selection/plot_cutoff_tuning.py
+++ b/examples/model_selection/plot_cutoff_tuning.py
@@ -366,6 +366,8 @@ def gain_cost_score(y, y_pred, **kwargs):
 # We observe that the decision generalized on the testing set leading to a better
 # business score.
 #
+# .. _cutoffclassifier_no_cv:
+#
 # Consideration regarding model refitting and cross-validation
 # ------------------------------------------------------------
 #
diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 21f62f8c4acd3..4572c38afcd13 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -223,19 +223,26 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
         Determines the cross-validation splitting strategy to train classifier.
         Possible inputs for cv are:
 
-        * None, to use the default 5-fold stratified K-fold cross validation;
-        * An integer number, to specify the number of folds in a stratified
-          k-fold;
-        * A float number, to specify a single shuffle split. The floating
-          number should be in (0, 1) and represent the size of the validation
-          set;
+        * `None`, to use the default 5-fold stratified K-fold cross validation;
+        * An integer number, to specify the number of folds in a stratified k-fold;
+        * A float number, to specify a single shuffle split. The floating number should
+          be in (0, 1) and represent the size of the validation set;
         * An object to be used as a cross-validation generator;
         * An iterable yielding train, test splits;
-        * "prefit", to bypass the cross-validation.
+        * `"prefit"`, to bypass the cross-validation.
 
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
 
+        .. warning::
+            Using `cv="prefit"` and passing the same dataset for fitting `estimator`
+            and tuning the cut-off point is subject to undesired overfitting. You can
+            refer to :ref:`cutoffclassifier_no_cv` for an example.
+
+            This option should only be used when the set used to fit `estimator` is
+            different from the one used to tune the cut-off point (by calling
+            :meth:`CutOffClassifier.fit`).
+
     refit : "auto" or bool, default="auto"
         Whether or not to refit the classifier on the entire training set once
         the decision threshold has been found. By default, `refit="auto"` is

From 4d557cc951cac4ce491f1c344ca531d5d065ac3a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 29 Apr 2023 12:01:55 +0200
Subject: [PATCH 055/194] some more doc

---
 doc/modules/prediction.rst | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/doc/modules/prediction.rst b/doc/modules/prediction.rst
index c534757709f7f..256c392be5d69 100644
--- a/doc/modules/prediction.rst
+++ b/doc/modules/prediction.rst
@@ -35,10 +35,9 @@ the positive class with probability value greater than 0.5 (obtained with
 Similar rules apply for other classification problems.
 
 While these approaches are reasonable as default behaviors, they might not be
-adapted to some cases. The context and nature of the use case will define the
+adapted to some cases. The context and nature of the use case define the
 expected behavior of the classifier and thus the strategy to convert soft
-predictions into hard predictions. We will illustrate this point with an
-example.
+predictions into hard predictions. We illustrate this point with an example.
 
 Let's imagine the deployment of a predictive model helping medical doctors to
 detect cancers. In a setting where this model would be a tool to discard
@@ -54,4 +53,31 @@ Post-tuning of the decision threshold
 One solution to address the problem stated in the introduction is to tune the
 decision threshold of the classifier once this model has been trained. The
 :class:`CutOffClassifier` allows to tune this threshold using an internal
-cross-validation.
+cross-validation. The optimum threshold is tuned to maximize a given metric
+with or without constraints.
+
+The following image illustrate the tuning of the cut-off point for a gradient
+boosting classifier. While the vanilla and tuned classifiers provide the same
+Receiver Operating Characteristic (ROC) and Precision-Recall curves, and thus
+the same :term:`predict_proba` outputs, the "hard" predictions defer because of
+the tuned cut-off point. The vanilla classifier predicts the class of interest
+for a probability greater than 0.5 while the tuned classifier predict the class
+of interest for a very low probability (around 0.02). This cut-off point is
+maximizes a utility metric defined by the business case (in this case an
+insurance company).
+
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cutoff_tuning_002.png
+   :target: ../auto_examples/model_selection/plot_cutoff_tuning.html
+   :align: center
+
+Available options to tune the cut-off point
+-------------------------------------------
+
+Important notes regarding the internal cross-validation
+-------------------------------------------------------
+
+Examples
+--------
+
+- See :ref:`sphx_glr_auto_examples_model_selection_plot_cutoff_tuning.py` example for
+  an example of tuning the decision threshold of a classifier.

From 2c6ee7e00c994a0299dd96c29ebb47abcff1e56c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 29 Apr 2023 12:07:17 +0200
Subject: [PATCH 056/194] some more doc

---
 doc/modules/prediction.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/modules/prediction.rst b/doc/modules/prediction.rst
index 256c392be5d69..336ca6d968279 100644
--- a/doc/modules/prediction.rst
+++ b/doc/modules/prediction.rst
@@ -61,9 +61,9 @@ boosting classifier. While the vanilla and tuned classifiers provide the same
 Receiver Operating Characteristic (ROC) and Precision-Recall curves, and thus
 the same :term:`predict_proba` outputs, the "hard" predictions defer because of
 the tuned cut-off point. The vanilla classifier predicts the class of interest
-for a probability greater than 0.5 while the tuned classifier predict the class
-of interest for a very low probability (around 0.02). This cut-off point is
-maximizes a utility metric defined by the business case (in this case an
+for a probability greater than 0.5 while the tuned classifier predicts the
+class of interest for a very low probability (around 0.02). This cut-off point
+is maximizes a utility metric defined by the business case (in this case an
 insurance company).
 
 .. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cutoff_tuning_002.png

From 91c8222818156c0a8d3d436c05f02742ad9cdb61 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 2 May 2023 11:50:39 +0200
Subject: [PATCH 057/194] DOC more documentation

---
 doc/modules/prediction.rst                    | 78 ++++++++++++++++++-
 .../model_selection/plot_cutoff_tuning.py     |  2 +
 sklearn/model_selection/_prediction.py        |  6 +-
 3 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/doc/modules/prediction.rst b/doc/modules/prediction.rst
index 336ca6d968279..25e9d146d2467 100644
--- a/doc/modules/prediction.rst
+++ b/doc/modules/prediction.rst
@@ -50,10 +50,10 @@ classify a patient having a cancer for a lower probability than 0.5.
 Post-tuning of the decision threshold
 =====================================
 
-One solution to address the problem stated in the introduction is to tune the
-decision threshold of the classifier once this model has been trained. The
-:class:`CutOffClassifier` allows to tune this threshold using an internal
-cross-validation. The optimum threshold is tuned to maximize a given metric
+One solution to address the problem stated in the introduction is to tune the decision
+threshold of the classifier once this model has been trained. The
+:class:`~sklearn.model_selection.CutOffClassifier` allows to tune this threshold using
+an internal cross-validation. The optimum threshold is tuned to maximize a given metric
 with or without constraints.
 
 The following image illustrate the tuning of the cut-off point for a gradient
@@ -73,9 +73,79 @@ insurance company).
 Available options to tune the cut-off point
 -------------------------------------------
 
+The cut-off point can be tuned with different strategies controlled by the parameter
+`objective_metric`.
+
+A straightforward use case is to maximize a pre-defined scikit-learn metric. These
+metrics can be found by calling the function :func:`~sklearn.metrics.get_scorer_names`.
+We provide an example where we maximize the balanced accuracy.
+
+.. note::
+
+    It is important to notice that these metrics comes with default parameter, notably
+    the label of the class of interested (i.e. `pos_label`). Thus, if this label is not
+    the right one for your application, you need to define a scorer and pass the right
+    `pos_label` (and additional parameters) using the
+    :func:`~sklearn.metrics.make_scorer`. You should refer to :ref:`scoring` to get all
+    information to define your own scoring function. For instance, we show how to pass
+    the information to the scorer that the label of interest is `0` when maximizing the
+    :func:`~sklearn.metrics.f1_score`:
+
+        >>> from sklearn.metrics import make_scorer, f1_score
+        >>> X, y = make_classification(
+        ...    n_samples=1_000, weights=[0.1, 0.9], random_state=0)
+        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+        >>> pos_label = 0
+        >>> scorer = make_scorer(f1_score, pos_label=pos_label)
+        >>> model = CutOffClassifier(classifier, objective_metric=scorer).fit(
+        ...     X_train, y_train)
+        >>> scorer(model, X_test, y_test)
+        0.82...
+        >>> # compare it with the internal score found by cross-validation
+        >>> model.objective_score_
+        0.86...
+
+A second strategy aims at maximizing a metric while imposing constraints on another
+metric. Four pre-defined options exist, 2 that uses the Receiver Operating
+Characteristic (ROC) statistic and 2 that uses the Precision-Recall statistic.
+
+- `"max_tpr_at_tnr_constraint"`: maximizes the True Positive Rate (TPR) such that the
+  True Negative Rate (TNR) is the closest to a given value.
+- `"max_tnr_at_tpr_constraint"`: maximizes the TNR such that the TPR is the closest to
+  a given value.
+- `"max_precision_at_recall_constraint"`: maximizes the precision such that the recall
+    is the closest to a given value.
+- `"max_recall_at_precision_constraint"`: maximizes the recall such that the precision
+    is the closest to a given value.
+
+For these options, the `constraint_value` parameter needs to be defined. In addition,
+you can use the `pos_label` parameter to indicate the label of the class of interest.
+
+The final strategy maximizes a custom utility function. This problem is also known as
+cost-sensitive learning. The utility function is defined by providing dictionary
+containing the cost-gain associated with the entries of the confusion matrix. The keys
+are defined as `{"tn", "fp", "fn", "tp"}`. The class of interest is defined using the
+`pos_label` parameter. Refer to :ref:`cost_sensitive_learning_example` for an example
+depicting the use of such a utility function.
+
 Important notes regarding the internal cross-validation
 -------------------------------------------------------
 
+By default :class:`~sklearn.model_selection.CutOffClassifier` uses a 5-fold stratified
+cross-validation to tune the cut-off point. The parameter `cv` allows to control the
+cross-validation strategy. It is possible to go around cross-validation by passing
+`cv="prefit"` and provide an already fitted classifier. In this case, the cut-off point
+is tuned on the data provided to the `fit` method.
+
+However, you should be extremely careful when using this option. You should never use
+the same data for training the classifier and tuning the cut-off point at the risk of
+overfitting. Refer to :ref:`cutoffclassifier_no_cv` that shows such overfitting. If
+you are in a situation where you have limited resources, you should can consider using
+a float number that will use a single split internally.
+
+The option `cv="prefit"` should only be used when the provided classifier was already
+trained on some data and you want to tune (or re-tune) on a new validation set.
+
 Examples
 --------
 
diff --git a/examples/model_selection/plot_cutoff_tuning.py b/examples/model_selection/plot_cutoff_tuning.py
index f95cb2f2345f0..c5f15a18a270b 100644
--- a/examples/model_selection/plot_cutoff_tuning.py
+++ b/examples/model_selection/plot_cutoff_tuning.py
@@ -256,6 +256,8 @@ def gain_cost_score(y, y_pred, **kwargs):
 # :class:`~sklearn.metrics.CutOffClassifier` class is here to help us. It automatically
 # compute the cost-gain for all possible cut-off points and choose the optimal one.
 #
+# .. _cost_sensitive_learning_example:
+#
 # Tuning the cut-off point
 # ------------------------
 #
diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 4572c38afcd13..e0946e0e77569 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -271,6 +271,9 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
     decision_threshold_ : float
         The new decision threshold.
 
+    decision_thresholds_ : ndarray of shape (n_thresholds,)
+        All decision thresholds that were evaluated.
+
     objective_score_ : float or tuple of floats
         The score of the objective metric associated with the decision threshold found.
         When `objective_metric` is one of `"max_tpr_at_tnr_constraint"`,
@@ -279,9 +282,6 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
         two float values: the first one is the score of the metric which is constrained
         and the second one is the score of the maximized metric.
 
-    decision_thresholds_ : ndarray of shape (n_thresholds,)
-        All decision thresholds that were evaluated.
-
     objective_scores_ : ndarray of shape (n_thresholds,)
         The scores of the objective metric associated with the decision thresholds.
 

From 3d4ce8157d7af951283bf03c32cac670ca570f19 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 2 May 2023 12:16:38 +0200
Subject: [PATCH 058/194] fix import

---
 doc/modules/prediction.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/modules/prediction.rst b/doc/modules/prediction.rst
index 25e9d146d2467..30a867b5e0ae9 100644
--- a/doc/modules/prediction.rst
+++ b/doc/modules/prediction.rst
@@ -91,6 +91,7 @@ We provide an example where we maximize the balanced accuracy.
     the information to the scorer that the label of interest is `0` when maximizing the
     :func:`~sklearn.metrics.f1_score`:
 
+        >>> from sklearn.model_selection import train_test_split
         >>> from sklearn.metrics import make_scorer, f1_score
         >>> X, y = make_classification(
         ...    n_samples=1_000, weights=[0.1, 0.9], random_state=0)

From d7d8dac537725bf612c2b5727536b8ca92fb50d2 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 2 May 2023 12:18:12 +0200
Subject: [PATCH 059/194] fix import

---
 doc/modules/prediction.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/modules/prediction.rst b/doc/modules/prediction.rst
index 30a867b5e0ae9..24ac762ac69ca 100644
--- a/doc/modules/prediction.rst
+++ b/doc/modules/prediction.rst
@@ -91,14 +91,15 @@ We provide an example where we maximize the balanced accuracy.
     the information to the scorer that the label of interest is `0` when maximizing the
     :func:`~sklearn.metrics.f1_score`:
 
-        >>> from sklearn.model_selection import train_test_split
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> from sklearn.model_selection import CutOffClassifier, train_test_split
         >>> from sklearn.metrics import make_scorer, f1_score
         >>> X, y = make_classification(
         ...    n_samples=1_000, weights=[0.1, 0.9], random_state=0)
         >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
         >>> pos_label = 0
         >>> scorer = make_scorer(f1_score, pos_label=pos_label)
-        >>> model = CutOffClassifier(classifier, objective_metric=scorer).fit(
+        >>> model = CutOffClassifier(LogisticRegression(), objective_metric=scorer).fit(
         ...     X_train, y_train)
         >>> scorer(model, X_test, y_test)
         0.82...

From aa3e83da6c63c0a39158fef63de98faa52993bba Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 2 May 2023 14:08:46 +0200
Subject: [PATCH 060/194] iter

---
 doc/modules/prediction.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/modules/prediction.rst b/doc/modules/prediction.rst
index 24ac762ac69ca..80bed4e63ee85 100644
--- a/doc/modules/prediction.rst
+++ b/doc/modules/prediction.rst
@@ -116,9 +116,9 @@ Characteristic (ROC) statistic and 2 that uses the Precision-Recall statistic.
 - `"max_tnr_at_tpr_constraint"`: maximizes the TNR such that the TPR is the closest to
   a given value.
 - `"max_precision_at_recall_constraint"`: maximizes the precision such that the recall
-    is the closest to a given value.
+  is the closest to a given value.
 - `"max_recall_at_precision_constraint"`: maximizes the recall such that the precision
-    is the closest to a given value.
+  is the closest to a given value.
 
 For these options, the `constraint_value` parameter needs to be defined. In addition,
 you can use the `pos_label` parameter to indicate the label of the class of interest.

From ab97d63d323c9fbc760168890d108f0f306920a0 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 2 May 2023 14:09:17 +0200
Subject: [PATCH 061/194] fix

---
 doc/modules/prediction.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/prediction.rst b/doc/modules/prediction.rst
index 80bed4e63ee85..d990c242aceee 100644
--- a/doc/modules/prediction.rst
+++ b/doc/modules/prediction.rst
@@ -109,7 +109,7 @@ We provide an example where we maximize the balanced accuracy.
 
 A second strategy aims at maximizing a metric while imposing constraints on another
 metric. Four pre-defined options exist, 2 that uses the Receiver Operating
-Characteristic (ROC) statistic and 2 that uses the Precision-Recall statistic.
+Characteristic (ROC) statistics and 2 that uses the Precision-Recall statistics.
 
 - `"max_tpr_at_tnr_constraint"`: maximizes the True Positive Rate (TPR) such that the
   True Negative Rate (TNR) is the closest to a given value.

From 486a2bdcd3bbe2cb9cb2321d857c50d4880d5ad5 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 4 May 2023 16:43:24 +0200
Subject: [PATCH 062/194] Update sklearn/metrics/_scorer.py

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 sklearn/metrics/_scorer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index fda958b464e19..38c87cc4a9969 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -378,7 +378,7 @@ def _factory_args(self):
 
 
 class _ContinuousScorer(_BaseScorer):
-    """ "Scorer taking a continuous response and output a score for each threshold."""
+    """Scorer taking a continuous response and output a score for each threshold."""
 
     def __init__(self, score_func, sign, response_method, kwargs):
         super().__init__(score_func=score_func, sign=sign, kwargs=kwargs)

From 6d4c4aaaa3144dd8a9d19a7187d3dfd01d7f82ee Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 15 May 2023 18:42:07 +0200
Subject: [PATCH 063/194] Apply suggestions from code review

Co-authored-by: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
---
 doc/modules/prediction.rst                    | 21 ++---
 .../model_selection/plot_cutoff_tuning.py     | 88 ++++++++++---------
 2 files changed, 57 insertions(+), 52 deletions(-)

diff --git a/doc/modules/prediction.rst b/doc/modules/prediction.rst
index d990c242aceee..4a5a014d38cdf 100644
--- a/doc/modules/prediction.rst
+++ b/doc/modules/prediction.rst
@@ -40,12 +40,13 @@ expected behavior of the classifier and thus the strategy to convert soft
 predictions into hard predictions. We illustrate this point with an example.
 
 Let's imagine the deployment of a predictive model helping medical doctors to
-detect cancers. In a setting where this model would be a tool to discard
-obvious cases, doctors might be interested to have a high recall (all cancers
-cases should be tagged as such) to not miss any patient with a cancer. However,
-it will be at the cost of having more false positive predictions (i.e. lower
-precision). Thus, in terms of decision threshold, it would be better to
-classify a patient having a cancer for a lower probability than 0.5.
+detect cancers. In a setting where this model was a tool to discard obvious
+cases and false positives don't lead to potentially harmful treatments, doctors
+might be interested in having a high recall (all cancers cases should be tagged
+as such) to not miss any patient with a cancer. However, that is at the cost of
+having more false positive predictions (i.e. lower precision). Thus, in terms of
+decision threshold, it may be better to classify a patient as having a cancer
+for a probability lower than 0.5.
 
 Post-tuning of the decision threshold
 =====================================
@@ -59,11 +60,11 @@ with or without constraints.
 The following image illustrate the tuning of the cut-off point for a gradient
 boosting classifier. While the vanilla and tuned classifiers provide the same
 Receiver Operating Characteristic (ROC) and Precision-Recall curves, and thus
-the same :term:`predict_proba` outputs, the "hard" predictions defer because of
+the same :term:`predict_proba` outputs, the "hard" predictions differ because of
 the tuned cut-off point. The vanilla classifier predicts the class of interest
 for a probability greater than 0.5 while the tuned classifier predicts the
 class of interest for a very low probability (around 0.02). This cut-off point
-is maximizes a utility metric defined by the business case (in this case an
+optimizes a utility metric defined by the business case (in this case an
 insurance company).
 
 .. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cutoff_tuning_002.png
@@ -82,7 +83,7 @@ We provide an example where we maximize the balanced accuracy.
 
 .. note::
 
-    It is important to notice that these metrics comes with default parameter, notably
+    It is important to notice that these metrics come with default parameters, notably
     the label of the class of interested (i.e. `pos_label`). Thus, if this label is not
     the right one for your application, you need to define a scorer and pass the right
     `pos_label` (and additional parameters) using the
@@ -142,7 +143,7 @@ is tuned on the data provided to the `fit` method.
 However, you should be extremely careful when using this option. You should never use
 the same data for training the classifier and tuning the cut-off point at the risk of
 overfitting. Refer to :ref:`cutoffclassifier_no_cv` that shows such overfitting. If
-you are in a situation where you have limited resources, you should can consider using
+you are in a situation where you have limited resources, you should consider using
 a float number that will use a single split internally.
 
 The option `cv="prefit"` should only be used when the provided classifier was already
diff --git a/examples/model_selection/plot_cutoff_tuning.py b/examples/model_selection/plot_cutoff_tuning.py
index c5f15a18a270b..ca8c18a3e241c 100644
--- a/examples/model_selection/plot_cutoff_tuning.py
+++ b/examples/model_selection/plot_cutoff_tuning.py
@@ -8,14 +8,16 @@
 or the :term:`predict_proba` output. For binary classifier, the default threshold is
 defined as a probability score of 0.5 or a decision function value of 0.0.
 
-However, it happens that this default strategy is not optimized for the task at hand.
-Here, we use the "Statlog" German credit dataset [1]_ to illustrate such an application.
-In this dataset, the task is to predict whether a person is "good" or "bad" credit.
-In addition, cost-matrix is provided where it is defined that classifying a "bad"
-credit as "good" is 5 times more costly than classifying a "good" credit as "bad".
+However, this default strategy may not be optimal for the task at hand.
+Here, we use the "Statlog" German credit dataset [1]_ to illustrate a use case.
+In this dataset, the task is to predict whether a person has a "good" or "bad" credit.
+In addition, a cost-matrix is provided that specifies the cost of
+misclassification. Specifically, misclassifying a "bad" credit as "good" is five
+times more costly than misclassifying a "good" credit as "bad".
 
-We use the :class:`~sklearn.model_selection.CutOffClassifier` to tune the
-cut-off point of the decision function to minimize the business cost provided to us.
+We use the :class:`~sklearn.model_selection.CutOffClassifier` to select the
+cut-off point of the decision function that minimizes the provided business
+cost.
 
 .. topic:: References
 
@@ -44,11 +46,11 @@
 
 # %%
 # Many features are categorical and usually string-encoded. We need to encode
-# these categories when we develop ou predictive model. Let's check the targets.
+# these categories when we develop our predictive model. Let's check the targets.
 y.value_counts()
 
 # %%
-# A first observation is that the dataset is imbalanced. We would need to be careful
+# Another observation is that the dataset is imbalanced. We would need to be careful
 # when evaluating our predictive model and use a family of metrics that are adapted
 # to this setting.
 #
@@ -102,8 +104,8 @@ def fpr_score(y, y_pred, **kwargs):
 #
 # We therefore need to define a scikit-learn scorer using
 # :func:`~sklearn.metrics.make_scorer` where the information is passed. We store all
-# created scorer in a dictionary. To use them, we need to pass the fitted model and
-# the data and target on which we want to evaluate the predictive model.
+# the custom scorers in a dictionary. To use them, we need to pass the fitted model,
+# the data and the target on which we want to evaluate the predictive model.
 tpr_score = recall_score  # TPR and recall are the same metric
 scoring = {
     "precision": make_scorer(precision_score, pos_label=pos_label),
@@ -114,7 +116,7 @@ def fpr_score(y, y_pred, **kwargs):
 
 # %%
 # In addition, the original research [1]_ defines a business metric. They provide a
-# cost-matrix where they define that predicting a "bad" credit as "good" is 5 times more
+# cost-matrix which encodes that predicting a "bad" credit as "good" is 5 times more
 # costly than the opposite. We define a dictionary containing this information and a
 # score function that computes the cost.
 cost_gain_matrix = {"tp": 0, "tn": 0, "fp": -1, "fn": -5}
@@ -146,13 +148,13 @@ def gain_cost_score(y, y_pred, **kwargs):
 # Design of the predictive model
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# In this section, we design our predictive model. We use a gradient boosting classifier
-# using :class:`~sklearn.ensemble.HistGradientBoostingClassifier`. However, we need to
-# encode the categorical features with numerical values and we therefore use an
-# :class:`~sklearn.preprocessing.OrdinalEncoder` to do so. The numerical features are
-# kept as-is. To recover the name of the categorical columns, we use the helper function
-# :func:`~sklearn.compose.make_column_selector` and the fact that the categorical
-# features are stored as `category` dtype.
+# In this section we design our predictive model consisting of a
+# :class:`~sklearn.ensemble.HistGradientBoostingClassifier`. We encode the
+# categorical features with an :class:`~sklearn.preprocessing.OrdinalEncoder`
+# but the numerical features are kept as they are. To identify the categorical
+# columns, we use the helper function
+# :func:`~sklearn.compose.make_column_selector` and the fact that the
+# categorical features are stored as `category` dtype.
 from sklearn.compose import ColumnTransformer, make_column_selector as selector
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.pipeline import Pipeline
@@ -233,14 +235,14 @@ def gain_cost_score(y, y_pred, **kwargs):
 _ = fig.suptitle("Evaluation of the vanilla GBDT model")
 
 # %%
-# We recall that these curves corresponds show the statistical performance of the
+# We recall that these curves give insights on the statistical performance of the
 # predictive model for different cut-off points. For the Precision-Recall curve, the
 # reported metrics are the precision and recall and for the ROC curve, the reported
 # metrics are the TPR (same as recall) and FPR.
 #
-# Here, the different cut-off points corresponds at different level of probability
+# Here, the different cut-off points correspond to different levels of probability
 # scores ranging between 0 and 1. By default, `model.predict` uses a cut-off point at
-# a probability of 0.5. Thus, the metrics for this cut-off point is reported with the
+# a probability of 0.5. The metrics for such cut-off point are reported with the
 # blue dot on the curves: it corresponds to the statistical performance of the model
 # when using `model.predict`.
 #
@@ -249,21 +251,22 @@ def gain_cost_score(y, y_pred, **kwargs):
 scoring["cost_gain"](model, X_test, y_test)
 
 # %%
-# At this stage, we don't know if some other cut-off points could lead to a greater
-# gain. To be able to decide, if another cut-off point is better, we would need to
-# compute the cost-gain using the business metric for all possible cut-off points and
-# choose the optimal one. This strategy is quite tedious to implement and the
-# :class:`~sklearn.metrics.CutOffClassifier` class is here to help us. It automatically
-# compute the cost-gain for all possible cut-off points and choose the optimal one.
+# At this stage we don't know if any other cut-off can lead to a greater gain.
+# To find the optimal one, we need to compute the cost-gain using the business
+# metric for all possible cut-off points and choose the best. This strategy can
+# be quite tedious to implement by hand, but the
+# :class:`~sklearn.metrics.CutOffClassifier` class is here to help us. It
+# automatically computes the cost-gain for all possible cut-off points and
+# optimizes for the `objective_metric`.
 #
 # .. _cost_sensitive_learning_example:
 #
 # Tuning the cut-off point
 # ------------------------
 #
-# We use :class:`~sklearn.metrics.CutOffClassifier` to tune the cut-off point. We need
+# We use :class:`~sklearn.model_selection.CutOffClassifier` to tune the cut-off point. We need
 # to provide the business metric to optimize as well as the positive label. Internally,
-# the optimum cut-off point is chosen such that it maximized the business metric
+# the optimum cut-off point is chosen such that it maximizes the business metric
 # via cross-validation. By default a 5-fold stratified cross-validation is used.
 from sklearn.model_selection import CutOffClassifier
 
@@ -351,7 +354,7 @@ def gain_cost_score(y, y_pred, **kwargs):
 # %%
 # The first remark is that both classifiers have exactly the same ROC and
 # Precision-Recall curves. It is expected because by default, the classifier is fitted
-# on the same training data. In a later section, we discuss more in details the
+# on the same training data. In a later section, we discuss more in detail the
 # available options regarding model refitting and cross-validation.
 #
 # The second remark is that the cut-off points of the vanilla and tuned model are
@@ -373,10 +376,11 @@ def gain_cost_score(y, y_pred, **kwargs):
 # Consideration regarding model refitting and cross-validation
 # ------------------------------------------------------------
 #
-# In the above experiment, we use the default parameter of the
-# :class:`~sklearn.model_selection.CutOffClassifier`. By default, the cut-off point is
-# tuned using a 5-fold stratified cross-validation. Also, the underlying predictive
-# model is refitted on the entire training data once the cut-off point is chosen.
+# In the above experiment, we use the default setting of the
+# :class:`~sklearn.model_selection.CutOffClassifier`. In particular, the cut-off
+# point is tuned using a 5-fold stratified cross-validation. Also, the
+# underlying predictive model is refitted on the entire training data once the
+# cut-off point is chosen.
 #
 # These two strategies can be changed by providing the `refit` and `cv` parameters.
 # For instance, one could provide a fitted `estimator` and set `cv="prefit"`, in which
@@ -556,11 +560,11 @@ def gain_cost_score(y, y_pred, **kwargs):
 # %%
 # Regarding the cut-off point, we observe that the optimum is similar to the multiple
 # repeated cross-validation case. However, be aware that a single split does not account
-# for the variability of the fit/predict process and thus we are enable to know if there
-# is any variance in the cut-off point. The repeated cross-validation averages this
-# effect.
+# for the variability of the fit/predict process and thus we are unable to know if there
+# is any variance in the cut-off point. The repeated cross-validation averages out
+# this effect.
 #
-# The second observation is about the ROC and Precision-Recall curves the tuned model.
-# We observe that the curve is now different from the vanilla model. It is not
-# surprising because we train the underlying classifier on a subset of the data provided
-# at `fit` while keeping a validation set to tune the cut-off point.
+# Another observation concerns the ROC and Precision-Recall curves of the tuned
+# model. As expected, these curves differ from those of the vanilla model, given
+# that we trained the underlying classifier on a subset of the data provided
+# during fitting and reserved a validation set for tuning the cut-off point.

From 1d12e1f4e1e567694d570ab938d40fb40afbf72e Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 1 Jun 2023 16:18:32 +0200
Subject: [PATCH 064/194] Fix linter

---
 examples/model_selection/plot_cutoff_tuning.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/model_selection/plot_cutoff_tuning.py b/examples/model_selection/plot_cutoff_tuning.py
index ca8c18a3e241c..d9fff88512586 100644
--- a/examples/model_selection/plot_cutoff_tuning.py
+++ b/examples/model_selection/plot_cutoff_tuning.py
@@ -264,10 +264,11 @@ def gain_cost_score(y, y_pred, **kwargs):
 # Tuning the cut-off point
 # ------------------------
 #
-# We use :class:`~sklearn.model_selection.CutOffClassifier` to tune the cut-off point. We need
-# to provide the business metric to optimize as well as the positive label. Internally,
-# the optimum cut-off point is chosen such that it maximizes the business metric
-# via cross-validation. By default a 5-fold stratified cross-validation is used.
+# We use :class:`~sklearn.model_selection.CutOffClassifier` to tune the cut-off
+# point. We need to provide the business metric to optimize as well as the
+# positive label. Internally, the optimum cut-off point is chosen such that it
+# maximizes the business metric via cross-validation. By default a 5-fold
+# stratified cross-validation is used.
 from sklearn.model_selection import CutOffClassifier
 
 model_tuned = CutOffClassifier(

From 7952cce26cc78e145ab4aded818e41010e5527a7 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft>
Date: Wed, 7 Jun 2023 11:48:16 +0500
Subject: [PATCH 065/194] Add routing to LogisticRegressionCV

---
 sklearn/linear_model/_logistic.py           | 101 ++++++++++++++++++--
 sklearn/linear_model/tests/test_logistic.py |  22 +++++
 2 files changed, 114 insertions(+), 9 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index a00004ae17676..d8cea5dc3160a 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -28,14 +28,20 @@
 from ..preprocessing import LabelEncoder, LabelBinarizer
 from ..svm._base import _fit_liblinear
 from ..utils import check_array, check_consistent_length, compute_class_weight
-from ..utils import check_random_state
+from ..utils import check_random_state, Bunch
 from ..utils.extmath import softmax
 from ..utils.extmath import row_norms
 from ..utils.optimize import _newton_cg, _check_optimize_result
-from ..utils.validation import check_is_fitted, _check_sample_weight
+from ..utils.validation import check_is_fitted, _check_sample_weight, _check_fit_params
 from ..utils.multiclass import check_classification_targets
 from ..utils.parallel import delayed, Parallel
 from ..utils._param_validation import StrOptions, Interval
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    process_routing,
+    _routing_enabled,
+)
 from ..model_selection import check_cv
 from ..metrics import get_scorer
 
@@ -587,6 +593,7 @@ def _log_reg_scoring_path(
     max_squared_sum=None,
     sample_weight=None,
     l1_ratio=None,
+    score_params=None,
 ):
     """Computes scores across logistic_regression_path
 
@@ -698,6 +705,9 @@ def _log_reg_scoring_path(
         to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a
         combination of L1 and L2.
 
+    score_params : dict
+        Parameters to pass to the `score` method of the underlying scorer.
+
     Returns
     -------
     coefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1)
@@ -778,7 +788,9 @@ def _log_reg_scoring_path(
         if scoring is None:
             scores.append(log_reg.score(X_test, y_test))
         else:
-            scores.append(scoring(log_reg, X_test, y_test))
+            score_params = score_params or {}
+            score_params = _check_fit_params(X=X, fit_params=score_params, indices=test)
+            scores.append(scoring(log_reg, X_test, y_test, **score_params))
 
     return coefs, Cs, np.array(scores), n_iter
 
@@ -1742,7 +1754,7 @@ def __init__(
         self.random_state = random_state
         self.l1_ratios = l1_ratios
 
-    def fit(self, X, y, sample_weight=None):
+    def fit(self, X, y, sample_weight=None, **fit_params):
         """Fit the model according to the given training data.
 
         Parameters
@@ -1758,6 +1770,11 @@ def fit(self, X, y, sample_weight=None):
             Array of weights that are assigned to individual samples.
             If not provided, then each sample is given unit weight.
 
+        **fit_params : dict
+            Parameters to pass to the underlying splitter and scorer.
+
+            .. versionadded:: 1.3
+
         Returns
         -------
         self : object
@@ -1827,9 +1844,23 @@ def fit(self, X, y, sample_weight=None):
         else:
             max_squared_sum = None
 
+        if _routing_enabled():
+            routed_params = process_routing(
+                obj=self,
+                method="fit",
+                sample_weight=sample_weight,
+                other_params=fit_params,
+            )
+        else:
+            routed_params = Bunch()
+            routed_params.splitter = Bunch(split={})
+            routed_params.scorer.score = Bunch(score=fit_params)
+            if sample_weight is not None:
+                routed_params.scorer.score["sample_weight"] = sample_weight
+
         # init cross-validation generator
         cv = check_cv(self.cv, y, classifier=True)
-        folds = list(cv.split(X, y))
+        folds = list(cv.split(X, y, **routed_params.splitter.split))
 
         # Use the label encoded classes
         n_classes = len(encoded_labels)
@@ -1896,6 +1927,7 @@ def fit(self, X, y, sample_weight=None):
                 max_squared_sum=max_squared_sum,
                 sample_weight=sample_weight,
                 l1_ratio=l1_ratio,
+                score_params=routed_params.scorer.score,
             )
             for label in iter_encoded_labels
             for train, test in folds
@@ -2076,7 +2108,7 @@ def fit(self, X, y, sample_weight=None):
 
         return self
 
-    def score(self, X, y, sample_weight=None):
+    def score(self, X, y, sample_weight=None, **score_params):
         """Score using the `scoring` option on the given test data and labels.
 
         Parameters
@@ -2090,15 +2122,59 @@ def score(self, X, y, sample_weight=None):
         sample_weight : array-like of shape (n_samples,), default=None
             Sample weights.
 
+        **score_params : dict
+            Parameters to pass to the `score` method of the underlying scorer.
+
+            .. versionadded:: 1.3
+
         Returns
         -------
         score : float
             Score of self.predict(X) w.r.t. y.
         """
-        scoring = self.scoring or "accuracy"
-        scoring = get_scorer(scoring)
+        scoring = self._get_scorer()
+        if _routing_enabled():
+            routed_params = process_routing(
+                obj=self,
+                method="score",
+                sample_weight=sample_weight,
+                other_params=score_params,
+            )
+        else:
+            routed_params = Bunch(estimator=Bunch(score=score_params))
 
-        return scoring(self, X, y, sample_weight=sample_weight)
+        return scoring(self, X, y, sample_weight=sample_weight, **routed_params)
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.3
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+
+        router = (
+            MetadataRouter(owner=self.__class__.__name__)
+            .add_self_request(self)
+            .add(
+                splitter=self.cv,
+                method_mapping=MethodMapping().add(callee="split", caller="fit"),
+            )
+            .add(
+                scorer=self._get_scorer(),
+                method_mapping=MethodMapping()
+                .add(callee="score", caller="score")
+                .add(callee="score", caller="fit"),
+            )
+        )
+        return router
 
     def _more_tags(self):
         return {
@@ -2108,3 +2184,10 @@ def _more_tags(self):
                 ),
             }
         }
+
+    def _get_scorer(self):
+        """Get the scorer based on the scoring method specified.
+        The default scoring method is `accuracy`.
+        """
+        scoring = self.scoring or "accuracy"
+        return get_scorer(scoring)
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index a470fe412ab36..bf42d89587c19 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -9,6 +9,7 @@
 
 import pytest
 
+from sklearn import config_context
 from sklearn.base import clone
 from sklearn.datasets import load_iris, make_classification
 from sklearn.metrics import log_loss
@@ -2057,3 +2058,24 @@ def test_liblinear_not_stuck():
     with warnings.catch_warnings():
         warnings.simplefilter("error", ConvergenceWarning)
         clf.fit(X_prep, y)
+
+
+def test_lr_cv_scores_differ_when_sample_weight_is_requested():
+    rng = np.random.RandomState(10)
+    X, y = make_classification(n_samples=10, random_state=rng)
+    sample_weight = np.ones(len(y))
+    sample_weight[: len(y) // 2] = 2
+    kwargs = {"sample_weight": sample_weight}
+
+    with config_context(enable_metadata_routing=True):
+        scorer1 = get_scorer("accuracy")
+        lr_cv1 = LogisticRegressionCV(scoring=scorer1)
+        lr_cv1.fit(X, y, **kwargs)
+
+        scorer2 = get_scorer("accuracy")
+        scorer2.set_score_request(sample_weight=True)
+        lr_cv2 = LogisticRegressionCV(scoring=scorer2)
+        lr_cv2.fit(X, y, **kwargs)
+
+    with pytest.raises(AssertionError):
+        assert_almost_equal(lr_cv1.scores_[1], lr_cv2.scores_[1])

From 66ad513506eb52707a5afb1bfd45d95c310482a0 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft>
Date: Wed, 7 Jun 2023 17:04:42 +0500
Subject: [PATCH 066/194] Add a test with enable_metadata_routing=False and fix
 an issue in score method

---
 sklearn/linear_model/_logistic.py           |  4 ++--
 sklearn/linear_model/tests/test_logistic.py | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index d8cea5dc3160a..0e20a8d8fd778 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -1854,7 +1854,7 @@ def fit(self, X, y, sample_weight=None, **fit_params):
         else:
             routed_params = Bunch()
             routed_params.splitter = Bunch(split={})
-            routed_params.scorer.score = Bunch(score=fit_params)
+            routed_params.scorer = Bunch(score=fit_params)
             if sample_weight is not None:
                 routed_params.scorer.score["sample_weight"] = sample_weight
 
@@ -2141,7 +2141,7 @@ def score(self, X, y, sample_weight=None, **score_params):
                 other_params=score_params,
             )
         else:
-            routed_params = Bunch(estimator=Bunch(score=score_params))
+            routed_params = {}
 
         return scoring(self, X, y, sample_weight=sample_weight, **routed_params)
 
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index bf42d89587c19..2684cd7ca8308 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -2079,3 +2079,24 @@ def test_lr_cv_scores_differ_when_sample_weight_is_requested():
 
     with pytest.raises(AssertionError):
         assert_almost_equal(lr_cv1.scores_[1], lr_cv2.scores_[1])
+
+
+def test_lr_cv_scores_without_enabling_metadata_routing():
+    rng = np.random.RandomState(10)
+    X, y = make_classification(n_samples=10, random_state=rng)
+    sample_weight = np.ones(len(y))
+    sample_weight[: len(y) // 2] = 2
+    kwargs = {"sample_weight": sample_weight}
+
+    with config_context(enable_metadata_routing=False):
+        scorer1 = get_scorer("accuracy")
+        lr_cv1 = LogisticRegressionCV(scoring=scorer1)
+        lr_cv1.fit(X, y, **kwargs)
+
+    with config_context(enable_metadata_routing=True):
+        scorer2 = get_scorer("accuracy")
+        scorer2.set_score_request(sample_weight=True)
+        lr_cv2 = LogisticRegressionCV(scoring=scorer2)
+        lr_cv2.fit(X, y, **kwargs)
+
+    assert_almost_equal(lr_cv1.scores_[1], lr_cv2.scores_[1])

From 7e8b8249cf46dfe3dd107d4bc2afd6a1ed51788a Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft>
Date: Tue, 13 Jun 2023 20:22:39 +0500
Subject: [PATCH 067/194] Add metaestimator tests and fix passing routed params
 in score method

---
 sklearn/linear_model/_logistic.py             |  12 +-
 sklearn/tests/test_metadata_routing.py        |  14 ++-
 .../test_metaestimators_metadata_routing.py   | 105 ++++++++++++++++++
 3 files changed, 128 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 0e20a8d8fd778..a399fbae8af6f 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -2141,9 +2141,17 @@ def score(self, X, y, sample_weight=None, **score_params):
                 other_params=score_params,
             )
         else:
-            routed_params = {}
+            routed_params = Bunch()
+            routed_params.scorer = Bunch(score=score_params)
+            if sample_weight is not None:
+                routed_params.scorer.score["sample_weight"] = sample_weight
 
-        return scoring(self, X, y, sample_weight=sample_weight, **routed_params)
+        return scoring(
+            self,
+            X,
+            y,
+            **routed_params.scorer.score,
+        )
 
     def get_metadata_routing(self):
         """Get metadata routing of this object.
diff --git a/sklearn/tests/test_metadata_routing.py b/sklearn/tests/test_metadata_routing.py
index 3b00b5a244ee8..25542cee72bee 100644
--- a/sklearn/tests/test_metadata_routing.py
+++ b/sklearn/tests/test_metadata_routing.py
@@ -104,7 +104,19 @@ def check_recorded_metadata(obj, method, **kwargs):
     records = getattr(obj, "_records", dict()).get(method, dict())
     assert set(kwargs.keys()) == set(records.keys())
     for key, value in kwargs.items():
-        assert records[key] is value
+        recorded_value = records[key]
+        # The following condition is used to check for the sample weight in an
+        # underlying estimator if the sample weight is taken as a subset of the
+        # original sample weight for example if it extracted based on the test
+        # indices.
+        if (
+            key == "sample_weight"
+            and recorded_value is not None
+            and len(recorded_value) < len(value)
+        ):
+            assert np.isin(recorded_value, value).all()
+        else:
+            assert recorded_value is value
 
 
 class MetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator):
diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py
index 892b0f21dbe8a..bdc02225da31c 100644
--- a/sklearn/tests/test_metaestimators_metadata_routing.py
+++ b/sklearn/tests/test_metaestimators_metadata_routing.py
@@ -9,6 +9,10 @@
 from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.exceptions import UnsetMetadataPassedError
+from sklearn.linear_model import LogisticRegressionCV
+from sklearn.metrics._scorer import _BaseScorer
+from sklearn.model_selection import BaseCrossValidator
+from sklearn.model_selection._split import GroupsConsumerMixin
 from sklearn.multioutput import (
     ClassifierChain,
     MultiOutputClassifier,
@@ -29,6 +33,7 @@
 y_multi = rng.randint(0, 2, size=(N, 3))
 metadata = rng.randint(0, 10, size=N)
 sample_weight = rng.rand(N)
+groups = np.array([0, 1] * (len(y) // 2))
 
 
 @pytest.fixture(autouse=True)
@@ -171,6 +176,49 @@ def predict_log_proba(self, X, sample_weight="default", metadata="default"):
         # return np.zeros(shape=(len(X), 2))
 
 
+class ConsumingScorer(_BaseScorer):
+    def __init__(self, registry=None):
+        super().__init__(score_func="test", sign=1, kwargs={})
+        self.registry = registry
+
+    def __repr__(self):
+        return "Consuming_Scorer"
+
+    def __call__(
+        self, estimator, X, y_true, sample_weight="default", metadata="default"
+    ):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata_not_default(
+            self, "score", sample_weight=sample_weight, metadata=metadata
+        )
+
+        return 0.0
+
+
+class ConsumingSplitter(BaseCrossValidator, GroupsConsumerMixin):
+    def __init__(self, registry=None):
+        self.registry = registry
+
+    def split(self, X, y=None, groups="default"):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata_not_default(self, "split", groups=groups)
+
+        split_index = len(X) - 10
+        train_indices = range(0, split_index)
+        test_indices = range(split_index, len(X))
+        yield test_indices, train_indices
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        pass
+
+    def _iter_test_indices(self, X=None, y=None, groups=None):
+        pass
+
+
 METAESTIMATORS = [
     {
         "metaestimator": MultiOutputRegressor,
@@ -234,6 +282,22 @@ def predict_log_proba(self, X, sample_weight="default", metadata="default"):
 # ids used for pytest fixture
 METAESTIMATOR_IDS = [str(row["metaestimator"].__name__) for row in METAESTIMATORS]
 
+CV_SCORERS = [
+    {
+        "cv_estimator": LogisticRegressionCV,
+        "scorer_name": "scoring",
+        "routing_methods": ["fit", "score"],
+    },
+]
+
+CV_SPLITTERS = [
+    {
+        "cv_estimator": LogisticRegressionCV,
+        "splitter_name": "cv",
+        "routing_methods": ["fit"],
+    }
+]
+
 
 def test_registry_copy():
     # test that _Registry is not copied into a new instance.
@@ -327,3 +391,44 @@ def set_request(estimator, method_name):
                 assert registry
                 for estimator in registry:
                     check_recorded_metadata(estimator, method_name, **kwargs)
+
+
+@pytest.mark.parametrize(
+    "cv_scorer",
+    CV_SCORERS,
+)
+def test_metadata_is_routed_correctly_to_scorer(cv_scorer):
+    registry = _Registry()
+    cls = cv_scorer["cv_estimator"]
+    scorer_name = cv_scorer["scorer_name"]
+    scorer = ConsumingScorer(registry=registry)
+    scorer.set_score_request(sample_weight=True)
+    routing_methods = cv_scorer["routing_methods"]
+
+    for method_name in routing_methods:
+        instance = cls(**{scorer_name: scorer})
+        method = getattr(instance, method_name)
+        kwargs = {"sample_weight": sample_weight}
+        method(X, y, **kwargs)
+        for _scorer in registry:
+            check_recorded_metadata(obj=_scorer, method="score", **kwargs)
+
+
+@pytest.mark.parametrize(
+    "cv_splitter",
+    CV_SPLITTERS,
+)
+def test_metadata_is_routed_correctly_to_splitter(cv_splitter):
+    registry = _Registry()
+    cls = cv_splitter["cv_estimator"]
+    splitter_name = cv_splitter["splitter_name"]
+    splitter = ConsumingSplitter(registry=registry)
+    routing_methods = cv_splitter["routing_methods"]
+
+    for method_name in routing_methods:
+        instance = cls(**{splitter_name: splitter})
+        method = getattr(instance, method_name)
+        kwargs = {"groups": groups}
+        method(X, y, **kwargs)
+        for _splitter in registry:
+            check_recorded_metadata(obj=_splitter, method="split", **kwargs)

From d7e50a6aa63e2b29eef9044923394902dcc7f0b9 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft>
Date: Sun, 25 Jun 2023 18:11:20 +0500
Subject: [PATCH 068/194] PR suggestions

---
 sklearn/calibration.py                        |  4 +-
 sklearn/linear_model/_logistic.py             | 76 +++++++++++--------
 sklearn/linear_model/tests/test_logistic.py   | 15 +++-
 sklearn/model_selection/_search.py            |  4 +-
 sklearn/model_selection/_validation.py        |  8 +-
 sklearn/multioutput.py                        |  4 +-
 sklearn/tests/test_metadata_routing.py        | 14 +---
 .../test_metaestimators_metadata_routing.py   | 11 ++-
 sklearn/utils/tests/test_validation.py        |  4 +-
 sklearn/utils/validation.py                   |  2 +-
 10 files changed, 83 insertions(+), 59 deletions(-)

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 5e7bfe2ab4a31..542e29acfcb59 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -44,7 +44,7 @@
 )
 from .utils._plotting import _BinaryClassifierCurveDisplayMixin
 from .utils.validation import (
-    _check_fit_params,
+    _check_method_params,
     _check_pos_label_consistency,
     _check_sample_weight,
     _num_samples,
@@ -611,7 +611,7 @@ def _fit_classifier_calibrator_pair(
     -------
     calibrated_classifier : _CalibratedClassifier instance
     """
-    fit_params_train = _check_fit_params(X, fit_params, train)
+    fit_params_train = _check_method_params(X, fit_params, train)
     X_train, y_train = _safe_indexing(X, train), _safe_indexing(y, train)
     X_test, y_test = _safe_indexing(X, test), _safe_indexing(y, test)
 
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index a399fbae8af6f..77e77fff758a5 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -32,7 +32,11 @@
 from ..utils.extmath import softmax
 from ..utils.extmath import row_norms
 from ..utils.optimize import _newton_cg, _check_optimize_result
-from ..utils.validation import check_is_fitted, _check_sample_weight, _check_fit_params
+from ..utils.validation import (
+    check_is_fitted,
+    _check_sample_weight,
+    _check_method_params,
+)
 from ..utils.multiclass import check_classification_targets
 from ..utils.parallel import delayed, Parallel
 from ..utils._param_validation import StrOptions, Interval
@@ -576,24 +580,25 @@ def _log_reg_scoring_path(
     y,
     train,
     test,
-    pos_class=None,
-    Cs=10,
-    scoring=None,
-    fit_intercept=False,
-    max_iter=100,
-    tol=1e-4,
-    class_weight=None,
-    verbose=0,
-    solver="lbfgs",
-    penalty="l2",
-    dual=False,
-    intercept_scaling=1.0,
-    multi_class="auto",
-    random_state=None,
-    max_squared_sum=None,
-    sample_weight=None,
-    l1_ratio=None,
-    score_params=None,
+    *,
+    pos_class,
+    Cs,
+    scoring,
+    fit_intercept,
+    max_iter,
+    tol,
+    class_weight,
+    verbose,
+    solver,
+    penalty,
+    dual,
+    intercept_scaling,
+    multi_class,
+    random_state,
+    max_squared_sum,
+    sample_weight,
+    l1_ratio,
+    score_params,
 ):
     """Computes scores across logistic_regression_path
 
@@ -789,7 +794,9 @@ def _log_reg_scoring_path(
             scores.append(log_reg.score(X_test, y_test))
         else:
             score_params = score_params or {}
-            score_params = _check_fit_params(X=X, fit_params=score_params, indices=test)
+            score_params = _check_method_params(
+                X=X, fit_params=score_params, indices=test
+            )
             scores.append(scoring(log_reg, X_test, y_test, **score_params))
 
     return coefs, Cs, np.array(scores), n_iter
@@ -1754,7 +1761,7 @@ def __init__(
         self.random_state = random_state
         self.l1_ratios = l1_ratios
 
-    def fit(self, X, y, sample_weight=None, **fit_params):
+    def fit(self, X, y, sample_weight=None, **params):
         """Fit the model according to the given training data.
 
         Parameters
@@ -1770,16 +1777,21 @@ def fit(self, X, y, sample_weight=None, **fit_params):
             Array of weights that are assigned to individual samples.
             If not provided, then each sample is given unit weight.
 
-        **fit_params : dict
+        **params : dict
             Parameters to pass to the underlying splitter and scorer.
 
-            .. versionadded:: 1.3
+            .. versionadded:: 1.4
 
         Returns
         -------
         self : object
             Fitted LogisticRegressionCV estimator.
         """
+        if params and not _routing_enabled():
+            raise ValueError(
+                "params is only supported if enable_metadata_routing=True."
+                " See the User Guide for more information."
+            )
 
         self._validate_params()
 
@@ -1849,12 +1861,12 @@ def fit(self, X, y, sample_weight=None, **fit_params):
                 obj=self,
                 method="fit",
                 sample_weight=sample_weight,
-                other_params=fit_params,
+                other_params=params,
             )
         else:
             routed_params = Bunch()
             routed_params.splitter = Bunch(split={})
-            routed_params.scorer = Bunch(score=fit_params)
+            routed_params.scorer = Bunch(score=params)
             if sample_weight is not None:
                 routed_params.scorer.score["sample_weight"] = sample_weight
 
@@ -2125,13 +2137,19 @@ def score(self, X, y, sample_weight=None, **score_params):
         **score_params : dict
             Parameters to pass to the `score` method of the underlying scorer.
 
-            .. versionadded:: 1.3
+            .. versionadded:: 1.4
 
         Returns
         -------
         score : float
             Score of self.predict(X) w.r.t. y.
         """
+        if score_params and not _routing_enabled():
+            raise ValueError(
+                "score_params is only supported if enable_metadata_routing=True."
+                " See the User Guide for more information."
+            )
+
         scoring = self._get_scorer()
         if _routing_enabled():
             routed_params = process_routing(
@@ -2142,9 +2160,7 @@ def score(self, X, y, sample_weight=None, **score_params):
             )
         else:
             routed_params = Bunch()
-            routed_params.scorer = Bunch(score=score_params)
-            if sample_weight is not None:
-                routed_params.scorer.score["sample_weight"] = sample_weight
+            routed_params.scorer = Bunch(score={})
 
         return scoring(
             self,
@@ -2159,7 +2175,7 @@ def get_metadata_routing(self):
         Please check :ref:`User Guide <metadata_routing>` on how the routing
         mechanism works.
 
-        .. versionadded:: 1.3
+        .. versionadded:: 1.4
 
         Returns
         -------
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index 2684cd7ca8308..aca4ad411de17 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -533,7 +533,17 @@ def test_logistic_cv_multinomial_score(scoring, multiclass_agg_list):
         scorer = get_scorer(scoring + averaging)
         assert_array_almost_equal(
             _log_reg_scoring_path(
-                X, y, train, test, Cs=[1.0], scoring=scorer, **params
+                X,
+                y,
+                train,
+                test,
+                Cs=[1.0],
+                scoring=scorer,
+                pos_class=None,
+                max_squared_sum=None,
+                sample_weight=None,
+                score_params=None,
+                **params,
             )[2][0],
             scorer(lr, X[test], y[test]),
         )
@@ -2077,8 +2087,7 @@ def test_lr_cv_scores_differ_when_sample_weight_is_requested():
         lr_cv2 = LogisticRegressionCV(scoring=scorer2)
         lr_cv2.fit(X, y, **kwargs)
 
-    with pytest.raises(AssertionError):
-        assert_almost_equal(lr_cv1.scores_[1], lr_cv2.scores_[1])
+    assert pytest.approx(lr_cv1.scores_[1]) != lr_cv2.scores_[1]
 
 
 def test_lr_cv_scores_without_enabling_metadata_routing():
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 1621dd324f81c..43ceb692098d4 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -37,7 +37,7 @@
 from ..utils.random import sample_without_replacement
 from ..utils._param_validation import HasMethods, Interval, StrOptions
 from ..utils._tags import _safe_tags
-from ..utils.validation import indexable, check_is_fitted, _check_fit_params
+from ..utils.validation import indexable, check_is_fitted, _check_method_params
 from ..utils.metaestimators import available_if
 from ..utils.parallel import delayed, Parallel
 from ..metrics._scorer import _check_multimetric_scoring, get_scorer_names
@@ -800,7 +800,7 @@ def fit(self, X, y=None, *, groups=None, **fit_params):
             refit_metric = self.refit
 
         X, y, groups = indexable(X, y, groups)
-        fit_params = _check_fit_params(X, fit_params)
+        fit_params = _check_method_params(X, fit_params)
 
         cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator))
         n_splits = cv_orig.get_n_splits(X, y, groups)
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index a103139c1640d..6fc344f2f5e43 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -26,7 +26,7 @@
 
 from ..base import is_classifier, clone
 from ..utils import indexable, check_random_state, _safe_indexing
-from ..utils.validation import _check_fit_params
+from ..utils.validation import _check_method_params
 from ..utils.validation import _num_samples
 from ..utils.parallel import delayed, Parallel
 from ..utils.metaestimators import _safe_split
@@ -709,7 +709,7 @@ def _fit_and_score(
 
     # Adjust length of sample weights
     fit_params = fit_params if fit_params is not None else {}
-    fit_params = _check_fit_params(X, fit_params, train)
+    fit_params = _check_method_params(X, fit_params, train)
 
     if parameters is not None:
         # clone after setting parameters in case any parameters
@@ -1109,7 +1109,7 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, method):
     """
     # Adjust length of sample weights
     fit_params = fit_params if fit_params is not None else {}
-    fit_params = _check_fit_params(X, fit_params, train)
+    fit_params = _check_method_params(X, fit_params, train)
 
     X_train, y_train = _safe_split(estimator, X, y, train)
     X_test, _ = _safe_split(estimator, X, y, test, train)
@@ -1414,7 +1414,7 @@ def _permutation_test_score(estimator, X, y, groups, cv, scorer, fit_params):
     for train, test in cv.split(X, y, groups):
         X_train, y_train = _safe_split(estimator, X, y, train)
         X_test, y_test = _safe_split(estimator, X, y, test, train)
-        fit_params = _check_fit_params(X, fit_params, train)
+        fit_params = _check_method_params(X, fit_params, train)
         estimator.fit(X_train, y_train, **fit_params)
         avg_score.append(scorer(estimator, X_test, y_test))
     return np.mean(avg_score)
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 90c1f04f7e46a..3a9264f95f9c6 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -39,7 +39,7 @@
 )
 from .utils.metaestimators import available_if
 from .utils.multiclass import check_classification_targets
-from .utils.validation import _check_fit_params, check_is_fitted, has_fit_parameter
+from .utils.validation import _check_method_params, check_is_fitted, has_fit_parameter
 from .utils.parallel import delayed, Parallel
 from .utils._param_validation import HasMethods, StrOptions
 
@@ -261,7 +261,7 @@ def fit(self, X, y, sample_weight=None, **fit_params):
                     "Underlying estimator does not support sample weights."
                 )
 
-            fit_params_validated = _check_fit_params(X, fit_params)
+            fit_params_validated = _check_method_params(X, fit_params)
             routed_params = Bunch(estimator=Bunch(fit=fit_params_validated))
             if sample_weight is not None:
                 routed_params.estimator.fit["sample_weight"] = sample_weight
diff --git a/sklearn/tests/test_metadata_routing.py b/sklearn/tests/test_metadata_routing.py
index 25542cee72bee..455248ab90348 100644
--- a/sklearn/tests/test_metadata_routing.py
+++ b/sklearn/tests/test_metadata_routing.py
@@ -99,21 +99,15 @@ def record_metadata(obj, method, record_default=True, **kwargs):
     obj._records[method] = kwargs
 
 
-def check_recorded_metadata(obj, method, **kwargs):
+def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs):
     """Check whether the expected metadata is passed to the object's method."""
     records = getattr(obj, "_records", dict()).get(method, dict())
     assert set(kwargs.keys()) == set(records.keys())
     for key, value in kwargs.items():
         recorded_value = records[key]
-        # The following condition is used to check for the sample weight in an
-        # underlying estimator if the sample weight is taken as a subset of the
-        # original sample weight for example if it extracted based on the test
-        # indices.
-        if (
-            key == "sample_weight"
-            and recorded_value is not None
-            and len(recorded_value) < len(value)
-        ):
+        # The following condition is used to check for any specified parameters
+        # being a subset of the original values
+        if key in split_params and recorded_value is not None:
             assert np.isin(recorded_value, value).all()
         else:
             assert recorded_value is value
diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py
index bdc02225da31c..61616c72899f6 100644
--- a/sklearn/tests/test_metaestimators_metadata_routing.py
+++ b/sklearn/tests/test_metaestimators_metadata_routing.py
@@ -213,10 +213,10 @@ def split(self, X, y=None, groups="default"):
         yield test_indices, train_indices
 
     def get_n_splits(self, X=None, y=None, groups=None):
-        pass
+        pass  # pragma: no cover
 
     def _iter_test_indices(self, X=None, y=None, groups=None):
-        pass
+        pass  # pragma: no cover
 
 
 METAESTIMATORS = [
@@ -411,7 +411,12 @@ def test_metadata_is_routed_correctly_to_scorer(cv_scorer):
         kwargs = {"sample_weight": sample_weight}
         method(X, y, **kwargs)
         for _scorer in registry:
-            check_recorded_metadata(obj=_scorer, method="score", **kwargs)
+            check_recorded_metadata(
+                obj=_scorer,
+                method="score",
+                split_params=("sample_weight",),
+                **kwargs,
+            )
 
 
 @pytest.mark.parametrize(
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 4a765d1404794..ddc112b3f0c4a 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -58,7 +58,7 @@
     FLOAT_DTYPES,
     _get_feature_names,
     _check_feature_names_in,
-    _check_fit_params,
+    _check_method_params,
     _check_response_method,
 )
 from sklearn.base import BaseEstimator
@@ -1500,7 +1500,7 @@ def test_check_fit_params(indices):
         "scalar-str": "xxx",
         "None": None,
     }
-    result = _check_fit_params(X, fit_params, indices)
+    result = _check_method_params(X, fit_params, indices)
     indices_ = indices if indices is not None else list(range(X.shape[0]))
 
     for key in ["sparse-row", "scalar-int", "scalar-str", "None"]:
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 6179d91c2a491..0b5c2dcd148f6 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1923,7 +1923,7 @@ def _check_response_method(estimator, response_method):
     return prediction_method
 
 
-def _check_fit_params(X, fit_params, indices=None):
+def _check_method_params(X, fit_params, indices=None):
     """Check and validate the parameters passed during `fit`.
 
     Parameters

From 0866c424f5e3cd707f318a8c92f856235d9d6182 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft>
Date: Mon, 26 Jun 2023 11:37:00 +0500
Subject: [PATCH 069/194] Add changelog entry

---
 doc/whats_new/v1.4.rst | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index 4946576d83056..df359125cc88f 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -58,3 +58,12 @@ TODO: update at the time of the release.
   :meth:`base.OutlierMixin.fit_predict` now accept ``**kwargs`` which are
   passed to the ``fit`` method of the the estimator. :pr:`26506` by `Adrin
   Jalali`_.
+
+:mod:`sklearn.linear_model`
+...........................
+
+- |Enhancement| :class:`linear_model.LogisticRegressionCV` now supports
+  metadata routing. :meth:`linear_model.LogisticRegressionCV.fit` now
+  accepts ``**params`` which are passed to the underlying splitter and
+  scorer. :meth:`linear_model.LogisticRegressionCV.score` now accepts
+  ``**score_params`` which are passed to the underlying scorer.

From 43f971b8a73a4d07470b1ca9d217753b48973c29 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft>
Date: Mon, 26 Jun 2023 11:44:09 +0500
Subject: [PATCH 070/194] Add user and pr information

---
 doc/whats_new/v1.4.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index df359125cc88f..3fad3bfc07492 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -67,3 +67,4 @@ TODO: update at the time of the release.
   accepts ``**params`` which are passed to the underlying splitter and
   scorer. :meth:`linear_model.LogisticRegressionCV.score` now accepts
   ``**score_params`` which are passed to the underlying scorer.
+  :pr:`26525` by :user:`Omar Salman`_

From db63769a951b7696e814b2144c1c186c577c8dbc Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft>
Date: Mon, 26 Jun 2023 12:24:22 +0500
Subject: [PATCH 071/194] Changelog adjustment

---
 doc/whats_new/v1.4.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index 3fad3bfc07492..d5db3ee1c6ad2 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -67,4 +67,4 @@ TODO: update at the time of the release.
   accepts ``**params`` which are passed to the underlying splitter and
   scorer. :meth:`linear_model.LogisticRegressionCV.score` now accepts
   ``**score_params`` which are passed to the underlying scorer.
-  :pr:`26525` by :user:`Omar Salman`_
+  :pr:`26525` by :user:`Omar Salman`_.

From a9b984fb13588733a952c5ee46bfb677966723d2 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft>
Date: Mon, 26 Jun 2023 12:26:57 +0500
Subject: [PATCH 072/194] Remove repr method from ConsumingScorer

---
 sklearn/tests/test_metaestimators_metadata_routing.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py
index b97238e784688..f74a8330d80c0 100644
--- a/sklearn/tests/test_metaestimators_metadata_routing.py
+++ b/sklearn/tests/test_metaestimators_metadata_routing.py
@@ -181,9 +181,6 @@ def __init__(self, registry=None):
         super().__init__(score_func="test", sign=1, kwargs={})
         self.registry = registry
 
-    def __repr__(self):
-        return "Consuming_Scorer"
-
     def __call__(
         self, estimator, X, y_true, sample_weight="default", metadata="default"
     ):

From 52f59212c19ad895399e9e63fbfe5e978a636703 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 3 Jul 2023 12:17:30 +0200
Subject: [PATCH 073/194] handle the np.inf case in roc-curve

---
 sklearn/model_selection/_prediction.py           |  7 ++++---
 sklearn/model_selection/tests/test_prediction.py | 11 +++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index e0946e0e77569..98cc868d16b7d 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -23,15 +23,14 @@
 from ..utils.parallel import Parallel, delayed
 from ..utils.validation import (
     _check_fit_params,
-    _check_sample_weight,
     _check_pos_label_consistency,
+    _check_sample_weight,
     _num_samples,
     check_consistent_length,
     check_is_fitted,
     indexable,
 )
-
-from ._split import check_cv, StratifiedShuffleSplit
+from ._split import StratifiedShuffleSplit, check_cv
 
 
 def _estimator_has(attr):
@@ -134,6 +133,8 @@ def _fit_and_score(
             fpr, tpr, potential_thresholds = scorer(
                 classifier, X_val, y_val, sample_weight=sw_val
             )
+            # For fpr=0/tpr=0, the threshold is set to `np.inf`. We need to remove it.
+            fpr, tpr, potential_thresholds = fpr[1:], tpr[1:], potential_thresholds[1:]
             # thresholds are in decreasing order
             return potential_thresholds[::-1], ((1 - fpr)[::-1], tpr[::-1])
         elif score_method in {
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index afbfa62401634..a30c737b3f59f 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -10,8 +10,8 @@
 from sklearn.metrics import (
     balanced_accuracy_score,
     confusion_matrix,
-    fbeta_score,
     f1_score,
+    fbeta_score,
     make_scorer,
     precision_recall_curve,
     precision_score,
@@ -19,6 +19,8 @@
     roc_curve,
 )
 from sklearn.metrics._scorer import _ContinuousScorer
+from sklearn.model_selection import CutOffClassifier
+from sklearn.model_selection._prediction import _fit_and_score
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC
@@ -30,9 +32,6 @@
     assert_array_equal,
 )
 
-from sklearn.model_selection import CutOffClassifier
-from sklearn.model_selection._prediction import _fit_and_score
-
 
 @pytest.mark.parametrize(
     "scorer, score_method",
@@ -110,12 +109,12 @@ def test_fit_and_score_scorers(scorer, score_method):
         (
             make_scorer(roc_curve, needs_proba=True),
             "max_tnr_at_tpr_constraint",
-            [[0.0, 1.0, 1.0], [1.0, 1.0, 0.0]],
+            [[0.0, 1.0], [1.0, 1.0]],
         ),
         (
             make_scorer(roc_curve, needs_proba=True),
             "max_tpr_at_tnr_constraint",
-            [[0.0, 1.0, 1.0], [1.0, 1.0, 0.0]],
+            [[0.0, 1.0], [1.0, 1.0]],
         ),
         (
             make_scorer(precision_recall_curve, needs_proba=True),

From 314bc83018e5e58f02ff630e57280cf0aa907127 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft>
Date: Fri, 7 Jul 2023 16:40:39 +0500
Subject: [PATCH 074/194] Adjust changelog

---
 doc/whats_new/v1.4.rst | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index f1e4458f8b103..d69e89f2b4996 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -78,6 +78,16 @@ Changelog
 - |Fix| :func:`feature_selection.mutual_info_regression` now correctly computes the
   result when `X` is of integer dtype. :pr:`26748` by :user:`Yao Xiao <Charlie-XIAO>`.
 
+:mod:`sklearn.linear_model`
+...........................
+
+- |Enhancement| :class:`linear_model.LogisticRegressionCV` now supports
+  metadata routing. :meth:`linear_model.LogisticRegressionCV.fit` now
+  accepts ``**params`` which are passed to the underlying splitter and
+  scorer. :meth:`linear_model.LogisticRegressionCV.score` now accepts
+  ``**score_params`` which are passed to the underlying scorer.
+  :pr:`26525` by :user:`Omar Salman <OmarManzoor>`.
+
 :mod:`sklearn.tree`
 ...................
 
@@ -104,13 +114,3 @@ TODO: update at the time of the release.
   :meth:`base.OutlierMixin.fit_predict` now accept ``**kwargs`` which are
   passed to the ``fit`` method of the the estimator. :pr:`26506` by `Adrin
   Jalali`_.
-
-:mod:`sklearn.linear_model`
-...........................
-
-- |Enhancement| :class:`linear_model.LogisticRegressionCV` now supports
-  metadata routing. :meth:`linear_model.LogisticRegressionCV.fit` now
-  accepts ``**params`` which are passed to the underlying splitter and
-  scorer. :meth:`linear_model.LogisticRegressionCV.score` now accepts
-  ``**score_params`` which are passed to the underlying scorer.
-  :pr:`26525` by :user:`Omar Salman`_.

From 9a8ef4e449d837460fbe6e007b862116ba13bd5a Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft.com>
Date: Mon, 10 Jul 2023 15:58:06 +0500
Subject: [PATCH 075/194] Add tests for error when passing params when routing
 not enabled in LogisticRegressionCV

---
 sklearn/linear_model/tests/test_logistic.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index 4aa9625d60f49..814915600e990 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -2141,3 +2141,18 @@ def test_zero_max_iter(solver):
             np.full(shape=(X.shape[0], 2), fill_value=0.5),
         )
     assert clf.score(X, y) < 0.7
+
+
+def test_passing_params_without_enabling_metadata_routing():
+    X, y = make_classification(n_samples=10, random_state=0)
+    lr_cv = LogisticRegressionCV()
+    msg = "params is only supported if enable_metadata_routing=True"
+
+    with config_context(enable_metadata_routing=False):
+        params = {"extra_param": 1.0}
+
+        with pytest.raises(ValueError, match=msg):
+            lr_cv.fit(X, y, **params)
+
+        with pytest.raises(ValueError, match=msg):
+            lr_cv.score(X, y, **params)

From 5b723a0fa2ebe234aa3da6460f312f69a6c9ae23 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft>
Date: Thu, 13 Jul 2023 12:25:01 +0500
Subject: [PATCH 076/194] Address PR suggestions partially

---
 sklearn/linear_model/tests/test_logistic.py      | 13 +++++++++++--
 sklearn/tests/test_metadata_routing.py           | 10 +++++++++-
 .../test_metaestimators_metadata_routing.py      | 16 ++++++++--------
 3 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index 814915600e990..9192111757330 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -2077,6 +2077,10 @@ def test_liblinear_not_stuck():
 
 
 def test_lr_cv_scores_differ_when_sample_weight_is_requested():
+    """Test sample_weight is correctly passed to the scorer in
+    LogisticRegressionCV :meth:`fit` by checking the difference
+    in scores with the case when sample_weight is not requested.
+    """
     rng = np.random.RandomState(10)
     X, y = make_classification(n_samples=10, random_state=rng)
     sample_weight = np.ones(len(y))
@@ -2093,10 +2097,13 @@ def test_lr_cv_scores_differ_when_sample_weight_is_requested():
         lr_cv2 = LogisticRegressionCV(scoring=scorer2)
         lr_cv2.fit(X, y, **kwargs)
 
-    assert pytest.approx(lr_cv1.scores_[1]) != lr_cv2.scores_[1]
+    assert not np.allclose(lr_cv1.scores_[1], lr_cv2.scores_[1])
 
 
 def test_lr_cv_scores_without_enabling_metadata_routing():
+    """Test that sample_weight is passed correctly to the scorer in
+    LogisticRegressionCV :meth:`fit` even when `enable_metadata_routing=False`
+    """
     rng = np.random.RandomState(10)
     X, y = make_classification(n_samples=10, random_state=rng)
     sample_weight = np.ones(len(y))
@@ -2114,7 +2121,7 @@ def test_lr_cv_scores_without_enabling_metadata_routing():
         lr_cv2 = LogisticRegressionCV(scoring=scorer2)
         lr_cv2.fit(X, y, **kwargs)
 
-    assert_almost_equal(lr_cv1.scores_[1], lr_cv2.scores_[1])
+    assert_allclose(lr_cv1.scores_[1], lr_cv2.scores_[1])
 
 
 @pytest.mark.parametrize("solver", SOLVERS)
@@ -2144,6 +2151,8 @@ def test_zero_max_iter(solver):
 
 
 def test_passing_params_without_enabling_metadata_routing():
+    """Test that the right error message is raised when metadata params
+    are passed while not supported when `enable_metadata_routing=False`."""
     X, y = make_classification(n_samples=10, random_state=0)
     lr_cv = LogisticRegressionCV()
     msg = "params is only supported if enable_metadata_routing=True"
diff --git a/sklearn/tests/test_metadata_routing.py b/sklearn/tests/test_metadata_routing.py
index c110a88990fc2..daa8fe795e4fb 100644
--- a/sklearn/tests/test_metadata_routing.py
+++ b/sklearn/tests/test_metadata_routing.py
@@ -110,7 +110,15 @@ def record_metadata(obj, method, record_default=True, **kwargs):
 
 
 def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs):
-    """Check whether the expected metadata is passed to the object's method."""
+    """Check whether the expected metadata is passed to the object's method.
+
+    Parameters
+    ----------
+    split_params : tuple, default=empty
+        specifies any parameters which are to be checked as being a subset
+        of the original values.
+
+    """
     records = getattr(obj, "_records", dict()).get(method, dict())
     assert set(kwargs.keys()) == set(records.keys())
     for key, value in kwargs.items():
diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py
index f74a8330d80c0..d055dff496bf7 100644
--- a/sklearn/tests/test_metaestimators_metadata_routing.py
+++ b/sklearn/tests/test_metaestimators_metadata_routing.py
@@ -390,11 +390,11 @@ def set_request(estimator, method_name):
                     check_recorded_metadata(estimator, method_name, **kwargs)
 
 
-@pytest.mark.parametrize(
-    "cv_scorer",
-    CV_SCORERS,
-)
+@pytest.mark.parametrize("cv_scorer", CV_SCORERS)
 def test_metadata_is_routed_correctly_to_scorer(cv_scorer):
+    """Test that any requested metadata is correctly routed to the underlying
+    scorers in CV estimators.
+    """
     registry = _Registry()
     cls = cv_scorer["cv_estimator"]
     scorer_name = cv_scorer["scorer_name"]
@@ -416,11 +416,11 @@ def test_metadata_is_routed_correctly_to_scorer(cv_scorer):
             )
 
 
-@pytest.mark.parametrize(
-    "cv_splitter",
-    CV_SPLITTERS,
-)
+@pytest.mark.parametrize("cv_splitter", CV_SPLITTERS)
 def test_metadata_is_routed_correctly_to_splitter(cv_splitter):
+    """Test that any requested metadata is correctly routed to the underlying
+    splitters in CV estimators.
+    """
     registry = _Registry()
     cls = cv_splitter["cv_estimator"]
     splitter_name = cv_splitter["splitter_name"]

From 9ce463dd396368747776f139458b775d01187f34 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 13 Jul 2023 12:04:33 +0200
Subject: [PATCH 077/194] address comment Tim

---
 doc/modules/prediction.rst | 51 ++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/doc/modules/prediction.rst b/doc/modules/prediction.rst
index 4a5a014d38cdf..6a83261a49111 100644
--- a/doc/modules/prediction.rst
+++ b/doc/modules/prediction.rst
@@ -7,17 +7,23 @@ Tuning cut-off decision threshold for classes prediction
 ========================================================
 
 Classifiers are predictive models: they use statistical learning to predict
-outcomes. The outcomes of a classifier takes two forms: a "soft" score for each
-sample in relation to each class, and a "hard" categorical prediction (i.e.
-class label). Soft predictions are obtained using :term:`predict_proba` or
-:term:`decision_function` while hard predictions are obtained using
-:term:`predict`.
-
-In scikit-learn, there is a connection between soft and hard prediction. In the
-case of a binary classification, hard predictions are obtained by associating
-the positive class with probability value greater than 0.5 (obtained with
-:term:`predict_proba`) or decision function value greater than 0 (obtained with
-:term:`decision_function`).
+outcomes. The outcomes of a classifier are scores for each sample in relation
+to each class and categorical prediction (class label). Scores are obtained
+from :term:`predict_proba` or :term:`decision_function`. The former returns
+posterior probability estimates for each class while the latter returns a
+decision function value for each class. The decision function value is a
+measure of how strongly the sample is predicted to belong to the positive
+class (e.g. the distance to the decisin boundary). A decision rule is then
+defined by thresholding the scores and obtained the class label for each
+sample. Those labels are obtained with :term:`predict`.
+
+For binary classification in scikit-learn, class labels are obtained by
+associating the positive class with probability estimates greater than 0.5
+(obtained with :term:`predict_proba`) or decision function values greater than
+0 (obtained with :term:`decision_function`).
+
+Here, we show an example that illustrates the relation between posterior
+probability estimates and class labels::
 
     >>> from sklearn.datasets import make_classification
     >>> from sklearn.tree import DecisionTreeClassifier
@@ -31,33 +37,30 @@ the positive class with probability value greater than 0.5 (obtained with
     >>> classifier.predict(X[:4])
     array([0, 0, 1, 1])
 
-
-Similar rules apply for other classification problems.
-
-While these approaches are reasonable as default behaviors, they might not be
-adapted to some cases. The context and nature of the use case define the
+While these approaches are reasonable as default behaviors, they are not be
+ideal for all cases. The context and nature of the use case defines the
 expected behavior of the classifier and thus the strategy to convert soft
 predictions into hard predictions. We illustrate this point with an example.
 
 Let's imagine the deployment of a predictive model helping medical doctors to
-detect cancers. In a setting where this model was a tool to discard obvious
+detect tumour. In a setting where this model was a tool to discard obvious
 cases and false positives don't lead to potentially harmful treatments, doctors
-might be interested in having a high recall (all cancers cases should be tagged
+might be interested in having a high recall (all cancer cases should be tagged
 as such) to not miss any patient with a cancer. However, that is at the cost of
 having more false positive predictions (i.e. lower precision). Thus, in terms of
 decision threshold, it may be better to classify a patient as having a cancer
-for a probability lower than 0.5.
+for a probability estimate lower than 0.5.
 
 Post-tuning of the decision threshold
 =====================================
 
 One solution to address the problem stated in the introduction is to tune the decision
-threshold of the classifier once this model has been trained. The
-:class:`~sklearn.model_selection.CutOffClassifier` allows to tune this threshold using
-an internal cross-validation. The optimum threshold is tuned to maximize a given metric
+threshold of the classifier once the model has been trained. The
+:class:`~sklearn.model_selection.CutOffClassifier` tunes this threshold using
+an internal cross-validation. The optimum threshold is chosen to maximize a given metric
 with or without constraints.
 
-The following image illustrate the tuning of the cut-off point for a gradient
+The following image illustrates the tuning of the cut-off point for a gradient
 boosting classifier. While the vanilla and tuned classifiers provide the same
 Receiver Operating Characteristic (ROC) and Precision-Recall curves, and thus
 the same :term:`predict_proba` outputs, the "hard" predictions differ because of
@@ -125,7 +128,7 @@ For these options, the `constraint_value` parameter needs to be defined. In addi
 you can use the `pos_label` parameter to indicate the label of the class of interest.
 
 The final strategy maximizes a custom utility function. This problem is also known as
-cost-sensitive learning. The utility function is defined by providing dictionary
+cost-sensitive learning. The utility function is defined by providing a dictionary
 containing the cost-gain associated with the entries of the confusion matrix. The keys
 are defined as `{"tn", "fp", "fn", "tp"}`. The class of interest is defined using the
 `pos_label` parameter. Refer to :ref:`cost_sensitive_learning_example` for an example

From bba8f555eca57f1bcf97adb08f1ae49a6ccf9084 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 13 Jul 2023 12:06:48 +0200
Subject: [PATCH 078/194] iter

---
 examples/model_selection/plot_cutoff_tuning.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/model_selection/plot_cutoff_tuning.py b/examples/model_selection/plot_cutoff_tuning.py
index d9fff88512586..1fb748f6a798b 100644
--- a/examples/model_selection/plot_cutoff_tuning.py
+++ b/examples/model_selection/plot_cutoff_tuning.py
@@ -22,7 +22,8 @@
 .. topic:: References
 
     .. [1] "Statlog (German Credit Data) Data Set", UCI Machine Learning Repository,
-       `Link <https://archive.ics.uci.edu/ml/datasets/Statlog+%28German+Credit+Data%29>`_.  # noqa
+       `Link
+       <https://archive.ics.uci.edu/ml/datasets/Statlog+%28German+Credit+Data%29>`_.
 """
 
 # %%
@@ -82,6 +83,7 @@
 # From these four metrics, scikit-learn does not provide a scorer for the FPR. We
 # therefore need to define a small custom function to compute it.
 import numpy as np
+
 from sklearn.metrics import confusion_matrix, make_scorer, precision_score, recall_score
 
 
@@ -155,7 +157,8 @@ def gain_cost_score(y, y_pred, **kwargs):
 # columns, we use the helper function
 # :func:`~sklearn.compose.make_column_selector` and the fact that the
 # categorical features are stored as `category` dtype.
-from sklearn.compose import ColumnTransformer, make_column_selector as selector
+from sklearn.compose import ColumnTransformer
+from sklearn.compose import make_column_selector as selector
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OrdinalEncoder
@@ -195,6 +198,7 @@ def gain_cost_score(y, y_pred, **kwargs):
 # We evaluate the performance of our predictive model using the ROC and Precision-Recall
 # curves.
 import matplotlib.pyplot as plt
+
 from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay
 
 fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))

From d302678eff93b6e3010fa1a42627bef68b1f0fdd Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 13 Jul 2023 12:15:19 +0200
Subject: [PATCH 079/194] MAINT rename modules as per olivier comment

---
 doc/model_selection.rst                       |   2 +-
 doc/modules/prediction.rst                    | 159 ----
 sklearn/model_selection/__init__.py           |   2 +-
 sklearn/model_selection/_prediction.py        | 802 -----------------
 .../model_selection/tests/test_prediction.py  | 832 ------------------
 5 files changed, 2 insertions(+), 1795 deletions(-)
 delete mode 100644 doc/modules/prediction.rst
 delete mode 100644 sklearn/model_selection/_prediction.py
 delete mode 100644 sklearn/model_selection/tests/test_prediction.py

diff --git a/doc/model_selection.rst b/doc/model_selection.rst
index 3a46315e65156..522544aefc820 100644
--- a/doc/model_selection.rst
+++ b/doc/model_selection.rst
@@ -14,6 +14,6 @@ Model selection and evaluation
 
     modules/cross_validation
     modules/grid_search
-    modules/prediction
+    modules/classification_threshold
     modules/model_evaluation
     modules/learning_curve
diff --git a/doc/modules/prediction.rst b/doc/modules/prediction.rst
deleted file mode 100644
index 6a83261a49111..0000000000000
--- a/doc/modules/prediction.rst
+++ /dev/null
@@ -1,159 +0,0 @@
-.. currentmodule:: sklearn.model_selection
-
-.. _cutoffclassifier:
-
-========================================================
-Tuning cut-off decision threshold for classes prediction
-========================================================
-
-Classifiers are predictive models: they use statistical learning to predict
-outcomes. The outcomes of a classifier are scores for each sample in relation
-to each class and categorical prediction (class label). Scores are obtained
-from :term:`predict_proba` or :term:`decision_function`. The former returns
-posterior probability estimates for each class while the latter returns a
-decision function value for each class. The decision function value is a
-measure of how strongly the sample is predicted to belong to the positive
-class (e.g. the distance to the decisin boundary). A decision rule is then
-defined by thresholding the scores and obtained the class label for each
-sample. Those labels are obtained with :term:`predict`.
-
-For binary classification in scikit-learn, class labels are obtained by
-associating the positive class with probability estimates greater than 0.5
-(obtained with :term:`predict_proba`) or decision function values greater than
-0 (obtained with :term:`decision_function`).
-
-Here, we show an example that illustrates the relation between posterior
-probability estimates and class labels::
-
-    >>> from sklearn.datasets import make_classification
-    >>> from sklearn.tree import DecisionTreeClassifier
-    >>> X, y = make_classification(random_state=0)
-    >>> classifier = DecisionTreeClassifier(max_depth=2, random_state=0).fit(X, y)
-    >>> classifier.predict_proba(X[:4])
-    array([[0.94   , 0.06   ],
-           [0.94   , 0.06   ],
-           [0.04..., 0.95...],
-           [0.04..., 0.95...]])
-    >>> classifier.predict(X[:4])
-    array([0, 0, 1, 1])
-
-While these approaches are reasonable as default behaviors, they are not be
-ideal for all cases. The context and nature of the use case defines the
-expected behavior of the classifier and thus the strategy to convert soft
-predictions into hard predictions. We illustrate this point with an example.
-
-Let's imagine the deployment of a predictive model helping medical doctors to
-detect tumour. In a setting where this model was a tool to discard obvious
-cases and false positives don't lead to potentially harmful treatments, doctors
-might be interested in having a high recall (all cancer cases should be tagged
-as such) to not miss any patient with a cancer. However, that is at the cost of
-having more false positive predictions (i.e. lower precision). Thus, in terms of
-decision threshold, it may be better to classify a patient as having a cancer
-for a probability estimate lower than 0.5.
-
-Post-tuning of the decision threshold
-=====================================
-
-One solution to address the problem stated in the introduction is to tune the decision
-threshold of the classifier once the model has been trained. The
-:class:`~sklearn.model_selection.CutOffClassifier` tunes this threshold using
-an internal cross-validation. The optimum threshold is chosen to maximize a given metric
-with or without constraints.
-
-The following image illustrates the tuning of the cut-off point for a gradient
-boosting classifier. While the vanilla and tuned classifiers provide the same
-Receiver Operating Characteristic (ROC) and Precision-Recall curves, and thus
-the same :term:`predict_proba` outputs, the "hard" predictions differ because of
-the tuned cut-off point. The vanilla classifier predicts the class of interest
-for a probability greater than 0.5 while the tuned classifier predicts the
-class of interest for a very low probability (around 0.02). This cut-off point
-optimizes a utility metric defined by the business case (in this case an
-insurance company).
-
-.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cutoff_tuning_002.png
-   :target: ../auto_examples/model_selection/plot_cutoff_tuning.html
-   :align: center
-
-Available options to tune the cut-off point
--------------------------------------------
-
-The cut-off point can be tuned with different strategies controlled by the parameter
-`objective_metric`.
-
-A straightforward use case is to maximize a pre-defined scikit-learn metric. These
-metrics can be found by calling the function :func:`~sklearn.metrics.get_scorer_names`.
-We provide an example where we maximize the balanced accuracy.
-
-.. note::
-
-    It is important to notice that these metrics come with default parameters, notably
-    the label of the class of interested (i.e. `pos_label`). Thus, if this label is not
-    the right one for your application, you need to define a scorer and pass the right
-    `pos_label` (and additional parameters) using the
-    :func:`~sklearn.metrics.make_scorer`. You should refer to :ref:`scoring` to get all
-    information to define your own scoring function. For instance, we show how to pass
-    the information to the scorer that the label of interest is `0` when maximizing the
-    :func:`~sklearn.metrics.f1_score`:
-
-        >>> from sklearn.linear_model import LogisticRegression
-        >>> from sklearn.model_selection import CutOffClassifier, train_test_split
-        >>> from sklearn.metrics import make_scorer, f1_score
-        >>> X, y = make_classification(
-        ...    n_samples=1_000, weights=[0.1, 0.9], random_state=0)
-        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
-        >>> pos_label = 0
-        >>> scorer = make_scorer(f1_score, pos_label=pos_label)
-        >>> model = CutOffClassifier(LogisticRegression(), objective_metric=scorer).fit(
-        ...     X_train, y_train)
-        >>> scorer(model, X_test, y_test)
-        0.82...
-        >>> # compare it with the internal score found by cross-validation
-        >>> model.objective_score_
-        0.86...
-
-A second strategy aims at maximizing a metric while imposing constraints on another
-metric. Four pre-defined options exist, 2 that uses the Receiver Operating
-Characteristic (ROC) statistics and 2 that uses the Precision-Recall statistics.
-
-- `"max_tpr_at_tnr_constraint"`: maximizes the True Positive Rate (TPR) such that the
-  True Negative Rate (TNR) is the closest to a given value.
-- `"max_tnr_at_tpr_constraint"`: maximizes the TNR such that the TPR is the closest to
-  a given value.
-- `"max_precision_at_recall_constraint"`: maximizes the precision such that the recall
-  is the closest to a given value.
-- `"max_recall_at_precision_constraint"`: maximizes the recall such that the precision
-  is the closest to a given value.
-
-For these options, the `constraint_value` parameter needs to be defined. In addition,
-you can use the `pos_label` parameter to indicate the label of the class of interest.
-
-The final strategy maximizes a custom utility function. This problem is also known as
-cost-sensitive learning. The utility function is defined by providing a dictionary
-containing the cost-gain associated with the entries of the confusion matrix. The keys
-are defined as `{"tn", "fp", "fn", "tp"}`. The class of interest is defined using the
-`pos_label` parameter. Refer to :ref:`cost_sensitive_learning_example` for an example
-depicting the use of such a utility function.
-
-Important notes regarding the internal cross-validation
--------------------------------------------------------
-
-By default :class:`~sklearn.model_selection.CutOffClassifier` uses a 5-fold stratified
-cross-validation to tune the cut-off point. The parameter `cv` allows to control the
-cross-validation strategy. It is possible to go around cross-validation by passing
-`cv="prefit"` and provide an already fitted classifier. In this case, the cut-off point
-is tuned on the data provided to the `fit` method.
-
-However, you should be extremely careful when using this option. You should never use
-the same data for training the classifier and tuning the cut-off point at the risk of
-overfitting. Refer to :ref:`cutoffclassifier_no_cv` that shows such overfitting. If
-you are in a situation where you have limited resources, you should consider using
-a float number that will use a single split internally.
-
-The option `cv="prefit"` should only be used when the provided classifier was already
-trained on some data and you want to tune (or re-tune) on a new validation set.
-
-Examples
---------
-
-- See :ref:`sphx_glr_auto_examples_model_selection_plot_cutoff_tuning.py` example for
-  an example of tuning the decision threshold of a classifier.
diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py
index 22ce40bec0c19..c1ce326c3201a 100644
--- a/sklearn/model_selection/__init__.py
+++ b/sklearn/model_selection/__init__.py
@@ -1,7 +1,7 @@
 import typing
 
+from ._classification_threshold import CutOffClassifier
 from ._plot import LearningCurveDisplay, ValidationCurveDisplay
-from ._prediction import CutOffClassifier
 from ._search import GridSearchCV, ParameterGrid, ParameterSampler, RandomizedSearchCV
 from ._split import (
     BaseCrossValidator,
diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
deleted file mode 100644
index 98cc868d16b7d..0000000000000
--- a/sklearn/model_selection/_prediction.py
+++ /dev/null
@@ -1,802 +0,0 @@
-from collections.abc import MutableMapping
-from inspect import signature
-from numbers import Integral, Real
-
-import numpy as np
-
-from ..base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin, clone
-from ..exceptions import NotFittedError
-from ..metrics import (
-    check_scoring,
-    confusion_matrix,
-    get_scorer_names,
-    make_scorer,
-    precision_recall_curve,
-    roc_curve,
-)
-from ..metrics._scorer import _ContinuousScorer
-from ..utils import _safe_indexing
-from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
-from ..utils._response import _get_response_values_binary
-from ..utils.metaestimators import available_if
-from ..utils.multiclass import type_of_target
-from ..utils.parallel import Parallel, delayed
-from ..utils.validation import (
-    _check_fit_params,
-    _check_pos_label_consistency,
-    _check_sample_weight,
-    _num_samples,
-    check_consistent_length,
-    check_is_fitted,
-    indexable,
-)
-from ._split import StratifiedShuffleSplit, check_cv
-
-
-def _estimator_has(attr):
-    """Check if we can delegate a method to the underlying estimator.
-
-    First, we check the first fitted estimator if available, otherwise we
-    check the unfitted estimator.
-    """
-    return lambda self: (
-        hasattr(self.estimator_, attr)
-        if hasattr(self, "estimator_")
-        else hasattr(self.estimator, attr)
-    )
-
-
-def _fit_and_score(
-    classifier,
-    X,
-    y,
-    sample_weight,
-    fit_params,
-    train_idx,
-    val_idx,
-    scorer,
-    score_method,
-):
-    """Fit a classifier and compute the scores for different decision thresholds.
-
-    Parameters
-    ----------
-    classifier : estimator instance
-        The classifier to fit and used for scoring. If `classifier` is already fitted,
-        it will be used as is.
-
-    X : {array-like, sparse matrix} of shape (n_samples, n_features)
-        The entire dataset.
-
-    y : array-like of shape (n_samples,)
-        The entire target vector.
-
-    sample_weight : array-like of shape (n_samples,)
-        Some optional associated sample weights.
-
-    fit_params : dict
-        Parameters to pass to the `fit` method of the underlying classifier.
-
-    train_idx : ndarray of shape (n_train_samples,) or None
-        The indices of the training set. If `None`, `classifier` is expected to be
-        already fitted.
-
-    val_idx : ndarray of shape (n_val_samples,)
-        The indices of the validation set used to score `classifier`. If `train_idx`,
-        the entire set will be used.
-
-    scorer : scorer instance
-        The scorer taking `classifier` and the validation set as input and outputting
-        decision thresholds and scores.
-
-    score_method : str or callable
-        The scoring method to use. Used to detect if we compute TPR/TNR or precision/
-        recall.
-
-    Returns
-    -------
-    thresholds : ndarray of shape (n_thresholds,)
-        The decision thresholds used to compute the scores. They are returned in
-        ascending order.
-
-    scores : ndarray of shape (n_thresholds,) or tuple os such arrays
-        The scores computed for each decision threshold. When TPR/TNR or precision/
-        recall are computed, `scores` is a tuple of two arrays.
-    """
-    arrays = (X, y) if sample_weight is None else (X, y, sample_weight)
-    check_consistent_length(*arrays)
-
-    fit_parameters = signature(classifier.fit).parameters
-    supports_sw = "sample_weight" in fit_parameters
-
-    if train_idx is not None:
-        X_train, X_val = _safe_indexing(X, train_idx), _safe_indexing(X, val_idx)
-        y_train, y_val = _safe_indexing(y, train_idx), _safe_indexing(y, val_idx)
-        if sample_weight is not None:
-            sw_train, sw_val = (
-                _safe_indexing(sample_weight, train_idx),
-                _safe_indexing(sample_weight, val_idx),
-            )
-        else:
-            sw_train, sw_val = None, None
-        fit_params_train = _check_fit_params(X, fit_params, indices=train_idx)
-        if supports_sw:
-            classifier.fit(X_train, y_train, sample_weight=sw_train, **fit_params_train)
-        else:
-            classifier.fit(X_train, y_train, **fit_params_train)
-    else:  # prefit estimator, only a validation set is provided
-        X_val, y_val, sw_val = X, y, sample_weight
-        check_is_fitted(classifier, "classes_")
-
-    if isinstance(score_method, str):
-        if score_method in {"max_tpr_at_tnr_constraint", "max_tnr_at_tpr_constraint"}:
-            fpr, tpr, potential_thresholds = scorer(
-                classifier, X_val, y_val, sample_weight=sw_val
-            )
-            # For fpr=0/tpr=0, the threshold is set to `np.inf`. We need to remove it.
-            fpr, tpr, potential_thresholds = fpr[1:], tpr[1:], potential_thresholds[1:]
-            # thresholds are in decreasing order
-            return potential_thresholds[::-1], ((1 - fpr)[::-1], tpr[::-1])
-        elif score_method in {
-            "max_precision_at_recall_constraint",
-            "max_recall_at_precision_constraint",
-        }:
-            precision, recall, potential_thresholds = scorer(
-                classifier, X_val, y_val, sample_weight=sw_val
-            )
-            # thresholds are in increasing order
-            # the last element of the precision and recall is not associated with any
-            # threshold and should be discarded
-            return potential_thresholds, (precision[:-1], recall[:-1])
-    return scorer(classifier, X_val, y_val, sample_weight=sw_val)
-
-
-class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
-    """Decision threshold tuning for binary classification.
-
-    This estimator post-tunes the decision threshold (cut-off point) that is
-    used for converting probabilities (i.e. output of `predict_proba`) or
-    decision function (i.e. output of `decision_function`) into a predicted
-    class. The tuning is done by maximizing a binary metric, potentially
-    constrained by a another metric.
-
-    Read more in the :ref:`User Guide <cutoffclassifier>`.
-
-    .. versionadded:: 1.3
-
-    Parameters
-    ----------
-    estimator : estimator instance
-        The classifier, fitted or not fitted, for which we want to optimize
-        the decision threshold used during `predict`.
-
-    objective_metric : {"max_tpr_at_tnr_constraint", "max_tnr_at_tpr_constraint", \
-            "max_precision_at_recall_constraint, "max_recall_at_precision_constraint"} \
-            , str, dict or callable, default="balanced_accuracy"
-        The objective metric to be optimized. Can be one of:
-
-        * a string associated to a scoring function (see model evaluation
-          documentation);
-        * a scorer callable object created with :func:`~sklearn.metrics.make_scorer`;
-        * `"max_tnr_at_tpr_constraint"`: find the decision threshold for a true
-          positive ratio (TPR) of `constraint_value`;
-        * `"max_tpr_at_tnr_constraint"`: find the decision threshold for a true
-          negative ratio (TNR) of `constraint_value`.
-        * `"max_precision_at_recall_constraint"`: find the decision threshold for a
-          recall of `constraint_value`;
-        * `"max_recall_at_precision_constraint"`: find the decision threshold for a
-          precision of `constraint_value`.
-        * a dictionary to be used as cost-sensitive matrix. The keys of the
-          dictionary should be: `("tp", "fp", "tn", "fn")`. The values of the
-          dictionary corresponds costs (negative values) and gains (positive
-          values).
-
-    constraint_value : float, default=None
-        The value associated with the `objective_metric` metric for which we
-        want to find the decision threshold when `objective_metric` is equal one of
-        `"max_tnr_at_tpr_constraint"`, `"max_tpr_at_tnr_constraint"`,
-        `"max_precision_at_recall_constraint"`, or
-        `"max_recall_at_precision_constraint"`.
-
-    pos_label : int, float, bool or str, default=None
-        The label of the positive class. Used when `objective_metric` is
-        `"max_tnr_at_tpr_constraint"`"`, `"max_tpr_at_tnr_constraint"`, or a dictionary.
-        When `pos_label=None`, if `y_true` is in `{-1, 1}` or `{0, 1}`,
-        `pos_label` is set to 1, otherwise an error will be raised. When using a
-        scorer, `pos_label` can be passed as a keyword argument to
-        :func:`~sklearn.metrics.make_scorer`.
-
-    response_method : {"auto", "decision_function", "predict_proba"}, default="auto"
-        Methods by the classifier `base_estimator` corresponding to the
-        decision function for which we want to find a threshold. It can be:
-
-        * if `"auto"`, it will try to invoke, for each classifier,
-          `"predict_proba"` or `"decision_function"` in that order.
-        * otherwise, one of `"predict_proba"` or `"decision_function"`.
-          If the method is not implemented by the classifier, it will raise an
-          error.
-
-    n_thresholds : int, default=100
-        The number of decision threshold to use when discretizing the output
-        of the classifier `method`.
-
-    cv : int, float, cross-validation generator, iterable or "prefit", default=None
-        Determines the cross-validation splitting strategy to train classifier.
-        Possible inputs for cv are:
-
-        * `None`, to use the default 5-fold stratified K-fold cross validation;
-        * An integer number, to specify the number of folds in a stratified k-fold;
-        * A float number, to specify a single shuffle split. The floating number should
-          be in (0, 1) and represent the size of the validation set;
-        * An object to be used as a cross-validation generator;
-        * An iterable yielding train, test splits;
-        * `"prefit"`, to bypass the cross-validation.
-
-        Refer :ref:`User Guide <cross_validation>` for the various
-        cross-validation strategies that can be used here.
-
-        .. warning::
-            Using `cv="prefit"` and passing the same dataset for fitting `estimator`
-            and tuning the cut-off point is subject to undesired overfitting. You can
-            refer to :ref:`cutoffclassifier_no_cv` for an example.
-
-            This option should only be used when the set used to fit `estimator` is
-            different from the one used to tune the cut-off point (by calling
-            :meth:`CutOffClassifier.fit`).
-
-    refit : "auto" or bool, default="auto"
-        Whether or not to refit the classifier on the entire training set once
-        the decision threshold has been found. By default, `refit="auto"` is
-        equivalent to `refit=False` when `cv` is a float number using a single
-        shuffle split or `cv="prefit"` otherwise `refit=True` in all other
-        cases. Note that forcing `refit=False` on cross-validation having more
-        than a single split will raise an error. Similarly, `refit=True` in
-        conjunction with `cv="prefit"` will raise an error.
-
-    n_jobs : int, default=None
-        The number of jobs to run in parallel. When `cv` represents a
-        cross-validation strategy, the fitting and scoring on each data split
-        is done in parallel. ``None`` means 1 unless in a
-        :obj:`joblib.parallel_backend` context. ``-1`` means using all
-        processors. See :term:`Glossary <n_jobs>` for more details.
-
-    random_state : int, RandomState instance or None, default=None
-        Controls the randomness of cross-validation when `cv` is a float.
-        See :term:`Glossary <random_state>`.
-
-    Attributes
-    ----------
-    estimator_ : estimator instance
-        The fitted classifier used when predicting.
-
-    decision_threshold_ : float
-        The new decision threshold.
-
-    decision_thresholds_ : ndarray of shape (n_thresholds,)
-        All decision thresholds that were evaluated.
-
-    objective_score_ : float or tuple of floats
-        The score of the objective metric associated with the decision threshold found.
-        When `objective_metric` is one of `"max_tpr_at_tnr_constraint"`,
-        `"max_tnr_at_tpr_constraint"`, `"max_precision_at_recall_constraint"`,
-        `"max_recall_at_precision_constraint"`, it will corresponds to a tuple of
-        two float values: the first one is the score of the metric which is constrained
-        and the second one is the score of the maximized metric.
-
-    objective_scores_ : ndarray of shape (n_thresholds,)
-        The scores of the objective metric associated with the decision thresholds.
-
-    classes_ : ndarray of shape (n_classes,)
-        The class labels.
-
-    n_features_in_ : int
-        Number of features seen during :term:`fit`. Only defined if the
-        underlying estimator exposes such an attribute when fit.
-
-    feature_names_in_ : ndarray of shape (`n_features_in_`,)
-        Names of features seen during :term:`fit`. Only defined if the
-        underlying estimator exposes such an attribute when fit.
-
-    See Also
-    --------
-    sklearn.calibration.CalibratedClassifierCV : Estimator that calibrates
-        probabilities.
-
-    Examples
-    --------
-    >>> from sklearn.datasets import make_classification
-    >>> from sklearn.ensemble import RandomForestClassifier
-    >>> from sklearn.metrics import classification_report
-    >>> from sklearn.model_selection import CutOffClassifier, train_test_split
-    >>> X, y = make_classification(
-    ...     n_samples=1_000, weights=[0.9, 0.1], class_sep=0.8, random_state=42
-    ... )
-    >>> X_train, X_test, y_train, y_test = train_test_split(
-    ...     X, y, stratify=y, random_state=42
-    ... )
-    >>> classifier = RandomForestClassifier(random_state=0).fit(X_train, y_train)
-    >>> print(classification_report(y_test, classifier.predict(X_test)))
-                  precision    recall  f1-score   support
-    <BLANKLINE>
-               0       0.94      0.99      0.96       224
-               1       0.80      0.46      0.59        26
-    <BLANKLINE>
-        accuracy                           0.93       250
-       macro avg       0.87      0.72      0.77       250
-    weighted avg       0.93      0.93      0.92       250
-    <BLANKLINE>
-    >>> classifier_tuned = CutOffClassifier(
-    ...     classifier, objective_metric="max_precision_at_recall_constraint",
-    ...     constraint_value=0.7,
-    ... ).fit(X_train, y_train)
-    >>> print(
-    ...     f"Cut-off point found at {classifier_tuned.decision_threshold_:.3f} for a "
-    ...     f"recall of {classifier_tuned.objective_score_[0]:.3f} and a precision of "
-    ...     f"{classifier_tuned.objective_score_[1]:.3f}."
-    ... )
-    Cut-off point found at 0.3... for a recall of 0.7... and a precision of 0.7...
-    >>> print(classification_report(y_test, classifier_tuned.predict(X_test)))
-                  precision    recall  f1-score   support
-    <BLANKLINE>
-               0       0.96      0.96      0.96       224
-               1       0.68      0.65      0.67        26
-    <BLANKLINE>
-        accuracy                           0.93       250
-       macro avg       0.82      0.81      0.81       250
-    weighted avg       0.93      0.93      0.93       250
-    <BLANKLINE>
-    """
-
-    _parameter_constraints: dict = {
-        "estimator": [
-            HasMethods(["fit", "predict_proba"]),
-            HasMethods(["fit", "decision_function"]),
-        ],
-        "objective_metric": [
-            StrOptions(
-                set(get_scorer_names())
-                | {
-                    "max_tnr_at_tpr_constraint",
-                    "max_tpr_at_tnr_constraint",
-                    "max_precision_at_recall_constraint",
-                    "max_recall_at_precision_constraint",
-                }
-            ),
-            callable,
-            MutableMapping,
-        ],
-        "constraint_value": [Real, None],
-        "pos_label": [Real, str, "boolean", None],
-        "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})],
-        "n_thresholds": [Interval(Integral, 1, None, closed="left")],
-        "cv": [
-            "cv_object",
-            StrOptions({"prefit"}),
-            Interval(RealNotInt, 0.0, 1.0, closed="right"),
-        ],
-        "refit": ["boolean", StrOptions({"auto"})],
-        "n_jobs": [Integral, None],
-        "random_state": ["random_state"],
-    }
-
-    def __init__(
-        self,
-        estimator,
-        *,
-        objective_metric="balanced_accuracy",
-        constraint_value=None,
-        pos_label=None,
-        response_method="auto",
-        n_thresholds=100,
-        cv=None,
-        refit="auto",
-        n_jobs=None,
-        random_state=None,
-    ):
-        self.estimator = estimator
-        self.objective_metric = objective_metric
-        self.constraint_value = constraint_value
-        self.pos_label = pos_label
-        self.response_method = response_method
-        self.n_thresholds = n_thresholds
-        self.cv = cv
-        self.refit = refit
-        self.n_jobs = n_jobs
-        self.random_state = random_state
-
-    def fit(self, X, y, sample_weight=None, **fit_params):
-        """Fit the classifier and post-tune the decision threshold.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            Training data.
-
-        y : array-like of shape (n_samples,)
-            Target values.
-
-        sample_weight : array-like of shape (n_samples,), default=None
-            Sample weights. If `None`, then samples are equally weighted.
-
-        **fit_params : dict
-            Parameters to pass to the `fit` method of the underlying
-            classifier.
-
-        Returns
-        -------
-        self : object
-            Returns an instance of self.
-        """
-        self._validate_params()
-        X, y = indexable(X, y)
-
-        y_type = type_of_target(y, input_name="y")
-        if y_type != "binary":
-            raise ValueError(
-                f"Only binary classification is supported. Unknown label type: {y_type}"
-            )
-
-        if sample_weight is not None:
-            sample_weight = _check_sample_weight(sample_weight, X)
-
-        if isinstance(self.cv, Real) and 0 < self.cv <= 1:
-            cv = StratifiedShuffleSplit(
-                n_splits=1, test_size=self.cv, random_state=self.random_state
-            )
-            refit = False if self.refit == "auto" else self.refit
-        elif self.cv == "prefit":
-            if self.refit is True:
-                raise ValueError("When cv='prefit', refit cannot be True.")
-            try:
-                check_is_fitted(self.estimator, "classes_")
-            except NotFittedError as exc:
-                raise NotFittedError(
-                    """When cv='prefit', `estimator` must be fitted."""
-                ) from exc
-            cv, refit = self.cv, False
-        else:
-            cv = check_cv(self.cv, y=y, classifier=True)
-            if self.refit is False and cv.get_n_splits() > 1:
-                raise ValueError("When cv has several folds, refit cannot be False.")
-            if self.refit == "auto":
-                refit = cv.get_n_splits() > 1
-            else:
-                refit = self.refit
-
-        if self.response_method == "auto":
-            self._response_method = ["predict_proba", "decision_function"]
-        else:
-            self._response_method = self.response_method
-
-        if isinstance(self.objective_metric, str) and self.objective_metric in {
-            "max_tpr_at_tnr_constraint",
-            "max_tnr_at_tpr_constraint",
-            "max_precision_at_recall_constraint",
-            "max_recall_at_precision_constraint",
-        }:
-            if self.constraint_value is None:
-                raise ValueError(
-                    "When `objective_metric` is 'max_tpr_at_tnr_constraint', "
-                    "'max_tnr_at_tpr_constraint', 'max_precision_at_recall_constraint',"
-                    " or 'max_recall_at_precision_constraint', `constraint_value` must "
-                    "be provided. Got None instead."
-                )
-            constraint_value = self.constraint_value
-        else:
-            constraint_value = "highest"
-
-        fit_parameters = signature(self.estimator.fit).parameters
-        supports_sw = "sample_weight" in fit_parameters
-
-        # in the following block, we:
-        # - define the final classifier `self.estimator_` and train it if necessary
-        # - define `classifier` to be used to post-tune the decision threshold
-        # - define `split` to be used to fit/score `classifier`
-        if cv == "prefit":
-            self.estimator_ = self.estimator
-            classifier = self.estimator_
-            splits = ([None, range(_num_samples(X))],)
-        else:
-            self.estimator_ = clone(self.estimator)
-            classifier = clone(self.estimator)
-            splits = cv.split(X, y)
-
-            if refit:
-                # train on the whole dataset
-                X_train, y_train, sw_train = X, y, sample_weight
-                fit_params_train = _check_fit_params(X, fit_params, indices=None)
-            else:
-                # single split cross-validation
-                train_idx, _ = next(cv.split(X, y))
-                X_train = _safe_indexing(X, train_idx)
-                y_train = _safe_indexing(y, train_idx)
-                if sample_weight is not None:
-                    sw_train = _safe_indexing(sample_weight, train_idx)
-                else:
-                    sw_train = None
-                fit_params_train = _check_fit_params(X, fit_params, indices=train_idx)
-
-            if sw_train is not None and supports_sw:
-                self.estimator_.fit(
-                    X_train, y_train, sample_weight=sw_train, **fit_params_train
-                )
-            else:
-                self.estimator_.fit(X_train, y_train, **fit_params_train)
-
-        if isinstance(self.objective_metric, MutableMapping):
-            keys = set(self.objective_metric.keys())
-            if not keys == {"tp", "tn", "fp", "fn"}:
-                raise ValueError(
-                    "Invalid keys in `objective_metric`. Valid keys are "
-                    f"'tp', 'tn', 'fp', and 'fn'. Got {keys} instead."
-                )
-            pos_label = _check_pos_label_consistency(self.pos_label, y)
-
-            def cost_sensitive_score_func(y_true, y_pred, **kwargs):
-                costs_and_gain = np.array(
-                    [
-                        [kwargs["tn"], kwargs["fp"]],
-                        [kwargs["fn"], kwargs["tp"]],
-                    ]
-                )
-
-                sample_weight = kwargs.get("sample_weight", None)
-                cm = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
-
-                pos_label, classes = kwargs["pos_label"], np.unique(y_true)
-                pos_label_idx = np.searchsorted(classes, pos_label)
-                if pos_label_idx == 0:
-                    # reorder the confusion matrix to be aligned with the cost-matrix
-                    cm = cm[::-1, ::-1]
-
-                return (costs_and_gain * cm).sum()
-
-            self._scorer = _ContinuousScorer(
-                score_func=cost_sensitive_score_func,
-                sign=1,
-                response_method=self._response_method,
-                kwargs={
-                    **self.objective_metric,
-                    "pos_label": pos_label,
-                },
-            )
-        elif self.objective_metric in {
-            "max_tnr_at_tpr_constraint",
-            "max_tpr_at_tnr_constraint",
-            "max_precision_at_recall_constraint",
-            "max_recall_at_precision_constraint",
-        }:
-            if self._response_method == "predict_proba":
-                params_scorer = {"needs_proba": True, "pos_label": self.pos_label}
-            elif (
-                isinstance(self._response_method, list)
-                and self._response_method[0] == "predict_proba"
-                and hasattr(classifier, "predict_proba")
-            ):
-                # TODO: this is due to a limitation in `make_scorer`: ideally, we should
-                # be able to pass a list of response methods to `make_scorer` and give
-                # priority to `predict_proba` other `decision_function`.
-                # Here, we manually check if the classifier provide `predict_proba` to
-                # use `needs_proba` instead and ensure that no error will be raised.
-                params_scorer = {"needs_proba": True, "pos_label": self.pos_label}
-            else:
-                params_scorer = {"needs_threshold": True, "pos_label": self.pos_label}
-
-            if "tpr" in self.objective_metric:  # tpr/tnr
-                score_func = roc_curve
-            else:  # precision/recall
-                score_func = precision_recall_curve
-            self._scorer = make_scorer(score_func, **params_scorer)
-        else:
-            scoring = check_scoring(classifier, scoring=self.objective_metric)
-            # add `pos_label` if requested by the scorer function
-            scorer_kwargs = {**scoring._kwargs}
-            signature_scoring_func = signature(scoring._score_func)
-            if (
-                "pos_label" in signature_scoring_func.parameters
-                and "pos_label" not in scorer_kwargs
-            ):
-                if self.pos_label is None:
-                    # Since the provided `pos_label` is the default, we need to
-                    # use the default value of the scoring function that can be either
-                    # `None` or `1`.
-                    scorer_kwargs["pos_label"] = signature_scoring_func.parameters[
-                        "pos_label"
-                    ].default
-                else:
-                    scorer_kwargs["pos_label"] = self.pos_label
-            # transform a binary metric into a curve metric for all possible decision
-            # thresholds
-            self._scorer = _ContinuousScorer(
-                score_func=scoring._score_func,
-                sign=scoring._sign,
-                response_method=self._response_method,
-                kwargs=scorer_kwargs,
-            )
-
-        cv_thresholds, cv_scores = zip(
-            *Parallel(n_jobs=self.n_jobs)(
-                delayed(_fit_and_score)(
-                    classifier,
-                    X,
-                    y,
-                    sample_weight,
-                    fit_params,
-                    train_idx,
-                    val_idx,
-                    self._scorer,
-                    self.objective_metric,
-                )
-                for train_idx, val_idx in splits
-            )
-        )
-
-        if any(len(th) == 1 for th in cv_thresholds):
-            raise ValueError(
-                "The provided estimator makes constant predictions. Therefore, it is "
-                "impossible to optimize the decision threshold."
-            )
-
-        # find the global min and max thresholds across all folds
-        min_threshold = np.min([th.min() for th in cv_thresholds])
-        max_threshold = np.max([th.max() for th in cv_thresholds])
-        self.decision_thresholds_ = np.linspace(
-            min_threshold, max_threshold, num=self.n_thresholds
-        )
-
-        def _mean_interpolated_score(threshold_interpolated, cv_thresholds, cv_scores):
-            return np.mean(
-                [
-                    np.interp(threshold_interpolated, th, sc)
-                    for th, sc in zip(cv_thresholds, cv_scores)
-                ],
-                axis=0,
-            )
-
-        if constraint_value == "highest":  # find best score
-            self.objective_scores_ = _mean_interpolated_score(
-                self.decision_thresholds_, cv_thresholds, cv_scores
-            )
-            best_idx = self.objective_scores_.argmax()
-            self.objective_score_ = self.objective_scores_[best_idx]
-            self.decision_threshold_ = self.decision_thresholds_[best_idx]
-        else:
-            if "tpr" in self.objective_metric:  # tpr/tnr
-                mean_tnr, mean_tpr = [
-                    _mean_interpolated_score(
-                        self.decision_thresholds_, cv_thresholds, sc
-                    )
-                    for sc in zip(*cv_scores)
-                ]
-            else:  # precision/recall
-                mean_precision, mean_recall = [
-                    _mean_interpolated_score(
-                        self.decision_thresholds_, cv_thresholds, sc
-                    )
-                    for sc in zip(*cv_scores)
-                ]
-
-            def _get_best_idx(constrained_score, maximized_score):
-                """Find the index of the best score constrained by another score."""
-                indices = np.arange(len(constrained_score))
-                mask = constrained_score >= constraint_value
-                mask_idx = maximized_score[mask].argmax()
-                return indices[mask][mask_idx]
-
-            if self.objective_metric == "max_tpr_at_tnr_constraint":
-                constrained_score, maximized_score = mean_tnr, mean_tpr
-            elif self.objective_metric == "max_tnr_at_tpr_constraint":
-                constrained_score, maximized_score = mean_tpr, mean_tnr
-            elif self.objective_metric == "max_precision_at_recall_constraint":
-                constrained_score, maximized_score = mean_recall, mean_precision
-            else:  # max_recall_at_precision_constraint
-                constrained_score, maximized_score = mean_precision, mean_recall
-
-            self.objective_scores_ = (constrained_score, maximized_score)
-            best_idx = _get_best_idx(constrained_score, maximized_score)
-            self.objective_score_ = (
-                constrained_score[best_idx],
-                maximized_score[best_idx],
-            )
-            self.decision_threshold_ = self.decision_thresholds_[best_idx]
-
-        if hasattr(self.estimator_, "n_features_in_"):
-            self.n_features_in_ = self.estimator_.n_features_in_
-        if hasattr(self.estimator_, "feature_names_in_"):
-            self.feature_names_in_ = self.estimator_.feature_names_in_
-
-        return self
-
-    @property
-    def classes_(self):
-        """Classes labels."""
-        return self.estimator_.classes_
-
-    def predict(self, X):
-        """Predict the target of new samples.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            The samples, as accepted by `estimator.predict`.
-
-        Returns
-        -------
-        C : ndarray of shape (n_samples,)
-            The predicted class.
-        """
-        check_is_fitted(self, "estimator_")
-        pos_label = self._scorer._get_pos_label()
-        y_score, _ = _get_response_values_binary(
-            self.estimator_, X, self._response_method, pos_label=pos_label
-        )
-        return self._scorer._from_scores_to_class_labels(
-            y_score, self.decision_threshold_, self.classes_
-        )
-
-    @available_if(_estimator_has("predict_proba"))
-    def predict_proba(self, X):
-        """Predict class probabilities for `X` using the fitted estimator.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            Training vectors, where `n_samples` is the number of samples and
-            `n_features` is the number of features.
-
-        Returns
-        -------
-        probabilities : ndarray of shape (n_samples, n_classes)
-            The class probabilities of the input samples.
-        """
-        check_is_fitted(self, "estimator_")
-        return self.estimator_.predict_proba(X)
-
-    @available_if(_estimator_has("predict_log_proba"))
-    def predict_log_proba(self, X):
-        """Predict logarithm class probabilities for `X` using the fitted estimator.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            Training vectors, where `n_samples` is the number of samples and
-            `n_features` is the number of features.
-
-        Returns
-        -------
-        log_probabilities : ndarray of shape (n_samples, n_classes)
-            The logarithm class probabilities of the input samples.
-        """
-        check_is_fitted(self, "estimator_")
-        return self.estimator_.predict_log_proba(X)
-
-    @available_if(_estimator_has("decision_function"))
-    def decision_function(self, X):
-        """Decision function for samples in `X` using the fitted estimator.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            Training vectors, where `n_samples` is the number of samples and
-            `n_features` is the number of features.
-
-        Returns
-        -------
-        decisions : ndarray of shape (n_samples,)
-            The decision function computed the fitted estimator.
-        """
-        check_is_fitted(self, "estimator_")
-        return self.estimator_.decision_function(X)
-
-    def _more_tags(self):
-        return {
-            "binary_only": True,
-            "_xfail_checks": {
-                "check_classifiers_train": "Threshold at probability 0.5 does not hold",
-                "check_sample_weights_invariance": (
-                    "Due to the cross-validation and sample ordering, removing a sample"
-                    " is not strictly equal to putting is weight to zero. Specific unit"
-                    " tests are added for CutOffClassifier specifically."
-                ),
-            },
-        }
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
deleted file mode 100644
index a30c737b3f59f..0000000000000
--- a/sklearn/model_selection/tests/test_prediction.py
+++ /dev/null
@@ -1,832 +0,0 @@
-import numpy as np
-import pytest
-
-from sklearn.base import clone
-from sklearn.datasets import load_breast_cancer, load_iris, make_classification
-from sklearn.dummy import DummyClassifier
-from sklearn.ensemble import GradientBoostingClassifier
-from sklearn.exceptions import NotFittedError
-from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import (
-    balanced_accuracy_score,
-    confusion_matrix,
-    f1_score,
-    fbeta_score,
-    make_scorer,
-    precision_recall_curve,
-    precision_score,
-    recall_score,
-    roc_curve,
-)
-from sklearn.metrics._scorer import _ContinuousScorer
-from sklearn.model_selection import CutOffClassifier
-from sklearn.model_selection._prediction import _fit_and_score
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import StandardScaler
-from sklearn.svm import SVC
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.utils._mocking import CheckingClassifier
-from sklearn.utils._testing import (
-    _convert_container,
-    assert_allclose,
-    assert_array_equal,
-)
-
-
-@pytest.mark.parametrize(
-    "scorer, score_method",
-    [
-        (
-            _ContinuousScorer(
-                score_func=balanced_accuracy_score,
-                sign=1,
-                response_method="predict_proba",
-                kwargs={},
-            ),
-            "balanced_accuracy",
-        ),
-        (
-            make_scorer(roc_curve, needs_proba=True),
-            "max_tnr_at_tpr_constraint",
-        ),
-        (
-            make_scorer(roc_curve, needs_proba=True),
-            "max_tpr_at_tnr_constraint",
-        ),
-        (
-            make_scorer(precision_recall_curve, needs_proba=True),
-            "max_precision_at_recall_constraint",
-        ),
-        (
-            make_scorer(precision_recall_curve, needs_proba=True),
-            "max_recall_at_precision_constraint",
-        ),
-    ],
-)
-def test_fit_and_score_scorers(scorer, score_method):
-    """Check that `_fit_and_score` returns thresholds in ascending order for the
-    different accepted scorers."""
-    X, y = make_classification(n_samples=100, random_state=0)
-    train_idx, val_idx = np.arange(50), np.arange(50, 100)
-    classifier = LogisticRegression()
-
-    thresholds, scores = _fit_and_score(
-        classifier,
-        X,
-        y,
-        sample_weight=None,
-        fit_params={},
-        train_idx=train_idx,
-        val_idx=val_idx,
-        scorer=scorer,
-        score_method=score_method,
-    )
-
-    if score_method.startswith("max_"):
-        assert_array_equal(np.argsort(thresholds), np.arange(len(thresholds)))
-        assert isinstance(scores, tuple) and len(scores) == 2
-        for sc in scores:
-            assert np.logical_and(sc >= 0, sc <= 1).all()
-    else:
-        assert_array_equal(np.argsort(thresholds), np.arange(len(thresholds)))
-        assert isinstance(scores, np.ndarray)
-        assert np.logical_and(scores >= 0, scores <= 1).all()
-
-
-@pytest.mark.parametrize(
-    "scorer, score_method, expected_score",
-    [
-        (
-            _ContinuousScorer(
-                score_func=balanced_accuracy_score,
-                sign=1,
-                response_method="predict_proba",
-                kwargs={},
-            ),
-            "balanced_accuracy",
-            [0.5, 1.0],
-        ),
-        (
-            make_scorer(roc_curve, needs_proba=True),
-            "max_tnr_at_tpr_constraint",
-            [[0.0, 1.0], [1.0, 1.0]],
-        ),
-        (
-            make_scorer(roc_curve, needs_proba=True),
-            "max_tpr_at_tnr_constraint",
-            [[0.0, 1.0], [1.0, 1.0]],
-        ),
-        (
-            make_scorer(precision_recall_curve, needs_proba=True),
-            "max_precision_at_recall_constraint",
-            [[0.5, 1.0], [1.0, 1.0]],
-        ),
-        (
-            make_scorer(precision_recall_curve, needs_proba=True),
-            "max_recall_at_precision_constraint",
-            [[0.5, 1.0], [1.0, 1.0]],
-        ),
-    ],
-)
-def test_fit_and_score_prefit(scorer, score_method, expected_score):
-    """Check the behaviour with a prefit classifier."""
-    X, y = make_classification(n_samples=100, random_state=0)
-
-    # `train_idx is None` to indicate that the classifier is prefit
-    train_idx, val_idx = None, np.arange(50, 100)
-    classifier = DecisionTreeClassifier(random_state=0)
-
-    with pytest.raises(NotFittedError):
-        _fit_and_score(
-            classifier,
-            X,
-            y,
-            sample_weight=None,
-            fit_params={},
-            train_idx=train_idx,
-            val_idx=val_idx,
-            scorer=scorer,
-            score_method=score_method,
-        )
-
-    classifier.fit(X, y)
-    # make sure that the classifier memorized the full dataset such that
-    # we get perfect predictions and thus match the expected score
-    assert classifier.score(X[val_idx], y[val_idx]) == pytest.approx(1.0)
-
-    thresholds, scores = _fit_and_score(
-        classifier,
-        X,
-        y,
-        sample_weight=None,
-        fit_params={},
-        train_idx=train_idx,
-        val_idx=val_idx,
-        scorer=scorer,
-        score_method=score_method,
-    )
-    assert_array_equal(np.argsort(thresholds), np.arange(len(thresholds)))
-    assert_allclose(scores, expected_score)
-
-
-@pytest.mark.parametrize(
-    "scorer, score_method",
-    [
-        (
-            _ContinuousScorer(
-                score_func=balanced_accuracy_score,
-                sign=1,
-                response_method="predict_proba",
-                kwargs={},
-            ),
-            "balanced_accuracy",
-        ),
-        (
-            make_scorer(roc_curve, needs_proba=True),
-            "max_tnr_at_tpr_constraint",
-        ),
-        (
-            make_scorer(roc_curve, needs_proba=True),
-            "max_tpr_at_tnr_constraint",
-        ),
-        (
-            make_scorer(precision_recall_curve, needs_proba=True),
-            "max_precision_at_recall_constraint",
-        ),
-        (
-            make_scorer(precision_recall_curve, needs_proba=True),
-            "max_recall_at_precision_constraint",
-        ),
-    ],
-)
-def test_fit_and_score_sample_weight(scorer, score_method):
-    """Check that we dispatch the sample-weight to fit and score the classifier."""
-    X, y = load_iris(return_X_y=True)
-    X, y = X[:100], y[:100]  # only 2 classes
-
-    # create a dataset and repeat twice the sample of class #0
-    X_repeated, y_repeated = np.vstack([X, X[y == 0]]), np.hstack([y, y[y == 0]])
-    # create a sample weight vector that is equivalent to the repeated dataset
-    sample_weight = np.ones_like(y)
-    sample_weight[:50] *= 2
-
-    classifier = LogisticRegression()
-    train_repeated_idx = np.arange(X_repeated.shape[0])
-    val_repeated_idx = np.arange(X_repeated.shape[0])
-    thresholds_repeated, scores_repeated = _fit_and_score(
-        classifier,
-        X_repeated,
-        y_repeated,
-        sample_weight=None,
-        fit_params={},
-        train_idx=train_repeated_idx,
-        val_idx=val_repeated_idx,
-        scorer=scorer,
-        score_method=score_method,
-    )
-
-    train_idx, val_idx = np.arange(X.shape[0]), np.arange(X.shape[0])
-    thresholds, scores = _fit_and_score(
-        classifier,
-        X,
-        y,
-        sample_weight=sample_weight,
-        fit_params={},
-        train_idx=train_idx,
-        val_idx=val_idx,
-        scorer=scorer,
-        score_method=score_method,
-    )
-
-    assert_allclose(thresholds_repeated, thresholds)
-    assert_allclose(scores_repeated, scores)
-
-
-@pytest.mark.parametrize(
-    "scorer, score_method",
-    [
-        (
-            _ContinuousScorer(
-                score_func=balanced_accuracy_score,
-                sign=1,
-                response_method="predict_proba",
-                kwargs={},
-            ),
-            "balanced_accuracy",
-        ),
-        (
-            make_scorer(roc_curve, needs_proba=True),
-            "max_tnr_at_tpr_constraint",
-        ),
-        (
-            make_scorer(roc_curve, needs_proba=True),
-            "max_tpr_at_tnr_constraint",
-        ),
-        (
-            make_scorer(precision_recall_curve, needs_proba=True),
-            "max_precision_at_recall_constraint",
-        ),
-        (
-            make_scorer(precision_recall_curve, needs_proba=True),
-            "max_recall_at_precision_constraint",
-        ),
-    ],
-)
-@pytest.mark.parametrize("fit_params_type", ["list", "array"])
-def test_fit_and_score_fit_params(scorer, score_method, fit_params_type):
-    """Check that we pass `fit_params` to the classifier when calling `fit`."""
-    X, y = make_classification(n_samples=100, random_state=0)
-    fit_params = {
-        "a": _convert_container(y, fit_params_type),
-        "b": _convert_container(y, fit_params_type),
-    }
-
-    classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0)
-    train_idx, val_idx = np.arange(50), np.arange(50, 100)
-
-    _fit_and_score(
-        classifier,
-        X,
-        y,
-        sample_weight=None,
-        fit_params=fit_params,
-        train_idx=train_idx,
-        val_idx=val_idx,
-        scorer=scorer,
-        score_method=score_method,
-    )
-
-
-def test_cutoffclassifier_no_binary():
-    """Check that we raise an informative error message for non-binary problem."""
-    X, y = make_classification(n_classes=3, n_clusters_per_class=1)
-    err_msg = "Only binary classification is supported."
-    with pytest.raises(ValueError, match=err_msg):
-        CutOffClassifier(LogisticRegression()).fit(X, y)
-
-
-@pytest.mark.parametrize(
-    "params, err_type, err_msg",
-    [
-        (
-            {"cv": "prefit", "refit": True},
-            ValueError,
-            "When cv='prefit', refit cannot be True.",
-        ),
-        (
-            {"cv": 10, "refit": False},
-            ValueError,
-            "When cv has several folds, refit cannot be False.",
-        ),
-        (
-            {"cv": "prefit", "refit": False},
-            NotFittedError,
-            "`estimator` must be fitted.",
-        ),
-    ],
-)
-def test_cutoffclassifier_conflict_cv_refit(params, err_type, err_msg):
-    """Check that we raise an informative error message when `cv` and `refit`
-    cannot be used together.
-    """
-    X, y = make_classification(n_samples=100, random_state=0)
-    with pytest.raises(err_type, match=err_msg):
-        CutOffClassifier(LogisticRegression(), **params).fit(X, y)
-
-
-@pytest.mark.parametrize(
-    "estimator",
-    [LogisticRegression(), SVC(), GradientBoostingClassifier(n_estimators=4)],
-)
-@pytest.mark.parametrize(
-    "response_method", ["predict_proba", "predict_log_proba", "decision_function"]
-)
-def test_cutoffclassifier_estimator_response_methods(estimator, response_method):
-    """Check that `CutOffClassifier` exposes the same response methods as the
-    underlying estimator.
-    """
-    X, y = make_classification(n_samples=100, random_state=0)
-
-    model = CutOffClassifier(estimator)
-    assert hasattr(model, response_method) == hasattr(estimator, response_method)
-
-    model.fit(X, y)
-    assert hasattr(model, response_method) == hasattr(estimator, response_method)
-
-    if hasattr(model, response_method):
-        y_pred_cutoff = getattr(model, response_method)(X)
-        y_pred_underlying_estimator = getattr(model.estimator_, response_method)(X)
-
-        assert_allclose(y_pred_cutoff, y_pred_underlying_estimator)
-
-
-@pytest.mark.parametrize(
-    "response_method", ["auto", "decision_function", "predict_proba"]
-)
-def test_cutoffclassifier_with_constraint_value(response_method):
-    """Check that `CutOffClassifier` is optimizing a given objective metric."""
-    X, y = load_breast_cancer(return_X_y=True)
-    # remove feature to degrade performances
-    X = X[:, :5]
-
-    # make the problem completely imbalanced such that the balanced accuracy is low
-    indices_pos = np.flatnonzero(y == 1)
-    indices_pos = indices_pos[: indices_pos.size // 50]
-    indices_neg = np.flatnonzero(y == 0)
-
-    X = np.vstack([X[indices_neg], X[indices_pos]])
-    y = np.hstack([y[indices_neg], y[indices_pos]])
-
-    lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
-    n_thresholds = 100
-    model = CutOffClassifier(
-        estimator=lr,
-        objective_metric="balanced_accuracy",
-        response_method=response_method,
-        n_thresholds=n_thresholds,
-    )
-    score_optimized = balanced_accuracy_score(y, model.fit(X, y).predict(X))
-    score_baseline = balanced_accuracy_score(y, lr.predict(X))
-    assert score_optimized > score_baseline
-    assert model.decision_thresholds_.shape == (n_thresholds,)
-    assert model.objective_scores_.shape == (n_thresholds,)
-
-
-@pytest.mark.parametrize(
-    "metrics",
-    [
-        ("max_tpr_at_tnr_constraint", "max_tnr_at_tpr_constraint"),
-        ("max_tnr_at_tpr_constraint", "max_tpr_at_tnr_constraint"),
-    ],
-)
-def test_cutoffclassifier_limit_metric_tradeoff(metrics):
-    """Check that an objective value of 0 give opposite predictions with tnr/tpr and
-    precision/recall.
-    """
-    X, y = load_breast_cancer(return_X_y=True)
-    estimator = make_pipeline(StandardScaler(), LogisticRegression())
-    model = CutOffClassifier(
-        estimator=estimator,
-        objective_metric=metrics[0],
-        constraint_value=0,
-    )
-    y_pred_1 = model.fit(X, y).predict(X)
-    model.set_params(objective_metric=metrics[1])
-    y_pred_2 = (~model.fit(X, y).predict(X).astype(bool)).astype(int)
-    assert np.mean(y_pred_1 == y_pred_2) > 0.98
-
-
-def test_cutoffclassifier_metric_with_parameter():
-    """Check that we can pass a metric with a parameter in addition check that
-    `f_beta with beta=1` is equivalent to `f1`.
-    """
-    X, y = load_breast_cancer(return_X_y=True)
-    lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
-    model_fbeta = CutOffClassifier(
-        estimator=lr, objective_metric=make_scorer(fbeta_score, beta=1)
-    ).fit(X, y)
-    model_f1 = CutOffClassifier(
-        estimator=lr, objective_metric=make_scorer(f1_score)
-    ).fit(X, y)
-
-    assert model_fbeta.decision_threshold_ == pytest.approx(
-        model_f1.decision_threshold_
-    )
-
-
-@pytest.mark.parametrize(
-    "response_method", ["auto", "decision_function", "predict_proba"]
-)
-@pytest.mark.parametrize(
-    "metric",
-    [
-        "max_tnr_at_tpr_constraint",
-        "max_tpr_at_tnr_constraint",
-        "max_precision_at_recall_constraint",
-        "max_recall_at_precision_constraint",
-        make_scorer(balanced_accuracy_score),
-        make_scorer(f1_score, pos_label="cancer"),
-        {"tp": 5, "tn": 1, "fp": -1, "fn": -1},
-    ],
-)
-def test_cutoffclassifier_with_string_targets(response_method, metric):
-    """Check that targets represented by str are properly managed.
-    Also, check with several metrics to be sure that `pos_label` is properly
-    dispatched.
-    """
-    X, y = load_breast_cancer(return_X_y=True)
-    # Encode numeric targets by meaningful strings. We purposely designed the class
-    # names such that the `pos_label` is the first alphabetically sorted class and thus
-    # encoded as 0.
-    classes = np.array(["cancer", "healthy"], dtype=object)
-    y = classes[y]
-    model = CutOffClassifier(
-        estimator=make_pipeline(StandardScaler(), LogisticRegression()),
-        objective_metric=metric,
-        constraint_value=0.9,
-        pos_label="cancer",
-        response_method=response_method,
-        n_thresholds=100,
-    ).fit(X, y)
-    assert_array_equal(model.classes_, np.sort(classes))
-    y_pred = model.predict(X)
-    assert_array_equal(np.sort(np.unique(y_pred)), np.sort(classes))
-
-
-@pytest.mark.parametrize("with_sample_weight", [True, False])
-def test_cutoffclassifier_refit(with_sample_weight, global_random_seed):
-    """Check the behaviour of the `refit` parameter."""
-    rng = np.random.RandomState(global_random_seed)
-    X, y = make_classification(n_samples=100, random_state=0)
-    if with_sample_weight:
-        sample_weight = rng.randn(X.shape[0])
-        sample_weight = np.abs(sample_weight, out=sample_weight)
-    else:
-        sample_weight = None
-
-    # check that `estimator_` if fitted on the full dataset when `refit=True`
-    estimator = LogisticRegression()
-    model = CutOffClassifier(estimator, refit=True).fit(
-        X, y, sample_weight=sample_weight
-    )
-
-    assert model.estimator_ is not estimator
-    estimator.fit(X, y, sample_weight=sample_weight)
-    assert_allclose(model.estimator_.coef_, estimator.coef_)
-    assert_allclose(model.estimator_.intercept_, estimator.intercept_)
-
-    # check that `estimator_` was not altered when `refit=False` and `cv="prefit"`
-    estimator = LogisticRegression().fit(X, y, sample_weight=sample_weight)
-    coef = estimator.coef_.copy()
-    model = CutOffClassifier(estimator, cv="prefit", refit=False).fit(
-        X, y, sample_weight=sample_weight
-    )
-
-    assert model.estimator_ is estimator
-    assert_allclose(model.estimator_.coef_, coef)
-
-    # check that we train `estimator_` on the training split of a given cross-validation
-    estimator = LogisticRegression()
-    cv = [
-        (np.arange(50), np.arange(50, 100)),
-    ]  # single split
-    model = CutOffClassifier(estimator, cv=cv, refit=False).fit(
-        X, y, sample_weight=sample_weight
-    )
-
-    assert model.estimator_ is not estimator
-    if with_sample_weight:
-        sw_train = sample_weight[cv[0][0]]
-    else:
-        sw_train = None
-    estimator.fit(X[cv[0][0]], y[cv[0][0]], sample_weight=sw_train)
-    assert_allclose(model.estimator_.coef_, estimator.coef_)
-
-
-@pytest.mark.parametrize(
-    "objective_metric",
-    [
-        "max_tnr_at_tpr_constraint",
-        "max_tpr_at_tnr_constraint",
-        "max_precision_at_recall_constraint",
-        "max_recall_at_precision_constraint",
-        "balanced_accuracy",
-    ],
-)
-@pytest.mark.parametrize("fit_params_type", ["list", "array"])
-def test_cutoffclassifier_fit_params(objective_metric, fit_params_type):
-    """Check that we pass `fit_params` to the classifier when calling `fit`."""
-    X, y = make_classification(n_samples=100, random_state=0)
-    fit_params = {
-        "a": _convert_container(y, fit_params_type),
-        "b": _convert_container(y, fit_params_type),
-    }
-
-    classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0)
-    model = CutOffClassifier(
-        classifier, objective_metric=objective_metric, constraint_value=0.5
-    )
-    model.fit(X, y, **fit_params)
-
-
-@pytest.mark.parametrize(
-    "objective_metric, constraint_value",
-    [
-        ("max_tnr_at_tpr_constraint", 0.5),
-        ("max_tpr_at_tnr_constraint", 0.5),
-        ("max_precision_at_recall_constraint", 0.5),
-        ("max_recall_at_precision_constraint", 0.5),
-    ],
-)
-@pytest.mark.parametrize(
-    "response_method", ["auto", "decision_function", "predict_proba"]
-)
-def test_cutoffclassifier_response_method_scorer_with_constraint_metric(
-    objective_metric, constraint_value, response_method, global_random_seed
-):
-    """Check that we use the proper scorer and forwarding the requested response method
-    for TNR/TPR and precision/recall metrics.
-    """
-    X, y = make_classification(n_samples=100, random_state=global_random_seed)
-    classifier = LogisticRegression()
-
-    n_thresholds = 100
-    model = CutOffClassifier(
-        classifier,
-        objective_metric=objective_metric,
-        constraint_value=constraint_value,
-        response_method=response_method,
-        n_thresholds=n_thresholds,
-    )
-    model.fit(X, y)
-    assert model.decision_thresholds_.shape == (n_thresholds,)
-    assert all(score.shape == (n_thresholds,) for score in model.objective_scores_)
-
-    if response_method in ("auto", "predict_proba"):
-        # "auto" will fall back  in priority on `predict_proba` if `estimator`
-        # supports it.
-        # we expect the decision threshold to be in [0, 1]
-        if objective_metric in (
-            "max_tnr_at_tpr_constraint",
-            "max_precision_at_recall_constraint",
-        ):
-            assert 0.5 <= model.decision_threshold_ <= 1
-        else:  # "max_tpr_at_tnr_constraint" or "max_recall_at_precision_constraint"
-            assert 0 <= model.decision_threshold_ <= 0.5
-    else:  # "decision_function"
-        # we expect the decision function to be centered in 0.0 and to be larger than
-        # -1 and 1.
-        if objective_metric in (
-            "max_tnr_at_tpr_constraint",
-            "max_precision_at_recall_constraint",
-        ):
-            assert 0 < model.decision_threshold_ < 20
-        else:  # "max_tpr_at_tnr_constraint" or "max_recall_at_precision_constraint"
-            assert -20 < model.decision_threshold_ < 0
-
-
-def test_cutoffclassifier_objective_metric_dict(global_random_seed):
-    """Check that we can pass a custom objective metric."""
-    X, y = make_classification(n_samples=500, random_state=global_random_seed)
-    classifier = LogisticRegression()
-
-    # we need to set a small number of thresholds to avoid ties and picking a too low
-    # threshold.
-    n_thresholds = 5
-
-    # affect a high gain to true negative and force the classifier to mainly
-    # predict the negative class.
-    costs_and_again = {"tp": 0, "tn": 10, "fp": 0, "fn": 0}
-    model = CutOffClassifier(
-        classifier, objective_metric=costs_and_again, n_thresholds=n_thresholds
-    )
-    model.fit(X, y)
-
-    assert model.decision_thresholds_.shape == (n_thresholds,)
-    assert model.objective_scores_.shape == (n_thresholds,)
-
-    assert model.decision_threshold_ > 0.99
-    assert np.mean(model.predict(X) == 0) > 0.9
-
-    # use the true positive now
-    costs_and_again = {"tp": 10, "tn": 0, "fp": 0, "fn": 0}
-    model = CutOffClassifier(
-        classifier, objective_metric=costs_and_again, n_thresholds=n_thresholds
-    )
-    model.fit(X, y)
-
-    assert model.decision_thresholds_.shape == (n_thresholds,)
-    assert model.objective_scores_.shape == (n_thresholds,)
-
-    assert model.decision_threshold_ < 0.01
-    assert np.mean(model.predict(X) == 1) > 0.9
-
-    # flipping the `pos_label` to zero should force the classifier to always predict 0
-    # and thus have a low threshold
-    pos_label = 0
-    model = CutOffClassifier(
-        classifier,
-        objective_metric=costs_and_again,
-        n_thresholds=n_thresholds,
-        pos_label=pos_label,
-    )
-    model.fit(X, y)
-
-    assert model.decision_thresholds_.shape == (n_thresholds,)
-    assert model.objective_scores_.shape == (n_thresholds,)
-
-    assert model.decision_threshold_ < 0.01
-    assert np.mean(model.predict(X) == 0) > 0.9
-
-
-def test_cutoffclassifier_sample_weight_costs_and_again():
-    """Check that we dispatch the `sample_weight` to the scorer when computing the
-    confusion matrix."""
-    X, y = load_iris(return_X_y=True)
-    X, y = X[:100], y[:100]  # only 2 classes
-
-    # create a dataset and repeat twice the sample of class #0
-    X_repeated, y_repeated = np.vstack([X, X[y == 0]]), np.hstack([y, y[y == 0]])
-    # create a sample weight vector that is equivalent to the repeated dataset
-    sample_weight = np.ones_like(y)
-    sample_weight[:50] *= 2
-
-    # we use a prefit classifier to simplify the test
-    cv = "prefit"
-    estimator = LogisticRegression().fit(X, y)
-    costs_and_again = {"tp": 1, "tn": 1, "fp": -1, "fn": -1}
-
-    model_repeat = CutOffClassifier(estimator, cv=cv, objective_metric=costs_and_again)
-    model_repeat.fit(X_repeated, y_repeated, sample_weight=None)
-
-    model_sw = CutOffClassifier(estimator, cv=cv, objective_metric=costs_and_again)
-    model_sw.fit(X, y, sample_weight=sample_weight)
-
-    assert model_repeat.objective_score_ == pytest.approx(model_sw.objective_score_)
-
-
-def test_cutoffclassifier_cv_zeros_sample_weights_equivalence():
-    """Check that passing removing some sample from the dataset `X` is
-    equivalent to passing a `sample_weight` with a factor 0."""
-    X, y = load_iris(return_X_y=True)
-    # Scale the data to avoid any convergence issue
-    X = StandardScaler().fit_transform(X)
-    # Only use 2 classes and select samples such that 2-fold cross-validation
-    # split will lead to an equivalence with a `sample_weight` of 0
-    X = np.vstack((X[:40], X[50:90]))
-    y = np.hstack((y[:40], y[50:90]))
-    sample_weight = np.zeros_like(y)
-    sample_weight[::2] = 1
-
-    estimator = LogisticRegression()
-    model_without_weights = CutOffClassifier(estimator, cv=2)
-    model_with_weights = clone(model_without_weights)
-
-    model_with_weights.fit(X, y, sample_weight=sample_weight)
-    model_without_weights.fit(X[::2], y[::2])
-
-    assert_allclose(
-        model_with_weights.estimator_.coef_, model_without_weights.estimator_.coef_
-    )
-
-    y_pred_with_weights = model_with_weights.predict_proba(X)
-    y_pred_without_weights = model_without_weights.predict_proba(X)
-    assert_allclose(y_pred_with_weights, y_pred_without_weights)
-
-
-def test_cutoffclassifier_error_constant_learner():
-    """Check that we raise an error message when providing an estimator that predicts
-    only a single class."""
-    X, y = make_classification(random_state=0)
-    estimator = DummyClassifier(strategy="constant", constant=1)
-    err_msg = "The provided estimator makes constant predictions."
-    with pytest.raises(ValueError, match=err_msg):
-        CutOffClassifier(estimator).fit(X, y)
-
-
-@pytest.mark.parametrize(
-    "objective_metric",
-    ["max_precision_at_recall_constraint", "max_recall_at_precision_constraint"],
-)
-@pytest.mark.parametrize("pos_label", [0, 1])
-def test_cutoffclassifier_pos_label_precision_recall(objective_metric, pos_label):
-    """Check that `pos_label` is dispatched correctly by checking the precision and
-    recall score found during the optimization and the one found at `predict` time."""
-    X, y = make_classification(n_samples=5_000, weights=[0.6, 0.4], random_state=42)
-
-    # prefit the estimator to avoid variability due to the cross-validation
-    estimator = LogisticRegression().fit(X, y)
-
-    constraint_value = 0.7
-    model = CutOffClassifier(
-        estimator,
-        objective_metric=objective_metric,
-        constraint_value=constraint_value,
-        cv="prefit",
-        pos_label=pos_label,
-    ).fit(X, y)
-
-    precision = precision_score(y, model.predict(X), pos_label=pos_label)
-    recall = recall_score(y, model.predict(X), pos_label=pos_label)
-
-    # due to internal interpolation, the scores will vary slightly
-    if objective_metric == "max_precision_at_recall_constraint":
-        assert recall == pytest.approx(model.objective_score_[0], abs=1e-3)
-        assert precision == pytest.approx(model.objective_score_[1], abs=1e-3)
-    else:
-        assert precision == pytest.approx(model.objective_score_[0], abs=1e-3)
-        assert recall == pytest.approx(model.objective_score_[1], abs=1e-3)
-
-
-@pytest.mark.parametrize(
-    "objective_metric", ["max_tnr_at_tpr_constraint", "max_tpr_at_tnr_constraint"]
-)
-@pytest.mark.parametrize("pos_label", [0, 1])
-def test_cutoffclassifier_pos_label_tnr_tpr(objective_metric, pos_label):
-    """Check that `pos_label` is dispatched correctly by checking the TNR and TPR
-    score found during the optimization and the one found at `predict` time."""
-    X, y = make_classification(n_samples=5_000, weights=[0.6, 0.4], random_state=42)
-
-    # prefit the estimator to avoid variability due to the cross-validation
-    estimator = LogisticRegression().fit(X, y)
-
-    constraint_value = 0.7
-    model = CutOffClassifier(
-        estimator,
-        objective_metric=objective_metric,
-        constraint_value=constraint_value,
-        cv="prefit",
-        pos_label=pos_label,
-    ).fit(X, y)
-
-    def tnr_tpr_score(y_true, y_pred, pos_label=pos_label):
-        cm = confusion_matrix(y_true, y_pred)
-        if pos_label == 0:
-            cm = cm[::-1, ::-1]
-        tn, fp, fn, tp = cm.ravel()
-        tnr = tn / (tn + fp)
-        tpr = tp / (tp + fn)
-        return tnr, tpr
-
-    tnr, tpr = tnr_tpr_score(y, model.predict(X), pos_label=pos_label)
-    # due to internal interpolation, the scores will vary slightly
-    if objective_metric == "max_tnr_at_tpr_constraint":
-        assert tpr == pytest.approx(model.objective_score_[0], abs=0.05)
-        assert tnr == pytest.approx(model.objective_score_[1], abs=0.05)
-    else:
-        assert tnr == pytest.approx(model.objective_score_[0], abs=0.05)
-        assert tpr == pytest.approx(model.objective_score_[1], abs=0.05)
-
-
-@pytest.mark.parametrize(
-    "metric_type",
-    ["string", "scorer_without_pos_label", "scorer_with_pos_label"],
-)
-@pytest.mark.parametrize("pos_label", [0, 1])
-def test_cutoffclassifier_pos_label_single_metric(pos_label, metric_type):
-    """Check that `pos_label` is dispatched correctly when getting a scorer linked to
-    a known metric. By default, the scorer in scikit-learn only have a default value
-    for `pos_label` which is 1.
-    """
-    X, y = make_classification(n_samples=100, weights=[0.6, 0.4], random_state=42)
-
-    # prefit the estimator to avoid variability due to the cross-validation
-    estimator = LogisticRegression().fit(X, y)
-
-    if metric_type == "string":
-        objective_metric = "precision"
-    elif metric_type == "scorer_without_pos_label":
-        objective_metric = make_scorer(precision_score)
-    else:  # metric_type == "scorer_with_pos_label"
-        objective_metric = make_scorer(precision_score, pos_label=pos_label)
-
-    model = CutOffClassifier(
-        estimator,
-        objective_metric=objective_metric,
-        cv="prefit",
-        pos_label=pos_label,
-        n_thresholds=500,
-    ).fit(X, y)
-
-    precision = precision_score(y, model.predict(X), pos_label=pos_label)
-    assert precision == pytest.approx(model.objective_score_, abs=1e-3)

From 2ade221f398d45ea3230e46adde02f4867370968 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 13 Jul 2023 12:15:32 +0200
Subject: [PATCH 080/194] add missing module

---
 .../_classification_threshold.py              | 802 +++++++++++++++++
 .../tests/test_classification_threshold.py    | 832 ++++++++++++++++++
 2 files changed, 1634 insertions(+)
 create mode 100644 sklearn/model_selection/_classification_threshold.py
 create mode 100644 sklearn/model_selection/tests/test_classification_threshold.py

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
new file mode 100644
index 0000000000000..98cc868d16b7d
--- /dev/null
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -0,0 +1,802 @@
+from collections.abc import MutableMapping
+from inspect import signature
+from numbers import Integral, Real
+
+import numpy as np
+
+from ..base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin, clone
+from ..exceptions import NotFittedError
+from ..metrics import (
+    check_scoring,
+    confusion_matrix,
+    get_scorer_names,
+    make_scorer,
+    precision_recall_curve,
+    roc_curve,
+)
+from ..metrics._scorer import _ContinuousScorer
+from ..utils import _safe_indexing
+from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
+from ..utils._response import _get_response_values_binary
+from ..utils.metaestimators import available_if
+from ..utils.multiclass import type_of_target
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import (
+    _check_fit_params,
+    _check_pos_label_consistency,
+    _check_sample_weight,
+    _num_samples,
+    check_consistent_length,
+    check_is_fitted,
+    indexable,
+)
+from ._split import StratifiedShuffleSplit, check_cv
+
+
+def _estimator_has(attr):
+    """Check if we can delegate a method to the underlying estimator.
+
+    First, we check the first fitted estimator if available, otherwise we
+    check the unfitted estimator.
+    """
+    return lambda self: (
+        hasattr(self.estimator_, attr)
+        if hasattr(self, "estimator_")
+        else hasattr(self.estimator, attr)
+    )
+
+
+def _fit_and_score(
+    classifier,
+    X,
+    y,
+    sample_weight,
+    fit_params,
+    train_idx,
+    val_idx,
+    scorer,
+    score_method,
+):
+    """Fit a classifier and compute the scores for different decision thresholds.
+
+    Parameters
+    ----------
+    classifier : estimator instance
+        The classifier to fit and used for scoring. If `classifier` is already fitted,
+        it will be used as is.
+
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The entire dataset.
+
+    y : array-like of shape (n_samples,)
+        The entire target vector.
+
+    sample_weight : array-like of shape (n_samples,)
+        Some optional associated sample weights.
+
+    fit_params : dict
+        Parameters to pass to the `fit` method of the underlying classifier.
+
+    train_idx : ndarray of shape (n_train_samples,) or None
+        The indices of the training set. If `None`, `classifier` is expected to be
+        already fitted.
+
+    val_idx : ndarray of shape (n_val_samples,)
+        The indices of the validation set used to score `classifier`. If `train_idx`,
+        the entire set will be used.
+
+    scorer : scorer instance
+        The scorer taking `classifier` and the validation set as input and outputting
+        decision thresholds and scores.
+
+    score_method : str or callable
+        The scoring method to use. Used to detect if we compute TPR/TNR or precision/
+        recall.
+
+    Returns
+    -------
+    thresholds : ndarray of shape (n_thresholds,)
+        The decision thresholds used to compute the scores. They are returned in
+        ascending order.
+
+    scores : ndarray of shape (n_thresholds,) or tuple os such arrays
+        The scores computed for each decision threshold. When TPR/TNR or precision/
+        recall are computed, `scores` is a tuple of two arrays.
+    """
+    arrays = (X, y) if sample_weight is None else (X, y, sample_weight)
+    check_consistent_length(*arrays)
+
+    fit_parameters = signature(classifier.fit).parameters
+    supports_sw = "sample_weight" in fit_parameters
+
+    if train_idx is not None:
+        X_train, X_val = _safe_indexing(X, train_idx), _safe_indexing(X, val_idx)
+        y_train, y_val = _safe_indexing(y, train_idx), _safe_indexing(y, val_idx)
+        if sample_weight is not None:
+            sw_train, sw_val = (
+                _safe_indexing(sample_weight, train_idx),
+                _safe_indexing(sample_weight, val_idx),
+            )
+        else:
+            sw_train, sw_val = None, None
+        fit_params_train = _check_fit_params(X, fit_params, indices=train_idx)
+        if supports_sw:
+            classifier.fit(X_train, y_train, sample_weight=sw_train, **fit_params_train)
+        else:
+            classifier.fit(X_train, y_train, **fit_params_train)
+    else:  # prefit estimator, only a validation set is provided
+        X_val, y_val, sw_val = X, y, sample_weight
+        check_is_fitted(classifier, "classes_")
+
+    if isinstance(score_method, str):
+        if score_method in {"max_tpr_at_tnr_constraint", "max_tnr_at_tpr_constraint"}:
+            fpr, tpr, potential_thresholds = scorer(
+                classifier, X_val, y_val, sample_weight=sw_val
+            )
+            # For fpr=0/tpr=0, the threshold is set to `np.inf`. We need to remove it.
+            fpr, tpr, potential_thresholds = fpr[1:], tpr[1:], potential_thresholds[1:]
+            # thresholds are in decreasing order
+            return potential_thresholds[::-1], ((1 - fpr)[::-1], tpr[::-1])
+        elif score_method in {
+            "max_precision_at_recall_constraint",
+            "max_recall_at_precision_constraint",
+        }:
+            precision, recall, potential_thresholds = scorer(
+                classifier, X_val, y_val, sample_weight=sw_val
+            )
+            # thresholds are in increasing order
+            # the last element of the precision and recall is not associated with any
+            # threshold and should be discarded
+            return potential_thresholds, (precision[:-1], recall[:-1])
+    return scorer(classifier, X_val, y_val, sample_weight=sw_val)
+
+
+class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
+    """Decision threshold tuning for binary classification.
+
+    This estimator post-tunes the decision threshold (cut-off point) that is
+    used for converting probabilities (i.e. output of `predict_proba`) or
+    decision function (i.e. output of `decision_function`) into a predicted
+    class. The tuning is done by maximizing a binary metric, potentially
+    constrained by a another metric.
+
+    Read more in the :ref:`User Guide <cutoffclassifier>`.
+
+    .. versionadded:: 1.3
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        The classifier, fitted or not fitted, for which we want to optimize
+        the decision threshold used during `predict`.
+
+    objective_metric : {"max_tpr_at_tnr_constraint", "max_tnr_at_tpr_constraint", \
+            "max_precision_at_recall_constraint, "max_recall_at_precision_constraint"} \
+            , str, dict or callable, default="balanced_accuracy"
+        The objective metric to be optimized. Can be one of:
+
+        * a string associated to a scoring function (see model evaluation
+          documentation);
+        * a scorer callable object created with :func:`~sklearn.metrics.make_scorer`;
+        * `"max_tnr_at_tpr_constraint"`: find the decision threshold for a true
+          positive ratio (TPR) of `constraint_value`;
+        * `"max_tpr_at_tnr_constraint"`: find the decision threshold for a true
+          negative ratio (TNR) of `constraint_value`.
+        * `"max_precision_at_recall_constraint"`: find the decision threshold for a
+          recall of `constraint_value`;
+        * `"max_recall_at_precision_constraint"`: find the decision threshold for a
+          precision of `constraint_value`.
+        * a dictionary to be used as cost-sensitive matrix. The keys of the
+          dictionary should be: `("tp", "fp", "tn", "fn")`. The values of the
+          dictionary corresponds costs (negative values) and gains (positive
+          values).
+
+    constraint_value : float, default=None
+        The value associated with the `objective_metric` metric for which we
+        want to find the decision threshold when `objective_metric` is equal one of
+        `"max_tnr_at_tpr_constraint"`, `"max_tpr_at_tnr_constraint"`,
+        `"max_precision_at_recall_constraint"`, or
+        `"max_recall_at_precision_constraint"`.
+
+    pos_label : int, float, bool or str, default=None
+        The label of the positive class. Used when `objective_metric` is
+        `"max_tnr_at_tpr_constraint"`"`, `"max_tpr_at_tnr_constraint"`, or a dictionary.
+        When `pos_label=None`, if `y_true` is in `{-1, 1}` or `{0, 1}`,
+        `pos_label` is set to 1, otherwise an error will be raised. When using a
+        scorer, `pos_label` can be passed as a keyword argument to
+        :func:`~sklearn.metrics.make_scorer`.
+
+    response_method : {"auto", "decision_function", "predict_proba"}, default="auto"
+        Methods by the classifier `base_estimator` corresponding to the
+        decision function for which we want to find a threshold. It can be:
+
+        * if `"auto"`, it will try to invoke, for each classifier,
+          `"predict_proba"` or `"decision_function"` in that order.
+        * otherwise, one of `"predict_proba"` or `"decision_function"`.
+          If the method is not implemented by the classifier, it will raise an
+          error.
+
+    n_thresholds : int, default=100
+        The number of decision threshold to use when discretizing the output
+        of the classifier `method`.
+
+    cv : int, float, cross-validation generator, iterable or "prefit", default=None
+        Determines the cross-validation splitting strategy to train classifier.
+        Possible inputs for cv are:
+
+        * `None`, to use the default 5-fold stratified K-fold cross validation;
+        * An integer number, to specify the number of folds in a stratified k-fold;
+        * A float number, to specify a single shuffle split. The floating number should
+          be in (0, 1) and represent the size of the validation set;
+        * An object to be used as a cross-validation generator;
+        * An iterable yielding train, test splits;
+        * `"prefit"`, to bypass the cross-validation.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. warning::
+            Using `cv="prefit"` and passing the same dataset for fitting `estimator`
+            and tuning the cut-off point is subject to undesired overfitting. You can
+            refer to :ref:`cutoffclassifier_no_cv` for an example.
+
+            This option should only be used when the set used to fit `estimator` is
+            different from the one used to tune the cut-off point (by calling
+            :meth:`CutOffClassifier.fit`).
+
+    refit : "auto" or bool, default="auto"
+        Whether or not to refit the classifier on the entire training set once
+        the decision threshold has been found. By default, `refit="auto"` is
+        equivalent to `refit=False` when `cv` is a float number using a single
+        shuffle split or `cv="prefit"` otherwise `refit=True` in all other
+        cases. Note that forcing `refit=False` on cross-validation having more
+        than a single split will raise an error. Similarly, `refit=True` in
+        conjunction with `cv="prefit"` will raise an error.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel. When `cv` represents a
+        cross-validation strategy, the fitting and scoring on each data split
+        is done in parallel. ``None`` means 1 unless in a
+        :obj:`joblib.parallel_backend` context. ``-1`` means using all
+        processors. See :term:`Glossary <n_jobs>` for more details.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of cross-validation when `cv` is a float.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    estimator_ : estimator instance
+        The fitted classifier used when predicting.
+
+    decision_threshold_ : float
+        The new decision threshold.
+
+    decision_thresholds_ : ndarray of shape (n_thresholds,)
+        All decision thresholds that were evaluated.
+
+    objective_score_ : float or tuple of floats
+        The score of the objective metric associated with the decision threshold found.
+        When `objective_metric` is one of `"max_tpr_at_tnr_constraint"`,
+        `"max_tnr_at_tpr_constraint"`, `"max_precision_at_recall_constraint"`,
+        `"max_recall_at_precision_constraint"`, it will corresponds to a tuple of
+        two float values: the first one is the score of the metric which is constrained
+        and the second one is the score of the maximized metric.
+
+    objective_scores_ : ndarray of shape (n_thresholds,)
+        The scores of the objective metric associated with the decision thresholds.
+
+    classes_ : ndarray of shape (n_classes,)
+        The class labels.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+    See Also
+    --------
+    sklearn.calibration.CalibratedClassifierCV : Estimator that calibrates
+        probabilities.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.metrics import classification_report
+    >>> from sklearn.model_selection import CutOffClassifier, train_test_split
+    >>> X, y = make_classification(
+    ...     n_samples=1_000, weights=[0.9, 0.1], class_sep=0.8, random_state=42
+    ... )
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, stratify=y, random_state=42
+    ... )
+    >>> classifier = RandomForestClassifier(random_state=0).fit(X_train, y_train)
+    >>> print(classification_report(y_test, classifier.predict(X_test)))
+                  precision    recall  f1-score   support
+    <BLANKLINE>
+               0       0.94      0.99      0.96       224
+               1       0.80      0.46      0.59        26
+    <BLANKLINE>
+        accuracy                           0.93       250
+       macro avg       0.87      0.72      0.77       250
+    weighted avg       0.93      0.93      0.92       250
+    <BLANKLINE>
+    >>> classifier_tuned = CutOffClassifier(
+    ...     classifier, objective_metric="max_precision_at_recall_constraint",
+    ...     constraint_value=0.7,
+    ... ).fit(X_train, y_train)
+    >>> print(
+    ...     f"Cut-off point found at {classifier_tuned.decision_threshold_:.3f} for a "
+    ...     f"recall of {classifier_tuned.objective_score_[0]:.3f} and a precision of "
+    ...     f"{classifier_tuned.objective_score_[1]:.3f}."
+    ... )
+    Cut-off point found at 0.3... for a recall of 0.7... and a precision of 0.7...
+    >>> print(classification_report(y_test, classifier_tuned.predict(X_test)))
+                  precision    recall  f1-score   support
+    <BLANKLINE>
+               0       0.96      0.96      0.96       224
+               1       0.68      0.65      0.67        26
+    <BLANKLINE>
+        accuracy                           0.93       250
+       macro avg       0.82      0.81      0.81       250
+    weighted avg       0.93      0.93      0.93       250
+    <BLANKLINE>
+    """
+
+    _parameter_constraints: dict = {
+        "estimator": [
+            HasMethods(["fit", "predict_proba"]),
+            HasMethods(["fit", "decision_function"]),
+        ],
+        "objective_metric": [
+            StrOptions(
+                set(get_scorer_names())
+                | {
+                    "max_tnr_at_tpr_constraint",
+                    "max_tpr_at_tnr_constraint",
+                    "max_precision_at_recall_constraint",
+                    "max_recall_at_precision_constraint",
+                }
+            ),
+            callable,
+            MutableMapping,
+        ],
+        "constraint_value": [Real, None],
+        "pos_label": [Real, str, "boolean", None],
+        "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})],
+        "n_thresholds": [Interval(Integral, 1, None, closed="left")],
+        "cv": [
+            "cv_object",
+            StrOptions({"prefit"}),
+            Interval(RealNotInt, 0.0, 1.0, closed="right"),
+        ],
+        "refit": ["boolean", StrOptions({"auto"})],
+        "n_jobs": [Integral, None],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        estimator,
+        *,
+        objective_metric="balanced_accuracy",
+        constraint_value=None,
+        pos_label=None,
+        response_method="auto",
+        n_thresholds=100,
+        cv=None,
+        refit="auto",
+        n_jobs=None,
+        random_state=None,
+    ):
+        self.estimator = estimator
+        self.objective_metric = objective_metric
+        self.constraint_value = constraint_value
+        self.pos_label = pos_label
+        self.response_method = response_method
+        self.n_thresholds = n_thresholds
+        self.cv = cv
+        self.refit = refit
+        self.n_jobs = n_jobs
+        self.random_state = random_state
+
+    def fit(self, X, y, sample_weight=None, **fit_params):
+        """Fit the classifier and post-tune the decision threshold.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If `None`, then samples are equally weighted.
+
+        **fit_params : dict
+            Parameters to pass to the `fit` method of the underlying
+            classifier.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        self._validate_params()
+        X, y = indexable(X, y)
+
+        y_type = type_of_target(y, input_name="y")
+        if y_type != "binary":
+            raise ValueError(
+                f"Only binary classification is supported. Unknown label type: {y_type}"
+            )
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X)
+
+        if isinstance(self.cv, Real) and 0 < self.cv <= 1:
+            cv = StratifiedShuffleSplit(
+                n_splits=1, test_size=self.cv, random_state=self.random_state
+            )
+            refit = False if self.refit == "auto" else self.refit
+        elif self.cv == "prefit":
+            if self.refit is True:
+                raise ValueError("When cv='prefit', refit cannot be True.")
+            try:
+                check_is_fitted(self.estimator, "classes_")
+            except NotFittedError as exc:
+                raise NotFittedError(
+                    """When cv='prefit', `estimator` must be fitted."""
+                ) from exc
+            cv, refit = self.cv, False
+        else:
+            cv = check_cv(self.cv, y=y, classifier=True)
+            if self.refit is False and cv.get_n_splits() > 1:
+                raise ValueError("When cv has several folds, refit cannot be False.")
+            if self.refit == "auto":
+                refit = cv.get_n_splits() > 1
+            else:
+                refit = self.refit
+
+        if self.response_method == "auto":
+            self._response_method = ["predict_proba", "decision_function"]
+        else:
+            self._response_method = self.response_method
+
+        if isinstance(self.objective_metric, str) and self.objective_metric in {
+            "max_tpr_at_tnr_constraint",
+            "max_tnr_at_tpr_constraint",
+            "max_precision_at_recall_constraint",
+            "max_recall_at_precision_constraint",
+        }:
+            if self.constraint_value is None:
+                raise ValueError(
+                    "When `objective_metric` is 'max_tpr_at_tnr_constraint', "
+                    "'max_tnr_at_tpr_constraint', 'max_precision_at_recall_constraint',"
+                    " or 'max_recall_at_precision_constraint', `constraint_value` must "
+                    "be provided. Got None instead."
+                )
+            constraint_value = self.constraint_value
+        else:
+            constraint_value = "highest"
+
+        fit_parameters = signature(self.estimator.fit).parameters
+        supports_sw = "sample_weight" in fit_parameters
+
+        # in the following block, we:
+        # - define the final classifier `self.estimator_` and train it if necessary
+        # - define `classifier` to be used to post-tune the decision threshold
+        # - define `split` to be used to fit/score `classifier`
+        if cv == "prefit":
+            self.estimator_ = self.estimator
+            classifier = self.estimator_
+            splits = ([None, range(_num_samples(X))],)
+        else:
+            self.estimator_ = clone(self.estimator)
+            classifier = clone(self.estimator)
+            splits = cv.split(X, y)
+
+            if refit:
+                # train on the whole dataset
+                X_train, y_train, sw_train = X, y, sample_weight
+                fit_params_train = _check_fit_params(X, fit_params, indices=None)
+            else:
+                # single split cross-validation
+                train_idx, _ = next(cv.split(X, y))
+                X_train = _safe_indexing(X, train_idx)
+                y_train = _safe_indexing(y, train_idx)
+                if sample_weight is not None:
+                    sw_train = _safe_indexing(sample_weight, train_idx)
+                else:
+                    sw_train = None
+                fit_params_train = _check_fit_params(X, fit_params, indices=train_idx)
+
+            if sw_train is not None and supports_sw:
+                self.estimator_.fit(
+                    X_train, y_train, sample_weight=sw_train, **fit_params_train
+                )
+            else:
+                self.estimator_.fit(X_train, y_train, **fit_params_train)
+
+        if isinstance(self.objective_metric, MutableMapping):
+            keys = set(self.objective_metric.keys())
+            if not keys == {"tp", "tn", "fp", "fn"}:
+                raise ValueError(
+                    "Invalid keys in `objective_metric`. Valid keys are "
+                    f"'tp', 'tn', 'fp', and 'fn'. Got {keys} instead."
+                )
+            pos_label = _check_pos_label_consistency(self.pos_label, y)
+
+            def cost_sensitive_score_func(y_true, y_pred, **kwargs):
+                costs_and_gain = np.array(
+                    [
+                        [kwargs["tn"], kwargs["fp"]],
+                        [kwargs["fn"], kwargs["tp"]],
+                    ]
+                )
+
+                sample_weight = kwargs.get("sample_weight", None)
+                cm = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
+
+                pos_label, classes = kwargs["pos_label"], np.unique(y_true)
+                pos_label_idx = np.searchsorted(classes, pos_label)
+                if pos_label_idx == 0:
+                    # reorder the confusion matrix to be aligned with the cost-matrix
+                    cm = cm[::-1, ::-1]
+
+                return (costs_and_gain * cm).sum()
+
+            self._scorer = _ContinuousScorer(
+                score_func=cost_sensitive_score_func,
+                sign=1,
+                response_method=self._response_method,
+                kwargs={
+                    **self.objective_metric,
+                    "pos_label": pos_label,
+                },
+            )
+        elif self.objective_metric in {
+            "max_tnr_at_tpr_constraint",
+            "max_tpr_at_tnr_constraint",
+            "max_precision_at_recall_constraint",
+            "max_recall_at_precision_constraint",
+        }:
+            if self._response_method == "predict_proba":
+                params_scorer = {"needs_proba": True, "pos_label": self.pos_label}
+            elif (
+                isinstance(self._response_method, list)
+                and self._response_method[0] == "predict_proba"
+                and hasattr(classifier, "predict_proba")
+            ):
+                # TODO: this is due to a limitation in `make_scorer`: ideally, we should
+                # be able to pass a list of response methods to `make_scorer` and give
+                # priority to `predict_proba` other `decision_function`.
+                # Here, we manually check if the classifier provide `predict_proba` to
+                # use `needs_proba` instead and ensure that no error will be raised.
+                params_scorer = {"needs_proba": True, "pos_label": self.pos_label}
+            else:
+                params_scorer = {"needs_threshold": True, "pos_label": self.pos_label}
+
+            if "tpr" in self.objective_metric:  # tpr/tnr
+                score_func = roc_curve
+            else:  # precision/recall
+                score_func = precision_recall_curve
+            self._scorer = make_scorer(score_func, **params_scorer)
+        else:
+            scoring = check_scoring(classifier, scoring=self.objective_metric)
+            # add `pos_label` if requested by the scorer function
+            scorer_kwargs = {**scoring._kwargs}
+            signature_scoring_func = signature(scoring._score_func)
+            if (
+                "pos_label" in signature_scoring_func.parameters
+                and "pos_label" not in scorer_kwargs
+            ):
+                if self.pos_label is None:
+                    # Since the provided `pos_label` is the default, we need to
+                    # use the default value of the scoring function that can be either
+                    # `None` or `1`.
+                    scorer_kwargs["pos_label"] = signature_scoring_func.parameters[
+                        "pos_label"
+                    ].default
+                else:
+                    scorer_kwargs["pos_label"] = self.pos_label
+            # transform a binary metric into a curve metric for all possible decision
+            # thresholds
+            self._scorer = _ContinuousScorer(
+                score_func=scoring._score_func,
+                sign=scoring._sign,
+                response_method=self._response_method,
+                kwargs=scorer_kwargs,
+            )
+
+        cv_thresholds, cv_scores = zip(
+            *Parallel(n_jobs=self.n_jobs)(
+                delayed(_fit_and_score)(
+                    classifier,
+                    X,
+                    y,
+                    sample_weight,
+                    fit_params,
+                    train_idx,
+                    val_idx,
+                    self._scorer,
+                    self.objective_metric,
+                )
+                for train_idx, val_idx in splits
+            )
+        )
+
+        if any(len(th) == 1 for th in cv_thresholds):
+            raise ValueError(
+                "The provided estimator makes constant predictions. Therefore, it is "
+                "impossible to optimize the decision threshold."
+            )
+
+        # find the global min and max thresholds across all folds
+        min_threshold = np.min([th.min() for th in cv_thresholds])
+        max_threshold = np.max([th.max() for th in cv_thresholds])
+        self.decision_thresholds_ = np.linspace(
+            min_threshold, max_threshold, num=self.n_thresholds
+        )
+
+        def _mean_interpolated_score(threshold_interpolated, cv_thresholds, cv_scores):
+            return np.mean(
+                [
+                    np.interp(threshold_interpolated, th, sc)
+                    for th, sc in zip(cv_thresholds, cv_scores)
+                ],
+                axis=0,
+            )
+
+        if constraint_value == "highest":  # find best score
+            self.objective_scores_ = _mean_interpolated_score(
+                self.decision_thresholds_, cv_thresholds, cv_scores
+            )
+            best_idx = self.objective_scores_.argmax()
+            self.objective_score_ = self.objective_scores_[best_idx]
+            self.decision_threshold_ = self.decision_thresholds_[best_idx]
+        else:
+            if "tpr" in self.objective_metric:  # tpr/tnr
+                mean_tnr, mean_tpr = [
+                    _mean_interpolated_score(
+                        self.decision_thresholds_, cv_thresholds, sc
+                    )
+                    for sc in zip(*cv_scores)
+                ]
+            else:  # precision/recall
+                mean_precision, mean_recall = [
+                    _mean_interpolated_score(
+                        self.decision_thresholds_, cv_thresholds, sc
+                    )
+                    for sc in zip(*cv_scores)
+                ]
+
+            def _get_best_idx(constrained_score, maximized_score):
+                """Find the index of the best score constrained by another score."""
+                indices = np.arange(len(constrained_score))
+                mask = constrained_score >= constraint_value
+                mask_idx = maximized_score[mask].argmax()
+                return indices[mask][mask_idx]
+
+            if self.objective_metric == "max_tpr_at_tnr_constraint":
+                constrained_score, maximized_score = mean_tnr, mean_tpr
+            elif self.objective_metric == "max_tnr_at_tpr_constraint":
+                constrained_score, maximized_score = mean_tpr, mean_tnr
+            elif self.objective_metric == "max_precision_at_recall_constraint":
+                constrained_score, maximized_score = mean_recall, mean_precision
+            else:  # max_recall_at_precision_constraint
+                constrained_score, maximized_score = mean_precision, mean_recall
+
+            self.objective_scores_ = (constrained_score, maximized_score)
+            best_idx = _get_best_idx(constrained_score, maximized_score)
+            self.objective_score_ = (
+                constrained_score[best_idx],
+                maximized_score[best_idx],
+            )
+            self.decision_threshold_ = self.decision_thresholds_[best_idx]
+
+        if hasattr(self.estimator_, "n_features_in_"):
+            self.n_features_in_ = self.estimator_.n_features_in_
+        if hasattr(self.estimator_, "feature_names_in_"):
+            self.feature_names_in_ = self.estimator_.feature_names_in_
+
+        return self
+
+    @property
+    def classes_(self):
+        """Classes labels."""
+        return self.estimator_.classes_
+
+    def predict(self, X):
+        """Predict the target of new samples.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The samples, as accepted by `estimator.predict`.
+
+        Returns
+        -------
+        C : ndarray of shape (n_samples,)
+            The predicted class.
+        """
+        check_is_fitted(self, "estimator_")
+        pos_label = self._scorer._get_pos_label()
+        y_score, _ = _get_response_values_binary(
+            self.estimator_, X, self._response_method, pos_label=pos_label
+        )
+        return self._scorer._from_scores_to_class_labels(
+            y_score, self.decision_threshold_, self.classes_
+        )
+
+    @available_if(_estimator_has("predict_proba"))
+    def predict_proba(self, X):
+        """Predict class probabilities for `X` using the fitted estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        probabilities : ndarray of shape (n_samples, n_classes)
+            The class probabilities of the input samples.
+        """
+        check_is_fitted(self, "estimator_")
+        return self.estimator_.predict_proba(X)
+
+    @available_if(_estimator_has("predict_log_proba"))
+    def predict_log_proba(self, X):
+        """Predict logarithm class probabilities for `X` using the fitted estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        log_probabilities : ndarray of shape (n_samples, n_classes)
+            The logarithm class probabilities of the input samples.
+        """
+        check_is_fitted(self, "estimator_")
+        return self.estimator_.predict_log_proba(X)
+
+    @available_if(_estimator_has("decision_function"))
+    def decision_function(self, X):
+        """Decision function for samples in `X` using the fitted estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        decisions : ndarray of shape (n_samples,)
+            The decision function computed the fitted estimator.
+        """
+        check_is_fitted(self, "estimator_")
+        return self.estimator_.decision_function(X)
+
+    def _more_tags(self):
+        return {
+            "binary_only": True,
+            "_xfail_checks": {
+                "check_classifiers_train": "Threshold at probability 0.5 does not hold",
+                "check_sample_weights_invariance": (
+                    "Due to the cross-validation and sample ordering, removing a sample"
+                    " is not strictly equal to putting is weight to zero. Specific unit"
+                    " tests are added for CutOffClassifier specifically."
+                ),
+            },
+        }
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
new file mode 100644
index 0000000000000..e803a0b5ff3a5
--- /dev/null
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -0,0 +1,832 @@
+import numpy as np
+import pytest
+
+from sklearn.base import clone
+from sklearn.datasets import load_breast_cancer, load_iris, make_classification
+from sklearn.dummy import DummyClassifier
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.exceptions import NotFittedError
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import (
+    balanced_accuracy_score,
+    confusion_matrix,
+    f1_score,
+    fbeta_score,
+    make_scorer,
+    precision_recall_curve,
+    precision_score,
+    recall_score,
+    roc_curve,
+)
+from sklearn.metrics._scorer import _ContinuousScorer
+from sklearn.model_selection import CutOffClassifier
+from sklearn.model_selection._classification_threshold import _fit_and_score
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.utils._mocking import CheckingClassifier
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_allclose,
+    assert_array_equal,
+)
+
+
+@pytest.mark.parametrize(
+    "scorer, score_method",
+    [
+        (
+            _ContinuousScorer(
+                score_func=balanced_accuracy_score,
+                sign=1,
+                response_method="predict_proba",
+                kwargs={},
+            ),
+            "balanced_accuracy",
+        ),
+        (
+            make_scorer(roc_curve, needs_proba=True),
+            "max_tnr_at_tpr_constraint",
+        ),
+        (
+            make_scorer(roc_curve, needs_proba=True),
+            "max_tpr_at_tnr_constraint",
+        ),
+        (
+            make_scorer(precision_recall_curve, needs_proba=True),
+            "max_precision_at_recall_constraint",
+        ),
+        (
+            make_scorer(precision_recall_curve, needs_proba=True),
+            "max_recall_at_precision_constraint",
+        ),
+    ],
+)
+def test_fit_and_score_scorers(scorer, score_method):
+    """Check that `_fit_and_score` returns thresholds in ascending order for the
+    different accepted scorers."""
+    X, y = make_classification(n_samples=100, random_state=0)
+    train_idx, val_idx = np.arange(50), np.arange(50, 100)
+    classifier = LogisticRegression()
+
+    thresholds, scores = _fit_and_score(
+        classifier,
+        X,
+        y,
+        sample_weight=None,
+        fit_params={},
+        train_idx=train_idx,
+        val_idx=val_idx,
+        scorer=scorer,
+        score_method=score_method,
+    )
+
+    if score_method.startswith("max_"):
+        assert_array_equal(np.argsort(thresholds), np.arange(len(thresholds)))
+        assert isinstance(scores, tuple) and len(scores) == 2
+        for sc in scores:
+            assert np.logical_and(sc >= 0, sc <= 1).all()
+    else:
+        assert_array_equal(np.argsort(thresholds), np.arange(len(thresholds)))
+        assert isinstance(scores, np.ndarray)
+        assert np.logical_and(scores >= 0, scores <= 1).all()
+
+
+@pytest.mark.parametrize(
+    "scorer, score_method, expected_score",
+    [
+        (
+            _ContinuousScorer(
+                score_func=balanced_accuracy_score,
+                sign=1,
+                response_method="predict_proba",
+                kwargs={},
+            ),
+            "balanced_accuracy",
+            [0.5, 1.0],
+        ),
+        (
+            make_scorer(roc_curve, needs_proba=True),
+            "max_tnr_at_tpr_constraint",
+            [[0.0, 1.0], [1.0, 1.0]],
+        ),
+        (
+            make_scorer(roc_curve, needs_proba=True),
+            "max_tpr_at_tnr_constraint",
+            [[0.0, 1.0], [1.0, 1.0]],
+        ),
+        (
+            make_scorer(precision_recall_curve, needs_proba=True),
+            "max_precision_at_recall_constraint",
+            [[0.5, 1.0], [1.0, 1.0]],
+        ),
+        (
+            make_scorer(precision_recall_curve, needs_proba=True),
+            "max_recall_at_precision_constraint",
+            [[0.5, 1.0], [1.0, 1.0]],
+        ),
+    ],
+)
+def test_fit_and_score_prefit(scorer, score_method, expected_score):
+    """Check the behaviour with a prefit classifier."""
+    X, y = make_classification(n_samples=100, random_state=0)
+
+    # `train_idx is None` to indicate that the classifier is prefit
+    train_idx, val_idx = None, np.arange(50, 100)
+    classifier = DecisionTreeClassifier(random_state=0)
+
+    with pytest.raises(NotFittedError):
+        _fit_and_score(
+            classifier,
+            X,
+            y,
+            sample_weight=None,
+            fit_params={},
+            train_idx=train_idx,
+            val_idx=val_idx,
+            scorer=scorer,
+            score_method=score_method,
+        )
+
+    classifier.fit(X, y)
+    # make sure that the classifier memorized the full dataset such that
+    # we get perfect predictions and thus match the expected score
+    assert classifier.score(X[val_idx], y[val_idx]) == pytest.approx(1.0)
+
+    thresholds, scores = _fit_and_score(
+        classifier,
+        X,
+        y,
+        sample_weight=None,
+        fit_params={},
+        train_idx=train_idx,
+        val_idx=val_idx,
+        scorer=scorer,
+        score_method=score_method,
+    )
+    assert_array_equal(np.argsort(thresholds), np.arange(len(thresholds)))
+    assert_allclose(scores, expected_score)
+
+
+@pytest.mark.parametrize(
+    "scorer, score_method",
+    [
+        (
+            _ContinuousScorer(
+                score_func=balanced_accuracy_score,
+                sign=1,
+                response_method="predict_proba",
+                kwargs={},
+            ),
+            "balanced_accuracy",
+        ),
+        (
+            make_scorer(roc_curve, needs_proba=True),
+            "max_tnr_at_tpr_constraint",
+        ),
+        (
+            make_scorer(roc_curve, needs_proba=True),
+            "max_tpr_at_tnr_constraint",
+        ),
+        (
+            make_scorer(precision_recall_curve, needs_proba=True),
+            "max_precision_at_recall_constraint",
+        ),
+        (
+            make_scorer(precision_recall_curve, needs_proba=True),
+            "max_recall_at_precision_constraint",
+        ),
+    ],
+)
+def test_fit_and_score_sample_weight(scorer, score_method):
+    """Check that we dispatch the sample-weight to fit and score the classifier."""
+    X, y = load_iris(return_X_y=True)
+    X, y = X[:100], y[:100]  # only 2 classes
+
+    # create a dataset and repeat twice the sample of class #0
+    X_repeated, y_repeated = np.vstack([X, X[y == 0]]), np.hstack([y, y[y == 0]])
+    # create a sample weight vector that is equivalent to the repeated dataset
+    sample_weight = np.ones_like(y)
+    sample_weight[:50] *= 2
+
+    classifier = LogisticRegression()
+    train_repeated_idx = np.arange(X_repeated.shape[0])
+    val_repeated_idx = np.arange(X_repeated.shape[0])
+    thresholds_repeated, scores_repeated = _fit_and_score(
+        classifier,
+        X_repeated,
+        y_repeated,
+        sample_weight=None,
+        fit_params={},
+        train_idx=train_repeated_idx,
+        val_idx=val_repeated_idx,
+        scorer=scorer,
+        score_method=score_method,
+    )
+
+    train_idx, val_idx = np.arange(X.shape[0]), np.arange(X.shape[0])
+    thresholds, scores = _fit_and_score(
+        classifier,
+        X,
+        y,
+        sample_weight=sample_weight,
+        fit_params={},
+        train_idx=train_idx,
+        val_idx=val_idx,
+        scorer=scorer,
+        score_method=score_method,
+    )
+
+    assert_allclose(thresholds_repeated, thresholds)
+    assert_allclose(scores_repeated, scores)
+
+
+@pytest.mark.parametrize(
+    "scorer, score_method",
+    [
+        (
+            _ContinuousScorer(
+                score_func=balanced_accuracy_score,
+                sign=1,
+                response_method="predict_proba",
+                kwargs={},
+            ),
+            "balanced_accuracy",
+        ),
+        (
+            make_scorer(roc_curve, needs_proba=True),
+            "max_tnr_at_tpr_constraint",
+        ),
+        (
+            make_scorer(roc_curve, needs_proba=True),
+            "max_tpr_at_tnr_constraint",
+        ),
+        (
+            make_scorer(precision_recall_curve, needs_proba=True),
+            "max_precision_at_recall_constraint",
+        ),
+        (
+            make_scorer(precision_recall_curve, needs_proba=True),
+            "max_recall_at_precision_constraint",
+        ),
+    ],
+)
+@pytest.mark.parametrize("fit_params_type", ["list", "array"])
+def test_fit_and_score_fit_params(scorer, score_method, fit_params_type):
+    """Check that we pass `fit_params` to the classifier when calling `fit`."""
+    X, y = make_classification(n_samples=100, random_state=0)
+    fit_params = {
+        "a": _convert_container(y, fit_params_type),
+        "b": _convert_container(y, fit_params_type),
+    }
+
+    classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0)
+    train_idx, val_idx = np.arange(50), np.arange(50, 100)
+
+    _fit_and_score(
+        classifier,
+        X,
+        y,
+        sample_weight=None,
+        fit_params=fit_params,
+        train_idx=train_idx,
+        val_idx=val_idx,
+        scorer=scorer,
+        score_method=score_method,
+    )
+
+
+def test_cutoffclassifier_no_binary():
+    """Check that we raise an informative error message for non-binary problem."""
+    X, y = make_classification(n_classes=3, n_clusters_per_class=1)
+    err_msg = "Only binary classification is supported."
+    with pytest.raises(ValueError, match=err_msg):
+        CutOffClassifier(LogisticRegression()).fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "params, err_type, err_msg",
+    [
+        (
+            {"cv": "prefit", "refit": True},
+            ValueError,
+            "When cv='prefit', refit cannot be True.",
+        ),
+        (
+            {"cv": 10, "refit": False},
+            ValueError,
+            "When cv has several folds, refit cannot be False.",
+        ),
+        (
+            {"cv": "prefit", "refit": False},
+            NotFittedError,
+            "`estimator` must be fitted.",
+        ),
+    ],
+)
+def test_cutoffclassifier_conflict_cv_refit(params, err_type, err_msg):
+    """Check that we raise an informative error message when `cv` and `refit`
+    cannot be used together.
+    """
+    X, y = make_classification(n_samples=100, random_state=0)
+    with pytest.raises(err_type, match=err_msg):
+        CutOffClassifier(LogisticRegression(), **params).fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [LogisticRegression(), SVC(), GradientBoostingClassifier(n_estimators=4)],
+)
+@pytest.mark.parametrize(
+    "response_method", ["predict_proba", "predict_log_proba", "decision_function"]
+)
+def test_cutoffclassifier_estimator_response_methods(estimator, response_method):
+    """Check that `CutOffClassifier` exposes the same response methods as the
+    underlying estimator.
+    """
+    X, y = make_classification(n_samples=100, random_state=0)
+
+    model = CutOffClassifier(estimator)
+    assert hasattr(model, response_method) == hasattr(estimator, response_method)
+
+    model.fit(X, y)
+    assert hasattr(model, response_method) == hasattr(estimator, response_method)
+
+    if hasattr(model, response_method):
+        y_pred_cutoff = getattr(model, response_method)(X)
+        y_pred_underlying_estimator = getattr(model.estimator_, response_method)(X)
+
+        assert_allclose(y_pred_cutoff, y_pred_underlying_estimator)
+
+
+@pytest.mark.parametrize(
+    "response_method", ["auto", "decision_function", "predict_proba"]
+)
+def test_cutoffclassifier_with_constraint_value(response_method):
+    """Check that `CutOffClassifier` is optimizing a given objective metric."""
+    X, y = load_breast_cancer(return_X_y=True)
+    # remove feature to degrade performances
+    X = X[:, :5]
+
+    # make the problem completely imbalanced such that the balanced accuracy is low
+    indices_pos = np.flatnonzero(y == 1)
+    indices_pos = indices_pos[: indices_pos.size // 50]
+    indices_neg = np.flatnonzero(y == 0)
+
+    X = np.vstack([X[indices_neg], X[indices_pos]])
+    y = np.hstack([y[indices_neg], y[indices_pos]])
+
+    lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
+    n_thresholds = 100
+    model = CutOffClassifier(
+        estimator=lr,
+        objective_metric="balanced_accuracy",
+        response_method=response_method,
+        n_thresholds=n_thresholds,
+    )
+    score_optimized = balanced_accuracy_score(y, model.fit(X, y).predict(X))
+    score_baseline = balanced_accuracy_score(y, lr.predict(X))
+    assert score_optimized > score_baseline
+    assert model.decision_thresholds_.shape == (n_thresholds,)
+    assert model.objective_scores_.shape == (n_thresholds,)
+
+
+@pytest.mark.parametrize(
+    "metrics",
+    [
+        ("max_tpr_at_tnr_constraint", "max_tnr_at_tpr_constraint"),
+        ("max_tnr_at_tpr_constraint", "max_tpr_at_tnr_constraint"),
+    ],
+)
+def test_cutoffclassifier_limit_metric_tradeoff(metrics):
+    """Check that an objective value of 0 give opposite predictions with tnr/tpr and
+    precision/recall.
+    """
+    X, y = load_breast_cancer(return_X_y=True)
+    estimator = make_pipeline(StandardScaler(), LogisticRegression())
+    model = CutOffClassifier(
+        estimator=estimator,
+        objective_metric=metrics[0],
+        constraint_value=0,
+    )
+    y_pred_1 = model.fit(X, y).predict(X)
+    model.set_params(objective_metric=metrics[1])
+    y_pred_2 = (~model.fit(X, y).predict(X).astype(bool)).astype(int)
+    assert np.mean(y_pred_1 == y_pred_2) > 0.98
+
+
+def test_cutoffclassifier_metric_with_parameter():
+    """Check that we can pass a metric with a parameter in addition check that
+    `f_beta with beta=1` is equivalent to `f1`.
+    """
+    X, y = load_breast_cancer(return_X_y=True)
+    lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
+    model_fbeta = CutOffClassifier(
+        estimator=lr, objective_metric=make_scorer(fbeta_score, beta=1)
+    ).fit(X, y)
+    model_f1 = CutOffClassifier(
+        estimator=lr, objective_metric=make_scorer(f1_score)
+    ).fit(X, y)
+
+    assert model_fbeta.decision_threshold_ == pytest.approx(
+        model_f1.decision_threshold_
+    )
+
+
+@pytest.mark.parametrize(
+    "response_method", ["auto", "decision_function", "predict_proba"]
+)
+@pytest.mark.parametrize(
+    "metric",
+    [
+        "max_tnr_at_tpr_constraint",
+        "max_tpr_at_tnr_constraint",
+        "max_precision_at_recall_constraint",
+        "max_recall_at_precision_constraint",
+        make_scorer(balanced_accuracy_score),
+        make_scorer(f1_score, pos_label="cancer"),
+        {"tp": 5, "tn": 1, "fp": -1, "fn": -1},
+    ],
+)
+def test_cutoffclassifier_with_string_targets(response_method, metric):
+    """Check that targets represented by str are properly managed.
+    Also, check with several metrics to be sure that `pos_label` is properly
+    dispatched.
+    """
+    X, y = load_breast_cancer(return_X_y=True)
+    # Encode numeric targets by meaningful strings. We purposely designed the class
+    # names such that the `pos_label` is the first alphabetically sorted class and thus
+    # encoded as 0.
+    classes = np.array(["cancer", "healthy"], dtype=object)
+    y = classes[y]
+    model = CutOffClassifier(
+        estimator=make_pipeline(StandardScaler(), LogisticRegression()),
+        objective_metric=metric,
+        constraint_value=0.9,
+        pos_label="cancer",
+        response_method=response_method,
+        n_thresholds=100,
+    ).fit(X, y)
+    assert_array_equal(model.classes_, np.sort(classes))
+    y_pred = model.predict(X)
+    assert_array_equal(np.sort(np.unique(y_pred)), np.sort(classes))
+
+
+@pytest.mark.parametrize("with_sample_weight", [True, False])
+def test_cutoffclassifier_refit(with_sample_weight, global_random_seed):
+    """Check the behaviour of the `refit` parameter."""
+    rng = np.random.RandomState(global_random_seed)
+    X, y = make_classification(n_samples=100, random_state=0)
+    if with_sample_weight:
+        sample_weight = rng.randn(X.shape[0])
+        sample_weight = np.abs(sample_weight, out=sample_weight)
+    else:
+        sample_weight = None
+
+    # check that `estimator_` if fitted on the full dataset when `refit=True`
+    estimator = LogisticRegression()
+    model = CutOffClassifier(estimator, refit=True).fit(
+        X, y, sample_weight=sample_weight
+    )
+
+    assert model.estimator_ is not estimator
+    estimator.fit(X, y, sample_weight=sample_weight)
+    assert_allclose(model.estimator_.coef_, estimator.coef_)
+    assert_allclose(model.estimator_.intercept_, estimator.intercept_)
+
+    # check that `estimator_` was not altered when `refit=False` and `cv="prefit"`
+    estimator = LogisticRegression().fit(X, y, sample_weight=sample_weight)
+    coef = estimator.coef_.copy()
+    model = CutOffClassifier(estimator, cv="prefit", refit=False).fit(
+        X, y, sample_weight=sample_weight
+    )
+
+    assert model.estimator_ is estimator
+    assert_allclose(model.estimator_.coef_, coef)
+
+    # check that we train `estimator_` on the training split of a given cross-validation
+    estimator = LogisticRegression()
+    cv = [
+        (np.arange(50), np.arange(50, 100)),
+    ]  # single split
+    model = CutOffClassifier(estimator, cv=cv, refit=False).fit(
+        X, y, sample_weight=sample_weight
+    )
+
+    assert model.estimator_ is not estimator
+    if with_sample_weight:
+        sw_train = sample_weight[cv[0][0]]
+    else:
+        sw_train = None
+    estimator.fit(X[cv[0][0]], y[cv[0][0]], sample_weight=sw_train)
+    assert_allclose(model.estimator_.coef_, estimator.coef_)
+
+
+@pytest.mark.parametrize(
+    "objective_metric",
+    [
+        "max_tnr_at_tpr_constraint",
+        "max_tpr_at_tnr_constraint",
+        "max_precision_at_recall_constraint",
+        "max_recall_at_precision_constraint",
+        "balanced_accuracy",
+    ],
+)
+@pytest.mark.parametrize("fit_params_type", ["list", "array"])
+def test_cutoffclassifier_fit_params(objective_metric, fit_params_type):
+    """Check that we pass `fit_params` to the classifier when calling `fit`."""
+    X, y = make_classification(n_samples=100, random_state=0)
+    fit_params = {
+        "a": _convert_container(y, fit_params_type),
+        "b": _convert_container(y, fit_params_type),
+    }
+
+    classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0)
+    model = CutOffClassifier(
+        classifier, objective_metric=objective_metric, constraint_value=0.5
+    )
+    model.fit(X, y, **fit_params)
+
+
+@pytest.mark.parametrize(
+    "objective_metric, constraint_value",
+    [
+        ("max_tnr_at_tpr_constraint", 0.5),
+        ("max_tpr_at_tnr_constraint", 0.5),
+        ("max_precision_at_recall_constraint", 0.5),
+        ("max_recall_at_precision_constraint", 0.5),
+    ],
+)
+@pytest.mark.parametrize(
+    "response_method", ["auto", "decision_function", "predict_proba"]
+)
+def test_cutoffclassifier_response_method_scorer_with_constraint_metric(
+    objective_metric, constraint_value, response_method, global_random_seed
+):
+    """Check that we use the proper scorer and forwarding the requested response method
+    for TNR/TPR and precision/recall metrics.
+    """
+    X, y = make_classification(n_samples=100, random_state=global_random_seed)
+    classifier = LogisticRegression()
+
+    n_thresholds = 100
+    model = CutOffClassifier(
+        classifier,
+        objective_metric=objective_metric,
+        constraint_value=constraint_value,
+        response_method=response_method,
+        n_thresholds=n_thresholds,
+    )
+    model.fit(X, y)
+    assert model.decision_thresholds_.shape == (n_thresholds,)
+    assert all(score.shape == (n_thresholds,) for score in model.objective_scores_)
+
+    if response_method in ("auto", "predict_proba"):
+        # "auto" will fall back  in priority on `predict_proba` if `estimator`
+        # supports it.
+        # we expect the decision threshold to be in [0, 1]
+        if objective_metric in (
+            "max_tnr_at_tpr_constraint",
+            "max_precision_at_recall_constraint",
+        ):
+            assert 0.5 <= model.decision_threshold_ <= 1
+        else:  # "max_tpr_at_tnr_constraint" or "max_recall_at_precision_constraint"
+            assert 0 <= model.decision_threshold_ <= 0.5
+    else:  # "decision_function"
+        # we expect the decision function to be centered in 0.0 and to be larger than
+        # -1 and 1.
+        if objective_metric in (
+            "max_tnr_at_tpr_constraint",
+            "max_precision_at_recall_constraint",
+        ):
+            assert 0 < model.decision_threshold_ < 20
+        else:  # "max_tpr_at_tnr_constraint" or "max_recall_at_precision_constraint"
+            assert -20 < model.decision_threshold_ < 0
+
+
+def test_cutoffclassifier_objective_metric_dict(global_random_seed):
+    """Check that we can pass a custom objective metric."""
+    X, y = make_classification(n_samples=500, random_state=global_random_seed)
+    classifier = LogisticRegression()
+
+    # we need to set a small number of thresholds to avoid ties and picking a too low
+    # threshold.
+    n_thresholds = 5
+
+    # affect a high gain to true negative and force the classifier to mainly
+    # predict the negative class.
+    costs_and_again = {"tp": 0, "tn": 10, "fp": 0, "fn": 0}
+    model = CutOffClassifier(
+        classifier, objective_metric=costs_and_again, n_thresholds=n_thresholds
+    )
+    model.fit(X, y)
+
+    assert model.decision_thresholds_.shape == (n_thresholds,)
+    assert model.objective_scores_.shape == (n_thresholds,)
+
+    assert model.decision_threshold_ > 0.99
+    assert np.mean(model.predict(X) == 0) > 0.9
+
+    # use the true positive now
+    costs_and_again = {"tp": 10, "tn": 0, "fp": 0, "fn": 0}
+    model = CutOffClassifier(
+        classifier, objective_metric=costs_and_again, n_thresholds=n_thresholds
+    )
+    model.fit(X, y)
+
+    assert model.decision_thresholds_.shape == (n_thresholds,)
+    assert model.objective_scores_.shape == (n_thresholds,)
+
+    assert model.decision_threshold_ < 0.01
+    assert np.mean(model.predict(X) == 1) > 0.9
+
+    # flipping the `pos_label` to zero should force the classifier to always predict 0
+    # and thus have a low threshold
+    pos_label = 0
+    model = CutOffClassifier(
+        classifier,
+        objective_metric=costs_and_again,
+        n_thresholds=n_thresholds,
+        pos_label=pos_label,
+    )
+    model.fit(X, y)
+
+    assert model.decision_thresholds_.shape == (n_thresholds,)
+    assert model.objective_scores_.shape == (n_thresholds,)
+
+    assert model.decision_threshold_ < 0.01
+    assert np.mean(model.predict(X) == 0) > 0.9
+
+
+def test_cutoffclassifier_sample_weight_costs_and_again():
+    """Check that we dispatch the `sample_weight` to the scorer when computing the
+    confusion matrix."""
+    X, y = load_iris(return_X_y=True)
+    X, y = X[:100], y[:100]  # only 2 classes
+
+    # create a dataset and repeat twice the sample of class #0
+    X_repeated, y_repeated = np.vstack([X, X[y == 0]]), np.hstack([y, y[y == 0]])
+    # create a sample weight vector that is equivalent to the repeated dataset
+    sample_weight = np.ones_like(y)
+    sample_weight[:50] *= 2
+
+    # we use a prefit classifier to simplify the test
+    cv = "prefit"
+    estimator = LogisticRegression().fit(X, y)
+    costs_and_again = {"tp": 1, "tn": 1, "fp": -1, "fn": -1}
+
+    model_repeat = CutOffClassifier(estimator, cv=cv, objective_metric=costs_and_again)
+    model_repeat.fit(X_repeated, y_repeated, sample_weight=None)
+
+    model_sw = CutOffClassifier(estimator, cv=cv, objective_metric=costs_and_again)
+    model_sw.fit(X, y, sample_weight=sample_weight)
+
+    assert model_repeat.objective_score_ == pytest.approx(model_sw.objective_score_)
+
+
+def test_cutoffclassifier_cv_zeros_sample_weights_equivalence():
+    """Check that passing removing some sample from the dataset `X` is
+    equivalent to passing a `sample_weight` with a factor 0."""
+    X, y = load_iris(return_X_y=True)
+    # Scale the data to avoid any convergence issue
+    X = StandardScaler().fit_transform(X)
+    # Only use 2 classes and select samples such that 2-fold cross-validation
+    # split will lead to an equivalence with a `sample_weight` of 0
+    X = np.vstack((X[:40], X[50:90]))
+    y = np.hstack((y[:40], y[50:90]))
+    sample_weight = np.zeros_like(y)
+    sample_weight[::2] = 1
+
+    estimator = LogisticRegression()
+    model_without_weights = CutOffClassifier(estimator, cv=2)
+    model_with_weights = clone(model_without_weights)
+
+    model_with_weights.fit(X, y, sample_weight=sample_weight)
+    model_without_weights.fit(X[::2], y[::2])
+
+    assert_allclose(
+        model_with_weights.estimator_.coef_, model_without_weights.estimator_.coef_
+    )
+
+    y_pred_with_weights = model_with_weights.predict_proba(X)
+    y_pred_without_weights = model_without_weights.predict_proba(X)
+    assert_allclose(y_pred_with_weights, y_pred_without_weights)
+
+
+def test_cutoffclassifier_error_constant_learner():
+    """Check that we raise an error message when providing an estimator that predicts
+    only a single class."""
+    X, y = make_classification(random_state=0)
+    estimator = DummyClassifier(strategy="constant", constant=1)
+    err_msg = "The provided estimator makes constant predictions."
+    with pytest.raises(ValueError, match=err_msg):
+        CutOffClassifier(estimator).fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "objective_metric",
+    ["max_precision_at_recall_constraint", "max_recall_at_precision_constraint"],
+)
+@pytest.mark.parametrize("pos_label", [0, 1])
+def test_cutoffclassifier_pos_label_precision_recall(objective_metric, pos_label):
+    """Check that `pos_label` is dispatched correctly by checking the precision and
+    recall score found during the optimization and the one found at `predict` time."""
+    X, y = make_classification(n_samples=5_000, weights=[0.6, 0.4], random_state=42)
+
+    # prefit the estimator to avoid variability due to the cross-validation
+    estimator = LogisticRegression().fit(X, y)
+
+    constraint_value = 0.7
+    model = CutOffClassifier(
+        estimator,
+        objective_metric=objective_metric,
+        constraint_value=constraint_value,
+        cv="prefit",
+        pos_label=pos_label,
+    ).fit(X, y)
+
+    precision = precision_score(y, model.predict(X), pos_label=pos_label)
+    recall = recall_score(y, model.predict(X), pos_label=pos_label)
+
+    # due to internal interpolation, the scores will vary slightly
+    if objective_metric == "max_precision_at_recall_constraint":
+        assert recall == pytest.approx(model.objective_score_[0], abs=1e-3)
+        assert precision == pytest.approx(model.objective_score_[1], abs=1e-3)
+    else:
+        assert precision == pytest.approx(model.objective_score_[0], abs=1e-3)
+        assert recall == pytest.approx(model.objective_score_[1], abs=1e-3)
+
+
+@pytest.mark.parametrize(
+    "objective_metric", ["max_tnr_at_tpr_constraint", "max_tpr_at_tnr_constraint"]
+)
+@pytest.mark.parametrize("pos_label", [0, 1])
+def test_cutoffclassifier_pos_label_tnr_tpr(objective_metric, pos_label):
+    """Check that `pos_label` is dispatched correctly by checking the TNR and TPR
+    score found during the optimization and the one found at `predict` time."""
+    X, y = make_classification(n_samples=5_000, weights=[0.6, 0.4], random_state=42)
+
+    # prefit the estimator to avoid variability due to the cross-validation
+    estimator = LogisticRegression().fit(X, y)
+
+    constraint_value = 0.7
+    model = CutOffClassifier(
+        estimator,
+        objective_metric=objective_metric,
+        constraint_value=constraint_value,
+        cv="prefit",
+        pos_label=pos_label,
+    ).fit(X, y)
+
+    def tnr_tpr_score(y_true, y_pred, pos_label=pos_label):
+        cm = confusion_matrix(y_true, y_pred)
+        if pos_label == 0:
+            cm = cm[::-1, ::-1]
+        tn, fp, fn, tp = cm.ravel()
+        tnr = tn / (tn + fp)
+        tpr = tp / (tp + fn)
+        return tnr, tpr
+
+    tnr, tpr = tnr_tpr_score(y, model.predict(X), pos_label=pos_label)
+    # due to internal interpolation, the scores will vary slightly
+    if objective_metric == "max_tnr_at_tpr_constraint":
+        assert tpr == pytest.approx(model.objective_score_[0], abs=0.05)
+        assert tnr == pytest.approx(model.objective_score_[1], abs=0.05)
+    else:
+        assert tnr == pytest.approx(model.objective_score_[0], abs=0.05)
+        assert tpr == pytest.approx(model.objective_score_[1], abs=0.05)
+
+
+@pytest.mark.parametrize(
+    "metric_type",
+    ["string", "scorer_without_pos_label", "scorer_with_pos_label"],
+)
+@pytest.mark.parametrize("pos_label", [0, 1])
+def test_cutoffclassifier_pos_label_single_metric(pos_label, metric_type):
+    """Check that `pos_label` is dispatched correctly when getting a scorer linked to
+    a known metric. By default, the scorer in scikit-learn only have a default value
+    for `pos_label` which is 1.
+    """
+    X, y = make_classification(n_samples=100, weights=[0.6, 0.4], random_state=42)
+
+    # prefit the estimator to avoid variability due to the cross-validation
+    estimator = LogisticRegression().fit(X, y)
+
+    if metric_type == "string":
+        objective_metric = "precision"
+    elif metric_type == "scorer_without_pos_label":
+        objective_metric = make_scorer(precision_score)
+    else:  # metric_type == "scorer_with_pos_label"
+        objective_metric = make_scorer(precision_score, pos_label=pos_label)
+
+    model = CutOffClassifier(
+        estimator,
+        objective_metric=objective_metric,
+        cv="prefit",
+        pos_label=pos_label,
+        n_thresholds=500,
+    ).fit(X, y)
+
+    precision = precision_score(y, model.predict(X), pos_label=pos_label)
+    assert precision == pytest.approx(model.objective_score_, abs=1e-3)

From dca5770eca1f02d6e241ca669bab2c4caebab312 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 13 Jul 2023 12:16:23 +0200
Subject: [PATCH 081/194] update changelog

---
 doc/whats_new/v1.3.rst | 5 -----
 doc/whats_new/v1.4.rst | 8 ++++++++
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 7d89254c74464..8d39ca2fed143 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -615,11 +615,6 @@ Changelog
   `return_indices` to return the train-test indices of each cv split.
   :pr:`25659` by :user:`Guillaume Lemaitre <glemaitre>`.
 
-- |MajorFeature| :class:`model_selection.CutOffClassifier` calibrates decision threshold
-  function of a binary classifier by maximizing a classification metric through
-  cross-validation.
-  :pr:`26120` by :user:`Guillaume Lemaitre <glemaitre>`.
-
 :mod:`sklearn.multioutput`
 ..........................
 
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index 4ba357c52d136..149a435389b52 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -78,6 +78,14 @@ Changelog
 - |Fix| :func:`feature_selection.mutual_info_regression` now correctly computes the
   result when `X` is of integer dtype. :pr:`26748` by :user:`Yao Xiao <Charlie-XIAO>`.
 
+:mod:`sklearn.model_selection`
+..............................
+
+- |MajorFeature| :class:`model_selection.CutOffClassifier` calibrates decision threshold
+  function of a binary classifier by maximizing a classification metric through
+  cross-validation.
+  :pr:`26120` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.tree`
 ...................
 

From 8897533fd50ae864404a5b573102bac3b83e31c8 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 13 Jul 2023 12:24:24 +0200
Subject: [PATCH 082/194] more renaming

---
 doc/modules/classes.rst                       |   2 +-
 doc/modules/classification_threshold.rst      | 164 ++++++++++++++++++
 doc/whats_new/v1.4.rst                        |   6 +-
 ....py => plot_tuned_threshold_classifier.py} |  26 +--
 sklearn/model_selection/__init__.py           |   4 +-
 .../_classification_threshold.py              |  12 +-
 .../tests/test_classification_threshold.py    |  94 +++++-----
 7 files changed, 240 insertions(+), 68 deletions(-)
 create mode 100644 doc/modules/classification_threshold.rst
 rename examples/model_selection/{plot_cutoff_tuning.py => plot_tuned_threshold_classifier.py} (95%)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 76eac663329d2..25f0292fb4928 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1230,7 +1230,7 @@ Model post-fit tuning
    :toctree: generated/
    :template: class.rst
 
-   model_selection.CutOffClassifier
+   model_selection.TunedThresholdClassifier
 
 Model validation
 ----------------
diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
new file mode 100644
index 0000000000000..868bb0747f7b7
--- /dev/null
+++ b/doc/modules/classification_threshold.rst
@@ -0,0 +1,164 @@
+.. currentmodule:: sklearn.model_selection
+
+.. _tunedthresholdclassifier:
+
+========================================================
+Tuning cut-off decision threshold for classes prediction
+========================================================
+
+Classifiers are predictive models: they use statistical learning to predict
+outcomes. The outcomes of a classifier are scores for each sample in relation
+to each class and categorical prediction (class label). Scores are obtained
+from :term:`predict_proba` or :term:`decision_function`. The former returns
+posterior probability estimates for each class while the latter returns a
+decision function value for each class. The decision function value is a
+measure of how strongly the sample is predicted to belong to the positive
+class (e.g. the distance to the decisin boundary). A decision rule is then
+defined by thresholding the scores and obtained the class label for each
+sample. Those labels are obtained with :term:`predict`.
+
+For binary classification in scikit-learn, class labels are obtained by
+associating the positive class with probability estimates greater than 0.5
+(obtained with :term:`predict_proba`) or decision function values greater than
+0 (obtained with :term:`decision_function`).
+
+Here, we show an example that illustrates the relation between posterior
+probability estimates and class labels::
+
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.tree import DecisionTreeClassifier
+    >>> X, y = make_classification(random_state=0)
+    >>> classifier = DecisionTreeClassifier(max_depth=2, random_state=0).fit(X, y)
+    >>> classifier.predict_proba(X[:4])
+    array([[0.94   , 0.06   ],
+           [0.94   , 0.06   ],
+           [0.04..., 0.95...],
+           [0.04..., 0.95...]])
+    >>> classifier.predict(X[:4])
+    array([0, 0, 1, 1])
+
+While these approaches are reasonable as default behaviors, they are not be
+ideal for all cases. The context and nature of the use case defines the
+expected behavior of the classifier and thus the strategy to convert soft
+predictions into hard predictions. We illustrate this point with an example.
+
+Let's imagine the deployment of a predictive model helping medical doctors to
+detect tumour. In a setting where this model was a tool to discard obvious
+cases and false positives don't lead to potentially harmful treatments, doctors
+might be interested in having a high recall (all cancer cases should be tagged
+as such) to not miss any patient with a cancer. However, that is at the cost of
+having more false positive predictions (i.e. lower precision). Thus, in terms of
+decision threshold, it may be better to classify a patient as having a cancer
+for a probability estimate lower than 0.5.
+
+Post-tuning of the decision threshold
+=====================================
+
+One solution to address the problem stated in the introduction is to tune the decision
+threshold of the classifier once the model has been trained. The
+:class:`~sklearn.model_selection.TunedThresholdClassifier` tunes this threshold using
+an internal cross-validation. The optimum threshold is chosen to maximize a given metric
+with or without constraints.
+
+The following image illustrates the tuning of the cut-off point for a gradient
+boosting classifier. While the vanilla and tuned classifiers provide the same
+Receiver Operating Characteristic (ROC) and Precision-Recall curves, and thus
+the same :term:`predict_proba` outputs, the "hard" predictions differ because of
+the tuned cut-off point. The vanilla classifier predicts the class of interest
+for a probability greater than 0.5 while the tuned classifier predicts the
+class of interest for a very low probability (around 0.02). This cut-off point
+optimizes a utility metric defined by the business case (in this case an
+insurance company).
+
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_tuned_threshold_classifier_002.png
+   :target: ../auto_examples/model_selection/plot_tuned_threshold_classifier.html
+   :align: center
+
+Available options to tune the cut-off point
+-------------------------------------------
+
+The cut-off point can be tuned with different strategies controlled by the parameter
+`objective_metric`.
+
+A straightforward use case is to maximize a pre-defined scikit-learn metric. These
+metrics can be found by calling the function :func:`~sklearn.metrics.get_scorer_names`.
+We provide an example where we maximize the balanced accuracy.
+
+.. note::
+
+    It is important to notice that these metrics come with default parameters, notably
+    the label of the class of interested (i.e. `pos_label`). Thus, if this label is not
+    the right one for your application, you need to define a scorer and pass the right
+    `pos_label` (and additional parameters) using the
+    :func:`~sklearn.metrics.make_scorer`. You should refer to :ref:`scoring` to get all
+    information to define your own scoring function. For instance, we show how to pass
+    the information to the scorer that the label of interest is `0` when maximizing the
+    :func:`~sklearn.metrics.f1_score`:
+
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> from sklearn.model_selection import (
+        ...     TunedThresholdClassifier, train_test_split
+        ... )
+        >>> from sklearn.metrics import make_scorer, f1_score
+        >>> X, y = make_classification(
+        ...    n_samples=1_000, weights=[0.1, 0.9], random_state=0)
+        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+        >>> pos_label = 0
+        >>> scorer = make_scorer(f1_score, pos_label=pos_label)
+        >>> base_model = LogisticRegression()
+        >>> model = TunedThresholdClassifier(base_model, objective_metric=scorer).fit(
+        ...     X_train, y_train)
+        >>> scorer(model, X_test, y_test)
+        0.82...
+        >>> # compare it with the internal score found by cross-validation
+        >>> model.objective_score_
+        0.86...
+
+A second strategy aims at maximizing a metric while imposing constraints on another
+metric. Four pre-defined options exist, 2 that uses the Receiver Operating
+Characteristic (ROC) statistics and 2 that uses the Precision-Recall statistics.
+
+- `"max_tpr_at_tnr_constraint"`: maximizes the True Positive Rate (TPR) such that the
+  True Negative Rate (TNR) is the closest to a given value.
+- `"max_tnr_at_tpr_constraint"`: maximizes the TNR such that the TPR is the closest to
+  a given value.
+- `"max_precision_at_recall_constraint"`: maximizes the precision such that the recall
+  is the closest to a given value.
+- `"max_recall_at_precision_constraint"`: maximizes the recall such that the precision
+  is the closest to a given value.
+
+For these options, the `constraint_value` parameter needs to be defined. In addition,
+you can use the `pos_label` parameter to indicate the label of the class of interest.
+
+The final strategy maximizes a custom utility function. This problem is also known as
+cost-sensitive learning. The utility function is defined by providing a dictionary
+containing the cost-gain associated with the entries of the confusion matrix. The keys
+are defined as `{"tn", "fp", "fn", "tp"}`. The class of interest is defined using the
+`pos_label` parameter. Refer to :ref:`cost_sensitive_learning_example` for an example
+depicting the use of such a utility function.
+
+Important notes regarding the internal cross-validation
+-------------------------------------------------------
+
+By default :class:`~sklearn.model_selection.TunedThresholdClassifier` uses a
+5-fold stratified cross-validation to tune the cut-off point. The parameter
+`cv` allows to control the cross-validation strategy. It is possible to go
+around cross-validation by passing `cv="prefit"` and provide an already fitted
+classifier. In this case, the cut-off point is tuned on the data provided to
+the `fit` method.
+
+However, you should be extremely careful when using this option. You should never use
+the same data for training the classifier and tuning the cut-off point at the risk of
+overfitting. Refer to :ref:`tunedthresholdclassifier_no_cv` that shows such overfitting. If
+you are in a situation where you have limited resources, you should consider using
+a float number that will use a single split internally.
+
+The option `cv="prefit"` should only be used when the provided classifier was already
+trained on some data and you want to tune (or re-tune) on a new validation set.
+
+Examples
+--------
+
+- See
+  :ref:`sphx_glr_auto_examples_model_selection_plot_tuned_threshold_classifier.py`
+  example for an example of tuning the decision threshold of a classifier.
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index 149a435389b52..1801fa190eb35 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -81,9 +81,9 @@ Changelog
 :mod:`sklearn.model_selection`
 ..............................
 
-- |MajorFeature| :class:`model_selection.CutOffClassifier` calibrates decision threshold
-  function of a binary classifier by maximizing a classification metric through
-  cross-validation.
+- |MajorFeature| :class:`model_selection.TunedThresholdClassifier` calibrates
+  decision threshold function of a binary classifier by maximizing a
+  classification metric through cross-validation.
   :pr:`26120` by :user:`Guillaume Lemaitre <glemaitre>`.
 
 :mod:`sklearn.tree`
diff --git a/examples/model_selection/plot_cutoff_tuning.py b/examples/model_selection/plot_tuned_threshold_classifier.py
similarity index 95%
rename from examples/model_selection/plot_cutoff_tuning.py
rename to examples/model_selection/plot_tuned_threshold_classifier.py
index 1fb748f6a798b..dda6caba7185b 100644
--- a/examples/model_selection/plot_cutoff_tuning.py
+++ b/examples/model_selection/plot_tuned_threshold_classifier.py
@@ -15,7 +15,7 @@
 misclassification. Specifically, misclassifying a "bad" credit as "good" is five
 times more costly than misclassifying a "good" credit as "bad".
 
-We use the :class:`~sklearn.model_selection.CutOffClassifier` to select the
+We use the :class:`~sklearn.model_selection.TunedThresholdClassifier` to select the
 cut-off point of the decision function that minimizes the provided business
 cost.
 
@@ -259,7 +259,7 @@ def gain_cost_score(y, y_pred, **kwargs):
 # To find the optimal one, we need to compute the cost-gain using the business
 # metric for all possible cut-off points and choose the best. This strategy can
 # be quite tedious to implement by hand, but the
-# :class:`~sklearn.metrics.CutOffClassifier` class is here to help us. It
+# :class:`~sklearn.metrics.TunedThresholdClassifier` class is here to help us. It
 # automatically computes the cost-gain for all possible cut-off points and
 # optimizes for the `objective_metric`.
 #
@@ -268,14 +268,14 @@ def gain_cost_score(y, y_pred, **kwargs):
 # Tuning the cut-off point
 # ------------------------
 #
-# We use :class:`~sklearn.model_selection.CutOffClassifier` to tune the cut-off
+# We use :class:`~sklearn.model_selection.TunedThresholdClassifier` to tune the cut-off
 # point. We need to provide the business metric to optimize as well as the
 # positive label. Internally, the optimum cut-off point is chosen such that it
 # maximizes the business metric via cross-validation. By default a 5-fold
 # stratified cross-validation is used.
-from sklearn.model_selection import CutOffClassifier
+from sklearn.model_selection import TunedThresholdClassifier
 
-model_tuned = CutOffClassifier(
+model_tuned = TunedThresholdClassifier(
     estimator=model,
     pos_label=pos_label,
     objective_metric=cost_gain_matrix,
@@ -376,13 +376,13 @@ def gain_cost_score(y, y_pred, **kwargs):
 # We observe that the decision generalized on the testing set leading to a better
 # business score.
 #
-# .. _cutoffclassifier_no_cv:
+# .. _tunedthresholdclassifier_no_cv:
 #
 # Consideration regarding model refitting and cross-validation
 # ------------------------------------------------------------
 #
 # In the above experiment, we use the default setting of the
-# :class:`~sklearn.model_selection.CutOffClassifier`. In particular, the cut-off
+# :class:`~sklearn.model_selection.TunedThresholdClassifier`. In particular, the cut-off
 # point is tuned using a 5-fold stratified cross-validation. Also, the
 # underlying predictive model is refitted on the entire training data once the
 # cut-off point is chosen.
@@ -477,12 +477,12 @@ def gain_cost_score(y, y_pred, **kwargs):
 # the same set as the model was trained on, and this is the reason for the observed
 # overfitting.
 #
-# This option should therefore be used with caution. One needs to make sure that the
-# data providing at fitting time to the
-# :class:`~sklearn.model_selection.CutOffClassifier` is not the same as the data used to
-# train the underlying classifier. This could happen sometimes when the idea is just
-# to tune the predictive model on a completely new validation set without a costly
-# complete refit.
+# This option should therefore be used with caution. One needs to make sure
+# that the data providing at fitting time to the
+# :class:`~sklearn.model_selection.TunedThresholdClassifier` is not the same as
+# the data used to train the underlying classifier. This could happen sometimes
+# when the idea is just to tune the predictive model on a completely new
+# validation set without a costly complete refit.
 #
 # In the case that cross-validation is too costly, a potential alternative is to use
 # a single train-test split by providing a floating number in range `[0, 1]` to the
diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py
index c1ce326c3201a..5facb793e3922 100644
--- a/sklearn/model_selection/__init__.py
+++ b/sklearn/model_selection/__init__.py
@@ -1,6 +1,6 @@
 import typing
 
-from ._classification_threshold import CutOffClassifier
+from ._classification_threshold import TunedThresholdClassifier
 from ._plot import LearningCurveDisplay, ValidationCurveDisplay
 from ._search import GridSearchCV, ParameterGrid, ParameterSampler, RandomizedSearchCV
 from ._split import (
@@ -64,7 +64,7 @@
     "StratifiedKFold",
     "StratifiedGroupKFold",
     "StratifiedShuffleSplit",
-    "CutOffClassifier",
+    "TunedThresholdClassifier",
     "check_cv",
     "cross_val_predict",
     "cross_val_score",
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 98cc868d16b7d..82b8608864432 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -151,7 +151,7 @@ def _fit_and_score(
     return scorer(classifier, X_val, y_val, sample_weight=sw_val)
 
 
-class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
+class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
     """Decision threshold tuning for binary classification.
 
     This estimator post-tunes the decision threshold (cut-off point) that is
@@ -160,7 +160,7 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
     class. The tuning is done by maximizing a binary metric, potentially
     constrained by a another metric.
 
-    Read more in the :ref:`User Guide <cutoffclassifier>`.
+    Read more in the :ref:`User Guide <TunedThresholdClassifier>`.
 
     .. versionadded:: 1.3
 
@@ -242,7 +242,7 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
 
             This option should only be used when the set used to fit `estimator` is
             different from the one used to tune the cut-off point (by calling
-            :meth:`CutOffClassifier.fit`).
+            :meth:`TunedThresholdClassifier.fit`).
 
     refit : "auto" or bool, default="auto"
         Whether or not to refit the classifier on the entire training set once
@@ -307,7 +307,7 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
     >>> from sklearn.datasets import make_classification
     >>> from sklearn.ensemble import RandomForestClassifier
     >>> from sklearn.metrics import classification_report
-    >>> from sklearn.model_selection import CutOffClassifier, train_test_split
+    >>> from sklearn.model_selection import TunedThresholdClassifier, train_test_split
     >>> X, y = make_classification(
     ...     n_samples=1_000, weights=[0.9, 0.1], class_sep=0.8, random_state=42
     ... )
@@ -325,7 +325,7 @@ class CutOffClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
        macro avg       0.87      0.72      0.77       250
     weighted avg       0.93      0.93      0.92       250
     <BLANKLINE>
-    >>> classifier_tuned = CutOffClassifier(
+    >>> classifier_tuned = TunedThresholdClassifier(
     ...     classifier, objective_metric="max_precision_at_recall_constraint",
     ...     constraint_value=0.7,
     ... ).fit(X_train, y_train)
@@ -796,7 +796,7 @@ def _more_tags(self):
                 "check_sample_weights_invariance": (
                     "Due to the cross-validation and sample ordering, removing a sample"
                     " is not strictly equal to putting is weight to zero. Specific unit"
-                    " tests are added for CutOffClassifier specifically."
+                    " tests are added for TunedThresholdClassifier specifically."
                 ),
             },
         }
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index e803a0b5ff3a5..e822b41dfce6e 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -19,7 +19,7 @@
     roc_curve,
 )
 from sklearn.metrics._scorer import _ContinuousScorer
-from sklearn.model_selection import CutOffClassifier
+from sklearn.model_selection import TunedThresholdClassifier
 from sklearn.model_selection._classification_threshold import _fit_and_score
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
@@ -297,12 +297,12 @@ def test_fit_and_score_fit_params(scorer, score_method, fit_params_type):
     )
 
 
-def test_cutoffclassifier_no_binary():
+def test_tunedthresholdclassifier_no_binary():
     """Check that we raise an informative error message for non-binary problem."""
     X, y = make_classification(n_classes=3, n_clusters_per_class=1)
     err_msg = "Only binary classification is supported."
     with pytest.raises(ValueError, match=err_msg):
-        CutOffClassifier(LogisticRegression()).fit(X, y)
+        TunedThresholdClassifier(LogisticRegression()).fit(X, y)
 
 
 @pytest.mark.parametrize(
@@ -325,13 +325,13 @@ def test_cutoffclassifier_no_binary():
         ),
     ],
 )
-def test_cutoffclassifier_conflict_cv_refit(params, err_type, err_msg):
+def test_tunedthresholdclassifier_conflict_cv_refit(params, err_type, err_msg):
     """Check that we raise an informative error message when `cv` and `refit`
     cannot be used together.
     """
     X, y = make_classification(n_samples=100, random_state=0)
     with pytest.raises(err_type, match=err_msg):
-        CutOffClassifier(LogisticRegression(), **params).fit(X, y)
+        TunedThresholdClassifier(LogisticRegression(), **params).fit(X, y)
 
 
 @pytest.mark.parametrize(
@@ -341,13 +341,15 @@ def test_cutoffclassifier_conflict_cv_refit(params, err_type, err_msg):
 @pytest.mark.parametrize(
     "response_method", ["predict_proba", "predict_log_proba", "decision_function"]
 )
-def test_cutoffclassifier_estimator_response_methods(estimator, response_method):
-    """Check that `CutOffClassifier` exposes the same response methods as the
+def test_tunedthresholdclassifier_estimator_response_methods(
+    estimator, response_method
+):
+    """Check that `TunedThresholdClassifier` exposes the same response methods as the
     underlying estimator.
     """
     X, y = make_classification(n_samples=100, random_state=0)
 
-    model = CutOffClassifier(estimator)
+    model = TunedThresholdClassifier(estimator)
     assert hasattr(model, response_method) == hasattr(estimator, response_method)
 
     model.fit(X, y)
@@ -363,8 +365,8 @@ def test_cutoffclassifier_estimator_response_methods(estimator, response_method)
 @pytest.mark.parametrize(
     "response_method", ["auto", "decision_function", "predict_proba"]
 )
-def test_cutoffclassifier_with_constraint_value(response_method):
-    """Check that `CutOffClassifier` is optimizing a given objective metric."""
+def test_tunedthresholdclassifier_with_constraint_value(response_method):
+    """Check that `TunedThresholdClassifier` is optimizing a given objective metric."""
     X, y = load_breast_cancer(return_X_y=True)
     # remove feature to degrade performances
     X = X[:, :5]
@@ -379,7 +381,7 @@ def test_cutoffclassifier_with_constraint_value(response_method):
 
     lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
     n_thresholds = 100
-    model = CutOffClassifier(
+    model = TunedThresholdClassifier(
         estimator=lr,
         objective_metric="balanced_accuracy",
         response_method=response_method,
@@ -399,13 +401,13 @@ def test_cutoffclassifier_with_constraint_value(response_method):
         ("max_tnr_at_tpr_constraint", "max_tpr_at_tnr_constraint"),
     ],
 )
-def test_cutoffclassifier_limit_metric_tradeoff(metrics):
+def test_tunedthresholdclassifier_limit_metric_tradeoff(metrics):
     """Check that an objective value of 0 give opposite predictions with tnr/tpr and
     precision/recall.
     """
     X, y = load_breast_cancer(return_X_y=True)
     estimator = make_pipeline(StandardScaler(), LogisticRegression())
-    model = CutOffClassifier(
+    model = TunedThresholdClassifier(
         estimator=estimator,
         objective_metric=metrics[0],
         constraint_value=0,
@@ -416,16 +418,16 @@ def test_cutoffclassifier_limit_metric_tradeoff(metrics):
     assert np.mean(y_pred_1 == y_pred_2) > 0.98
 
 
-def test_cutoffclassifier_metric_with_parameter():
+def test_tunedthresholdclassifier_metric_with_parameter():
     """Check that we can pass a metric with a parameter in addition check that
     `f_beta with beta=1` is equivalent to `f1`.
     """
     X, y = load_breast_cancer(return_X_y=True)
     lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
-    model_fbeta = CutOffClassifier(
+    model_fbeta = TunedThresholdClassifier(
         estimator=lr, objective_metric=make_scorer(fbeta_score, beta=1)
     ).fit(X, y)
-    model_f1 = CutOffClassifier(
+    model_f1 = TunedThresholdClassifier(
         estimator=lr, objective_metric=make_scorer(f1_score)
     ).fit(X, y)
 
@@ -449,7 +451,7 @@ def test_cutoffclassifier_metric_with_parameter():
         {"tp": 5, "tn": 1, "fp": -1, "fn": -1},
     ],
 )
-def test_cutoffclassifier_with_string_targets(response_method, metric):
+def test_tunedthresholdclassifier_with_string_targets(response_method, metric):
     """Check that targets represented by str are properly managed.
     Also, check with several metrics to be sure that `pos_label` is properly
     dispatched.
@@ -460,7 +462,7 @@ def test_cutoffclassifier_with_string_targets(response_method, metric):
     # encoded as 0.
     classes = np.array(["cancer", "healthy"], dtype=object)
     y = classes[y]
-    model = CutOffClassifier(
+    model = TunedThresholdClassifier(
         estimator=make_pipeline(StandardScaler(), LogisticRegression()),
         objective_metric=metric,
         constraint_value=0.9,
@@ -474,7 +476,7 @@ def test_cutoffclassifier_with_string_targets(response_method, metric):
 
 
 @pytest.mark.parametrize("with_sample_weight", [True, False])
-def test_cutoffclassifier_refit(with_sample_weight, global_random_seed):
+def test_tunedthresholdclassifier_refit(with_sample_weight, global_random_seed):
     """Check the behaviour of the `refit` parameter."""
     rng = np.random.RandomState(global_random_seed)
     X, y = make_classification(n_samples=100, random_state=0)
@@ -486,7 +488,7 @@ def test_cutoffclassifier_refit(with_sample_weight, global_random_seed):
 
     # check that `estimator_` if fitted on the full dataset when `refit=True`
     estimator = LogisticRegression()
-    model = CutOffClassifier(estimator, refit=True).fit(
+    model = TunedThresholdClassifier(estimator, refit=True).fit(
         X, y, sample_weight=sample_weight
     )
 
@@ -498,7 +500,7 @@ def test_cutoffclassifier_refit(with_sample_weight, global_random_seed):
     # check that `estimator_` was not altered when `refit=False` and `cv="prefit"`
     estimator = LogisticRegression().fit(X, y, sample_weight=sample_weight)
     coef = estimator.coef_.copy()
-    model = CutOffClassifier(estimator, cv="prefit", refit=False).fit(
+    model = TunedThresholdClassifier(estimator, cv="prefit", refit=False).fit(
         X, y, sample_weight=sample_weight
     )
 
@@ -510,7 +512,7 @@ def test_cutoffclassifier_refit(with_sample_weight, global_random_seed):
     cv = [
         (np.arange(50), np.arange(50, 100)),
     ]  # single split
-    model = CutOffClassifier(estimator, cv=cv, refit=False).fit(
+    model = TunedThresholdClassifier(estimator, cv=cv, refit=False).fit(
         X, y, sample_weight=sample_weight
     )
 
@@ -534,7 +536,7 @@ def test_cutoffclassifier_refit(with_sample_weight, global_random_seed):
     ],
 )
 @pytest.mark.parametrize("fit_params_type", ["list", "array"])
-def test_cutoffclassifier_fit_params(objective_metric, fit_params_type):
+def test_tunedthresholdclassifier_fit_params(objective_metric, fit_params_type):
     """Check that we pass `fit_params` to the classifier when calling `fit`."""
     X, y = make_classification(n_samples=100, random_state=0)
     fit_params = {
@@ -543,7 +545,7 @@ def test_cutoffclassifier_fit_params(objective_metric, fit_params_type):
     }
 
     classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0)
-    model = CutOffClassifier(
+    model = TunedThresholdClassifier(
         classifier, objective_metric=objective_metric, constraint_value=0.5
     )
     model.fit(X, y, **fit_params)
@@ -561,7 +563,7 @@ def test_cutoffclassifier_fit_params(objective_metric, fit_params_type):
 @pytest.mark.parametrize(
     "response_method", ["auto", "decision_function", "predict_proba"]
 )
-def test_cutoffclassifier_response_method_scorer_with_constraint_metric(
+def test_tunedthresholdclassifier_response_method_scorer_with_constraint_metric(
     objective_metric, constraint_value, response_method, global_random_seed
 ):
     """Check that we use the proper scorer and forwarding the requested response method
@@ -571,7 +573,7 @@ def test_cutoffclassifier_response_method_scorer_with_constraint_metric(
     classifier = LogisticRegression()
 
     n_thresholds = 100
-    model = CutOffClassifier(
+    model = TunedThresholdClassifier(
         classifier,
         objective_metric=objective_metric,
         constraint_value=constraint_value,
@@ -605,7 +607,7 @@ def test_cutoffclassifier_response_method_scorer_with_constraint_metric(
             assert -20 < model.decision_threshold_ < 0
 
 
-def test_cutoffclassifier_objective_metric_dict(global_random_seed):
+def test_tunedthresholdclassifier_objective_metric_dict(global_random_seed):
     """Check that we can pass a custom objective metric."""
     X, y = make_classification(n_samples=500, random_state=global_random_seed)
     classifier = LogisticRegression()
@@ -617,7 +619,7 @@ def test_cutoffclassifier_objective_metric_dict(global_random_seed):
     # affect a high gain to true negative and force the classifier to mainly
     # predict the negative class.
     costs_and_again = {"tp": 0, "tn": 10, "fp": 0, "fn": 0}
-    model = CutOffClassifier(
+    model = TunedThresholdClassifier(
         classifier, objective_metric=costs_and_again, n_thresholds=n_thresholds
     )
     model.fit(X, y)
@@ -630,7 +632,7 @@ def test_cutoffclassifier_objective_metric_dict(global_random_seed):
 
     # use the true positive now
     costs_and_again = {"tp": 10, "tn": 0, "fp": 0, "fn": 0}
-    model = CutOffClassifier(
+    model = TunedThresholdClassifier(
         classifier, objective_metric=costs_and_again, n_thresholds=n_thresholds
     )
     model.fit(X, y)
@@ -644,7 +646,7 @@ def test_cutoffclassifier_objective_metric_dict(global_random_seed):
     # flipping the `pos_label` to zero should force the classifier to always predict 0
     # and thus have a low threshold
     pos_label = 0
-    model = CutOffClassifier(
+    model = TunedThresholdClassifier(
         classifier,
         objective_metric=costs_and_again,
         n_thresholds=n_thresholds,
@@ -659,7 +661,7 @@ def test_cutoffclassifier_objective_metric_dict(global_random_seed):
     assert np.mean(model.predict(X) == 0) > 0.9
 
 
-def test_cutoffclassifier_sample_weight_costs_and_again():
+def test_tunedthresholdclassifier_sample_weight_costs_and_again():
     """Check that we dispatch the `sample_weight` to the scorer when computing the
     confusion matrix."""
     X, y = load_iris(return_X_y=True)
@@ -676,16 +678,20 @@ def test_cutoffclassifier_sample_weight_costs_and_again():
     estimator = LogisticRegression().fit(X, y)
     costs_and_again = {"tp": 1, "tn": 1, "fp": -1, "fn": -1}
 
-    model_repeat = CutOffClassifier(estimator, cv=cv, objective_metric=costs_and_again)
+    model_repeat = TunedThresholdClassifier(
+        estimator, cv=cv, objective_metric=costs_and_again
+    )
     model_repeat.fit(X_repeated, y_repeated, sample_weight=None)
 
-    model_sw = CutOffClassifier(estimator, cv=cv, objective_metric=costs_and_again)
+    model_sw = TunedThresholdClassifier(
+        estimator, cv=cv, objective_metric=costs_and_again
+    )
     model_sw.fit(X, y, sample_weight=sample_weight)
 
     assert model_repeat.objective_score_ == pytest.approx(model_sw.objective_score_)
 
 
-def test_cutoffclassifier_cv_zeros_sample_weights_equivalence():
+def test_tunedthresholdclassifier_cv_zeros_sample_weights_equivalence():
     """Check that passing removing some sample from the dataset `X` is
     equivalent to passing a `sample_weight` with a factor 0."""
     X, y = load_iris(return_X_y=True)
@@ -699,7 +705,7 @@ def test_cutoffclassifier_cv_zeros_sample_weights_equivalence():
     sample_weight[::2] = 1
 
     estimator = LogisticRegression()
-    model_without_weights = CutOffClassifier(estimator, cv=2)
+    model_without_weights = TunedThresholdClassifier(estimator, cv=2)
     model_with_weights = clone(model_without_weights)
 
     model_with_weights.fit(X, y, sample_weight=sample_weight)
@@ -714,14 +720,14 @@ def test_cutoffclassifier_cv_zeros_sample_weights_equivalence():
     assert_allclose(y_pred_with_weights, y_pred_without_weights)
 
 
-def test_cutoffclassifier_error_constant_learner():
+def test_tunedthresholdclassifier_error_constant_learner():
     """Check that we raise an error message when providing an estimator that predicts
     only a single class."""
     X, y = make_classification(random_state=0)
     estimator = DummyClassifier(strategy="constant", constant=1)
     err_msg = "The provided estimator makes constant predictions."
     with pytest.raises(ValueError, match=err_msg):
-        CutOffClassifier(estimator).fit(X, y)
+        TunedThresholdClassifier(estimator).fit(X, y)
 
 
 @pytest.mark.parametrize(
@@ -729,7 +735,9 @@ def test_cutoffclassifier_error_constant_learner():
     ["max_precision_at_recall_constraint", "max_recall_at_precision_constraint"],
 )
 @pytest.mark.parametrize("pos_label", [0, 1])
-def test_cutoffclassifier_pos_label_precision_recall(objective_metric, pos_label):
+def test_tunedthresholdclassifier_pos_label_precision_recall(
+    objective_metric, pos_label
+):
     """Check that `pos_label` is dispatched correctly by checking the precision and
     recall score found during the optimization and the one found at `predict` time."""
     X, y = make_classification(n_samples=5_000, weights=[0.6, 0.4], random_state=42)
@@ -738,7 +746,7 @@ def test_cutoffclassifier_pos_label_precision_recall(objective_metric, pos_label
     estimator = LogisticRegression().fit(X, y)
 
     constraint_value = 0.7
-    model = CutOffClassifier(
+    model = TunedThresholdClassifier(
         estimator,
         objective_metric=objective_metric,
         constraint_value=constraint_value,
@@ -762,7 +770,7 @@ def test_cutoffclassifier_pos_label_precision_recall(objective_metric, pos_label
     "objective_metric", ["max_tnr_at_tpr_constraint", "max_tpr_at_tnr_constraint"]
 )
 @pytest.mark.parametrize("pos_label", [0, 1])
-def test_cutoffclassifier_pos_label_tnr_tpr(objective_metric, pos_label):
+def test_tunedthresholdclassifier_pos_label_tnr_tpr(objective_metric, pos_label):
     """Check that `pos_label` is dispatched correctly by checking the TNR and TPR
     score found during the optimization and the one found at `predict` time."""
     X, y = make_classification(n_samples=5_000, weights=[0.6, 0.4], random_state=42)
@@ -771,7 +779,7 @@ def test_cutoffclassifier_pos_label_tnr_tpr(objective_metric, pos_label):
     estimator = LogisticRegression().fit(X, y)
 
     constraint_value = 0.7
-    model = CutOffClassifier(
+    model = TunedThresholdClassifier(
         estimator,
         objective_metric=objective_metric,
         constraint_value=constraint_value,
@@ -803,7 +811,7 @@ def tnr_tpr_score(y_true, y_pred, pos_label=pos_label):
     ["string", "scorer_without_pos_label", "scorer_with_pos_label"],
 )
 @pytest.mark.parametrize("pos_label", [0, 1])
-def test_cutoffclassifier_pos_label_single_metric(pos_label, metric_type):
+def test_tunedthresholdclassifier_pos_label_single_metric(pos_label, metric_type):
     """Check that `pos_label` is dispatched correctly when getting a scorer linked to
     a known metric. By default, the scorer in scikit-learn only have a default value
     for `pos_label` which is 1.
@@ -820,7 +828,7 @@ def test_cutoffclassifier_pos_label_single_metric(pos_label, metric_type):
     else:  # metric_type == "scorer_with_pos_label"
         objective_metric = make_scorer(precision_score, pos_label=pos_label)
 
-    model = CutOffClassifier(
+    model = TunedThresholdClassifier(
         estimator,
         objective_metric=objective_metric,
         cv="prefit",

From 75bd7ac8bafd74473847df5525182a1e93e0d4ae Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 13 Jul 2023 14:20:58 +0200
Subject: [PATCH 083/194] iter

---
 sklearn/model_selection/_classification_threshold.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 82b8608864432..3d40352ec6252 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -238,7 +238,7 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
         .. warning::
             Using `cv="prefit"` and passing the same dataset for fitting `estimator`
             and tuning the cut-off point is subject to undesired overfitting. You can
-            refer to :ref:`cutoffclassifier_no_cv` for an example.
+            refer to :ref:`tunedthresholdclassifier_no_cv` for an example.
 
             This option should only be used when the set used to fit `estimator` is
             different from the one used to tune the cut-off point (by calling

From c07a980ffba65858387948ee2ff16a2e650d4750 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft.com>
Date: Thu, 13 Jul 2023 17:25:41 +0500
Subject: [PATCH 084/194] Adjust and change the name of params in
 _check_method_params

---
 sklearn/calibration.py                      |  2 +-
 sklearn/conftest.py                         |  8 ++++++
 sklearn/linear_model/_logistic.py           |  4 +--
 sklearn/linear_model/tests/test_logistic.py | 16 ++++++------
 sklearn/model_selection/_search.py          |  2 +-
 sklearn/model_selection/_validation.py      |  6 ++---
 sklearn/multioutput.py                      |  2 +-
 sklearn/utils/tests/test_validation.py      | 14 +++++------
 sklearn/utils/validation.py                 | 27 +++++++++++----------
 9 files changed, 44 insertions(+), 37 deletions(-)

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 43b1e3f0231ba..5f6bc31118199 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -612,7 +612,7 @@ def _fit_classifier_calibrator_pair(
     -------
     calibrated_classifier : _CalibratedClassifier instance
     """
-    fit_params_train = _check_method_params(X, fit_params, train)
+    fit_params_train = _check_method_params(X, params=fit_params, indices=train)
     X_train, y_train = _safe_indexing(X, train), _safe_indexing(y, train)
     X_test, y_test = _safe_indexing(X, test), _safe_indexing(y, test)
 
diff --git a/sklearn/conftest.py b/sklearn/conftest.py
index c3db0453918f8..d9a26e4928d26 100644
--- a/sklearn/conftest.py
+++ b/sklearn/conftest.py
@@ -12,6 +12,7 @@
 from _pytest.doctest import DoctestItem
 from threadpoolctl import threadpool_limits
 
+from sklearn import config_context
 from sklearn._min_dependencies import PYTEST_MIN_VERSION
 from sklearn.datasets import (
     fetch_20newsgroups,
@@ -35,6 +36,13 @@
 scipy_datasets_require_network = sp_version >= parse_version("1.10")
 
 
+@pytest.fixture
+def enable_slep006():
+    """Enable SLEP006 for all tests."""
+    with config_context(enable_metadata_routing=True):
+        yield
+
+
 def raccoon_face_or_skip():
     # SciPy >= 1.10 requires network to access to get data
     if scipy_datasets_require_network:
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 0343b13cb8410..8fa188c7f368e 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -801,9 +801,7 @@ def _log_reg_scoring_path(
             scores.append(log_reg.score(X_test, y_test))
         else:
             score_params = score_params or {}
-            score_params = _check_method_params(
-                X=X, fit_params=score_params, indices=test
-            )
+            score_params = _check_method_params(X=X, params=score_params, indices=test)
             scores.append(scoring(log_reg, X_test, y_test, **score_params))
 
     return coefs, Cs, np.array(scores), n_iter
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index 9192111757330..37e697c6b7eb0 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -2076,6 +2076,7 @@ def test_liblinear_not_stuck():
         clf.fit(X_prep, y)
 
 
+@pytest.mark.usefixtures("enable_slep006")
 def test_lr_cv_scores_differ_when_sample_weight_is_requested():
     """Test sample_weight is correctly passed to the scorer in
     LogisticRegressionCV :meth:`fit` by checking the difference
@@ -2087,15 +2088,14 @@ def test_lr_cv_scores_differ_when_sample_weight_is_requested():
     sample_weight[: len(y) // 2] = 2
     kwargs = {"sample_weight": sample_weight}
 
-    with config_context(enable_metadata_routing=True):
-        scorer1 = get_scorer("accuracy")
-        lr_cv1 = LogisticRegressionCV(scoring=scorer1)
-        lr_cv1.fit(X, y, **kwargs)
+    scorer1 = get_scorer("accuracy")
+    lr_cv1 = LogisticRegressionCV(scoring=scorer1)
+    lr_cv1.fit(X, y, **kwargs)
 
-        scorer2 = get_scorer("accuracy")
-        scorer2.set_score_request(sample_weight=True)
-        lr_cv2 = LogisticRegressionCV(scoring=scorer2)
-        lr_cv2.fit(X, y, **kwargs)
+    scorer2 = get_scorer("accuracy")
+    scorer2.set_score_request(sample_weight=True)
+    lr_cv2 = LogisticRegressionCV(scoring=scorer2)
+    lr_cv2.fit(X, y, **kwargs)
 
     assert not np.allclose(lr_cv1.scores_[1], lr_cv2.scores_[1])
 
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 6aabf68e2c97a..f8dcd5be7a97f 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -804,7 +804,7 @@ def fit(self, X, y=None, *, groups=None, **fit_params):
             refit_metric = self.refit
 
         X, y, groups = indexable(X, y, groups)
-        fit_params = _check_method_params(X, fit_params)
+        fit_params = _check_method_params(X, params=fit_params)
 
         cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator))
         n_splits = cv_orig.get_n_splits(X, y, groups)
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 18b37986100a1..6c3d601d59b8e 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -723,7 +723,7 @@ def _fit_and_score(
 
     # Adjust length of sample weights
     fit_params = fit_params if fit_params is not None else {}
-    fit_params = _check_method_params(X, fit_params, train)
+    fit_params = _check_method_params(X, params=fit_params, indices=train)
 
     if parameters is not None:
         # clone after setting parameters in case any parameters
@@ -1148,7 +1148,7 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, method):
     """
     # Adjust length of sample weights
     fit_params = fit_params if fit_params is not None else {}
-    fit_params = _check_method_params(X, fit_params, train)
+    fit_params = _check_method_params(X, params=fit_params, indices=train)
 
     X_train, y_train = _safe_split(estimator, X, y, train)
     X_test, _ = _safe_split(estimator, X, y, test, train)
@@ -1454,7 +1454,7 @@ def _permutation_test_score(estimator, X, y, groups, cv, scorer, fit_params):
     for train, test in cv.split(X, y, groups):
         X_train, y_train = _safe_split(estimator, X, y, train)
         X_test, y_test = _safe_split(estimator, X, y, test, train)
-        fit_params = _check_method_params(X, fit_params, train)
+        fit_params = _check_method_params(X, params=fit_params, indices=train)
         estimator.fit(X_train, y_train, **fit_params)
         avg_score.append(scorer(estimator, X_test, y_test))
     return np.mean(avg_score)
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index e217d46be177b..bc975156d791d 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -265,7 +265,7 @@ def fit(self, X, y, sample_weight=None, **fit_params):
                     "Underlying estimator does not support sample weights."
                 )
 
-            fit_params_validated = _check_method_params(X, fit_params)
+            fit_params_validated = _check_method_params(X, params=fit_params)
             routed_params = Bunch(estimator=Bunch(fit=fit_params_validated))
             if sample_weight is not None:
                 routed_params.estimator.fit["sample_weight"] = sample_weight
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 13bbe79a8c0e7..46b2b8262b957 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -1503,9 +1503,9 @@ def __init__(self, a=1, b=1, *, c=1, d=1):
 
 
 @pytest.mark.parametrize("indices", [None, [1, 3]])
-def test_check_fit_params(indices):
+def test_check_method_params(indices):
     X = np.random.randn(4, 2)
-    fit_params = {
+    _params = {
         "list": [1, 2, 3, 4],
         "array": np.array([1, 2, 3, 4]),
         "sparse-col": sp.csc_matrix([1, 2, 3, 4]).T,
@@ -1514,16 +1514,16 @@ def test_check_fit_params(indices):
         "scalar-str": "xxx",
         "None": None,
     }
-    result = _check_method_params(X, fit_params, indices)
+    result = _check_method_params(X, params=_params, indices=indices)
     indices_ = indices if indices is not None else list(range(X.shape[0]))
 
     for key in ["sparse-row", "scalar-int", "scalar-str", "None"]:
-        assert result[key] is fit_params[key]
+        assert result[key] is _params[key]
 
-    assert result["list"] == _safe_indexing(fit_params["list"], indices_)
-    assert_array_equal(result["array"], _safe_indexing(fit_params["array"], indices_))
+    assert result["list"] == _safe_indexing(_params["list"], indices_)
+    assert_array_equal(result["array"], _safe_indexing(_params["array"], indices_))
     assert_allclose_dense_sparse(
-        result["sparse-col"], _safe_indexing(fit_params["sparse-col"], indices_)
+        result["sparse-col"], _safe_indexing(_params["sparse-col"], indices_)
     )
 
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index b1f5b3d7d7122..641459fd75f92 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1946,44 +1946,45 @@ def _check_response_method(estimator, response_method):
     return prediction_method
 
 
-def _check_method_params(X, fit_params, indices=None):
-    """Check and validate the parameters passed during `fit`.
+def _check_method_params(X, params, indices=None):
+    """Check and validate the parameters passed to a specific
+    method like `fit`.
 
     Parameters
     ----------
     X : array-like of shape (n_samples, n_features)
         Data array.
 
-    fit_params : dict
-        Dictionary containing the parameters passed at fit.
+    params : dict
+        Dictionary containing the parameters passed to the method.
 
     indices : array-like of shape (n_samples,), default=None
         Indices to be selected if the parameter has the same size as `X`.
 
     Returns
     -------
-    fit_params_validated : dict
+    method_params_validated : dict
         Validated parameters. We ensure that the values support indexing.
     """
     from . import _safe_indexing
 
-    fit_params_validated = {}
-    for param_key, param_value in fit_params.items():
+    method_params_validated = {}
+    for param_key, param_value in params.items():
         if not _is_arraylike(param_value) or _num_samples(param_value) != _num_samples(
             X
         ):
             # Non-indexable pass-through (for now for backward-compatibility).
             # https://github.com/scikit-learn/scikit-learn/issues/15805
-            fit_params_validated[param_key] = param_value
+            method_params_validated[param_key] = param_value
         else:
-            # Any other fit_params should support indexing
+            # Any other method_params should support indexing
             # (e.g. for cross-validation).
-            fit_params_validated[param_key] = _make_indexable(param_value)
-            fit_params_validated[param_key] = _safe_indexing(
-                fit_params_validated[param_key], indices
+            method_params_validated[param_key] = _make_indexable(param_value)
+            method_params_validated[param_key] = _safe_indexing(
+                method_params_validated[param_key], indices
             )
 
-    return fit_params_validated
+    return method_params_validated
 
 
 def _is_pandas_df(X):

From cc5ba48072cb4df4c3a6e6aeec907a79d37aeafe Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft.com>
Date: Thu, 13 Jul 2023 17:27:15 +0500
Subject: [PATCH 085/194] Resolve conflict in changelog

---
 doc/whats_new/v1.4.rst | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index d69e89f2b4996..33a0cd477504f 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -106,11 +106,3 @@ Thanks to everyone who has contributed to the maintenance and improvement of
 the project since version 1.3, including:
 
 TODO: update at the time of the release.
-
-:mod:`sklearn.base`
-...................
-
-- |Enhancement| :meth:`base.ClusterMixin.fit_predict` and
-  :meth:`base.OutlierMixin.fit_predict` now accept ``**kwargs`` which are
-  passed to the ``fit`` method of the the estimator. :pr:`26506` by `Adrin
-  Jalali`_.

From 66c4c7f7077af2d4ca31c10f5e7976b936212f7d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 13 Jul 2023 14:36:39 +0200
Subject: [PATCH 086/194] iter

---
 sklearn/model_selection/_classification_threshold.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 3d40352ec6252..9ee04fa4c157f 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -160,7 +160,7 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
     class. The tuning is done by maximizing a binary metric, potentially
     constrained by a another metric.
 
-    Read more in the :ref:`User Guide <TunedThresholdClassifier>`.
+    Read more in the :ref:`User Guide <tunedthresholdclassifier>`.
 
     .. versionadded:: 1.3
 

From 378930e1d52e48d81fd605bb2a5c725abae5c208 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 13 Jul 2023 18:24:46 +0200
Subject: [PATCH 087/194] iter

---
 sklearn/metrics/_scorer.py                    | 30 ++++-----
 .../_classification_threshold.py              | 65 ++++++++++++++-----
 .../tests/test_classification_threshold.py    | 59 +++++++++++++----
 3 files changed, 109 insertions(+), 45 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 7cc3e9fb13314..162dd8c7493bc 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -269,20 +269,6 @@ def _factory_args(self):
         """Return non-default make_scorer arguments for repr."""
         return ""
 
-    def _from_scores_to_class_labels(self, y_score, threshold, classes):
-        """Threshold `y_score` and return the associated class labels."""
-        pos_label = self._get_pos_label()
-        if pos_label is None:
-            map_thresholded_score_to_label = np.array([0, 1])
-        else:
-            pos_label_idx = np.flatnonzero(classes == pos_label)[0]
-            neg_label_idx = np.flatnonzero(classes != pos_label)[0]
-            map_thresholded_score_to_label = np.array([neg_label_idx, pos_label_idx])
-
-        return classes[
-            map_thresholded_score_to_label[(y_score >= threshold).astype(int)]
-        ]
-
     def _warn_overlap(self, message, kwargs):
         """Warn if there is any overlap between ``self._kwargs`` and ``kwargs``.
 
@@ -488,6 +474,18 @@ def _factory_args(self):
         return ", needs_threshold=True"
 
 
+def _threshold_scores_to_class_labels(y_score, threshold, classes, pos_label):
+    """Threshold `y_score` and return the associated class labels."""
+    if pos_label is None:
+        map_thresholded_score_to_label = np.array([0, 1])
+    else:
+        pos_label_idx = np.flatnonzero(classes == pos_label)[0]
+        neg_label_idx = np.flatnonzero(classes != pos_label)[0]
+        map_thresholded_score_to_label = np.array([neg_label_idx, pos_label_idx])
+
+    return classes[map_thresholded_score_to_label[(y_score >= threshold).astype(int)]]
+
+
 class _ContinuousScorer(_BaseScorer):
     """Scorer taking a continuous response and output a score for each threshold."""
 
@@ -536,7 +534,9 @@ def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
             self._sign
             * score_func(
                 y_true,
-                self._from_scores_to_class_labels(y_score, th, estimator.classes_),
+                _threshold_scores_to_class_labels(
+                    y_score, th, estimator.classes_, self._get_pos_label()
+                ),
                 **self._kwargs,
             )
             for th in potential_thresholds
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 9ee04fa4c157f..2403a23a213c4 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -14,7 +14,7 @@
     precision_recall_curve,
     roc_curve,
 )
-from ..metrics._scorer import _ContinuousScorer
+from ..metrics._scorer import _ContinuousScorer, _threshold_scores_to_class_labels
 from ..utils import _safe_indexing
 from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
 from ..utils._response import _get_response_values_binary
@@ -155,14 +155,14 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
     """Decision threshold tuning for binary classification.
 
     This estimator post-tunes the decision threshold (cut-off point) that is
-    used for converting probabilities (i.e. output of `predict_proba`) or
-    decision function (i.e. output of `decision_function`) into a predicted
-    class. The tuning is done by maximizing a binary metric, potentially
-    constrained by a another metric.
+    used for converting posterior probability estimates (i.e. output of
+    `predict_proba`) or decision scores (i.e. output of `decision_function`)
+    into a class label. The tuning is done by maximizing a binary metric,
+    potentially constrained by a another metric.
 
     Read more in the :ref:`User Guide <tunedthresholdclassifier>`.
 
-    .. versionadded:: 1.3
+    .. versionadded:: 1.4
 
     Parameters
     ----------
@@ -170,6 +170,13 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
         The classifier, fitted or not fitted, for which we want to optimize
         the decision threshold used during `predict`.
 
+    strategy : {"optimum", "constant"}, default="optimum"
+        The strategy to use for tuning the decision threshold:
+
+        * `"optimum"`: the decision threshold is tuned to optimize the objective
+            metric;
+        * `"constant"`: the decision threshold is set to `constant_value`.
+
     objective_metric : {"max_tpr_at_tnr_constraint", "max_tnr_at_tpr_constraint", \
             "max_precision_at_recall_constraint, "max_recall_at_precision_constraint"} \
             , str, dict or callable, default="balanced_accuracy"
@@ -198,6 +205,9 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
         `"max_precision_at_recall_constraint"`, or
         `"max_recall_at_precision_constraint"`.
 
+    constant_threshold : float, default=0.5
+        The constant threshold to use when `strategy` is `"constant"`.
+
     pos_label : int, float, bool or str, default=None
         The label of the positive class. Used when `objective_metric` is
         `"max_tnr_at_tpr_constraint"`"`, `"max_tpr_at_tnr_constraint"`, or a dictionary.
@@ -272,8 +282,9 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
     decision_threshold_ : float
         The new decision threshold.
 
-    decision_thresholds_ : ndarray of shape (n_thresholds,)
-        All decision thresholds that were evaluated.
+    decision_thresholds_ : ndarray of shape (n_thresholds,) or None
+        All decision thresholds that were evaluated. If `strategy="constant"`,
+        `decision_thresholds_` is None.
 
     objective_score_ : float or tuple of floats
         The score of the objective metric associated with the decision threshold found.
@@ -281,10 +292,12 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
         `"max_tnr_at_tpr_constraint"`, `"max_precision_at_recall_constraint"`,
         `"max_recall_at_precision_constraint"`, it will corresponds to a tuple of
         two float values: the first one is the score of the metric which is constrained
-        and the second one is the score of the maximized metric.
+        and the second one is the score of the maximized metric. If
+        `strategy="constant"`, `objective_score_` is None.
 
     objective_scores_ : ndarray of shape (n_thresholds,)
         The scores of the objective metric associated with the decision thresholds.
+        If `strategy="constant"`, `objective_scores_` is None.
 
     classes_ : ndarray of shape (n_classes,)
         The class labels.
@@ -352,6 +365,7 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
             HasMethods(["fit", "predict_proba"]),
             HasMethods(["fit", "decision_function"]),
         ],
+        "strategy": [StrOptions({"optimum", "constant"})],
         "objective_metric": [
             StrOptions(
                 set(get_scorer_names())
@@ -366,6 +380,7 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
             MutableMapping,
         ],
         "constraint_value": [Real, None],
+        "constant_threshold": [Real],
         "pos_label": [Real, str, "boolean", None],
         "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})],
         "n_thresholds": [Interval(Integral, 1, None, closed="left")],
@@ -383,8 +398,10 @@ def __init__(
         self,
         estimator,
         *,
+        strategy="optimum",
         objective_metric="balanced_accuracy",
         constraint_value=None,
+        constant_threshold=0.5,
         pos_label=None,
         response_method="auto",
         n_thresholds=100,
@@ -394,8 +411,10 @@ def __init__(
         random_state=None,
     ):
         self.estimator = estimator
+        self.strategy = strategy
         self.objective_metric = objective_metric
         self.constraint_value = constraint_value
+        self.constant_threshold = constant_threshold
         self.pos_label = pos_label
         self.response_method = response_method
         self.n_thresholds = n_thresholds
@@ -523,6 +542,18 @@ def fit(self, X, y, sample_weight=None, **fit_params):
             else:
                 self.estimator_.fit(X_train, y_train, **fit_params_train)
 
+        if hasattr(self.estimator_, "n_features_in_"):
+            self.n_features_in_ = self.estimator_.n_features_in_
+        if hasattr(self.estimator_, "feature_names_in_"):
+            self.feature_names_in_ = self.estimator_.feature_names_in_
+
+        if self.strategy == "constant":
+            # early exit when we don't need to find the optimal threshold
+            self.decision_threshold_ = self.constant_threshold
+            self.decision_thresholds_ = None
+            self.objective_score_, self.objective_scores_ = None, None
+            return self
+
         if isinstance(self.objective_metric, MutableMapping):
             keys = set(self.objective_metric.keys())
             if not keys == {"tp", "tn", "fp", "fn"}:
@@ -700,11 +731,6 @@ def _get_best_idx(constrained_score, maximized_score):
             )
             self.decision_threshold_ = self.decision_thresholds_[best_idx]
 
-        if hasattr(self.estimator_, "n_features_in_"):
-            self.n_features_in_ = self.estimator_.n_features_in_
-        if hasattr(self.estimator_, "feature_names_in_"):
-            self.feature_names_in_ = self.estimator_.feature_names_in_
-
         return self
 
     @property
@@ -726,12 +752,17 @@ def predict(self, X):
             The predicted class.
         """
         check_is_fitted(self, "estimator_")
-        pos_label = self._scorer._get_pos_label()
+        if self.strategy == "optimum":
+            # `pos_label` has been validated and is stored in the scorer
+            pos_label = self._scorer._get_pos_label()
+        else:
+            pos_label = self.pos_label
         y_score, _ = _get_response_values_binary(
             self.estimator_, X, self._response_method, pos_label=pos_label
         )
-        return self._scorer._from_scores_to_class_labels(
-            y_score, self.decision_threshold_, self.classes_
+
+        return _threshold_scores_to_class_labels(
+            y_score, self.decision_threshold_, self.classes_, pos_label
         )
 
     @available_if(_estimator_has("predict_proba"))
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index e822b41dfce6e..6b778d2422104 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -325,13 +325,18 @@ def test_tunedthresholdclassifier_no_binary():
         ),
     ],
 )
-def test_tunedthresholdclassifier_conflict_cv_refit(params, err_type, err_msg):
+@pytest.mark.parametrize("strategy", ["optimum", "constant"])
+def test_tunedthresholdclassifier_conflict_cv_refit(
+    strategy, params, err_type, err_msg
+):
     """Check that we raise an informative error message when `cv` and `refit`
     cannot be used together.
     """
     X, y = make_classification(n_samples=100, random_state=0)
     with pytest.raises(err_type, match=err_msg):
-        TunedThresholdClassifier(LogisticRegression(), **params).fit(X, y)
+        TunedThresholdClassifier(LogisticRegression(), strategy=strategy, **params).fit(
+            X, y
+        )
 
 
 @pytest.mark.parametrize(
@@ -341,15 +346,16 @@ def test_tunedthresholdclassifier_conflict_cv_refit(params, err_type, err_msg):
 @pytest.mark.parametrize(
     "response_method", ["predict_proba", "predict_log_proba", "decision_function"]
 )
+@pytest.mark.parametrize("strategy", ["optimum", "constant"])
 def test_tunedthresholdclassifier_estimator_response_methods(
-    estimator, response_method
+    estimator, strategy, response_method
 ):
     """Check that `TunedThresholdClassifier` exposes the same response methods as the
     underlying estimator.
     """
     X, y = make_classification(n_samples=100, random_state=0)
 
-    model = TunedThresholdClassifier(estimator)
+    model = TunedThresholdClassifier(estimator, strategy=strategy)
     assert hasattr(model, response_method) == hasattr(estimator, response_method)
 
     model.fit(X, y)
@@ -475,8 +481,11 @@ def test_tunedthresholdclassifier_with_string_targets(response_method, metric):
     assert_array_equal(np.sort(np.unique(y_pred)), np.sort(classes))
 
 
+@pytest.mark.parametrize("strategy", ["optimum", "constant"])
 @pytest.mark.parametrize("with_sample_weight", [True, False])
-def test_tunedthresholdclassifier_refit(with_sample_weight, global_random_seed):
+def test_tunedthresholdclassifier_refit(
+    strategy, with_sample_weight, global_random_seed
+):
     """Check the behaviour of the `refit` parameter."""
     rng = np.random.RandomState(global_random_seed)
     X, y = make_classification(n_samples=100, random_state=0)
@@ -488,7 +497,7 @@ def test_tunedthresholdclassifier_refit(with_sample_weight, global_random_seed):
 
     # check that `estimator_` if fitted on the full dataset when `refit=True`
     estimator = LogisticRegression()
-    model = TunedThresholdClassifier(estimator, refit=True).fit(
+    model = TunedThresholdClassifier(estimator, strategy=strategy, refit=True).fit(
         X, y, sample_weight=sample_weight
     )
 
@@ -500,9 +509,9 @@ def test_tunedthresholdclassifier_refit(with_sample_weight, global_random_seed):
     # check that `estimator_` was not altered when `refit=False` and `cv="prefit"`
     estimator = LogisticRegression().fit(X, y, sample_weight=sample_weight)
     coef = estimator.coef_.copy()
-    model = TunedThresholdClassifier(estimator, cv="prefit", refit=False).fit(
-        X, y, sample_weight=sample_weight
-    )
+    model = TunedThresholdClassifier(
+        estimator, strategy=strategy, cv="prefit", refit=False
+    ).fit(X, y, sample_weight=sample_weight)
 
     assert model.estimator_ is estimator
     assert_allclose(model.estimator_.coef_, coef)
@@ -512,9 +521,9 @@ def test_tunedthresholdclassifier_refit(with_sample_weight, global_random_seed):
     cv = [
         (np.arange(50), np.arange(50, 100)),
     ]  # single split
-    model = TunedThresholdClassifier(estimator, cv=cv, refit=False).fit(
-        X, y, sample_weight=sample_weight
-    )
+    model = TunedThresholdClassifier(
+        estimator, strategy=strategy, cv=cv, refit=False
+    ).fit(X, y, sample_weight=sample_weight)
 
     assert model.estimator_ is not estimator
     if with_sample_weight:
@@ -661,7 +670,7 @@ def test_tunedthresholdclassifier_objective_metric_dict(global_random_seed):
     assert np.mean(model.predict(X) == 0) > 0.9
 
 
-def test_tunedthresholdclassifier_sample_weight_costs_and_again():
+def test_tunedthresholdclassifier_sample_weight_costs_and_gain():
     """Check that we dispatch the `sample_weight` to the scorer when computing the
     confusion matrix."""
     X, y = load_iris(return_X_y=True)
@@ -838,3 +847,27 @@ def test_tunedthresholdclassifier_pos_label_single_metric(pos_label, metric_type
 
     precision = precision_score(y, model.predict(X), pos_label=pos_label)
     assert precision == pytest.approx(model.objective_score_, abs=1e-3)
+
+
+@pytest.mark.parametrize(
+    "predict_method",
+    ["predict", "predict_proba", "decision_function", "predict_log_proba"],
+)
+def test_tunedthresholdclassifier_constant_strategy(predict_method):
+    """Check the behavior when `strategy='contant'."""
+    X, y = make_classification(n_samples=100, weights=[0.6, 0.4], random_state=42)
+
+    # With a constant strategy and a threshold at 0.5, we should get the same than the
+    # original model
+    estimator = LogisticRegression().fit(X, y)
+    constant_threshold = 0.5
+    tuned_model = TunedThresholdClassifier(
+        estimator, strategy="constant", constant_threshold=constant_threshold
+    ).fit(X, y)
+    assert tuned_model.decision_threshold_ == pytest.approx(constant_threshold)
+    for attribute in ("decision_thresholds_", "objective_score_", "objective_scores_"):
+        assert getattr(tuned_model, attribute) is None
+
+    assert_allclose(
+        getattr(tuned_model, predict_method)(X), getattr(estimator, predict_method)(X)
+    )

From c88ed9415f44b36051db9b7d69e6c039fae90a61 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 13 Jul 2023 18:35:29 +0200
Subject: [PATCH 088/194] iter

---
 doc/modules/classification_threshold.rst | 83 +++++++++++++-----------
 1 file changed, 45 insertions(+), 38 deletions(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index 868bb0747f7b7..d3fbeea67174a 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -6,21 +6,20 @@
 Tuning cut-off decision threshold for classes prediction
 ========================================================
 
-Classifiers are predictive models: they use statistical learning to predict
-outcomes. The outcomes of a classifier are scores for each sample in relation
-to each class and categorical prediction (class label). Scores are obtained
-from :term:`predict_proba` or :term:`decision_function`. The former returns
-posterior probability estimates for each class while the latter returns a
-decision function value for each class. The decision function value is a
-measure of how strongly the sample is predicted to belong to the positive
-class (e.g. the distance to the decisin boundary). A decision rule is then
-defined by thresholding the scores and obtained the class label for each
-sample. Those labels are obtained with :term:`predict`.
-
-For binary classification in scikit-learn, class labels are obtained by
-associating the positive class with probability estimates greater than 0.5
-(obtained with :term:`predict_proba`) or decision function values greater than
-0 (obtained with :term:`decision_function`).
+Classifiers are predictive models: they use statistical learning to predict outcomes.
+The outcomes of a classifier are scores for each sample in relation to each class and
+categorical prediction (class label). Scores are obtained from :term:`predict_proba` or
+:term:`decision_function`. The former returns posterior probability estimates for each
+class while the latter returns a decision score for each class. The decision score is a
+measure of how strongly the sample is predicted to belong to the positive class (e.g.
+the distance to the decisin boundary). A decision rule is then defined by thresholding
+the scores and obtained the class label for each sample. Those labels are obtained with
+:term:`predict`.
+
+For binary classification in scikit-learn, class labels are obtained by associating the
+positive class with posterior probability estimates greater than 0.5 (obtained with
+:term:`predict_proba`) or decision scores greater than 0 (obtained with
+:term:`decision_function`).
 
 Here, we show an example that illustrates the relation between posterior
 probability estimates and class labels::
@@ -37,38 +36,37 @@ probability estimates and class labels::
     >>> classifier.predict(X[:4])
     array([0, 0, 1, 1])
 
-While these approaches are reasonable as default behaviors, they are not be
-ideal for all cases. The context and nature of the use case defines the
-expected behavior of the classifier and thus the strategy to convert soft
-predictions into hard predictions. We illustrate this point with an example.
+While these approaches are reasonable as default behaviors, they are not be ideal for
+all cases. The context and nature of the use case defines the expected behavior of the
+classifier and thus the strategy to convert soft predictions into hard predictions. We
+illustrate this point with an example.
 
-Let's imagine the deployment of a predictive model helping medical doctors to
-detect tumour. In a setting where this model was a tool to discard obvious
-cases and false positives don't lead to potentially harmful treatments, doctors
-might be interested in having a high recall (all cancer cases should be tagged
-as such) to not miss any patient with a cancer. However, that is at the cost of
-having more false positive predictions (i.e. lower precision). Thus, in terms of
-decision threshold, it may be better to classify a patient as having a cancer
-for a probability estimate lower than 0.5.
+Let's imagine the deployment of a predictive model helping medical doctors to detect
+tumour. In a setting where this model was a tool to discard obvious cases and false
+positives don't lead to potentially harmful treatments, doctors might be interested in
+having a high recall (all cancer cases should be tagged as such) to not miss any patient
+with a cancer. However, that is at the cost of having more false positive predictions
+(i.e. lower precision). Thus, in terms of decision threshold, it may be better to
+classify a patient as having a cancer for a posterior probability estimate lower than
+0.5.
 
 Post-tuning of the decision threshold
 =====================================
 
 One solution to address the problem stated in the introduction is to tune the decision
 threshold of the classifier once the model has been trained. The
-:class:`~sklearn.model_selection.TunedThresholdClassifier` tunes this threshold using
-an internal cross-validation. The optimum threshold is chosen to maximize a given metric
+:class:`~sklearn.model_selection.TunedThresholdClassifier` tunes this threshold using an
+internal cross-validation. The optimum threshold is chosen to maximize a given metric
 with or without constraints.
 
-The following image illustrates the tuning of the cut-off point for a gradient
-boosting classifier. While the vanilla and tuned classifiers provide the same
-Receiver Operating Characteristic (ROC) and Precision-Recall curves, and thus
-the same :term:`predict_proba` outputs, the "hard" predictions differ because of
-the tuned cut-off point. The vanilla classifier predicts the class of interest
-for a probability greater than 0.5 while the tuned classifier predicts the
-class of interest for a very low probability (around 0.02). This cut-off point
-optimizes a utility metric defined by the business case (in this case an
-insurance company).
+The following image illustrates the tuning of the cut-off point for a gradient boosting
+classifier. While the vanilla and tuned classifiers provide the same Receiver Operating
+Characteristic (ROC) and Precision-Recall curves, and thus the same
+:term:`predict_proba` outputs, the class label predictions differ because of the tuned
+decision threshold. The vanilla classifier predicts the class of interest for a
+posterior probability greater than 0.5 while the tuned classifier predicts the class of
+interest for a very low probability (around 0.02). This cut-off point optimizes a
+utility metric defined by the business case (in this case an insurance company).
 
 .. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_tuned_threshold_classifier_002.png
    :target: ../auto_examples/model_selection/plot_tuned_threshold_classifier.html
@@ -156,6 +154,15 @@ a float number that will use a single split internally.
 The option `cv="prefit"` should only be used when the provided classifier was already
 trained on some data and you want to tune (or re-tune) on a new validation set.
 
+Manually setting the decision thresholding
+-------------------------------------------
+
+The previous sections discussed strategies to find an optimal decision threshold. It is
+also possible to manually set the decision threshold in
+:class`~sklearn.model_selection.TunedThresholdClassifier` by setting the parameter
+`strategy` to `"constant"` and provide the desired threshold using the parameter
+`constant_threshold`.
+
 Examples
 --------
 

From 4715e673962e08edcdc7f1430abf80a9cae3fade Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 13 Jul 2023 18:39:17 +0200
Subject: [PATCH 089/194] iter

---
 .../plot_tuned_threshold_classifier.py        | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/examples/model_selection/plot_tuned_threshold_classifier.py b/examples/model_selection/plot_tuned_threshold_classifier.py
index dda6caba7185b..5e9206b81a634 100644
--- a/examples/model_selection/plot_tuned_threshold_classifier.py
+++ b/examples/model_selection/plot_tuned_threshold_classifier.py
@@ -3,10 +3,11 @@
 Post-tuning the cut-off point of decision function
 ==================================================
 
-Once a classifier is trained, the output of the :term:`predict` method output hard
-predictions corresponding to a thresholding of either the :term:`decision function`
-or the :term:`predict_proba` output. For binary classifier, the default threshold is
-defined as a probability score of 0.5 or a decision function value of 0.0.
+Once a classifier is trained, the output of the :term:`predict` method output class
+label predictions corresponding to a thresholding of either the :term:`decision
+function` or the :term:`predict_proba` output. For a binary classifier, the default
+threshold is defined as a posterior probability estimate of 0.5 or a decision score of
+0.0.
 
 However, this default strategy may not be optimal for the task at hand.
 Here, we use the "Statlog" German credit dataset [1]_ to illustrate a use case.
@@ -244,11 +245,11 @@ def gain_cost_score(y, y_pred, **kwargs):
 # reported metrics are the precision and recall and for the ROC curve, the reported
 # metrics are the TPR (same as recall) and FPR.
 #
-# Here, the different cut-off points correspond to different levels of probability
-# scores ranging between 0 and 1. By default, `model.predict` uses a cut-off point at
-# a probability of 0.5. The metrics for such cut-off point are reported with the
-# blue dot on the curves: it corresponds to the statistical performance of the model
-# when using `model.predict`.
+# Here, the different cut-off points correspond to different levels of posterior
+# probability estimates ranging between 0 and 1. By default, `model.predict` uses a
+# cut-off point at a probability estimate of 0.5. The metrics for such cut-off point are
+# reported with the blue dot on the curves: it corresponds to the statistical
+# performance of the model when using `model.predict`.
 #
 # However, we recall that the original aim was to minimize the cost (or maximize the
 # gain) by the business metric. We can compute the value of the business metric:

From b3bb39fe264cd08c976c44a92392020ca4741a1b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 13 Jul 2023 18:40:36 +0200
Subject: [PATCH 090/194] iter

---
 .../plot_tuned_threshold_classifier.py        | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/examples/model_selection/plot_tuned_threshold_classifier.py b/examples/model_selection/plot_tuned_threshold_classifier.py
index 5e9206b81a634..3d8a6e9348916 100644
--- a/examples/model_selection/plot_tuned_threshold_classifier.py
+++ b/examples/model_selection/plot_tuned_threshold_classifier.py
@@ -478,17 +478,17 @@ def gain_cost_score(y, y_pred, **kwargs):
 # the same set as the model was trained on, and this is the reason for the observed
 # overfitting.
 #
-# This option should therefore be used with caution. One needs to make sure
-# that the data providing at fitting time to the
-# :class:`~sklearn.model_selection.TunedThresholdClassifier` is not the same as
-# the data used to train the underlying classifier. This could happen sometimes
-# when the idea is just to tune the predictive model on a completely new
-# validation set without a costly complete refit.
+# This option should therefore be used with caution. One needs to make sure that the
+# data providing at fitting time to the
+# :class:`~sklearn.model_selection.TunedThresholdClassifier` is not the same as the data
+# used to train the underlying classifier. This could happen sometimes when the idea is
+# just to tune the predictive model on a completely new validation set without a costly
+# complete refit.
 #
-# In the case that cross-validation is too costly, a potential alternative is to use
-# a single train-test split by providing a floating number in range `[0, 1]` to the
-# `cv` parameter. It splits the data into a training and testing set. Let's
-# explore this option:
+# In the case that cross-validation is too costly, a potential alternative is to use a
+# single train-test split by providing a floating number in range `[0, 1]` to the `cv`
+# parameter. It splits the data into a training and testing set. Let's explore this
+# option:
 model_tuned.set_params(cv=0.75).fit(X_train, y_train)
 
 # %%
@@ -570,7 +570,7 @@ def gain_cost_score(y, y_pred, **kwargs):
 # is any variance in the cut-off point. The repeated cross-validation averages out
 # this effect.
 #
-# Another observation concerns the ROC and Precision-Recall curves of the tuned
-# model. As expected, these curves differ from those of the vanilla model, given
-# that we trained the underlying classifier on a subset of the data provided
-# during fitting and reserved a validation set for tuning the cut-off point.
+# Another observation concerns the ROC and Precision-Recall curves of the tuned model.
+# As expected, these curves differ from those of the vanilla model, given that we
+# trained the underlying classifier on a subset of the data provided during fitting and
+# reserved a validation set for tuning the cut-off point.

From b72a72a1f7cdb5ab574423001b26246f68823456 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 13 Jul 2023 19:52:38 +0200
Subject: [PATCH 091/194] iter

---
 sklearn/model_selection/_classification_threshold.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 2403a23a213c4..7ed41533e7d4f 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -4,7 +4,13 @@
 
 import numpy as np
 
-from ..base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin, clone
+from ..base import (
+    BaseEstimator,
+    ClassifierMixin,
+    MetaEstimatorMixin,
+    _fit_context,
+    clone,
+)
 from ..exceptions import NotFittedError
 from ..metrics import (
     check_scoring,
@@ -423,6 +429,10 @@ def __init__(
         self.n_jobs = n_jobs
         self.random_state = random_state
 
+    @_fit_context(
+        # estimators in TunedThresholdClassifier.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None, **fit_params):
         """Fit the classifier and post-tune the decision threshold.
 

From b4e67fbeac32d727807fbfd2d661c92681c51b02 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 14 Jul 2023 22:20:24 +0200
Subject: [PATCH 092/194] Add metadata routing

---
 .../plot_tuned_threshold_classifier.py        |  51 ++--
 sklearn/metrics/_scorer.py                    |  48 +++-
 .../_classification_threshold.py              | 266 +++++++-----------
 .../tests/test_classification_threshold.py    | 166 +++++------
 4 files changed, 261 insertions(+), 270 deletions(-)

diff --git a/examples/model_selection/plot_tuned_threshold_classifier.py b/examples/model_selection/plot_tuned_threshold_classifier.py
index 3d8a6e9348916..7c87fb5a09b50 100644
--- a/examples/model_selection/plot_tuned_threshold_classifier.py
+++ b/examples/model_selection/plot_tuned_threshold_classifier.py
@@ -32,12 +32,17 @@
 # -------------------------------
 #
 # We fetch the German credit dataset from OpenML.
+import numpy as np
+
 import sklearn
 from sklearn.datasets import fetch_openml
 
 sklearn.set_config(transform_output="pandas")
+sklearn.set_config(enable_metadata_routing=True)
+
 german_credit = fetch_openml(data_id=31, as_frame=True, parser="pandas")
 X, y = german_credit.data, german_credit.target
+pure_loss = np.abs(np.random.RandomState(0).randint(0, 5, size=len(y)))
 
 # %%
 # We check the feature types available in `X`.
@@ -66,7 +71,9 @@
 # To carry our analysis, we split our dataset using a single stratified split.
 from sklearn.model_selection import train_test_split
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
+X_train, X_test, y_train, y_test, pure_loss_train, pure_loss_test = train_test_split(
+    X, y, pure_loss, stratify=y, random_state=0
+)
 
 # %%
 # We are ready to design our predictive model and the associated evaluation strategy.
@@ -122,28 +129,24 @@ def fpr_score(y, y_pred, **kwargs):
 # cost-matrix which encodes that predicting a "bad" credit as "good" is 5 times more
 # costly than the opposite. We define a dictionary containing this information and a
 # score function that computes the cost.
-cost_gain_matrix = {"tp": 0, "tn": 0, "fp": -1, "fn": -5}
 
 
-def gain_cost_score(y, y_pred, **kwargs):
-    cm = confusion_matrix(y, y_pred)
-    classes = np.unique(y)
-    pos_label = kwargs.get("pos_label", classes[-1])
-    pos_label_idx = np.searchsorted(classes, pos_label)
-    if pos_label_idx == 0:
-        cm = cm[::-1, ::-1]
-    costs_and_gain = np.array(
-        [
-            [kwargs["cost_gain_matrix"]["tn"], kwargs["cost_gain_matrix"]["fp"]],
-            [kwargs["cost_gain_matrix"]["fn"], kwargs["cost_gain_matrix"]["tp"]],
-        ]
-    )
-    return (costs_and_gain * cm).sum()
+def gain_cost_score(y, y_pred, pos_label, pure_loss):
+    cost_and_gain = np.zeros_like(y)
+    mask_tp = (y == pos_label) & (y_pred == pos_label)
+    cost_and_gain[mask_tp] = 0
+    mask_fp = (y != pos_label) & (y_pred == pos_label)
+    cost_and_gain[mask_fp] = -pure_loss[mask_fp]
+    mask_fn = (y == pos_label) & (y_pred != pos_label)
+    cost_and_gain[mask_fn] = -1
+    mask_tn = (y != pos_label) & (y_pred != pos_label)
+    cost_and_gain[mask_tn] = 0
+    return cost_and_gain.sum()
 
 
 scoring["cost_gain"] = make_scorer(
-    gain_cost_score, pos_label=pos_label, cost_gain_matrix=cost_gain_matrix
-)
+    gain_cost_score, pos_label=pos_label
+).set_score_request(pure_loss=True)
 # %%
 # Vanilla predictive model
 # ------------------------
@@ -253,7 +256,7 @@ def gain_cost_score(y, y_pred, **kwargs):
 #
 # However, we recall that the original aim was to minimize the cost (or maximize the
 # gain) by the business metric. We can compute the value of the business metric:
-scoring["cost_gain"](model, X_test, y_test)
+scoring["cost_gain"](model, X_test, y_test, pure_loss=pure_loss_test)
 
 # %%
 # At this stage we don't know if any other cut-off can lead to a greater gain.
@@ -279,9 +282,9 @@ def gain_cost_score(y, y_pred, **kwargs):
 model_tuned = TunedThresholdClassifier(
     estimator=model,
     pos_label=pos_label,
-    objective_metric=cost_gain_matrix,
+    objective_metric=scoring["cost_gain"],
 )
-model_tuned.fit(X_train, y_train)
+model_tuned.fit(X_train, y_train, pure_loss=pure_loss_train)
 
 # %%
 # We plot the ROC and Precision-Recall curves for the vanilla model and the tuned model.
@@ -371,7 +374,7 @@ def gain_cost_score(y, y_pred, **kwargs):
 #
 # We can now check if choosing this cut-off point leads to a better score on the testing
 # set:
-scoring["cost_gain"](model_tuned, X_test, y_test)
+scoring["cost_gain"](model_tuned, X_test, y_test, pure_loss=pure_loss_test)
 
 # %%
 # We observe that the decision generalized on the testing set leading to a better
@@ -394,7 +397,7 @@ def gain_cost_score(y, y_pred, **kwargs):
 # Also, the underlying classifier is not be refitted. Here, we can try to do such
 # experiment.
 model.fit(X_train, y_train)
-model_tuned.set_params(cv="prefit").fit(X_train, y_train)
+model_tuned.set_params(cv="prefit").fit(X_train, y_train, pure_loss=pure_loss_train)
 
 
 # %%
@@ -489,7 +492,7 @@ def gain_cost_score(y, y_pred, **kwargs):
 # single train-test split by providing a floating number in range `[0, 1]` to the `cv`
 # parameter. It splits the data into a training and testing set. Let's explore this
 # option:
-model_tuned.set_params(cv=0.75).fit(X_train, y_train)
+model_tuned.set_params(cv=0.75).fit(X_train, y_train, pure_loss=pure_loss_train)
 
 # %%
 fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(21, 6))
diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 162dd8c7493bc..df1f2602c38ee 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -493,7 +493,38 @@ def __init__(self, score_func, sign, response_method, kwargs):
         super().__init__(score_func=score_func, sign=sign, kwargs=kwargs)
         self.response_method = response_method
 
-    def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
+    @classmethod
+    def from_scorer(cls, scorer, response_method, pos_label):
+        """Create a continuous scorer from a normal scorer."""
+        # add `pos_label` if requested by the scorer function
+        scorer_kwargs = {**scorer._kwargs}
+        signature_scoring_func = signature(scorer._score_func)
+        if (
+            "pos_label" in signature_scoring_func.parameters
+            and "pos_label" not in scorer_kwargs
+        ):
+            if pos_label is None:
+                # Since the provided `pos_label` is the default, we need to
+                # use the default value of the scoring function that can be either
+                # `None` or `1`.
+                scorer_kwargs["pos_label"] = signature_scoring_func.parameters[
+                    "pos_label"
+                ].default
+            else:
+                scorer_kwargs["pos_label"] = pos_label
+        # transform a binary metric into a curve metric for all possible decision
+        # thresholds
+        instance = cls(
+            score_func=scorer._score_func,
+            sign=scorer._sign,
+            response_method=response_method,
+            kwargs=scorer_kwargs,
+        )
+        # transfer the metadata request
+        instance._metadata_request = scorer._get_metadata_request()
+        return instance
+
+    def _score(self, method_caller, estimator, X, y_true, **kwargs):
         """Evaluate predicted target values for X relative to y_true.
 
         Parameters
@@ -507,8 +538,9 @@ def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
         y_true : array-like of shape (n_samples,)
             Gold standard target values for X.
 
-        sample_weight : array-like of shape (n_samples,), default=None
-            Sample weights.
+        **kwargs : dict
+            Other parameters passed to the scorer. Refer to
+            :func:`set_score_request` for more details.
 
         Returns
         -------
@@ -518,26 +550,22 @@ def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
         pos_label = self._get_pos_label()
         y_score = method_caller(estimator, self.response_method, X, pos_label=pos_label)
 
-        if sample_weight is not None:
-            score_func = partial(self._score_func, sample_weight=sample_weight)
-        else:
-            score_func = self._score_func
-
         # TODO: this is pretty slow if we have a lot of `potential_thresholds`
         # We could parallelize but then we are inside a nested parallel loop where the
         # external parallelism is on the CV.
         # Another guess would be to interpolate the potential thresholds at this moment.
         # Easy for the probability case but not for the decision function case since it
         # is not bounded.
+        scoring_kwargs = {**self._kwargs, **kwargs}
         potential_thresholds = np.unique(y_score)
         score_thresholds = [
             self._sign
-            * score_func(
+            * self._score_func(
                 y_true,
                 _threshold_scores_to_class_labels(
                     y_score, th, estimator.classes_, self._get_pos_label()
                 ),
-                **self._kwargs,
+                **scoring_kwargs,
             )
             for th in potential_thresholds
         ]
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index b5cde9b33bbd3..58446ed5babb4 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -1,5 +1,4 @@
 from collections.abc import MutableMapping
-from inspect import signature
 from numbers import Integral, Real
 
 import numpy as np
@@ -14,7 +13,6 @@
 from ..exceptions import NotFittedError
 from ..metrics import (
     check_scoring,
-    confusion_matrix,
     get_scorer_names,
     make_scorer,
     precision_recall_curve,
@@ -24,13 +22,17 @@
 from ..utils import _safe_indexing
 from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
 from ..utils._response import _get_response_values_binary
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _routing_enabled,
+    process_routing,
+)
 from ..utils.metaestimators import available_if
 from ..utils.multiclass import type_of_target
 from ..utils.parallel import Parallel, delayed
 from ..utils.validation import (
-    _check_fit_params,
-    _check_pos_label_consistency,
-    _check_sample_weight,
+    _check_method_params,
     _num_samples,
     check_consistent_length,
     check_is_fitted,
@@ -56,12 +58,13 @@ def _fit_and_score(
     classifier,
     X,
     y,
-    sample_weight,
+    *,
     fit_params,
     train_idx,
     val_idx,
     scorer,
     score_method,
+    score_params,
 ):
     """Fit a classifier and compute the scores for different decision thresholds.
 
@@ -77,9 +80,6 @@ def _fit_and_score(
     y : array-like of shape (n_samples,)
         The entire target vector.
 
-    sample_weight : array-like of shape (n_samples,)
-        Some optional associated sample weights.
-
     fit_params : dict
         Parameters to pass to the `fit` method of the underlying classifier.
 
@@ -99,6 +99,9 @@ def _fit_and_score(
         The scoring method to use. Used to detect if we compute TPR/TNR or precision/
         recall.
 
+    score_params : dict
+        Parameters to pass to the `score` method of the underlying scorer.
+
     Returns
     -------
     thresholds : ndarray of shape (n_thresholds,)
@@ -109,35 +112,22 @@ def _fit_and_score(
         The scores computed for each decision threshold. When TPR/TNR or precision/
         recall are computed, `scores` is a tuple of two arrays.
     """
-    arrays = (X, y) if sample_weight is None else (X, y, sample_weight)
-    check_consistent_length(*arrays)
-
-    fit_parameters = signature(classifier.fit).parameters
-    supports_sw = "sample_weight" in fit_parameters
+    check_consistent_length(X, y)
 
     if train_idx is not None:
         X_train, X_val = _safe_indexing(X, train_idx), _safe_indexing(X, val_idx)
         y_train, y_val = _safe_indexing(y, train_idx), _safe_indexing(y, val_idx)
-        if sample_weight is not None:
-            sw_train, sw_val = (
-                _safe_indexing(sample_weight, train_idx),
-                _safe_indexing(sample_weight, val_idx),
-            )
-        else:
-            sw_train, sw_val = None, None
-        fit_params_train = _check_fit_params(X, fit_params, indices=train_idx)
-        if supports_sw:
-            classifier.fit(X_train, y_train, sample_weight=sw_train, **fit_params_train)
-        else:
-            classifier.fit(X_train, y_train, **fit_params_train)
+        fit_params_train = _check_method_params(X, fit_params, indices=train_idx)
+        score_params_val = _check_method_params(X, score_params, indices=val_idx)
+        classifier.fit(X_train, y_train, **fit_params_train)
     else:  # prefit estimator, only a validation set is provided
-        X_val, y_val, sw_val = X, y, sample_weight
+        X_val, y_val, score_params_val = X, y, score_params
         check_is_fitted(classifier, "classes_")
 
     if isinstance(score_method, str):
         if score_method in {"max_tpr_at_tnr_constraint", "max_tnr_at_tpr_constraint"}:
             fpr, tpr, potential_thresholds = scorer(
-                classifier, X_val, y_val, sample_weight=sw_val
+                classifier, X_val, y_val, **score_params_val
             )
             # For fpr=0/tpr=0, the threshold is set to `np.inf`. We need to remove it.
             fpr, tpr, potential_thresholds = fpr[1:], tpr[1:], potential_thresholds[1:]
@@ -148,13 +138,13 @@ def _fit_and_score(
             "max_recall_at_precision_constraint",
         }:
             precision, recall, potential_thresholds = scorer(
-                classifier, X_val, y_val, sample_weight=sw_val
+                classifier, X_val, y_val, **score_params_val
             )
             # thresholds are in increasing order
             # the last element of the precision and recall is not associated with any
             # threshold and should be discarded
             return potential_thresholds, (precision[:-1], recall[:-1])
-    return scorer(classifier, X_val, y_val, sample_weight=sw_val)
+    return scorer(classifier, X_val, y_val, **score_params_val)
 
 
 class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
@@ -199,10 +189,6 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
           recall of `constraint_value`;
         * `"max_recall_at_precision_constraint"`: find the decision threshold for a
           precision of `constraint_value`.
-        * a dictionary to be used as cost-sensitive matrix. The keys of the
-          dictionary should be: `("tp", "fp", "tn", "fn")`. The values of the
-          dictionary corresponds costs (negative values) and gains (positive
-          values).
 
     constraint_value : float, default=None
         The value associated with the `objective_metric` metric for which we
@@ -433,7 +419,7 @@ def __init__(
         # estimators in TunedThresholdClassifier.estimator is not validated yet
         prefer_skip_nested_validation=True
     )
-    def fit(self, X, y, sample_weight=None, **fit_params):
+    def fit(self, X, y, **params):
         """Fit the classifier and post-tune the decision threshold.
 
         Parameters
@@ -444,18 +430,18 @@ def fit(self, X, y, sample_weight=None, **fit_params):
         y : array-like of shape (n_samples,)
             Target values.
 
-        sample_weight : array-like of shape (n_samples,), default=None
-            Sample weights. If `None`, then samples are equally weighted.
-
-        **fit_params : dict
+        **params : dict
             Parameters to pass to the `fit` method of the underlying
-            classifier.
+            classifier and to the `objective_metric` scorer.
 
         Returns
         -------
         self : object
             Returns an instance of self.
         """
+        if params and not _routing_enabled():
+            raise ValueError
+
         self._validate_params()
         X, y = indexable(X, y)
 
@@ -465,9 +451,6 @@ def fit(self, X, y, sample_weight=None, **fit_params):
                 f"Only binary classification is supported. Unknown label type: {y_type}"
             )
 
-        if sample_weight is not None:
-            sample_weight = _check_sample_weight(sample_weight, X)
-
         if isinstance(self.cv, Real) and 0 < self.cv <= 1:
             cv = StratifiedShuffleSplit(
                 n_splits=1, test_size=self.cv, random_state=self.random_state
@@ -514,8 +497,10 @@ def fit(self, X, y, sample_weight=None, **fit_params):
         else:
             constraint_value = "highest"
 
-        fit_parameters = signature(self.estimator.fit).parameters
-        supports_sw = "sample_weight" in fit_parameters
+        routed_params = process_routing(
+            obj=self, method="fit", other_params={}, **params
+        )
+        self._scorer = self._get_scorer()
 
         # in the following block, we:
         # - define the final classifier `self.estimator_` and train it if necessary
@@ -528,29 +513,21 @@ def fit(self, X, y, sample_weight=None, **fit_params):
         else:
             self.estimator_ = clone(self.estimator)
             classifier = clone(self.estimator)
-            splits = cv.split(X, y)
+            splits = cv.split(X, y, **routed_params.splitter.split)
 
             if refit:
                 # train on the whole dataset
-                X_train, y_train, sw_train = X, y, sample_weight
-                fit_params_train = _check_fit_params(X, fit_params, indices=None)
+                X_train, y_train, fit_params_train = X, y, routed_params.estimator.fit
             else:
                 # single split cross-validation
-                train_idx, _ = next(cv.split(X, y))
+                train_idx, _ = next(cv.split(X, y, **routed_params.splitter.split))
                 X_train = _safe_indexing(X, train_idx)
                 y_train = _safe_indexing(y, train_idx)
-                if sample_weight is not None:
-                    sw_train = _safe_indexing(sample_weight, train_idx)
-                else:
-                    sw_train = None
-                fit_params_train = _check_fit_params(X, fit_params, indices=train_idx)
-
-            if sw_train is not None and supports_sw:
-                self.estimator_.fit(
-                    X_train, y_train, sample_weight=sw_train, **fit_params_train
+                fit_params_train = _check_method_params(
+                    X, routed_params.estimator.fit, indices=train_idx
                 )
-            else:
-                self.estimator_.fit(X_train, y_train, **fit_params_train)
+
+            self.estimator_.fit(X_train, y_train, **fit_params_train)
 
         if hasattr(self.estimator_, "n_features_in_"):
             self.n_features_in_ = self.estimator_.n_features_in_
@@ -564,109 +541,18 @@ def fit(self, X, y, sample_weight=None, **fit_params):
             self.objective_score_, self.objective_scores_ = None, None
             return self
 
-        if isinstance(self.objective_metric, MutableMapping):
-            keys = set(self.objective_metric.keys())
-            if not keys == {"tp", "tn", "fp", "fn"}:
-                raise ValueError(
-                    "Invalid keys in `objective_metric`. Valid keys are "
-                    f"'tp', 'tn', 'fp', and 'fn'. Got {keys} instead."
-                )
-            pos_label = _check_pos_label_consistency(self.pos_label, y)
-
-            def cost_sensitive_score_func(y_true, y_pred, **kwargs):
-                costs_and_gain = np.array(
-                    [
-                        [kwargs["tn"], kwargs["fp"]],
-                        [kwargs["fn"], kwargs["tp"]],
-                    ]
-                )
-
-                sample_weight = kwargs.get("sample_weight", None)
-                cm = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
-
-                pos_label, classes = kwargs["pos_label"], np.unique(y_true)
-                pos_label_idx = np.searchsorted(classes, pos_label)
-                if pos_label_idx == 0:
-                    # reorder the confusion matrix to be aligned with the cost-matrix
-                    cm = cm[::-1, ::-1]
-
-                return (costs_and_gain * cm).sum()
-
-            self._scorer = _ContinuousScorer(
-                score_func=cost_sensitive_score_func,
-                sign=1,
-                response_method=self._response_method,
-                kwargs={
-                    **self.objective_metric,
-                    "pos_label": pos_label,
-                },
-            )
-        elif self.objective_metric in {
-            "max_tnr_at_tpr_constraint",
-            "max_tpr_at_tnr_constraint",
-            "max_precision_at_recall_constraint",
-            "max_recall_at_precision_constraint",
-        }:
-            if self._response_method == "predict_proba":
-                params_scorer = {"needs_proba": True, "pos_label": self.pos_label}
-            elif (
-                isinstance(self._response_method, list)
-                and self._response_method[0] == "predict_proba"
-                and hasattr(classifier, "predict_proba")
-            ):
-                # TODO: this is due to a limitation in `make_scorer`: ideally, we should
-                # be able to pass a list of response methods to `make_scorer` and give
-                # priority to `predict_proba` other `decision_function`.
-                # Here, we manually check if the classifier provide `predict_proba` to
-                # use `needs_proba` instead and ensure that no error will be raised.
-                params_scorer = {"needs_proba": True, "pos_label": self.pos_label}
-            else:
-                params_scorer = {"needs_threshold": True, "pos_label": self.pos_label}
-
-            if "tpr" in self.objective_metric:  # tpr/tnr
-                score_func = roc_curve
-            else:  # precision/recall
-                score_func = precision_recall_curve
-            self._scorer = make_scorer(score_func, **params_scorer)
-        else:
-            scoring = check_scoring(classifier, scoring=self.objective_metric)
-            # add `pos_label` if requested by the scorer function
-            scorer_kwargs = {**scoring._kwargs}
-            signature_scoring_func = signature(scoring._score_func)
-            if (
-                "pos_label" in signature_scoring_func.parameters
-                and "pos_label" not in scorer_kwargs
-            ):
-                if self.pos_label is None:
-                    # Since the provided `pos_label` is the default, we need to
-                    # use the default value of the scoring function that can be either
-                    # `None` or `1`.
-                    scorer_kwargs["pos_label"] = signature_scoring_func.parameters[
-                        "pos_label"
-                    ].default
-                else:
-                    scorer_kwargs["pos_label"] = self.pos_label
-            # transform a binary metric into a curve metric for all possible decision
-            # thresholds
-            self._scorer = _ContinuousScorer(
-                score_func=scoring._score_func,
-                sign=scoring._sign,
-                response_method=self._response_method,
-                kwargs=scorer_kwargs,
-            )
-
         cv_thresholds, cv_scores = zip(
             *Parallel(n_jobs=self.n_jobs)(
                 delayed(_fit_and_score)(
                     classifier,
                     X,
                     y,
-                    sample_weight,
-                    fit_params,
-                    train_idx,
-                    val_idx,
-                    self._scorer,
-                    self.objective_metric,
+                    fit_params=routed_params.estimator.fit,
+                    train_idx=train_idx,
+                    val_idx=val_idx,
+                    scorer=self._scorer,
+                    score_method=self.objective_metric,
+                    score_params=routed_params.scorer.score,
                 )
                 for train_idx, val_idx in splits
             )
@@ -829,6 +715,72 @@ def decision_function(self, X):
         check_is_fitted(self, "estimator_")
         return self.estimator_.decision_function(X)
 
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = (
+            MetadataRouter(owner=self.__class__.__name__)
+            .add_self_request(self)
+            .add(
+                estimator=self.estimator,
+                method_mapping=MethodMapping().add(callee="fit", caller="fit"),
+            )
+            .add(
+                splitter=self.cv,
+                method_mapping=MethodMapping().add(callee="split", caller="fit"),
+            )
+            .add(
+                scorer=self._get_scorer(),
+                method_mapping=MethodMapping().add(callee="score", caller="fit"),
+            )
+        )
+        return router
+
+    def _get_scorer(self):
+        """Get the scorer based on the objective metric used."""
+        if self.objective_metric in {
+            "max_tnr_at_tpr_constraint",
+            "max_tpr_at_tnr_constraint",
+            "max_precision_at_recall_constraint",
+            "max_recall_at_precision_constraint",
+        }:
+            if self._response_method == "predict_proba":
+                params_scorer = {"needs_proba": True, "pos_label": self.pos_label}
+            elif (
+                isinstance(self._response_method, list)
+                and self._response_method[0] == "predict_proba"
+                and hasattr(self.estimator, "predict_proba")
+            ):
+                # TODO: this is due to a limitation in `make_scorer`: ideally, we should
+                # be able to pass a list of response methods to `make_scorer` and give
+                # priority to `predict_proba` other `decision_function`.
+                # Here, we manually check if the classifier provide `predict_proba` to
+                # use `needs_proba` instead and ensure that no error will be raised.
+                params_scorer = {"needs_proba": True, "pos_label": self.pos_label}
+            else:
+                params_scorer = {"needs_threshold": True, "pos_label": self.pos_label}
+
+            if "tpr" in self.objective_metric:  # tpr/tnr
+                score_func = roc_curve
+            else:  # precision/recall
+                score_func = precision_recall_curve
+            scorer = make_scorer(score_func, **params_scorer)
+        else:
+            scoring = check_scoring(self.estimator, scoring=self.objective_metric)
+            scorer = _ContinuousScorer.from_scorer(
+                scoring, self._response_method, self.pos_label
+            )
+        return scorer
+
     def _more_tags(self):
         return {
             "binary_only": True,
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 6b778d2422104..ea509833873f3 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -74,12 +74,12 @@ def test_fit_and_score_scorers(scorer, score_method):
         classifier,
         X,
         y,
-        sample_weight=None,
         fit_params={},
         train_idx=train_idx,
         val_idx=val_idx,
         scorer=scorer,
         score_method=score_method,
+        score_params={},
     )
 
     if score_method.startswith("max_"):
@@ -141,12 +141,12 @@ def test_fit_and_score_prefit(scorer, score_method, expected_score):
             classifier,
             X,
             y,
-            sample_weight=None,
             fit_params={},
             train_idx=train_idx,
             val_idx=val_idx,
             scorer=scorer,
             score_method=score_method,
+            score_params={},
         )
 
     classifier.fit(X, y)
@@ -158,17 +158,18 @@ def test_fit_and_score_prefit(scorer, score_method, expected_score):
         classifier,
         X,
         y,
-        sample_weight=None,
         fit_params={},
         train_idx=train_idx,
         val_idx=val_idx,
         scorer=scorer,
         score_method=score_method,
+        score_params={},
     )
     assert_array_equal(np.argsort(thresholds), np.arange(len(thresholds)))
     assert_allclose(scores, expected_score)
 
 
+@pytest.mark.usefixtures("enable_slep006")
 @pytest.mark.parametrize(
     "scorer, score_method",
     [
@@ -217,31 +218,32 @@ def test_fit_and_score_sample_weight(scorer, score_method):
         classifier,
         X_repeated,
         y_repeated,
-        sample_weight=None,
         fit_params={},
         train_idx=train_repeated_idx,
         val_idx=val_repeated_idx,
         scorer=scorer,
         score_method=score_method,
+        score_params={},
     )
 
     train_idx, val_idx = np.arange(X.shape[0]), np.arange(X.shape[0])
     thresholds, scores = _fit_and_score(
-        classifier,
+        classifier.set_fit_request(sample_weight=True),
         X,
         y,
-        sample_weight=sample_weight,
-        fit_params={},
+        fit_params={"sample_weight": sample_weight},
         train_idx=train_idx,
         val_idx=val_idx,
-        scorer=scorer,
+        scorer=scorer.set_score_request(sample_weight=True),
         score_method=score_method,
+        score_params={"sample_weight": sample_weight},
     )
 
     assert_allclose(thresholds_repeated, thresholds)
     assert_allclose(scores_repeated, scores)
 
 
+@pytest.mark.usefixtures("enable_slep006")
 @pytest.mark.parametrize(
     "scorer, score_method",
     [
@@ -282,18 +284,19 @@ def test_fit_and_score_fit_params(scorer, score_method, fit_params_type):
     }
 
     classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0)
+    classifier.set_fit_request(a=True, b=True)
     train_idx, val_idx = np.arange(50), np.arange(50, 100)
 
     _fit_and_score(
         classifier,
         X,
         y,
-        sample_weight=None,
         fit_params=fit_params,
         train_idx=train_idx,
         val_idx=val_idx,
         scorer=scorer,
         score_method=score_method,
+        score_params={},
     )
 
 
@@ -454,7 +457,7 @@ def test_tunedthresholdclassifier_metric_with_parameter():
         "max_recall_at_precision_constraint",
         make_scorer(balanced_accuracy_score),
         make_scorer(f1_score, pos_label="cancer"),
-        {"tp": 5, "tn": 1, "fp": -1, "fn": -1},
+        # {"tp": 5, "tn": 1, "fp": -1, "fn": -1},
     ],
 )
 def test_tunedthresholdclassifier_with_string_targets(response_method, metric):
@@ -481,6 +484,7 @@ def test_tunedthresholdclassifier_with_string_targets(response_method, metric):
     assert_array_equal(np.sort(np.unique(y_pred)), np.sort(classes))
 
 
+@pytest.mark.usefixtures("enable_slep006")
 @pytest.mark.parametrize("strategy", ["optimum", "constant"])
 @pytest.mark.parametrize("with_sample_weight", [True, False])
 def test_tunedthresholdclassifier_refit(
@@ -496,7 +500,7 @@ def test_tunedthresholdclassifier_refit(
         sample_weight = None
 
     # check that `estimator_` if fitted on the full dataset when `refit=True`
-    estimator = LogisticRegression()
+    estimator = LogisticRegression().set_fit_request(sample_weight=True)
     model = TunedThresholdClassifier(estimator, strategy=strategy, refit=True).fit(
         X, y, sample_weight=sample_weight
     )
@@ -507,7 +511,8 @@ def test_tunedthresholdclassifier_refit(
     assert_allclose(model.estimator_.intercept_, estimator.intercept_)
 
     # check that `estimator_` was not altered when `refit=False` and `cv="prefit"`
-    estimator = LogisticRegression().fit(X, y, sample_weight=sample_weight)
+    estimator = LogisticRegression().set_fit_request(sample_weight=True)
+    estimator.fit(X, y, sample_weight=sample_weight)
     coef = estimator.coef_.copy()
     model = TunedThresholdClassifier(
         estimator, strategy=strategy, cv="prefit", refit=False
@@ -517,7 +522,7 @@ def test_tunedthresholdclassifier_refit(
     assert_allclose(model.estimator_.coef_, coef)
 
     # check that we train `estimator_` on the training split of a given cross-validation
-    estimator = LogisticRegression()
+    estimator = LogisticRegression().set_fit_request(sample_weight=True)
     cv = [
         (np.arange(50), np.arange(50, 100)),
     ]  # single split
@@ -534,6 +539,7 @@ def test_tunedthresholdclassifier_refit(
     assert_allclose(model.estimator_.coef_, estimator.coef_)
 
 
+@pytest.mark.usefixtures("enable_slep006")
 @pytest.mark.parametrize(
     "objective_metric",
     [
@@ -554,6 +560,7 @@ def test_tunedthresholdclassifier_fit_params(objective_metric, fit_params_type):
     }
 
     classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0)
+    classifier.set_fit_request(a=True, b=True)
     model = TunedThresholdClassifier(
         classifier, objective_metric=objective_metric, constraint_value=0.5
     )
@@ -616,90 +623,91 @@ def test_tunedthresholdclassifier_response_method_scorer_with_constraint_metric(
             assert -20 < model.decision_threshold_ < 0
 
 
-def test_tunedthresholdclassifier_objective_metric_dict(global_random_seed):
-    """Check that we can pass a custom objective metric."""
-    X, y = make_classification(n_samples=500, random_state=global_random_seed)
-    classifier = LogisticRegression()
+# def test_tunedthresholdclassifier_objective_metric_dict(global_random_seed):
+#     """Check that we can pass a custom objective metric."""
+#     X, y = make_classification(n_samples=500, random_state=global_random_seed)
+#     classifier = LogisticRegression()
 
-    # we need to set a small number of thresholds to avoid ties and picking a too low
-    # threshold.
-    n_thresholds = 5
+#     # we need to set a small number of thresholds to avoid ties and picking a too low
+#     # threshold.
+#     n_thresholds = 5
 
-    # affect a high gain to true negative and force the classifier to mainly
-    # predict the negative class.
-    costs_and_again = {"tp": 0, "tn": 10, "fp": 0, "fn": 0}
-    model = TunedThresholdClassifier(
-        classifier, objective_metric=costs_and_again, n_thresholds=n_thresholds
-    )
-    model.fit(X, y)
+#     # affect a high gain to true negative and force the classifier to mainly
+#     # predict the negative class.
+#     costs_and_again = {"tp": 0, "tn": 10, "fp": 0, "fn": 0}
+#     model = TunedThresholdClassifier(
+#         classifier, objective_metric=costs_and_again, n_thresholds=n_thresholds
+#     )
+#     model.fit(X, y)
 
-    assert model.decision_thresholds_.shape == (n_thresholds,)
-    assert model.objective_scores_.shape == (n_thresholds,)
+#     assert model.decision_thresholds_.shape == (n_thresholds,)
+#     assert model.objective_scores_.shape == (n_thresholds,)
 
-    assert model.decision_threshold_ > 0.99
-    assert np.mean(model.predict(X) == 0) > 0.9
+#     assert model.decision_threshold_ > 0.99
+#     assert np.mean(model.predict(X) == 0) > 0.9
 
-    # use the true positive now
-    costs_and_again = {"tp": 10, "tn": 0, "fp": 0, "fn": 0}
-    model = TunedThresholdClassifier(
-        classifier, objective_metric=costs_and_again, n_thresholds=n_thresholds
-    )
-    model.fit(X, y)
+#     # use the true positive now
+#     costs_and_again = {"tp": 10, "tn": 0, "fp": 0, "fn": 0}
+#     model = TunedThresholdClassifier(
+#         classifier, objective_metric=costs_and_again, n_thresholds=n_thresholds
+#     )
+#     model.fit(X, y)
 
-    assert model.decision_thresholds_.shape == (n_thresholds,)
-    assert model.objective_scores_.shape == (n_thresholds,)
+#     assert model.decision_thresholds_.shape == (n_thresholds,)
+#     assert model.objective_scores_.shape == (n_thresholds,)
 
-    assert model.decision_threshold_ < 0.01
-    assert np.mean(model.predict(X) == 1) > 0.9
+#     assert model.decision_threshold_ < 0.01
+#     assert np.mean(model.predict(X) == 1) > 0.9
 
-    # flipping the `pos_label` to zero should force the classifier to always predict 0
-    # and thus have a low threshold
-    pos_label = 0
-    model = TunedThresholdClassifier(
-        classifier,
-        objective_metric=costs_and_again,
-        n_thresholds=n_thresholds,
-        pos_label=pos_label,
-    )
-    model.fit(X, y)
+#     # flipping the `pos_label` to zero should force the classifier to always predict 0
+#     # and thus have a low threshold
+#     pos_label = 0
+#     model = TunedThresholdClassifier(
+#         classifier,
+#         objective_metric=costs_and_again,
+#         n_thresholds=n_thresholds,
+#         pos_label=pos_label,
+#     )
+#     model.fit(X, y)
 
-    assert model.decision_thresholds_.shape == (n_thresholds,)
-    assert model.objective_scores_.shape == (n_thresholds,)
+#     assert model.decision_thresholds_.shape == (n_thresholds,)
+#     assert model.objective_scores_.shape == (n_thresholds,)
 
-    assert model.decision_threshold_ < 0.01
-    assert np.mean(model.predict(X) == 0) > 0.9
+#     assert model.decision_threshold_ < 0.01
+#     assert np.mean(model.predict(X) == 0) > 0.9
 
 
-def test_tunedthresholdclassifier_sample_weight_costs_and_gain():
-    """Check that we dispatch the `sample_weight` to the scorer when computing the
-    confusion matrix."""
-    X, y = load_iris(return_X_y=True)
-    X, y = X[:100], y[:100]  # only 2 classes
+# def test_tunedthresholdclassifier_sample_weight_costs_and_gain():
+#     """Check that we dispatch the `sample_weight` to the scorer when computing the
+#     confusion matrix."""
+#     X, y = load_iris(return_X_y=True)
+#     X, y = X[:100], y[:100]  # only 2 classes
 
-    # create a dataset and repeat twice the sample of class #0
-    X_repeated, y_repeated = np.vstack([X, X[y == 0]]), np.hstack([y, y[y == 0]])
-    # create a sample weight vector that is equivalent to the repeated dataset
-    sample_weight = np.ones_like(y)
-    sample_weight[:50] *= 2
+#     # create a dataset and repeat twice the sample of class #0
+#     X_repeated, y_repeated = np.vstack([X, X[y == 0]]), np.hstack([y, y[y == 0]])
+#     # create a sample weight vector that is equivalent to the repeated dataset
+#     sample_weight = np.ones_like(y)
+#     sample_weight[:50] *= 2
 
-    # we use a prefit classifier to simplify the test
-    cv = "prefit"
-    estimator = LogisticRegression().fit(X, y)
-    costs_and_again = {"tp": 1, "tn": 1, "fp": -1, "fn": -1}
+#     # we use a prefit classifier to simplify the test
+#     cv = "prefit"
+#     estimator = LogisticRegression().fit(X, y)
+#     costs_and_again = {"tp": 1, "tn": 1, "fp": -1, "fn": -1}
 
-    model_repeat = TunedThresholdClassifier(
-        estimator, cv=cv, objective_metric=costs_and_again
-    )
-    model_repeat.fit(X_repeated, y_repeated, sample_weight=None)
+#     model_repeat = TunedThresholdClassifier(
+#         estimator, cv=cv, objective_metric=costs_and_again
+#     )
+#     model_repeat.fit(X_repeated, y_repeated, sample_weight=None)
 
-    model_sw = TunedThresholdClassifier(
-        estimator, cv=cv, objective_metric=costs_and_again
-    )
-    model_sw.fit(X, y, sample_weight=sample_weight)
+#     model_sw = TunedThresholdClassifier(
+#         estimator, cv=cv, objective_metric=costs_and_again
+#     )
+#     model_sw.fit(X, y, sample_weight=sample_weight)
 
-    assert model_repeat.objective_score_ == pytest.approx(model_sw.objective_score_)
+#     assert model_repeat.objective_score_ == pytest.approx(model_sw.objective_score_)
 
 
+@pytest.mark.usefixtures("enable_slep006")
 def test_tunedthresholdclassifier_cv_zeros_sample_weights_equivalence():
     """Check that passing removing some sample from the dataset `X` is
     equivalent to passing a `sample_weight` with a factor 0."""
@@ -713,7 +721,7 @@ def test_tunedthresholdclassifier_cv_zeros_sample_weights_equivalence():
     sample_weight = np.zeros_like(y)
     sample_weight[::2] = 1
 
-    estimator = LogisticRegression()
+    estimator = LogisticRegression().set_fit_request(sample_weight=True)
     model_without_weights = TunedThresholdClassifier(estimator, cv=2)
     model_with_weights = clone(model_without_weights)
 

From 005126a5fedcdca94b0ff7ba1b67d7048820f346 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 15 Jul 2023 00:42:44 +0200
Subject: [PATCH 093/194] Apply suggestions from code review

Co-authored-by: Soledad Galli <solegalli@protonmail.com>
---
 doc/modules/classification_threshold.rst      | 36 +++++++++----------
 doc/whats_new/v1.4.rst                        |  2 +-
 .../_classification_threshold.py              |  6 ++--
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index d3fbeea67174a..2ba5b539fab3b 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -10,9 +10,9 @@ Classifiers are predictive models: they use statistical learning to predict outc
 The outcomes of a classifier are scores for each sample in relation to each class and
 categorical prediction (class label). Scores are obtained from :term:`predict_proba` or
 :term:`decision_function`. The former returns posterior probability estimates for each
-class while the latter returns a decision score for each class. The decision score is a
-measure of how strongly the sample is predicted to belong to the positive class (e.g.
-the distance to the decisin boundary). A decision rule is then defined by thresholding
+class, while the latter returns a decision score for each class. The decision score is a
+measure of how strongly the sample is predicted to belong to the positive class (e.g.,
+the distance to the decision boundary). A decision rule is then defined by thresholding
 the scores and obtained the class label for each sample. Those labels are obtained with
 :term:`predict`.
 
@@ -36,9 +36,9 @@ probability estimates and class labels::
     >>> classifier.predict(X[:4])
     array([0, 0, 1, 1])
 
-While these approaches are reasonable as default behaviors, they are not be ideal for
+While these approaches are reasonable as default behaviors, they are not ideal for
 all cases. The context and nature of the use case defines the expected behavior of the
-classifier and thus the strategy to convert soft predictions into hard predictions. We
+classifier and thus, the strategy to convert soft predictions into hard predictions. We
 illustrate this point with an example.
 
 Let's imagine the deployment of a predictive model helping medical doctors to detect
@@ -66,21 +66,21 @@ Characteristic (ROC) and Precision-Recall curves, and thus the same
 decision threshold. The vanilla classifier predicts the class of interest for a
 posterior probability greater than 0.5 while the tuned classifier predicts the class of
 interest for a very low probability (around 0.02). This cut-off point optimizes a
-utility metric defined by the business case (in this case an insurance company).
+utility metric defined by the business (in this case an insurance company).
 
 .. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_tuned_threshold_classifier_002.png
    :target: ../auto_examples/model_selection/plot_tuned_threshold_classifier.html
    :align: center
 
-Available options to tune the cut-off point
+Options to tune the cut-off point
 -------------------------------------------
 
-The cut-off point can be tuned with different strategies controlled by the parameter
+The cut-off point can be tuned through different strategies controlled by the parameter
 `objective_metric`.
 
-A straightforward use case is to maximize a pre-defined scikit-learn metric. These
+One way to tune the threshold is by maximizing a pre-defined scikit-learn metric. These
 metrics can be found by calling the function :func:`~sklearn.metrics.get_scorer_names`.
-We provide an example where we maximize the balanced accuracy.
+In this example, we maximize the balanced accuracy.
 
 .. note::
 
@@ -88,7 +88,7 @@ We provide an example where we maximize the balanced accuracy.
     the label of the class of interested (i.e. `pos_label`). Thus, if this label is not
     the right one for your application, you need to define a scorer and pass the right
     `pos_label` (and additional parameters) using the
-    :func:`~sklearn.metrics.make_scorer`. You should refer to :ref:`scoring` to get all
+    :func:`~sklearn.metrics.make_scorer`. Refer to :ref:`scoring` to get 
     information to define your own scoring function. For instance, we show how to pass
     the information to the scorer that the label of interest is `0` when maximizing the
     :func:`~sklearn.metrics.f1_score`:
@@ -112,9 +112,9 @@ We provide an example where we maximize the balanced accuracy.
         >>> model.objective_score_
         0.86...
 
-A second strategy aims at maximizing a metric while imposing constraints on another
-metric. Four pre-defined options exist, 2 that uses the Receiver Operating
-Characteristic (ROC) statistics and 2 that uses the Precision-Recall statistics.
+A second strategy aims to maximize one metric while imposing constraints on another
+metric. There are four pre-defined options, 2 use the Receiver Operating
+Characteristic (ROC) statistics and 2 use the Precision-Recall statistics.
 
 - `"max_tpr_at_tnr_constraint"`: maximizes the True Positive Rate (TPR) such that the
   True Negative Rate (TNR) is the closest to a given value.
@@ -141,18 +141,18 @@ Important notes regarding the internal cross-validation
 By default :class:`~sklearn.model_selection.TunedThresholdClassifier` uses a
 5-fold stratified cross-validation to tune the cut-off point. The parameter
 `cv` allows to control the cross-validation strategy. It is possible to go
-around cross-validation by passing `cv="prefit"` and provide an already fitted
+around cross-validation by passing `cv="prefit"` and providing a fitted
 classifier. In this case, the cut-off point is tuned on the data provided to
 the `fit` method.
 
 However, you should be extremely careful when using this option. You should never use
 the same data for training the classifier and tuning the cut-off point at the risk of
-overfitting. Refer to :ref:`tunedthresholdclassifier_no_cv` that shows such overfitting. If
+overfitting. Refer to :ref:`tunedthresholdclassifier_no_cv` for an example. If
 you are in a situation where you have limited resources, you should consider using
 a float number that will use a single split internally.
 
 The option `cv="prefit"` should only be used when the provided classifier was already
-trained on some data and you want to tune (or re-tune) on a new validation set.
+trained, and you just want to find the best cut-off using a new validation set.
 
 Manually setting the decision thresholding
 -------------------------------------------
@@ -160,7 +160,7 @@ Manually setting the decision thresholding
 The previous sections discussed strategies to find an optimal decision threshold. It is
 also possible to manually set the decision threshold in
 :class`~sklearn.model_selection.TunedThresholdClassifier` by setting the parameter
-`strategy` to `"constant"` and provide the desired threshold using the parameter
+`strategy` to `"constant"` and providing the desired threshold using the parameter
 `constant_threshold`.
 
 Examples
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index b28b25452d236..76b3bc12a1e8d 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -108,7 +108,7 @@ Changelog
 ..............................
 
 - |MajorFeature| :class:`model_selection.TunedThresholdClassifier` calibrates
-  decision threshold function of a binary classifier by maximizing a
+  the decision threshold function of a binary classifier by maximizing a
   classification metric through cross-validation.
   :pr:`26120` by :user:`Guillaume Lemaitre <glemaitre>`.
 
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 58446ed5babb4..aa45df40abaa8 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -71,7 +71,7 @@ def _fit_and_score(
     Parameters
     ----------
     classifier : estimator instance
-        The classifier to fit and used for scoring. If `classifier` is already fitted,
+        The classifier to fit and use for scoring. If `classifier` is already fitted,
         it will be used as is.
 
     X : {array-like, sparse matrix} of shape (n_samples, n_features)
@@ -163,7 +163,7 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
     Parameters
     ----------
     estimator : estimator instance
-        The classifier, fitted or not fitted, for which we want to optimize
+        The classifier, fitted or not, for which we want to optimize
         the decision threshold used during `predict`.
 
     strategy : {"optimum", "constant"}, default="optimum"
@@ -192,7 +192,7 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
 
     constraint_value : float, default=None
         The value associated with the `objective_metric` metric for which we
-        want to find the decision threshold when `objective_metric` is equal one of
+        want to find the decision threshold when `objective_metric` is either
         `"max_tnr_at_tpr_constraint"`, `"max_tpr_at_tnr_constraint"`,
         `"max_precision_at_recall_constraint"`, or
         `"max_recall_at_precision_constraint"`.

From 767a05ffefb700265216e3f8c4ddbb1b1be197a7 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Fri, 14 Jul 2023 15:42:18 +0200
Subject: [PATCH 094/194] CLN clean up some repeated code related to SLEP006

---
 sklearn/metrics/_scorer.py                  |   7 +-
 sklearn/metrics/tests/test_score_objects.py | 134 ++++++++++----------
 sklearn/multioutput.py                      |  13 +-
 sklearn/pipeline.py                         |  19 +--
 sklearn/utils/_metadata_requests.py         |  34 +++++
 sklearn/utils/metadata_routing.py           |   1 +
 6 files changed, 111 insertions(+), 97 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index df1f2602c38ee..df33c52d25afb 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -35,6 +35,7 @@
     MetadataRequest,
     MetadataRouter,
     _MetadataRequester,
+    _raise_for_params,
     _routing_enabled,
     get_routing_for_object,
     process_routing,
@@ -253,11 +254,7 @@ def __call__(self, estimator, X, y_true, sample_weight=None, **kwargs):
         score : float
             Score function applied to prediction of estimator on X.
         """
-        if kwargs and not _routing_enabled():
-            raise ValueError(
-                "kwargs is only supported if enable_metadata_routing=True. See"
-                " the User Guide for more information."
-            )
+        _raise_for_params(kwargs, self, None)
 
         _kwargs = copy.deepcopy(kwargs)
         if sample_weight is not None:
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 9e37671ac1eba..c97b1bac28545 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -1277,6 +1277,7 @@ def test_continuous_scorer_pos_label(global_random_seed):
     assert scores_pos_label_1.max() == pytest.approx(1.0)
 
 
+@pytest.mark.usefixtures("enable_slep006")
 @pytest.mark.parametrize("name", get_scorer_names(), ids=get_scorer_names())
 def test_scorer_metadata_request(name):
     """Testing metadata requests for scorers.
@@ -1284,90 +1285,87 @@ def test_scorer_metadata_request(name):
     This test checks many small things in a large test, to reduce the
     boilerplate required for each section.
     """
-    with config_context(enable_metadata_routing=True):
-        # Make sure they expose the routing methods.
-        scorer = get_scorer(name)
-        assert hasattr(scorer, "set_score_request")
-        assert hasattr(scorer, "get_metadata_routing")
-
-        # Check that by default no metadata is requested.
-        assert_request_is_empty(scorer.get_metadata_routing())
-
-        weighted_scorer = scorer.set_score_request(sample_weight=True)
-        # set_score_request should mutate the instance, rather than returning a
-        # new instance
-        assert weighted_scorer is scorer
-
-        # make sure the scorer doesn't request anything on methods other than
-        # `score`, and that the requested value on `score` is correct.
-        assert_request_is_empty(weighted_scorer.get_metadata_routing(), exclude="score")
-        assert (
-            weighted_scorer.get_metadata_routing().score.requests["sample_weight"]
-            is True
-        )
+    # Make sure they expose the routing methods.
+    scorer = get_scorer(name)
+    assert hasattr(scorer, "set_score_request")
+    assert hasattr(scorer, "get_metadata_routing")
+
+    # Check that by default no metadata is requested.
+    assert_request_is_empty(scorer.get_metadata_routing())
+
+    weighted_scorer = scorer.set_score_request(sample_weight=True)
+    # set_score_request should mutate the instance, rather than returning a
+    # new instance
+    assert weighted_scorer is scorer
+
+    # make sure the scorer doesn't request anything on methods other than
+    # `score`, and that the requested value on `score` is correct.
+    assert_request_is_empty(weighted_scorer.get_metadata_routing(), exclude="score")
+    assert (
+        weighted_scorer.get_metadata_routing().score.requests["sample_weight"] is True
+    )
 
-        # make sure putting the scorer in a router doesn't request anything by
-        # default
-        router = MetadataRouter(owner="test").add(
-            method_mapping="score", scorer=get_scorer(name)
-        )
-        # make sure `sample_weight` is refused if passed.
-        with pytest.raises(TypeError, match="got unexpected argument"):
-            router.validate_metadata(params={"sample_weight": 1}, method="score")
-        # make sure `sample_weight` is not routed even if passed.
-        routed_params = router.route_params(params={"sample_weight": 1}, caller="score")
-        assert not routed_params.scorer.score
-
-        # make sure putting weighted_scorer in a router requests sample_weight
-        router = MetadataRouter(owner="test").add(
-            scorer=weighted_scorer, method_mapping="score"
-        )
+    # make sure putting the scorer in a router doesn't request anything by
+    # default
+    router = MetadataRouter(owner="test").add(
+        method_mapping="score", scorer=get_scorer(name)
+    )
+    # make sure `sample_weight` is refused if passed.
+    with pytest.raises(TypeError, match="got unexpected argument"):
         router.validate_metadata(params={"sample_weight": 1}, method="score")
-        routed_params = router.route_params(params={"sample_weight": 1}, caller="score")
-        assert list(routed_params.scorer.score.keys()) == ["sample_weight"]
+    # make sure `sample_weight` is not routed even if passed.
+    routed_params = router.route_params(params={"sample_weight": 1}, caller="score")
+    assert not routed_params.scorer.score
+
+    # make sure putting weighted_scorer in a router requests sample_weight
+    router = MetadataRouter(owner="test").add(
+        scorer=weighted_scorer, method_mapping="score"
+    )
+    router.validate_metadata(params={"sample_weight": 1}, method="score")
+    routed_params = router.route_params(params={"sample_weight": 1}, caller="score")
+    assert list(routed_params.scorer.score.keys()) == ["sample_weight"]
 
 
+@pytest.mark.usefixtures("enable_slep006")
 def test_metadata_kwarg_conflict():
     """This test makes sure the right warning is raised if the user passes
     some metadata both as a constructor to make_scorer, and during __call__.
     """
-    with config_context(enable_metadata_routing=True):
-        X, y = make_classification(
-            n_classes=3, n_informative=3, n_samples=20, random_state=0
-        )
-        lr = LogisticRegression().fit(X, y)
+    X, y = make_classification(
+        n_classes=3, n_informative=3, n_samples=20, random_state=0
+    )
+    lr = LogisticRegression().fit(X, y)
 
-        scorer = make_scorer(
-            roc_auc_score,
-            needs_proba=True,
-            multi_class="ovo",
-            labels=lr.classes_,
-        )
-        with pytest.warns(UserWarning, match="already set as kwargs"):
-            scorer.set_score_request(labels=True)
+    scorer = make_scorer(
+        roc_auc_score,
+        needs_proba=True,
+        multi_class="ovo",
+        labels=lr.classes_,
+    )
+    with pytest.warns(UserWarning, match="already set as kwargs"):
+        scorer.set_score_request(labels=True)
 
-        with config_context(enable_metadata_routing=True):
-            with pytest.warns(UserWarning, match="There is an overlap"):
-                scorer(lr, X, y, labels=lr.classes_)
+    with pytest.warns(UserWarning, match="There is an overlap"):
+        scorer(lr, X, y, labels=lr.classes_)
 
 
+@pytest.mark.usefixtures("enable_slep006")
 def test_PassthroughScorer_metadata_request():
     """Test that _PassthroughScorer properly routes metadata.
 
     _PassthroughScorer should behave like a consumer, mirroring whatever is the
     underlying score method.
     """
-    with config_context(enable_metadata_routing=True):
-        scorer = _PassthroughScorer(
-            estimator=LinearSVC()
-            .set_score_request(sample_weight="alias")
-            .set_fit_request(sample_weight=True)
-        )
-        # test that _PassthroughScorer leaves everything other than `score` empty
-        assert_request_is_empty(scorer.get_metadata_routing(), exclude="score")
-        # test that _PassthroughScorer doesn't behave like a router and leaves
-        # the request as is.
-        assert scorer.get_metadata_routing().score.requests["sample_weight"] == "alias"
+    scorer = _PassthroughScorer(
+        estimator=LinearSVC()
+        .set_score_request(sample_weight="alias")
+        .set_fit_request(sample_weight=True)
+    )
+    # test that _PassthroughScorer leaves everything other than `score` empty
+    assert_request_is_empty(scorer.get_metadata_routing(), exclude="score")
+    # test that _PassthroughScorer doesn't behave like a router and leaves
+    # the request as is.
+    assert scorer.get_metadata_routing().score.requests["sample_weight"] == "alias"
 
 
 def test_multimetric_scoring_metadata_routing():
@@ -1424,5 +1422,7 @@ def score(y_true, y_pred, param=None):
     clf = DecisionTreeClassifier().fit(X, y)
     scorer = make_scorer(score)
     with config_context(enable_metadata_routing=False):
-        with pytest.raises(ValueError, match="kwargs is only supported if"):
+        with pytest.raises(
+            ValueError, match="is only supported if enable_metadata_routing=True"
+        ):
             scorer(clf, X, y, param="blah")
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 89baa96893d39..8bd71924f954b 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -36,6 +36,7 @@
 from .utils.metadata_routing import (
     MetadataRouter,
     MethodMapping,
+    _raise_for_params,
     _routing_enabled,
     process_routing,
 )
@@ -148,11 +149,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None, **partial_fit_para
         self : object
             Returns a fitted instance.
         """
-        if partial_fit_params and not _routing_enabled():
-            raise ValueError(
-                "partial_fit_params is only supported if enable_metadata_routing=True."
-                " See the User Guide for more information."
-            )
+        _raise_for_params(partial_fit_params, self, "partial_fit")
 
         first_time = not hasattr(self, "estimators_")
 
@@ -919,11 +916,7 @@ def fit(self, X, Y, **fit_params):
         self : object
             Class instance.
         """
-        if fit_params and not _routing_enabled():
-            raise ValueError(
-                "fit_params is only supported if enable_metadata_routing=True. "
-                "See the User Guide for more information."
-            )
+        _raise_for_params(fit_params, self, "fit")
 
         super().fit(X, Y, **fit_params)
         self.classes_ = [
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 6cc22a7101da5..d6ad0001ad257 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -27,6 +27,7 @@
 from .utils.metadata_routing import (
     MetadataRouter,
     MethodMapping,
+    _raise_for_params,
     _routing_enabled,
     process_routing,
 )
@@ -742,11 +743,7 @@ def decision_function(self, X, **params):
         y_score : ndarray of shape (n_samples, n_classes)
             Result of calling `decision_function` on the final estimator.
         """
-        if params and not _routing_enabled():
-            raise ValueError(
-                "params is only supported if enable_metadata_routing=True."
-                " See the User Guide for more information."
-            )
+        _raise_for_params(params, self, "decision_function")
 
         # not branching here since params is only available if
         # enable_metadata_routing=True
@@ -881,11 +878,7 @@ def transform(self, X, **params):
         Xt : ndarray of shape (n_samples, n_transformed_features)
             Transformed data.
         """
-        if not _routing_enabled() and params:
-            raise ValueError(
-                "params is only supported if enable_metadata_routing=True."
-                " See the User Guide for more information."
-            )
+        _raise_for_params(params, self, "transform")
 
         # not branching here since params is only available if
         # enable_metadata_routing=True
@@ -928,11 +921,7 @@ def inverse_transform(self, Xt, **params):
             Inverse transformed data, that is, data in the original feature
             space.
         """
-        if not _routing_enabled() and params:
-            raise ValueError(
-                "params is only supported if enable_metadata_routing=True. See"
-                " the User Guide for more information."
-            )
+        _raise_for_params(params, self, "inverse_transform")
 
         # we don't have to branch here, since params is only non-empty if
         # enable_metadata_routing=True.
diff --git a/sklearn/utils/_metadata_requests.py b/sklearn/utils/_metadata_requests.py
index 8dbd5a4ad386d..6027e562fa1d8 100644
--- a/sklearn/utils/_metadata_requests.py
+++ b/sklearn/utils/_metadata_requests.py
@@ -127,6 +127,40 @@ def _routing_enabled():
     return get_config().get("enable_metadata_routing", False)
 
 
+def _raise_for_params(params, owner, method):
+    """Raise an error if metadata routing is not enabled and params are passed.
+
+    .. versionadded:: 1.4
+
+    Parameters
+    ----------
+    params : dict
+        The metadata passed to a method.
+
+    owner : object
+        The object to which the method belongs.
+
+    method : str
+        The name of the method, e.g. "fit".
+
+    Raises
+    ------
+    ValueError
+        If metadata routing is not enabled and params are passed.
+    """
+    caller = (
+        f"{owner.__class__.__name__}.{method}" if method else owner.__class__.__name__
+    )
+    if not _routing_enabled() and params:
+        raise ValueError(
+            f"Passing extra keyword arguments to {caller} is only supported if"
+            " enable_metadata_routing=True, which you can set using"
+            " `sklearn.set_config`. See the User Guide"
+            " <https://scikit-learn.org/stable/metadata_routing.html> for more"
+            " details."
+        )
+
+
 # Request values
 # ==============
 # Each request value needs to be one of the following values, or an alias.
diff --git a/sklearn/utils/metadata_routing.py b/sklearn/utils/metadata_routing.py
index 0dd25951376c0..b8af5794ee248 100644
--- a/sklearn/utils/metadata_routing.py
+++ b/sklearn/utils/metadata_routing.py
@@ -16,3 +16,4 @@
 from ._metadata_requests import process_routing  # noqa
 from ._metadata_requests import _MetadataRequester  # noqa
 from ._metadata_requests import _routing_enabled  # noqa
+from ._metadata_requests import _raise_for_params  # noqa

From 080ba5c88c3bc804fa903ce8776648631e9c11aa Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 15 Jul 2023 00:47:35 +0200
Subject: [PATCH 095/194] iter

---
 sklearn/model_selection/_classification_threshold.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index aa45df40abaa8..5a723bd4199d9 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -25,7 +25,7 @@
 from ..utils.metadata_routing import (
     MetadataRouter,
     MethodMapping,
-    _routing_enabled,
+    _raise_for_params,
     process_routing,
 )
 from ..utils.metaestimators import available_if
@@ -439,8 +439,7 @@ def fit(self, X, y, **params):
         self : object
             Returns an instance of self.
         """
-        if params and not _routing_enabled():
-            raise ValueError
+        _raise_for_params(params, self, None)
 
         self._validate_params()
         X, y = indexable(X, y)

From 05ec85df04ef16c4c2bc7628a0cd9b76aef386d4 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 15 Jul 2023 00:57:06 +0200
Subject: [PATCH 096/194] iter

---
 .../tests/test_classification_threshold.py    | 85 -------------------
 1 file changed, 85 deletions(-)

diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index ea509833873f3..8e9e2e9e87905 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -457,7 +457,6 @@ def test_tunedthresholdclassifier_metric_with_parameter():
         "max_recall_at_precision_constraint",
         make_scorer(balanced_accuracy_score),
         make_scorer(f1_score, pos_label="cancer"),
-        # {"tp": 5, "tn": 1, "fp": -1, "fn": -1},
     ],
 )
 def test_tunedthresholdclassifier_with_string_targets(response_method, metric):
@@ -623,90 +622,6 @@ def test_tunedthresholdclassifier_response_method_scorer_with_constraint_metric(
             assert -20 < model.decision_threshold_ < 0
 
 
-# def test_tunedthresholdclassifier_objective_metric_dict(global_random_seed):
-#     """Check that we can pass a custom objective metric."""
-#     X, y = make_classification(n_samples=500, random_state=global_random_seed)
-#     classifier = LogisticRegression()
-
-#     # we need to set a small number of thresholds to avoid ties and picking a too low
-#     # threshold.
-#     n_thresholds = 5
-
-#     # affect a high gain to true negative and force the classifier to mainly
-#     # predict the negative class.
-#     costs_and_again = {"tp": 0, "tn": 10, "fp": 0, "fn": 0}
-#     model = TunedThresholdClassifier(
-#         classifier, objective_metric=costs_and_again, n_thresholds=n_thresholds
-#     )
-#     model.fit(X, y)
-
-#     assert model.decision_thresholds_.shape == (n_thresholds,)
-#     assert model.objective_scores_.shape == (n_thresholds,)
-
-#     assert model.decision_threshold_ > 0.99
-#     assert np.mean(model.predict(X) == 0) > 0.9
-
-#     # use the true positive now
-#     costs_and_again = {"tp": 10, "tn": 0, "fp": 0, "fn": 0}
-#     model = TunedThresholdClassifier(
-#         classifier, objective_metric=costs_and_again, n_thresholds=n_thresholds
-#     )
-#     model.fit(X, y)
-
-#     assert model.decision_thresholds_.shape == (n_thresholds,)
-#     assert model.objective_scores_.shape == (n_thresholds,)
-
-#     assert model.decision_threshold_ < 0.01
-#     assert np.mean(model.predict(X) == 1) > 0.9
-
-#     # flipping the `pos_label` to zero should force the classifier to always predict 0
-#     # and thus have a low threshold
-#     pos_label = 0
-#     model = TunedThresholdClassifier(
-#         classifier,
-#         objective_metric=costs_and_again,
-#         n_thresholds=n_thresholds,
-#         pos_label=pos_label,
-#     )
-#     model.fit(X, y)
-
-#     assert model.decision_thresholds_.shape == (n_thresholds,)
-#     assert model.objective_scores_.shape == (n_thresholds,)
-
-#     assert model.decision_threshold_ < 0.01
-#     assert np.mean(model.predict(X) == 0) > 0.9
-
-
-# def test_tunedthresholdclassifier_sample_weight_costs_and_gain():
-#     """Check that we dispatch the `sample_weight` to the scorer when computing the
-#     confusion matrix."""
-#     X, y = load_iris(return_X_y=True)
-#     X, y = X[:100], y[:100]  # only 2 classes
-
-#     # create a dataset and repeat twice the sample of class #0
-#     X_repeated, y_repeated = np.vstack([X, X[y == 0]]), np.hstack([y, y[y == 0]])
-#     # create a sample weight vector that is equivalent to the repeated dataset
-#     sample_weight = np.ones_like(y)
-#     sample_weight[:50] *= 2
-
-#     # we use a prefit classifier to simplify the test
-#     cv = "prefit"
-#     estimator = LogisticRegression().fit(X, y)
-#     costs_and_again = {"tp": 1, "tn": 1, "fp": -1, "fn": -1}
-
-#     model_repeat = TunedThresholdClassifier(
-#         estimator, cv=cv, objective_metric=costs_and_again
-#     )
-#     model_repeat.fit(X_repeated, y_repeated, sample_weight=None)
-
-#     model_sw = TunedThresholdClassifier(
-#         estimator, cv=cv, objective_metric=costs_and_again
-#     )
-#     model_sw.fit(X, y, sample_weight=sample_weight)
-
-#     assert model_repeat.objective_score_ == pytest.approx(model_sw.objective_score_)
-
-
 @pytest.mark.usefixtures("enable_slep006")
 def test_tunedthresholdclassifier_cv_zeros_sample_weights_equivalence():
     """Check that passing removing some sample from the dataset `X` is

From 63c32bd249715a3c65184dbf9662fa514db37316 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 15 Jul 2023 13:35:52 +0200
Subject: [PATCH 097/194] ENH add new response_method in make_scorer

---
 doc/whats_new/v1.4.rst                      |  7 ++
 sklearn/metrics/_scorer.py                  | 71 +++++++++++++++------
 sklearn/metrics/tests/test_score_objects.py | 15 +++--
 sklearn/utils/_response.py                  | 24 ++++---
 4 files changed, 84 insertions(+), 33 deletions(-)

diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index c2b7d19404af9..acc534a5c6b65 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -111,6 +111,13 @@ Changelog
   to :ref:`metadata routing user guide <metadata_routing>`. :pr:`26789` by
   `Adrin Jalali`_.
 
+:mod:`sklearn.metrics`
+......................
+
+- |Enhancement| add a parameter `response_method` to define a list of priority
+  of response methods to use with metrics requiring the option `needs_threshold=True`.
+  :pr:`xxxx` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.model_selection`
 ..............................
 
diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index c8969acba1744..247a44d484a76 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -25,8 +25,6 @@
 from inspect import signature
 from traceback import format_exc
 
-import numpy as np
-
 from ..base import is_regressor
 from ..utils import Bunch
 from ..utils._param_validation import HasMethods, StrOptions, validate_params
@@ -40,6 +38,7 @@
     process_routing,
 )
 from ..utils.multiclass import type_of_target
+from ..utils.validation import _check_response_method
 from . import (
     accuracy_score,
     average_precision_score,
@@ -130,7 +129,21 @@ def __call__(self, estimator, *args, **kwargs):
                 **{name: Bunch(score=kwargs) for name in self._scorers}
             )
 
-        for name, scorer in self._scorers.items():
+        # to have the highest cache hit rate, we need to check if we have any
+        # _ThresholdScorer and choose a single response method based on the current
+        # estimator
+        scorers = copy.deepcopy(self._scorers)
+        for name, scorer in scorers.items():
+            if isinstance(scorer, _ThresholdScorer):
+                if scorer._response_method is None:
+                    response_method = ("decision_function", "predict_proba")
+                else:
+                    response_method = scorer._response_method
+                scorer._response_method = _check_response_method(
+                    estimator, response_method
+                ).__name__
+
+        for name, scorer in scorers.items():
             try:
                 if isinstance(scorer, _BaseScorer):
                     score = scorer._score(
@@ -405,6 +418,10 @@ def _factory_args(self):
 
 
 class _ThresholdScorer(_BaseScorer):
+    def __init__(self, score_func, sign, kwargs, response_method=None):
+        super().__init__(score_func, sign, kwargs)
+        self._response_method = response_method
+
     def _score(self, method_caller, clf, X, y, **kwargs):
         """Evaluate decision function output for X relative to y_true.
 
@@ -452,20 +469,18 @@ def _score(self, method_caller, clf, X, y, **kwargs):
             raise ValueError("{0} format is not supported".format(y_type))
 
         if is_regressor(clf):
-            y_pred = method_caller(clf, "predict", X)
+            response_method = self._response_method or "predict"
+            y_pred = method_caller(clf, response_method, X)
         else:
+            if self._response_method is None:
+                response_method = ("decision_function", "predict_proba")
+            elif isinstance(self._response_method, list):
+                # Need immutable for potential caching
+                response_method = tuple(self._response_method)
+            else:
+                response_method = self._response_method
             pos_label = self._get_pos_label()
-            try:
-                y_pred = method_caller(clf, "decision_function", X, pos_label=pos_label)
-
-                if isinstance(y_pred, list):
-                    # For multi-output multi-class estimator
-                    y_pred = np.vstack([p for p in y_pred]).T
-
-            except (NotImplementedError, AttributeError):
-                y_pred = method_caller(clf, "predict_proba", X, pos_label=pos_label)
-                if isinstance(y_pred, list):
-                    y_pred = np.vstack([p[:, -1] for p in y_pred]).T
+            y_pred = method_caller(clf, response_method, X, pos_label=pos_label)
 
         scoring_kwargs = {**self._kwargs, **kwargs}
         return self._sign * self._score_func(y, y_pred, **scoring_kwargs)
@@ -647,6 +662,7 @@ def make_scorer(
     greater_is_better=True,
     needs_proba=False,
     needs_threshold=False,
+    response_method=None,
     **kwargs,
 ):
     """Make a scorer from a performance metric or loss function.
@@ -696,6 +712,24 @@ def make_scorer(
         For example `average_precision` or the area under the roc curve
         can not be computed using discrete predictions alone.
 
+    response_method : {"predict_proba", "decision_function", "predict"} or \
+            list of such str, default=None
+
+        Specifies the response method to use get prediction from an estimator
+        (i.e. :term:`predict_proba`, :term:`decision_function` or
+        :term:`predict`). Possible choices are:
+
+        - if `str`, it corresponds to the name to the method to return;
+        - if a list of `str`, it provides the method names in order of
+          preference. The method returned corresponds to the first method in
+          the list and which is implemented by `estimator`.
+        - if `None`, the default order of methods is
+          `["predict_proba", "decision_function"]`.
+
+        Only used when `needs_threshold=True`.
+
+        .. versionadded:: 1.4
+
     **kwargs : additional arguments
         Additional parameters to be passed to `score_func`.
 
@@ -731,10 +765,11 @@ def make_scorer(
         raise ValueError(
             "Set either needs_proba or needs_threshold to True, but not both."
         )
-    if needs_proba:
-        cls = _ProbaScorer
-    elif needs_threshold:
+    if needs_threshold:
         cls = _ThresholdScorer
+        return cls(score_func, sign, kwargs, response_method)
+    elif needs_proba:
+        cls = _ProbaScorer
     else:
         cls = _PredictScorer
     return cls(score_func, sign, kwargs)
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index ffcb0ed4fda10..0cbca98526c3d 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -541,12 +541,15 @@ def test_thresholded_scorers_multilabel_indicator_data():
 
     # Multi-output multi-class decision_function
     # TODO Is there any yet?
-    clf = DecisionTreeClassifier()
-    clf.fit(X_train, y_train)
-    clf._predict_proba = clf.predict_proba
-    clf.predict_proba = None
-    clf.decision_function = lambda X: [p[:, 1] for p in clf._predict_proba(X)]
+    class TreeWithDecisionFunction(DecisionTreeClassifier):
+        # disable predict_proba
+        predict_proba = None
 
+        def decision_function(self, X):
+            return [p[:, 1] for p in DecisionTreeClassifier.predict_proba(self, X)]
+
+    clf = TreeWithDecisionFunction()
+    clf.fit(X_train, y_train)
     y_proba = clf.decision_function(X_test)
     score1 = get_scorer("roc_auc")(clf, X_test, y_test)
     score2 = roc_auc_score(y_test, np.vstack([p for p in y_proba]).T)
@@ -837,7 +840,7 @@ def predict(self, X):
     clf = MockDecisionTreeRegressor()
     clf.fit(X, y)
 
-    scorers = {"neg_mse": "neg_mean_squared_error", "r2": "roc_auc"}
+    scorers = {"neg_mse": "neg_mean_squared_error", "r2": "r2"}
     scorer_dict = _check_multimetric_scoring(clf, scorers)
     scorer = _MultimetricScorer(scorers=scorer_dict)
     scorer(clf, X, y)
diff --git a/sklearn/utils/_response.py b/sklearn/utils/_response.py
index e753ced045e1e..45138394a09ef 100644
--- a/sklearn/utils/_response.py
+++ b/sklearn/utils/_response.py
@@ -72,15 +72,17 @@ def _get_response_values(
     if is_classifier(estimator):
         prediction_method = _check_response_method(estimator, response_method)
         classes = estimator.classes_
-        target_type = "binary" if len(classes) <= 2 else "multiclass"
-
-        if pos_label is not None and pos_label not in classes.tolist():
-            raise ValueError(
-                f"pos_label={pos_label} is not a valid label: It should be "
-                f"one of {classes}"
-            )
-        elif pos_label is None and target_type == "binary":
-            pos_label = pos_label if pos_label is not None else classes[-1]
+        if isinstance(classes, list):
+            target_type = "multilabel-indicator"
+        else:
+            target_type = "binary" if len(classes) <= 2 else "multiclass"
+            if pos_label is not None and pos_label not in classes.tolist():
+                raise ValueError(
+                    f"pos_label={pos_label} is not a valid label: It should be "
+                    f"one of {classes}"
+                )
+            elif pos_label is None and target_type == "binary":
+                pos_label = pos_label if pos_label is not None else classes[-1]
 
         y_pred = prediction_method(X)
         if prediction_method.__name__ == "predict_proba":
@@ -94,10 +96,14 @@ def _get_response_values(
                         "classifier with two classes."
                     )
                     raise ValueError(err_msg)
+            elif target_type == "multilabel-indicator" and isinstance(y_pred, list):
+                y_pred = np.vstack([p[:, -1] for p in y_pred]).T
         elif prediction_method.__name__ == "decision_function":
             if target_type == "binary":
                 if pos_label == classes[0]:
                     y_pred *= -1
+            elif target_type == "multilabel-indicator" and isinstance(y_pred, list):
+                y_pred = np.vstack([p for p in y_pred]).T
     else:  # estimator is a regressor
         if response_method != "predict":
             raise ValueError(

From 1584c5b00a373f30d324a0cfb7d293aec270a3b7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 15 Jul 2023 13:49:47 +0200
Subject: [PATCH 098/194] add non-regression test

---
 doc/whats_new/v1.4.rst                      |  6 +++++-
 sklearn/metrics/tests/test_score_objects.py | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index acc534a5c6b65..478fc8a1f1c2a 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -116,7 +116,11 @@ Changelog
 
 - |Enhancement| add a parameter `response_method` to define a list of priority
   of response methods to use with metrics requiring the option `needs_threshold=True`.
-  :pr:`xxxx` by :user:`Guillaume Lemaitre <glemaitre>`.
+  :pr:`26840` by :user:`Guillaume Lemaitre <glemaitre>`.
+
+- |Fix| scorers used with :func:`metrics.get_scorer` handles properly
+  multilabel-indicator matrix.
+  :pr:`26840` by :user:`Guillaume Lemaitre <glemaitre>`.
 
 :mod:`sklearn.model_selection`
 ..............................
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 0cbca98526c3d..d0d37e5d2a837 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -52,6 +52,7 @@
 )
 from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
 from sklearn.multiclass import OneVsRestClassifier
+from sklearn.multioutput import ClassifierChain
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.pipeline import make_pipeline
 from sklearn.svm import LinearSVC
@@ -1349,3 +1350,20 @@ def score(y_true, y_pred, param=None):
     with config_context(enable_metadata_routing=False):
         with pytest.raises(ValueError, match="kwargs is only supported if"):
             scorer(clf, X, y, param="blah")
+
+
+def test_get_scorer_multilabel_indicator():
+    """Check that our scorer deal with multi-label indicator matrices.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/26817
+    """
+    X, Y = make_multilabel_classification(n_samples=72, n_classes=3, random_state=0)
+    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)
+
+    base_lr = LogisticRegression(solver="lbfgs", random_state=0)
+    chain = ClassifierChain(base_lr, order="random", random_state=0)
+    chain.fit(X_train, Y_train)
+
+    score = get_scorer("average_precision")(chain, X_test, Y_test)
+    assert score > 0.8

From 1a5a2476a6344d9b6023846a7186e516df1d78b4 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 15 Jul 2023 14:01:48 +0200
Subject: [PATCH 099/194] update validation param

---
 sklearn/metrics/_scorer.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 247a44d484a76..803bbca0c1cd9 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -653,6 +653,11 @@ def _check_multimetric_scoring(estimator, scoring):
         "greater_is_better": ["boolean"],
         "needs_proba": ["boolean"],
         "needs_threshold": ["boolean"],
+        "response_method": [
+            None,
+            list,
+            StrOptions({"predict", "predict_proba", "decision_function"}),
+        ],
     },
     prefer_skip_nested_validation=True,
 )

From 4cc61b91af722d03340b34843ab9b22079249b0f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 15 Jul 2023 21:20:11 +0200
Subject: [PATCH 100/194] more coverage

---
 sklearn/metrics/_scorer.py                  |  3 ---
 sklearn/metrics/tests/test_score_objects.py | 19 +++++++++++++++++--
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 803bbca0c1cd9..0aae306d33365 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -474,9 +474,6 @@ def _score(self, method_caller, clf, X, y, **kwargs):
         else:
             if self._response_method is None:
                 response_method = ("decision_function", "predict_proba")
-            elif isinstance(self._response_method, list):
-                # Need immutable for potential caching
-                response_method = tuple(self._response_method)
             else:
                 response_method = self._response_method
             pos_label = self._get_pos_label()
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index d0d37e5d2a837..807265b206263 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -803,7 +803,23 @@ def test_multimetric_scorer_calls_method_once(
     assert decision_function_func.call_count == expected_decision_func_count
 
 
-def test_multimetric_scorer_calls_method_once_classifier_no_decision():
+@pytest.mark.parametrize(
+    "scorers",
+    [
+        (["roc_auc", "neg_log_loss"]),
+        (
+            {
+                "roc_auc": make_scorer(
+                    roc_auc_score,
+                    needs_threshold=True,
+                    response_method=["predict_proba", "decision_function"],
+                ),
+                "neg_log_loss": make_scorer(log_loss, needs_proba=True),
+            }
+        ),
+    ],
+)
+def test_multimetric_scorer_calls_method_once_classifier_no_decision(scorers):
     predict_proba_call_cnt = 0
 
     class MockKNeighborsClassifier(KNeighborsClassifier):
@@ -818,7 +834,6 @@ def predict_proba(self, X):
     clf = MockKNeighborsClassifier(n_neighbors=1)
     clf.fit(X, y)
 
-    scorers = ["roc_auc", "neg_log_loss"]
     scorer_dict = _check_multimetric_scoring(clf, scorers)
     scorer = _MultimetricScorer(scorers=scorer_dict)
     scorer(clf, X, y)

From 8f36235db896e210e94b1b209e9820f99277741a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 15 Jul 2023 21:28:25 +0200
Subject: [PATCH 101/194] TST add mulitlabel test

---
 sklearn/utils/tests/test_response.py | 30 +++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/sklearn/utils/tests/test_response.py b/sklearn/utils/tests/test_response.py
index a67346e5697ec..c428256558b62 100644
--- a/sklearn/utils/tests/test_response.py
+++ b/sklearn/utils/tests/test_response.py
@@ -1,11 +1,17 @@
 import numpy as np
 import pytest
 
-from sklearn.datasets import load_iris, make_classification, make_regression
+from sklearn.datasets import (
+    load_iris,
+    make_classification,
+    make_multilabel_classification,
+    make_regression,
+)
 from sklearn.linear_model import (
     LinearRegression,
     LogisticRegression,
 )
+from sklearn.multioutput import ClassifierChain
 from sklearn.preprocessing import scale
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.utils._mocking import _MockEstimatorOnOffPrediction
@@ -230,3 +236,25 @@ def test_get_response_values_multiclass(estimator, response_method):
     assert predictions.shape == (X.shape[0], len(estimator.classes_))
     if response_method == "predict_proba":
         assert np.logical_and(predictions >= 0, predictions <= 1).all()
+
+
+@pytest.mark.parametrize(
+    "response_method", ["predict_proba", "decision_function", "predict"]
+)
+def test_get_response_values_multilabel_indicator(response_method):
+    X, Y = make_multilabel_classification(random_state=0)
+    estimator = ClassifierChain(LogisticRegression()).fit(X, Y)
+
+    y_pred, pos_label = _get_response_values(
+        estimator, X, response_method=response_method
+    )
+    assert pos_label is None
+    assert y_pred.shape == Y.shape
+
+    if response_method == "predict_proba":
+        assert np.logical_and(y_pred >= 0, y_pred <= 1).all()
+    elif response_method == "decision_function":
+        assert (y_pred < 0).sum() > 0
+        assert (y_pred > 0).sum() > 0
+    else:  # response_method == "predict"
+        assert np.logical_or(y_pred == 0, y_pred == 1).all()

From 5490ce46fcdd61968f3e3ead6dea50504197a4ae Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 15 Jul 2023 23:04:57 +0200
Subject: [PATCH 102/194] simplify scorer

---
 sklearn/metrics/_scorer.py                    |  2 ++
 .../_classification_threshold.py              | 23 +++++--------------
 2 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 398d7e606d3ca..f6cbf8d281906 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -25,6 +25,8 @@
 from inspect import signature
 from traceback import format_exc
 
+import numpy as np
+
 from ..base import is_regressor
 from ..utils import Bunch
 from ..utils._param_validation import HasMethods, StrOptions, validate_params
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 5a723bd4199d9..bda92484ef64e 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -752,27 +752,16 @@ def _get_scorer(self):
             "max_precision_at_recall_constraint",
             "max_recall_at_precision_constraint",
         }:
-            if self._response_method == "predict_proba":
-                params_scorer = {"needs_proba": True, "pos_label": self.pos_label}
-            elif (
-                isinstance(self._response_method, list)
-                and self._response_method[0] == "predict_proba"
-                and hasattr(self.estimator, "predict_proba")
-            ):
-                # TODO: this is due to a limitation in `make_scorer`: ideally, we should
-                # be able to pass a list of response methods to `make_scorer` and give
-                # priority to `predict_proba` other `decision_function`.
-                # Here, we manually check if the classifier provide `predict_proba` to
-                # use `needs_proba` instead and ensure that no error will be raised.
-                params_scorer = {"needs_proba": True, "pos_label": self.pos_label}
-            else:
-                params_scorer = {"needs_threshold": True, "pos_label": self.pos_label}
-
             if "tpr" in self.objective_metric:  # tpr/tnr
                 score_func = roc_curve
             else:  # precision/recall
                 score_func = precision_recall_curve
-            scorer = make_scorer(score_func, **params_scorer)
+            scorer = make_scorer(
+                score_func,
+                needs_threshold=True,
+                response_method=self._response_method,
+                pos_label=self.pos_label,
+            )
         else:
             scoring = check_scoring(self.estimator, scoring=self.objective_metric)
             scorer = _ContinuousScorer.from_scorer(

From 8dad0a4a693e89e893e374a8349e33e76df8866f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 15 Jul 2023 23:20:54 +0200
Subject: [PATCH 103/194] iter

---
 doc/modules/classification_threshold.rst | 55 ++++++++++++------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index 2ba5b539fab3b..c87e90778717e 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -13,7 +13,7 @@ categorical prediction (class label). Scores are obtained from :term:`predict_pr
 class, while the latter returns a decision score for each class. The decision score is a
 measure of how strongly the sample is predicted to belong to the positive class (e.g.,
 the distance to the decision boundary). A decision rule is then defined by thresholding
-the scores and obtained the class label for each sample. Those labels are obtained with
+the scores, leading to a class label for each sample. Those labels are obtained with
 :term:`predict`.
 
 For binary classification in scikit-learn, class labels are obtained by associating the
@@ -41,17 +41,19 @@ all cases. The context and nature of the use case defines the expected behavior
 classifier and thus, the strategy to convert soft predictions into hard predictions. We
 illustrate this point with an example.
 
-Let's imagine the deployment of a predictive model helping medical doctors to detect
-tumour. In a setting where this model was a tool to discard obvious cases and false
-positives don't lead to potentially harmful treatments, doctors might be interested in
-having a high recall (all cancer cases should be tagged as such) to not miss any patient
-with a cancer. However, that is at the cost of having more false positive predictions
-(i.e. lower precision). Thus, in terms of decision threshold, it may be better to
-classify a patient as having a cancer for a posterior probability estimate lower than
-0.5.
+Let's consider a scenario where a predictive model is being deployed to assist medical
+doctors in detecting tumors. In this setting, doctors will be most likely interested in
+correctly identifying all patients with cancer so that they can provide them with the
+right treatment. In other words, doctors prioritize achieving a high recall rate,
+meaning they want to identify all cases of cancer without missing any patients who have
+it. This emphasis on recall comes, of course, with the trade-off of potentially more
+false-positive predictions, reducing the precision of the model, but that is a risk
+doctors are willing to take. Consequently, when it comes to deciding whether to classify
+a patient as having cancer or not, it may be more beneficial to classify them as
+positive for cancer when the posterior probability estimate is lower than 0.5.
 
-Post-tuning of the decision threshold
-=====================================
+Post-tuning the decision threshold
+==================================
 
 One solution to address the problem stated in the introduction is to tune the decision
 threshold of the classifier once the model has been trained. The
@@ -73,7 +75,7 @@ utility metric defined by the business (in this case an insurance company).
    :align: center
 
 Options to tune the cut-off point
--------------------------------------------
+---------------------------------
 
 The cut-off point can be tuned through different strategies controlled by the parameter
 `objective_metric`.
@@ -88,7 +90,7 @@ In this example, we maximize the balanced accuracy.
     the label of the class of interested (i.e. `pos_label`). Thus, if this label is not
     the right one for your application, you need to define a scorer and pass the right
     `pos_label` (and additional parameters) using the
-    :func:`~sklearn.metrics.make_scorer`. Refer to :ref:`scoring` to get 
+    :func:`~sklearn.metrics.make_scorer`. Refer to :ref:`scoring` to get
     information to define your own scoring function. For instance, we show how to pass
     the information to the scorer that the label of interest is `0` when maximizing the
     :func:`~sklearn.metrics.f1_score`:
@@ -138,24 +140,23 @@ depicting the use of such a utility function.
 Important notes regarding the internal cross-validation
 -------------------------------------------------------
 
-By default :class:`~sklearn.model_selection.TunedThresholdClassifier` uses a
-5-fold stratified cross-validation to tune the cut-off point. The parameter
-`cv` allows to control the cross-validation strategy. It is possible to go
-around cross-validation by passing `cv="prefit"` and providing a fitted
-classifier. In this case, the cut-off point is tuned on the data provided to
-the `fit` method.
+By default :class:`~sklearn.model_selection.TunedThresholdClassifier` uses a 5-fold
+stratified cross-validation to tune the cut-off point. The parameter `cv` allows to
+control the cross-validation strategy. It is possible to bypass cross-validation by
+setting `cv="prefit"` and providing a fitted classifier. In this case, the cut-off point
+is tuned on the data provided to the `fit` method.
 
 However, you should be extremely careful when using this option. You should never use
 the same data for training the classifier and tuning the cut-off point at the risk of
-overfitting. Refer to :ref:`tunedthresholdclassifier_no_cv` for an example. If
-you are in a situation where you have limited resources, you should consider using
-a float number that will use a single split internally.
+overfitting. Refer to the following example section for more details (cf.
+:ref:`tunedthresholdclassifier_no_cv`). If you have limited resources, consider using a
+float number to limit to an internal single train-test split.
 
 The option `cv="prefit"` should only be used when the provided classifier was already
 trained, and you just want to find the best cut-off using a new validation set.
 
-Manually setting the decision thresholding
--------------------------------------------
+Manually setting the decision threshold
+---------------------------------------
 
 The previous sections discussed strategies to find an optimal decision threshold. It is
 also possible to manually set the decision threshold in
@@ -166,6 +167,6 @@ also possible to manually set the decision threshold in
 Examples
 --------
 
-- See
-  :ref:`sphx_glr_auto_examples_model_selection_plot_tuned_threshold_classifier.py`
-  example for an example of tuning the decision threshold of a classifier.
+- See the example entitled
+  :ref:`sphx_glr_auto_examples_model_selection_plot_tuned_threshold_classifier.py`,
+  to learn about tuning the decision threshold of a classifier.

From b9187086be83bbbe106d8b019401029c9829bd7e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 15 Jul 2023 23:22:28 +0200
Subject: [PATCH 104/194] remove unecessary part in doc

---
 doc/modules/classification_threshold.rst | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index c87e90778717e..d2f18dd640f72 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -130,13 +130,6 @@ Characteristic (ROC) statistics and 2 use the Precision-Recall statistics.
 For these options, the `constraint_value` parameter needs to be defined. In addition,
 you can use the `pos_label` parameter to indicate the label of the class of interest.
 
-The final strategy maximizes a custom utility function. This problem is also known as
-cost-sensitive learning. The utility function is defined by providing a dictionary
-containing the cost-gain associated with the entries of the confusion matrix. The keys
-are defined as `{"tn", "fp", "fn", "tp"}`. The class of interest is defined using the
-`pos_label` parameter. Refer to :ref:`cost_sensitive_learning_example` for an example
-depicting the use of such a utility function.
-
 Important notes regarding the internal cross-validation
 -------------------------------------------------------
 

From 5e2352381929115d4350556b17621726567c0869 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 15 Jul 2023 23:29:02 +0200
Subject: [PATCH 105/194] iter

---
 examples/model_selection/plot_tuned_threshold_classifier.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/examples/model_selection/plot_tuned_threshold_classifier.py b/examples/model_selection/plot_tuned_threshold_classifier.py
index 7c87fb5a09b50..facbab5caa2fb 100644
--- a/examples/model_selection/plot_tuned_threshold_classifier.py
+++ b/examples/model_selection/plot_tuned_threshold_classifier.py
@@ -25,6 +25,11 @@
     .. [1] "Statlog (German Credit Data) Data Set", UCI Machine Learning Repository,
        `Link
        <https://archive.ics.uci.edu/ml/datasets/Statlog+%28German+Credit+Data%29>`_.
+
+    .. [2] `Charles Elkan, "The Foundations of Cost-Sensitive Learning",
+       International joint conference on artificial intelligence.
+       Vol. 17. No. 1. Lawrence Erlbaum Associates Ltd, 2001.
+       <https://cseweb.ucsd.edu//~elkan/rescale.pdf>`_
 """
 
 # %%

From d5578f99eadfc360aa0109bee64145279641aa17 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 16 Jul 2023 13:45:16 +0200
Subject: [PATCH 106/194] iter

---
 .../plot_tuned_threshold_classifier.py        | 60 +++++++------------
 1 file changed, 22 insertions(+), 38 deletions(-)

diff --git a/examples/model_selection/plot_tuned_threshold_classifier.py b/examples/model_selection/plot_tuned_threshold_classifier.py
index facbab5caa2fb..5a7b0467a4a03 100644
--- a/examples/model_selection/plot_tuned_threshold_classifier.py
+++ b/examples/model_selection/plot_tuned_threshold_classifier.py
@@ -37,8 +37,6 @@
 # -------------------------------
 #
 # We fetch the German credit dataset from OpenML.
-import numpy as np
-
 import sklearn
 from sklearn.datasets import fetch_openml
 
@@ -47,7 +45,6 @@
 
 german_credit = fetch_openml(data_id=31, as_frame=True, parser="pandas")
 X, y = german_credit.data, german_credit.target
-pure_loss = np.abs(np.random.RandomState(0).randint(0, 5, size=len(y)))
 
 # %%
 # We check the feature types available in `X`.
@@ -70,15 +67,13 @@
 # (e.g. precision and recall) require to provide the label of interest also called
 # the "positive label". Here, we define that our goal is to predict whether or not
 # a sample is a "bad" credit.
-pos_label = "bad"
+pos_label, neg_label = "bad", "good"
 
 # %%
 # To carry our analysis, we split our dataset using a single stratified split.
 from sklearn.model_selection import train_test_split
 
-X_train, X_test, y_train, y_test, pure_loss_train, pure_loss_test = train_test_split(
-    X, y, pure_loss, stratify=y, random_state=0
-)
+X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
 
 # %%
 # We are ready to design our predictive model and the associated evaluation strategy.
@@ -95,18 +90,11 @@
 #
 # From these four metrics, scikit-learn does not provide a scorer for the FPR. We
 # therefore need to define a small custom function to compute it.
-import numpy as np
+from sklearn.metrics import confusion_matrix
 
-from sklearn.metrics import confusion_matrix, make_scorer, precision_score, recall_score
 
-
-def fpr_score(y, y_pred, **kwargs):
-    cm = confusion_matrix(y, y_pred)
-    classes = np.unique(y)
-    pos_label = kwargs.get("pos_label", classes[-1])
-    pos_label_idx = np.searchsorted(classes, pos_label)
-    if pos_label_idx == 0:
-        cm = cm[::-1, ::-1]
+def fpr_score(y, y_pred, neg_label, pos_label):
+    cm = confusion_matrix(y, y_pred, labels=[neg_label, pos_label])
     tn, fp, _, _ = cm.ravel()
     tnr = tn / (tn + fp)
     return 1 - tnr
@@ -121,37 +109,33 @@ def fpr_score(y, y_pred, **kwargs):
 # :func:`~sklearn.metrics.make_scorer` where the information is passed. We store all
 # the custom scorers in a dictionary. To use them, we need to pass the fitted model,
 # the data and the target on which we want to evaluate the predictive model.
+from sklearn.metrics import make_scorer, precision_score, recall_score
+
 tpr_score = recall_score  # TPR and recall are the same metric
 scoring = {
     "precision": make_scorer(precision_score, pos_label=pos_label),
     "recall": make_scorer(recall_score, pos_label=pos_label),
-    "fpr": make_scorer(fpr_score, pos_label=pos_label),
+    "fpr": make_scorer(fpr_score, neg_label=neg_label, pos_label=pos_label),
     "tpr": make_scorer(tpr_score, pos_label=pos_label),
 }
 
 # %%
 # In addition, the original research [1]_ defines a business metric. They provide a
 # cost-matrix which encodes that predicting a "bad" credit as "good" is 5 times more
-# costly than the opposite. We define a dictionary containing this information and a
-# score function that computes the cost.
+# costly than the opposite. We define a python function that will weight the confusion
+# matrix and return the overall cost.
+import numpy as np
 
 
-def gain_cost_score(y, y_pred, pos_label, pure_loss):
-    cost_and_gain = np.zeros_like(y)
-    mask_tp = (y == pos_label) & (y_pred == pos_label)
-    cost_and_gain[mask_tp] = 0
-    mask_fp = (y != pos_label) & (y_pred == pos_label)
-    cost_and_gain[mask_fp] = -pure_loss[mask_fp]
-    mask_fn = (y == pos_label) & (y_pred != pos_label)
-    cost_and_gain[mask_fn] = -1
-    mask_tn = (y != pos_label) & (y_pred != pos_label)
-    cost_and_gain[mask_tn] = 0
-    return cost_and_gain.sum()
+def gain_cost_score(y, y_pred, neg_label, pos_label):
+    cm = confusion_matrix(y, y_pred, labels=[neg_label, pos_label])
+    cost_matrix = np.array([[0, -1], [-5, 0]])
+    return np.sum(cm * cost_matrix)
 
 
 scoring["cost_gain"] = make_scorer(
-    gain_cost_score, pos_label=pos_label
-).set_score_request(pure_loss=True)
+    gain_cost_score, neg_label=neg_label, pos_label=pos_label
+)
 # %%
 # Vanilla predictive model
 # ------------------------
@@ -261,7 +245,7 @@ def gain_cost_score(y, y_pred, pos_label, pure_loss):
 #
 # However, we recall that the original aim was to minimize the cost (or maximize the
 # gain) by the business metric. We can compute the value of the business metric:
-scoring["cost_gain"](model, X_test, y_test, pure_loss=pure_loss_test)
+scoring["cost_gain"](model, X_test, y_test)
 
 # %%
 # At this stage we don't know if any other cut-off can lead to a greater gain.
@@ -289,7 +273,7 @@ def gain_cost_score(y, y_pred, pos_label, pure_loss):
     pos_label=pos_label,
     objective_metric=scoring["cost_gain"],
 )
-model_tuned.fit(X_train, y_train, pure_loss=pure_loss_train)
+model_tuned.fit(X_train, y_train)
 
 # %%
 # We plot the ROC and Precision-Recall curves for the vanilla model and the tuned model.
@@ -379,7 +363,7 @@ def gain_cost_score(y, y_pred, pos_label, pure_loss):
 #
 # We can now check if choosing this cut-off point leads to a better score on the testing
 # set:
-scoring["cost_gain"](model_tuned, X_test, y_test, pure_loss=pure_loss_test)
+scoring["cost_gain"](model_tuned, X_test, y_test)
 
 # %%
 # We observe that the decision generalized on the testing set leading to a better
@@ -402,7 +386,7 @@ def gain_cost_score(y, y_pred, pos_label, pure_loss):
 # Also, the underlying classifier is not be refitted. Here, we can try to do such
 # experiment.
 model.fit(X_train, y_train)
-model_tuned.set_params(cv="prefit").fit(X_train, y_train, pure_loss=pure_loss_train)
+model_tuned.set_params(cv="prefit").fit(X_train, y_train)
 
 
 # %%
@@ -497,7 +481,7 @@ def gain_cost_score(y, y_pred, pos_label, pure_loss):
 # single train-test split by providing a floating number in range `[0, 1]` to the `cv`
 # parameter. It splits the data into a training and testing set. Let's explore this
 # option:
-model_tuned.set_params(cv=0.75).fit(X_train, y_train, pure_loss=pure_loss_train)
+model_tuned.set_params(cv=0.75).fit(X_train, y_train)
 
 # %%
 fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(21, 6))

From f3f844ef7815c6f3cfa66e9792257dac5cd6f162 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 24 Jul 2023 14:36:23 +0200
Subject: [PATCH 107/194] address tim comments

---
 sklearn/metrics/_scorer.py           | 5 ++++-
 sklearn/utils/tests/test_response.py | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 0aae306d33365..52b52602d9dff 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -469,7 +469,10 @@ def _score(self, method_caller, clf, X, y, **kwargs):
             raise ValueError("{0} format is not supported".format(y_type))
 
         if is_regressor(clf):
-            response_method = self._response_method or "predict"
+            if self._response_method is None:
+                response_method = "predict"
+            else:
+                response_method = self._response_method
             y_pred = method_caller(clf, response_method, X)
         else:
             if self._response_method is None:
diff --git a/sklearn/utils/tests/test_response.py b/sklearn/utils/tests/test_response.py
index c428256558b62..e715e94cc60b1 100644
--- a/sklearn/utils/tests/test_response.py
+++ b/sklearn/utils/tests/test_response.py
@@ -254,7 +254,8 @@ def test_get_response_values_multilabel_indicator(response_method):
     if response_method == "predict_proba":
         assert np.logical_and(y_pred >= 0, y_pred <= 1).all()
     elif response_method == "decision_function":
+        # values returned by `decision_function` are not bounded in [0, 1]
         assert (y_pred < 0).sum() > 0
-        assert (y_pred > 0).sum() > 0
+        assert (y_pred > 1).sum() > 0
     else:  # response_method == "predict"
         assert np.logical_or(y_pred == 0, y_pred == 1).all()

From 26dc94e9850d5f1ce1045e9dfb151dd4b76927e2 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 Jul 2023 09:17:31 +0200
Subject: [PATCH 108/194] iter

---
 ...eshold_classifier_with_metadata_routing.py | 590 ++++++++++++++++++
 sklearn/ensemble/_base.py                     |   4 +-
 sklearn/metrics/_scorer.py                    |  15 +-
 .../_classification_threshold.py              |  15 +-
 4 files changed, 615 insertions(+), 9 deletions(-)
 create mode 100644 examples/model_selection/plot_tuned_threshold_classifier_with_metadata_routing.py

diff --git a/examples/model_selection/plot_tuned_threshold_classifier_with_metadata_routing.py b/examples/model_selection/plot_tuned_threshold_classifier_with_metadata_routing.py
new file mode 100644
index 0000000000000..98c1e1049ba21
--- /dev/null
+++ b/examples/model_selection/plot_tuned_threshold_classifier_with_metadata_routing.py
@@ -0,0 +1,590 @@
+# %%
+import pandas as pd
+
+filename_training = "~/Downloads/kdd98/epsilon_mirror/cup98lrn.zip"
+index = ["CONTROLN"]
+df_train = pd.read_csv(
+    filename_training, compression="zip", encoding="latin-1", low_memory=False
+).set_index(index)
+train_indices = df_train.index
+
+# %%
+filename_data_test = "~/Downloads/kdd98/epsilon_mirror/cup98val.zip"
+data_test = pd.read_csv(
+    filename_data_test, compression="zip", encoding="latin-1", low_memory=False
+).set_index(index)
+
+# %%
+filename_target_test = "~/Downloads/kdd98/epsilon_mirror/valtargt.txt"
+target_test = pd.read_csv(filename_target_test).set_index(index)
+
+# %%
+df_test = pd.concat([data_test, target_test], axis=1)
+test_indices = df_test.index
+# Do not convert string to avoid the pd.NA bug:
+# xref: https://github.com/scikit-learn/scikit-learn/issues/26890
+df = pd.concat([df_train, df_test], axis=0)
+# df = df.convert_dtypes(convert_string=False)
+
+# %%
+# convert to categorical in case we don't want to use the TableVectorizer
+categorical_columns = {}
+for col_idx, col in enumerate(df.columns):
+    series = df[col]
+    dtype_series = series.dtype
+    # if isinstance(dtype_series, pd.StringDtype):
+    if dtype_series.kind == "O":
+        categorical_columns[col] = "category"
+        continue  # skip the rest of the loop
+    unique_values = series.value_counts()
+    if len(unique_values) < 60:
+        # low-cardinality features are considered as categorical
+        categorical_columns[col] = "category"
+
+df = df.astype(categorical_columns)
+
+# %%
+target_continuous_name = "TARGET_D"
+target_binary_name = "TARGET_B"
+neg_label, pos_label = 0, 1
+
+# %%
+from sklearn.model_selection import train_test_split
+
+# df_train, df_test = train_test_split(df, test_size=0.5, random_state=42)
+df_train = df.loc[train_indices]
+df_test = df.loc[test_indices]
+
+# %%
+data_train = df_train.drop(columns=[target_continuous_name, target_binary_name])
+
+# %%
+data_train
+
+# %%
+target_continuous_train = df_train[target_continuous_name]
+target_binary_train = df_train[target_binary_name]
+
+# %%
+
+
+def cost_metric(y_true, y_pred, neg_label, pos_label, donation_amount):
+    """Compute the business cost related to the prediction.
+
+    The real cost is to send a mail to a person and is evaluated to $0.68.
+    The gain is the donation amount if the person donate.
+    """
+    mask_true_positive = (y_true == pos_label) & (y_pred == pos_label)
+    mask_false_positive = (y_true == neg_label) & (y_pred == pos_label)
+    # mask_false_negative = (y_true == pos_label) & (y_pred == neg_label)
+    # mask_true_negative = (y_true == neg_label) & (y_pred == neg_label)
+    cost_sending_mail = -0.68
+    cost_false_positive = cost_sending_mail * mask_false_positive.sum()
+    gain_true_positive = (cost_sending_mail + donation_amount[mask_true_positive]).sum()
+    return gain_true_positive + cost_false_positive
+    # loss_false_negative = donation_amount[mask_false_negative].sum()
+    # fp = - mask_false_positive.sum() * cost_sending_mail
+    # fn = - (donation_amount[mask_false_negative] - cost_sending_mail).sum()
+    # return fp + fn
+
+
+# %%
+import numpy as np
+
+gain = cost_metric(
+    y_true=target_binary_train,
+    y_pred=np.zeros_like(target_binary_train),
+    neg_label=neg_label,
+    pos_label=pos_label,
+    donation_amount=target_continuous_train,
+)
+print(f"Gain if we don't send any mail: ${gain:,.2f}")
+
+# %%
+gain = cost_metric(
+    y_true=target_binary_train,
+    y_pred=np.ones_like(target_binary_train),
+    neg_label=neg_label,
+    pos_label=pos_label,
+    donation_amount=target_continuous_train,
+)
+print(f"Gain if we send mails to everyone: ${gain:,.2f}")
+
+# %%
+gain = cost_metric(
+    y_true=target_binary_train,
+    y_pred=target_binary_train,
+    neg_label=neg_label,
+    pos_label=pos_label,
+    donation_amount=target_continuous_train,
+)
+print(f"Maximum gain on the training set: ${gain:,.2f}")
+
+# %%
+import sklearn
+from sklearn.metrics import make_scorer
+
+sklearn.set_config(enable_metadata_routing=True)
+
+cost_scorer = make_scorer(
+    cost_metric, neg_label=neg_label, pos_label=pos_label
+).set_score_request(donation_amount=True)
+
+# %%
+from sklearn.dummy import DummyClassifier
+
+stingy_classifier = DummyClassifier(strategy="constant", constant=0)
+stingy_classifier.fit(data_train, target_binary_train)
+
+# %%
+bling_bling_classifier = DummyClassifier(strategy="constant", constant=1)
+bling_bling_classifier.fit(data_train, target_binary_train)
+
+# %%
+gain = cost_scorer(
+    stingy_classifier,
+    data_train,
+    target_binary_train,
+    donation_amount=target_continuous_train,
+)
+print(f"Gain of the stingy classifier: ${gain:,.2f}")
+
+# %%
+gain = cost_scorer(
+    bling_bling_classifier,
+    data_train,
+    target_binary_train,
+    donation_amount=target_continuous_train,
+)
+print(f"Gain of the bling-bling classifier: ${gain:,.2f}")
+
+# %%
+data_test = df_test.drop(columns=[target_continuous_name, target_binary_name])
+
+# %%
+target_continuous_test = df_test[target_continuous_name]
+target_binary_test = df_test[target_binary_name]
+
+# %%
+from sklearn.compose import ColumnTransformer
+from sklearn.compose import make_column_selector as selector
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OrdinalEncoder
+
+sklearn.set_config(transform_output="pandas")
+
+categorical_columns = selector(dtype_include="category")
+preprocessing = ColumnTransformer(
+    transformers=[
+        (
+            "categorical",
+            OrdinalEncoder(
+                handle_unknown="use_encoded_value",
+                unknown_value=-1,
+                min_frequency=0.05,
+                max_categories=255,
+            ),
+            categorical_columns,
+        ),
+    ],
+    remainder="passthrough",
+    n_jobs=-1,
+    verbose_feature_names_out=False,
+)
+model = Pipeline(
+    steps=[
+        ("preprocessing", preprocessing),
+        (
+            "classifier",
+            HistGradientBoostingClassifier(
+                max_iter=1_000,
+                early_stopping=True,
+                categorical_features=categorical_columns(data_train),
+            ),
+            # DecisionTreeClassifier(max_leaf_nodes=200, random_state=42),
+        ),
+    ]
+)
+model.fit(data_train, target_binary_train)
+
+# %%
+model.score(data_test, target_binary_test)
+
+# %%
+from sklearn.metrics import balanced_accuracy_score
+
+balanced_accuracy_score(target_binary_test, model.predict(data_test))
+
+# %%
+gain = cost_metric(
+    y_true=target_binary_test,
+    y_pred=model.predict(data_test),
+    neg_label=neg_label,
+    pos_label=pos_label,
+    donation_amount=target_continuous_test,
+)
+print(f"Gain of the model: ${gain:,.2f}")
+
+# %%
+from sklearn.calibration import CalibrationDisplay
+
+CalibrationDisplay.from_estimator(model, data_test, target_binary_test, n_bins=10)
+
+# %%
+from sklearn.calibration import CalibratedClassifierCV
+from sklearn.model_selection import StratifiedKFold, TunedThresholdClassifier
+
+cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
+tuned_model = TunedThresholdClassifier(
+    # estimator=model,
+    estimator=CalibratedClassifierCV(model, cv=5, method="isotonic"),
+    n_thresholds=np.linspace(0.02, 0.05, 1_000),
+    # n_thresholds=1_000,
+    # strategy="constant",
+    # constant_threshold=0.0302,
+    pos_label=pos_label,
+    objective_metric=cost_scorer,
+    cv=cv,
+    n_jobs=-1,
+)
+tuned_model.fit(
+    data_train, target_binary_train, donation_amount=target_continuous_train
+)
+
+# %%
+CalibrationDisplay.from_estimator(tuned_model, data_test, target_binary_test, n_bins=10)
+
+# %%
+tuned_model.score(data_test, target_binary_test)
+
+# %%
+balanced_accuracy_score(target_binary_test, tuned_model.predict(data_test))
+
+# %%
+gain = cost_metric(
+    y_true=target_binary_test,
+    y_pred=tuned_model.predict(data_test),
+    neg_label=neg_label,
+    pos_label=pos_label,
+    donation_amount=target_continuous_test,
+)
+print(f"Gain of the model: ${gain:,.2f}")
+
+# %%
+import matplotlib.pyplot as plt
+
+for thresholds, scores in zip(tuned_model.cv_thresholds_, tuned_model.cv_scores_):
+    plt.semilogx(thresholds, scores, alpha=0.5)
+
+# %%
+import plotly.graph_objects as go
+
+fig = go.Figure()
+for thresholds, scores in zip(tuned_model.cv_thresholds_, tuned_model.cv_scores_):
+    fig.add_trace(go.Scatter(x=thresholds, y=scores, mode="lines", name="lines"))
+fig.update_xaxes(type="log")
+fig.show()
+
+# %%
+
+# %%
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.impute import SimpleImputer
+
+categorical_columns = selector(dtype_include="category")
+preprocessing = ColumnTransformer(
+    transformers=[
+        (
+            "categorical",
+            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
+            categorical_columns,
+        ),
+    ],
+    remainder="passthrough",
+    n_jobs=-1,
+    verbose_feature_names_out=False,
+)
+model = Pipeline(
+    steps=[
+        ("preprocessing", preprocessing),
+        ("imputer", SimpleImputer(strategy="mean")),
+        ("classifier", RandomForestClassifier(n_jobs=-1)),
+    ]
+)
+tuned_model = TunedThresholdClassifier(
+    estimator=CalibratedClassifierCV(model, cv=5, method="isotonic"),
+    pos_label=pos_label,
+    objective_metric=cost_scorer,
+    cv=0.8,
+)
+tuned_model.fit(
+    data_train, target_binary_train, donation_amount=target_continuous_train
+)
+
+# %%
+balanced_accuracy_score(target_binary_test, tuned_model.predict(data_test))
+
+# %%
+gain = cost_metric(
+    y_true=target_binary_test,
+    y_pred=tuned_model.predict(data_test),
+    neg_label=neg_label,
+    pos_label=pos_label,
+    donation_amount=target_continuous_test,
+)
+print(f"Gain of the model: ${gain:,.2f}")
+
+# %%
+from imblearn.ensemble import BalancedRandomForestClassifier
+
+sklearn.set_config(enable_metadata_routing=False)
+
+categorical_columns = selector(dtype_include="category")
+preprocessing = ColumnTransformer(
+    transformers=[
+        (
+            "categorical",
+            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
+            categorical_columns,
+        ),
+    ],
+    remainder="passthrough",
+    n_jobs=-1,
+    verbose_feature_names_out=False,
+)
+model = Pipeline(
+    steps=[
+        ("preprocessing", preprocessing),
+        ("imputer", SimpleImputer(strategy="mean")),
+        (
+            "classifier",
+            BalancedRandomForestClassifier(
+                sampling_strategy="all", replacement=True, bootstrap=False, n_jobs=-1
+            ),
+        ),
+    ]
+)
+
+# %%
+balanced_accuracy_score(target_binary_test, model.predict(data_test))
+
+# %%
+gain = cost_metric(
+    y_true=target_binary_test,
+    y_pred=model.predict(data_test),
+    neg_label=neg_label,
+    pos_label=pos_label,
+    donation_amount=target_continuous_test,
+)
+print(f"Gain of the model: ${gain:,.2f}")
+
+
+# %%
+# %%
+# %%
+from sklearn.datasets import fetch_openml
+
+credit_card = fetch_openml(data_id=1597, as_frame=True, parser="pandas")
+credit_card.frame
+
+# %%
+data = credit_card.frame.drop(columns=["Class", "Amount"])
+target = credit_card.frame["Class"].astype(int).to_numpy()
+amount = credit_card.frame["Amount"].to_numpy()
+
+# %%
+target.value_counts()
+
+# %%
+target.value_counts(normalize=True)
+
+# %%
+fraud = target == 1
+amount_fraud = amount[fraud]
+ax = amount_fraud.plot.hist(bins=100)
+ax.set_title("Amount of fraud transaction")
+_ = ax.set_xlabel("Amount ($)")
+
+# %%
+
+
+def business_metric(y_true, y_pred, amount):
+    mask_true_positive = (y_true == 1) & (y_pred == 1)
+    mask_true_negative = (y_true == 0) & (y_pred == 0)
+    mask_false_positive = (y_true == 0) & (y_pred == 1)
+    mask_false_negative = (y_true == 1) & (y_pred == 0)
+    fraudulent_refuse = (mask_true_positive.sum() * 50) + amount[
+        mask_true_positive
+    ].sum()
+    fraudulent_accept = -amount[mask_false_negative].sum()
+    legitimate_refuse = mask_false_positive.sum() * -5
+    legitimate_accept = (amount[mask_true_negative] * 0.02).sum()
+    return fraudulent_refuse + fraudulent_accept + legitimate_refuse + legitimate_accept
+
+
+# %%
+import numpy as np
+
+benefit_cost = business_metric(target, np.zeros_like(target), amount)
+print(f"Benefit/cost of not detecting fraud: ${benefit_cost:,.2f}")
+
+# %%
+benefit_cost = business_metric(target, np.ones_like(target), amount)
+print(f"Benefit/cost of tagging everything as fraud: ${benefit_cost:,.2f}")
+
+# %%
+from sklearn.model_selection import train_test_split
+
+data_train, data_test, target_train, target_test, amount_train, amount_test = (
+    train_test_split(
+        data, target, amount, stratify=target, test_size=0.5, random_state=42
+    )
+)
+
+# %%
+import sklearn
+from sklearn.metrics import make_scorer
+
+sklearn.set_config(enable_metadata_routing=True)
+
+business_scorer = make_scorer(business_metric).set_score_request(amount=True)
+
+# %%
+from sklearn.dummy import DummyClassifier
+
+easy_going_classifier = DummyClassifier(strategy="constant", constant=0)
+easy_going_classifier.fit(data_train, target_train)
+
+# %%
+benefit_cost = business_scorer(
+    easy_going_classifier, data_test, target_test, amount=amount_test
+)
+print(f"Benefit/cost of our easy-going classifier: ${benefit_cost:,.2f}")
+
+# %%
+intolerant_classifier = DummyClassifier(strategy="constant", constant=1)
+intolerant_classifier.fit(data_train, target_train)
+
+# %%
+benefit_cost = business_scorer(
+    intolerant_classifier, data_test, target_test, amount=amount_test
+)
+print(f"Benefit/cost of our intolerant classifier: ${benefit_cost:,.2f}")
+
+# %%
+from sklearn.linear_model import LogisticRegression
+
+logistic_regression = LogisticRegression(max_iter=1_000).fit(data_train, target_train)
+benefit_cost = business_scorer(
+    logistic_regression, data_test, target_test, amount=amount_test
+)
+print(f"Benefit/cost of our logistic regression: ${benefit_cost:,.2f}")
+
+# %%
+from sklearn.metrics import get_scorer
+
+balanced_accuracy_scorer = get_scorer("balanced_accuracy")
+balanced_accuracy = balanced_accuracy_scorer(
+    logistic_regression, data_test, target_test
+)
+print(f"Balanced accuracy of our logistic regression: {balanced_accuracy:.2f}")
+
+# %%
+from sklearn.model_selection import TunedThresholdClassifier
+
+tuned_model = TunedThresholdClassifier(
+    estimator=logistic_regression,
+    objective_metric=business_scorer,
+    n_thresholds=1_000,
+    n_jobs=-1,
+)
+tuned_model.fit(data_train, target_train, amount=amount_train)
+
+# %%
+benefit_cost = business_scorer(tuned_model, data_test, target_test, amount=amount_test)
+print(f"Benefit/cost of our tuned model: ${benefit_cost:,.2f}")
+
+# %%
+balanced_accuracy = balanced_accuracy_scorer(tuned_model, data_test, target_test)
+print(f"Balanced accuracy of our tuned model: {balanced_accuracy:.2f}")
+
+# %%
+tuned_model.set_params(objective_metric="balanced_accuracy").fit(
+    data_train, target_train
+)
+balanced_accuracy = balanced_accuracy_scorer(tuned_model, data_test, target_test)
+print(f"Balanced accuracy of our tuned model: {balanced_accuracy:.2f}")
+
+# %%
+logistic_regression.set_params(class_weight="balanced").fit(data_train, target_train)
+balanced_accuracy = balanced_accuracy_scorer(
+    logistic_regression, data_test, target_test
+)
+print(f"Balanced accuracy of our logistic regression: {balanced_accuracy:.2f}")
+
+# %%
+from sklearn.calibration import CalibrationDisplay
+
+CalibrationDisplay.from_estimator(
+    logistic_regression, data_test, target_test, n_bins=10
+)
+
+# %%
+CalibrationDisplay.from_estimator(tuned_model, data_test, target_test, n_bins=10)
+
+# %%
+from sklearn.ensemble import RandomForestClassifier
+
+rf = RandomForestClassifier(n_jobs=-1).fit(data_train, target_train)
+# %%
+balanced_accuracy = balanced_accuracy_scorer(rf, data_test, target_test)
+print(f"Balanced accuracy of our random forest: {balanced_accuracy:.2f}")
+benefit_cost = business_scorer(rf, data_test, target_test, amount=amount_test)
+print(f"Benefit/cost of our random forest: ${benefit_cost:,.2f}")
+
+# %%
+from imblearn.ensemble import BalancedRandomForestClassifier
+
+brf = BalancedRandomForestClassifier(
+    sampling_strategy="all", replacement=True, bootstrap=False, n_jobs=-1
+)
+brf.fit(data_train, target_train)
+
+# %%
+balanced_accuracy = balanced_accuracy_scorer(brf, data_test, target_test)
+print(f"Balanced accuracy of our balanced random forest: {balanced_accuracy:.2f}")
+benefit_cost = business_scorer(brf, data_test, target_test, amount=amount_test)
+print(f"Benefit/cost of our balanced random forest: ${benefit_cost:,.2f}")
+
+# %%
+tuned_model = TunedThresholdClassifier(
+    estimator=brf,
+    objective_metric=business_scorer,
+    n_thresholds=1_000,
+    n_jobs=-1,
+)
+tuned_model.fit(data_train, target_train, amount=amount_train)
+
+# %%
+balanced_accuracy = balanced_accuracy_scorer(tuned_model, data_test, target_test)
+print(f"Balanced accuracy of our balanced random forest: {balanced_accuracy:.2f}")
+benefit_cost = business_scorer(tuned_model, data_test, target_test, amount=amount_test)
+print(f"Benefit/cost of our balanced random forest: ${benefit_cost:,.2f}")
+
+# %%
+tuned_model = TunedThresholdClassifier(
+    estimator=rf,
+    objective_metric=business_scorer,
+    n_thresholds=1_000,
+    n_jobs=-1,
+)
+tuned_model.fit(data_train, target_train, amount=amount_train)
+
+# %%
+balanced_accuracy = balanced_accuracy_scorer(tuned_model, data_test, target_test)
+print(f"Balanced accuracy of our balanced random forest: {balanced_accuracy:.2f}")
+benefit_cost = business_scorer(tuned_model, data_test, target_test, amount=amount_test)
+print(f"Benefit/cost of our balanced random forest: ${benefit_cost:,.2f}")
+
+# %%
diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py
index 3107b4cf9a6c5..5cf3527f0cc43 100644
--- a/sklearn/ensemble/_base.py
+++ b/sklearn/ensemble/_base.py
@@ -197,7 +197,9 @@ def _make_estimator(self, append=True, random_state=None):
 
     def __len__(self):
         """Return the number of estimators in the ensemble."""
-        return len(self.estimators_)
+        if hasattr(self, "estimators_"):
+            return len(self.estimators_)
+        return True
 
     def __getitem__(self, index):
         """Return the index'th estimator in the ensemble."""
diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index be8b7e4b5f239..760e9d13f8e25 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -23,6 +23,7 @@
 from collections import Counter
 from functools import partial
 from inspect import signature
+from numbers import Integral
 from traceback import format_exc
 
 import numpy as np
@@ -503,12 +504,13 @@ def _threshold_scores_to_class_labels(y_score, threshold, classes, pos_label):
 class _ContinuousScorer(_BaseScorer):
     """Scorer taking a continuous response and output a score for each threshold."""
 
-    def __init__(self, score_func, sign, response_method, kwargs):
+    def __init__(self, score_func, sign, response_method, n_thresholds, kwargs):
         super().__init__(score_func=score_func, sign=sign, kwargs=kwargs)
         self.response_method = response_method
+        self.n_thresholds = n_thresholds
 
     @classmethod
-    def from_scorer(cls, scorer, response_method, pos_label):
+    def from_scorer(cls, scorer, response_method, n_thresholds, pos_label):
         """Create a continuous scorer from a normal scorer."""
         # add `pos_label` if requested by the scorer function
         scorer_kwargs = {**scorer._kwargs}
@@ -532,6 +534,7 @@ def from_scorer(cls, scorer, response_method, pos_label):
             score_func=scorer._score_func,
             sign=scorer._sign,
             response_method=response_method,
+            n_thresholds=n_thresholds,
             kwargs=scorer_kwargs,
         )
         # transfer the metadata request
@@ -571,7 +574,13 @@ def _score(self, method_caller, estimator, X, y_true, **kwargs):
         # Easy for the probability case but not for the decision function case since it
         # is not bounded.
         scoring_kwargs = {**self._kwargs, **kwargs}
-        potential_thresholds = np.unique(y_score)
+        # potential_thresholds = np.unique(y_score)
+        if isinstance(self.n_thresholds, Integral):
+            potential_thresholds = np.linspace(
+                np.min(y_score), np.max(y_score), self.n_thresholds
+            )
+        else:
+            potential_thresholds = np.array(self.n_thresholds, copy=False)
         score_thresholds = [
             self._sign
             * self._score_func(
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index bda92484ef64e..fb7e4a5e740d3 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -221,6 +221,7 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
     n_thresholds : int, default=100
         The number of decision threshold to use when discretizing the output
         of the classifier `method`.
+        # TODO: update to array-like
 
     cv : int, float, cross-validation generator, iterable or "prefit", default=None
         Determines the cross-validation splitting strategy to train classifier.
@@ -375,7 +376,7 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
         "constant_threshold": [Real],
         "pos_label": [Real, str, "boolean", None],
         "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})],
-        "n_thresholds": [Interval(Integral, 1, None, closed="left")],
+        "n_thresholds": [Interval(Integral, 1, None, closed="left"), "array-like"],
         "cv": [
             "cv_object",
             StrOptions({"prefit"}),
@@ -566,9 +567,12 @@ def fit(self, X, y, **params):
         # find the global min and max thresholds across all folds
         min_threshold = np.min([th.min() for th in cv_thresholds])
         max_threshold = np.max([th.max() for th in cv_thresholds])
-        self.decision_thresholds_ = np.linspace(
-            min_threshold, max_threshold, num=self.n_thresholds
-        )
+        if isinstance(self.n_thresholds, Integral):
+            self.decision_thresholds_ = np.linspace(
+                min_threshold, max_threshold, num=self.n_thresholds
+            )
+        else:
+            self.decision_thresholds_ = np.array(self.n_thresholds, copy=False)
 
         def _mean_interpolated_score(threshold_interpolated, cv_thresholds, cv_scores):
             return np.mean(
@@ -583,6 +587,7 @@ def _mean_interpolated_score(threshold_interpolated, cv_thresholds, cv_scores):
             self.objective_scores_ = _mean_interpolated_score(
                 self.decision_thresholds_, cv_thresholds, cv_scores
             )
+            self.cv_thresholds_, self.cv_scores_ = cv_thresholds, cv_scores
             best_idx = self.objective_scores_.argmax()
             self.objective_score_ = self.objective_scores_[best_idx]
             self.decision_threshold_ = self.decision_thresholds_[best_idx]
@@ -765,7 +770,7 @@ def _get_scorer(self):
         else:
             scoring = check_scoring(self.estimator, scoring=self.objective_metric)
             scorer = _ContinuousScorer.from_scorer(
-                scoring, self._response_method, self.pos_label
+                scoring, self._response_method, self.n_thresholds, self.pos_label
             )
         return scorer
 

From 6a1a6c71498dcd8368e4a5e80f800c57a55e0730 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 Jul 2023 17:00:19 +0200
Subject: [PATCH 109/194] iter

---
 sklearn/metrics/tests/test_score_objects.py            | 10 +++++++++-
 sklearn/model_selection/_classification_threshold.py   |  2 +-
 .../tests/test_classification_threshold.py             |  4 ++++
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index b9415333e5f80..86f14c4d1c85f 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -1222,7 +1222,11 @@ def test_continuous_scorer():
     X, y = make_classification(random_state=0)
     estimator = LogisticRegression().fit(X, y)
     scorer = _ContinuousScorer(
-        balanced_accuracy_score, sign=1, response_method="predict_proba", kwargs={}
+        balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        n_thresholds=10,
+        kwargs={},
     )
     thresholds, scores = scorer(estimator, X, y)
 
@@ -1238,6 +1242,7 @@ def test_continuous_scorer():
         balanced_accuracy_score,
         sign=1,
         response_method="predict_proba",
+        n_thresholds=10,
         kwargs={"adjusted": True},
     )
     thresholds, scores = scorer(estimator, X, y)
@@ -1250,6 +1255,7 @@ def test_continuous_scorer():
         balanced_accuracy_score,
         sign=-1,
         response_method="predict_proba",
+        n_thresholds=10,
         kwargs={"adjusted": True},
     )
     thresholds, scores = scorer(estimator, X, y)
@@ -1269,6 +1275,7 @@ def test_continuous_scorer_pos_label(global_random_seed):
         recall_score,
         sign=1,
         response_method="predict_proba",
+        n_thresholds=1000,
         kwargs={"pos_label": 1},
     )
     thresholds_pos_label_1, scores_pos_label_1 = scorer(estimator, X, y)
@@ -1277,6 +1284,7 @@ def test_continuous_scorer_pos_label(global_random_seed):
         recall_score,
         sign=1,
         response_method="predict_proba",
+        n_thresholds=1000,
         kwargs={"pos_label": 0},
     )
     thresholds_pos_label_0, scores_pos_label_0 = scorer(estimator, X, y)
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index fb7e4a5e740d3..349da566a8a6b 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -558,7 +558,7 @@ def fit(self, X, y, **params):
             )
         )
 
-        if any(len(th) == 1 for th in cv_thresholds):
+        if any(np.isclose(th[0], th[-1]) for th in cv_thresholds):
             raise ValueError(
                 "The provided estimator makes constant predictions. Therefore, it is "
                 "impossible to optimize the decision threshold."
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 8e9e2e9e87905..63551792c377f 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -41,6 +41,7 @@
                 score_func=balanced_accuracy_score,
                 sign=1,
                 response_method="predict_proba",
+                n_thresholds=10,
                 kwargs={},
             ),
             "balanced_accuracy",
@@ -101,6 +102,7 @@ def test_fit_and_score_scorers(scorer, score_method):
                 score_func=balanced_accuracy_score,
                 sign=1,
                 response_method="predict_proba",
+                n_thresholds=2,
                 kwargs={},
             ),
             "balanced_accuracy",
@@ -178,6 +180,7 @@ def test_fit_and_score_prefit(scorer, score_method, expected_score):
                 score_func=balanced_accuracy_score,
                 sign=1,
                 response_method="predict_proba",
+                n_thresholds=10,
                 kwargs={},
             ),
             "balanced_accuracy",
@@ -252,6 +255,7 @@ def test_fit_and_score_sample_weight(scorer, score_method):
                 score_func=balanced_accuracy_score,
                 sign=1,
                 response_method="predict_proba",
+                n_thresholds=10,
                 kwargs={},
             ),
             "balanced_accuracy",

From b17b59e0ab8232b7d73d4f87cc275d5078a1f56d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 28 Jul 2023 14:30:26 +0200
Subject: [PATCH 110/194] iter

---
 .../plot_tuned_decision_threshold.py          | 377 +++++++++++++++++
 ...eshold_classifier_with_metadata_routing.py | 394 +-----------------
 sklearn/ensemble/_base.py                     |   4 +-
 .../_classification_threshold.py              |   8 +-
 4 files changed, 388 insertions(+), 395 deletions(-)
 create mode 100644 examples/model_selection/plot_tuned_decision_threshold.py

diff --git a/examples/model_selection/plot_tuned_decision_threshold.py b/examples/model_selection/plot_tuned_decision_threshold.py
new file mode 100644
index 0000000000000..4e903c5de3815
--- /dev/null
+++ b/examples/model_selection/plot_tuned_decision_threshold.py
@@ -0,0 +1,377 @@
+"""
+==================================================
+Post-tuning the cut-off point of decision function
+==================================================
+
+Once a classifier is trained, the output of the :term:`predict` method output class
+label predictions corresponding to a thresholding of either the :term:`decision
+function` or the :term:`predict_proba` output. For a binary classifier, the default
+threshold is defined as a posterior probability estimate of 0.5 or a decision score of
+0.0. However, this default strategy may not be optimal for the task at hand.
+
+This example shows how to use the
+:class:`~sklearn.model_selection.TunedThresholdClassifier` to tune the decision function
+threshold, depending on a metric of interest as well as under a specific constraint.
+"""
+
+# %%
+# The diabetes dataset
+# --------------------
+#
+# To illustrate the tuning of the decision threshold, we will use the diabetes dataset.
+# This dataset is available on OpenML: https://www.openml.org/d/37. We use the
+# :func:`~sklearn.datasets.fetch_openml` function to fetch this dataset.
+from sklearn.datasets import fetch_openml
+
+diabetes = fetch_openml(data_id=37, as_frame=True, parser="pandas")
+data, target = diabetes.data, diabetes.target
+
+# %%
+# We look at the target to understand the type of problem we are dealing with.
+target.value_counts()
+
+# %%
+# We see that we are dealing with a binary classification problem. Since the labels are
+# not encoded as 0 and 1, we will store which label we considered the negative class
+# and which one we considered the positive class: "tested_negative" will be considered
+# the negative class and "tested_positive" the positive class.
+#
+# We also observed that this binary problem is slightly imbalanced where we have around
+# twice more samples from the negative class than from the positive class. When it
+# comes to evaluation, we should consider this aspect to interpret the results.
+neg_label, pos_label = target.value_counts().index
+
+# %%
+# Our vanilla classifier
+# ----------------------
+#
+# We define a basic predictive model composed of a scaler followed by a logistic
+# regression classifier.
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+
+model = make_pipeline(StandardScaler(), LogisticRegression())
+model
+
+# %%
+# We evaluate our model using cross-validation. We use the accuracy and the balanced
+# accuracy to report the performance of our model. The balanced accuracy is a metric
+# that is less sensitive to class imbalance and will allow us to put the accuracy
+# score in perspective.
+import pandas as pd
+
+from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate
+
+scoring = ["accuracy", "balanced_accuracy"]
+cv_scores = [
+    "train_accuracy",
+    "test_accuracy",
+    "train_balanced_accuracy",
+    "test_balanced_accuracy",
+]
+cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)
+cv_results_vanilla_model = pd.DataFrame(
+    cross_validate(
+        model,
+        data,
+        target,
+        scoring=scoring,
+        cv=cv,
+        return_train_score=True,
+        return_estimator=True,
+    )
+)
+cv_results_vanilla_model[cv_scores].aggregate(["mean", "std"]).T
+
+# %%
+# Our predictive model succeed to grasp relationship between the data and the target.
+# The training and testing scores are close to each other, meaning that our predictive
+# model is not overfitting. We also observe that the balanced accuracy is lower than
+# the accuracy, due to the class imbalanced previously mentioned.
+#
+# For this classifier, we used a decision threshold of 0.5 to convert the probability
+# of the positive class into a class prediction. However, this threshold might not be
+# optimal. If our interest is to maximize the balanced accuracy, we should select
+# another threshold that would maximize this metric.
+#
+# The :class:`~sklearn.model_selection.TunedThresholdClassifier` allows to tune the
+# decision threshold of a classifier given a metric of interest.
+#
+# Tuning the decision threshold
+# -----------------------------
+#
+# We create a :class:`~sklearn.model_selection.TunedThresholdClassifier` and we
+# configure it to maximize the balanced accuracy. We evaluate the model using the same
+# cross-validation strategy as previously.
+from sklearn.model_selection import TunedThresholdClassifier
+
+tuned_model = TunedThresholdClassifier(
+    estimator=model, objective_metric="balanced_accuracy"
+)
+cv_results_tuned_model = pd.DataFrame(
+    cross_validate(
+        tuned_model,
+        data,
+        target,
+        scoring=scoring,
+        cv=cv,
+        return_train_score=True,
+        return_estimator=True,
+    )
+)
+cv_results_tuned_model[cv_scores].aggregate(["mean", "std"]).T
+
+# %%
+# In comparison with the vanilla model, we observe that the balanced accuracy score
+# increased. Of course, it comes at the cost of a lower accuracy score. It means that
+# our model is now more sensitive to the positive class but makes more mistakes on the
+# negative class.
+#
+# However, it is important to note that this tuned predictive model is internally the
+# same model as the vanilla model.
+import matplotlib.pyplot as plt
+
+vanilla_model_coef = pd.DataFrame(
+    [est[-1].coef_.ravel() for est in cv_results_vanilla_model["estimator"]],
+    columns=diabetes.feature_names,
+)
+tuned_model_coef = pd.DataFrame(
+    [est.estimator_[-1].coef_.ravel() for est in cv_results_tuned_model["estimator"]],
+    columns=diabetes.feature_names,
+)
+
+fig, ax = plt.subplots(ncols=2, figsize=(12, 4), sharex=True, sharey=True)
+vanilla_model_coef.boxplot(ax=ax[0])
+ax[0].set_ylabel("Coefficient value")
+ax[0].set_title("Vanilla model")
+tuned_model_coef.boxplot(ax=ax[1])
+ax[1].set_title("Tuned model")
+_ = fig.suptitle("Coefficients of the predictive models")
+
+# %%
+# Only the decision threshold of each model was changed during the cross-validation.
+decision_threshold = pd.Series(
+    [est.decision_threshold_ for est in cv_results_tuned_model["estimator"]],
+)
+ax = decision_threshold.plot.kde()
+ax.axvline(
+    decision_threshold.mean(),
+    color="k",
+    linestyle="--",
+    label=f"Mean decision threshold: {decision_threshold.mean():.2f}",
+)
+ax.set_xlabel("Decision threshold")
+ax.legend(loc="upper right")
+_ = ax.set_title(
+    "Distribution of the decision threshold \nacross different cross-validation folds"
+)
+
+# %%
+# In average, a decision threshold around 0.32 is maximizing the balanced accuracy. It
+# is thus different from the default decision threshold of 0.5. Tuning the decision
+# threshold is thus particularly important when the output of the predictive model
+# is used to make decisions. Besides, the metric used to tune the decision threshold
+# should be chosen carefully. Here, we used the balanced accuracy but it might not be
+# the most appropriate metric for the problem at hand. The choice of the "right" metric
+# is usually problem-dependent and might require some domain knowledge. Refer to the
+# example entitled,
+# :ref:`sphx_glr_auto_examples_model_selection_plot_tuned_threshold_classifier.py`,
+# for more details.
+#
+# Tuning the decision threshold under constraint
+# ----------------------------------------------
+#
+# In some cases, we do not want only to maximize a specific metric but instead maximize
+# a metric while satisfying a constraint on another metric. In the current example, we
+# could imagine that the decision of our predictive model will be reviewed by a medical
+# doctor. In this case, this doctor will only accept a ratio of false positive.
+# Therefore, we are interesting at maximizing the true positive rate while having a
+# a false positive rate lower than a given threshold.
+#
+# The :class:`~sklearn.model_selection.TunedThresholdClassifier` allows to tune the
+# decision threshold with such specification. We should how to proceed using a single
+# cross-validation split to display the Receiver Operating Characteristic (ROC) curves
+# to get intuition on the problem.
+#
+# First, we split the data into a training and testing set.
+
+# %%
+from sklearn.model_selection import train_test_split
+
+data_train, data_test, target_train, target_test = train_test_split(
+    data, target, random_state=42
+)
+
+# %%
+# Now, we will train both the vanilla and tuned model on the training set. We recall
+# that the tuned model is internally maximizing the balanced accuracy for the moment.
+model.fit(data_train, target_train)
+tuned_model.fit(data_train, target_train)
+
+# %%
+# To show the benefit on optimizing a metric under constraint, we will evaluate the
+# models using the ROC curves statistics: the true positive rate (TPR) and the false
+# positive rate (FPR).
+#
+# The FPR is not defined in scikit-learn and we define it below:
+from sklearn.metrics import confusion_matrix, make_scorer, recall_score
+
+
+def fpr_score(y, y_pred, neg_label, pos_label):
+    cm = confusion_matrix(y, y_pred, labels=[neg_label, pos_label])
+    tn, fp, _, _ = cm.ravel()
+    tnr = tn / (tn + fp)
+    return 1 - tnr
+
+
+tpr_score = recall_score  # TPR and recall are the same metric
+scoring = {
+    "fpr": make_scorer(fpr_score, neg_label=neg_label, pos_label=pos_label),
+    "tpr": make_scorer(tpr_score, pos_label=pos_label),
+}
+
+# %%
+# Now, we plot the ROC curve of both models and the FPR and TPR statistics for the
+# decision thresholds of both models.
+from sklearn.metrics import RocCurveDisplay
+
+disp = RocCurveDisplay.from_estimator(
+    model, data_test, target_test, name="Vanilla model", linestyle="--", alpha=0.5
+)
+RocCurveDisplay.from_estimator(
+    tuned_model,
+    data_test,
+    target_test,
+    name="Tuned model",
+    linestyle="-.",
+    alpha=0.5,
+    ax=disp.ax_,
+)
+disp.ax_.plot(
+    scoring["fpr"](model, data_test, target_test),
+    scoring["tpr"](model, data_test, target_test),
+    marker="o",
+    markersize=10,
+    color="tab:blue",
+    label="Default cut-off point at a probability of 0.5",
+)
+disp.ax_.plot(
+    scoring["fpr"](tuned_model, data_test, target_test),
+    scoring["tpr"](tuned_model, data_test, target_test),
+    marker=">",
+    markersize=10,
+    color="tab:orange",
+    label=f"Cut-off point at probability of {tuned_model.decision_threshold_:.2f}",
+)
+disp.ax_.legend()
+_ = disp.ax_.set_title("ROC curves")
+
+# %%
+# We observe that both models have the same ROC curves. This is expected since the tuned
+# model is only a post-processing step of the vanilla model. The tuning is only
+# changing the decision threshold threshold as displayed by the markers blue and orange.
+# To optimize the balanced accuracy, the tuned model moved the decision threshold is
+# moved from 0.5 to 0.22. By shifting this point, we increase the FPR while increasing
+# the TPR: in short we make more false positive but also more true positive. This is
+# exactly what we concluded in the previous section when looking at the balanced
+# accuracy score.
+#
+# However, this decision threshold might not be acceptable for our medical doctor. He
+# might be instead interested to have a low FPR, let say lower than 5%. For this level
+# of FPR, he would like our predictive model to maximize the TPR.
+#
+# The :class:`~sklearn.model_selection.TunedThresholdClassifier` allows to specify such
+# constraint by providing the name of the metric and the constraint value. Here, we use
+# `max_tpr_at_tnr_constraint` which is exactly what we want. Since the true negative
+# rate (TNR) is equal to 1 - FPR, we can rewrite the constraint value as
+# `1 - 0.05 = 0.95`.
+
+# %%
+constraint_value = 0.95
+tuned_model.set_params(
+    objective_metric="max_tpr_at_tnr_constraint",
+    constraint_value=constraint_value,
+    pos_label=pos_label,
+)
+tuned_model.fit(data_train, target_train)
+
+# %%
+# Now, we can plot the ROC curves and analyse the results.
+import matplotlib.pyplot as plt
+
+_, axs = plt.subplots(ncols=2, figsize=(12, 5))
+
+disp = RocCurveDisplay(
+    fpr=1 - tuned_model.objective_scores_[0],
+    tpr=tuned_model.objective_scores_[1],
+    estimator_name="ROC of the tuned model",
+    pos_label=pos_label,
+)
+axs[0].plot(
+    1 - tuned_model.objective_score_[0],
+    tuned_model.objective_score_[1],
+    marker="o",
+    markersize=10,
+    color="tab:blue",
+    label=f"Cut-off point at probability of {tuned_model.decision_threshold_:.2f}",
+)
+axs[0].axvline(
+    1 - constraint_value, 0, 1, color="tab:blue", linestyle="--", label="FPR constraint"
+)
+axs[0].set_title("Average ROC curve for the tuned model\nacross CV folds")
+RocCurveDisplay.from_estimator(
+    model,
+    data_test,
+    target_test,
+    name="Vanilla model",
+    linestyle="--",
+    alpha=0.5,
+    ax=axs[1],
+)
+RocCurveDisplay.from_estimator(
+    tuned_model,
+    data_test,
+    target_test,
+    name="Tuned model",
+    linestyle="-.",
+    alpha=0.5,
+    ax=axs[1],
+)
+axs[1].plot(
+    scoring["fpr"](model, data_test, target_test),
+    scoring["tpr"](model, data_test, target_test),
+    marker="o",
+    markersize=10,
+    color="tab:blue",
+    label="Default cut-off point at a probability of 0.5",
+)
+axs[1].plot(
+    1 - tuned_model.objective_score_[0],
+    tuned_model.objective_score_[1],
+    marker="^",
+    markersize=10,
+    color="tab:orange",
+    label=f"Cut-off point at probability of {tuned_model.decision_threshold_:.2f}",
+)
+axs[1].legend()
+axs[1].set_title("ROC curves")
+disp.plot(ax=axs[0])
+
+# %%
+# We start with the right-hand side plot. It depicts the ROC curves as in the previous
+# section. We observe that the control point of the tuned model moved to a low FPR
+# that was defined by our constraint. To achieve this low FPR, the decision threshold
+# was moved to a probability of 0.72.
+#
+# The left-hand side plot shows the averaged ROC curve on the internal validation set
+# across the different cross-validation folds. This curve is used to define the decision
+# threshold. The vertical dashed line represents the FPR constraint that we defined.
+# The decision threshold corresponds to the maximum TPR on the left of this dashed line
+# and is represented by a blue marker.
+#
+# An important point to note is that the decision threshold is defined on averaged
+# statistics on an internal validation set. It means that the constraint is respected
+# on the train/validation dataset but not necessarily on the test set, in case the
+# statistical performance of the model differ from the train/validation set to the test
+# set (i.e. overfitting).
diff --git a/examples/model_selection/plot_tuned_threshold_classifier_with_metadata_routing.py b/examples/model_selection/plot_tuned_threshold_classifier_with_metadata_routing.py
index 98c1e1049ba21..7adfd65591904 100644
--- a/examples/model_selection/plot_tuned_threshold_classifier_with_metadata_routing.py
+++ b/examples/model_selection/plot_tuned_threshold_classifier_with_metadata_routing.py
@@ -1,388 +1,4 @@
 # %%
-import pandas as pd
-
-filename_training = "~/Downloads/kdd98/epsilon_mirror/cup98lrn.zip"
-index = ["CONTROLN"]
-df_train = pd.read_csv(
-    filename_training, compression="zip", encoding="latin-1", low_memory=False
-).set_index(index)
-train_indices = df_train.index
-
-# %%
-filename_data_test = "~/Downloads/kdd98/epsilon_mirror/cup98val.zip"
-data_test = pd.read_csv(
-    filename_data_test, compression="zip", encoding="latin-1", low_memory=False
-).set_index(index)
-
-# %%
-filename_target_test = "~/Downloads/kdd98/epsilon_mirror/valtargt.txt"
-target_test = pd.read_csv(filename_target_test).set_index(index)
-
-# %%
-df_test = pd.concat([data_test, target_test], axis=1)
-test_indices = df_test.index
-# Do not convert string to avoid the pd.NA bug:
-# xref: https://github.com/scikit-learn/scikit-learn/issues/26890
-df = pd.concat([df_train, df_test], axis=0)
-# df = df.convert_dtypes(convert_string=False)
-
-# %%
-# convert to categorical in case we don't want to use the TableVectorizer
-categorical_columns = {}
-for col_idx, col in enumerate(df.columns):
-    series = df[col]
-    dtype_series = series.dtype
-    # if isinstance(dtype_series, pd.StringDtype):
-    if dtype_series.kind == "O":
-        categorical_columns[col] = "category"
-        continue  # skip the rest of the loop
-    unique_values = series.value_counts()
-    if len(unique_values) < 60:
-        # low-cardinality features are considered as categorical
-        categorical_columns[col] = "category"
-
-df = df.astype(categorical_columns)
-
-# %%
-target_continuous_name = "TARGET_D"
-target_binary_name = "TARGET_B"
-neg_label, pos_label = 0, 1
-
-# %%
-from sklearn.model_selection import train_test_split
-
-# df_train, df_test = train_test_split(df, test_size=0.5, random_state=42)
-df_train = df.loc[train_indices]
-df_test = df.loc[test_indices]
-
-# %%
-data_train = df_train.drop(columns=[target_continuous_name, target_binary_name])
-
-# %%
-data_train
-
-# %%
-target_continuous_train = df_train[target_continuous_name]
-target_binary_train = df_train[target_binary_name]
-
-# %%
-
-
-def cost_metric(y_true, y_pred, neg_label, pos_label, donation_amount):
-    """Compute the business cost related to the prediction.
-
-    The real cost is to send a mail to a person and is evaluated to $0.68.
-    The gain is the donation amount if the person donate.
-    """
-    mask_true_positive = (y_true == pos_label) & (y_pred == pos_label)
-    mask_false_positive = (y_true == neg_label) & (y_pred == pos_label)
-    # mask_false_negative = (y_true == pos_label) & (y_pred == neg_label)
-    # mask_true_negative = (y_true == neg_label) & (y_pred == neg_label)
-    cost_sending_mail = -0.68
-    cost_false_positive = cost_sending_mail * mask_false_positive.sum()
-    gain_true_positive = (cost_sending_mail + donation_amount[mask_true_positive]).sum()
-    return gain_true_positive + cost_false_positive
-    # loss_false_negative = donation_amount[mask_false_negative].sum()
-    # fp = - mask_false_positive.sum() * cost_sending_mail
-    # fn = - (donation_amount[mask_false_negative] - cost_sending_mail).sum()
-    # return fp + fn
-
-
-# %%
-import numpy as np
-
-gain = cost_metric(
-    y_true=target_binary_train,
-    y_pred=np.zeros_like(target_binary_train),
-    neg_label=neg_label,
-    pos_label=pos_label,
-    donation_amount=target_continuous_train,
-)
-print(f"Gain if we don't send any mail: ${gain:,.2f}")
-
-# %%
-gain = cost_metric(
-    y_true=target_binary_train,
-    y_pred=np.ones_like(target_binary_train),
-    neg_label=neg_label,
-    pos_label=pos_label,
-    donation_amount=target_continuous_train,
-)
-print(f"Gain if we send mails to everyone: ${gain:,.2f}")
-
-# %%
-gain = cost_metric(
-    y_true=target_binary_train,
-    y_pred=target_binary_train,
-    neg_label=neg_label,
-    pos_label=pos_label,
-    donation_amount=target_continuous_train,
-)
-print(f"Maximum gain on the training set: ${gain:,.2f}")
-
-# %%
-import sklearn
-from sklearn.metrics import make_scorer
-
-sklearn.set_config(enable_metadata_routing=True)
-
-cost_scorer = make_scorer(
-    cost_metric, neg_label=neg_label, pos_label=pos_label
-).set_score_request(donation_amount=True)
-
-# %%
-from sklearn.dummy import DummyClassifier
-
-stingy_classifier = DummyClassifier(strategy="constant", constant=0)
-stingy_classifier.fit(data_train, target_binary_train)
-
-# %%
-bling_bling_classifier = DummyClassifier(strategy="constant", constant=1)
-bling_bling_classifier.fit(data_train, target_binary_train)
-
-# %%
-gain = cost_scorer(
-    stingy_classifier,
-    data_train,
-    target_binary_train,
-    donation_amount=target_continuous_train,
-)
-print(f"Gain of the stingy classifier: ${gain:,.2f}")
-
-# %%
-gain = cost_scorer(
-    bling_bling_classifier,
-    data_train,
-    target_binary_train,
-    donation_amount=target_continuous_train,
-)
-print(f"Gain of the bling-bling classifier: ${gain:,.2f}")
-
-# %%
-data_test = df_test.drop(columns=[target_continuous_name, target_binary_name])
-
-# %%
-target_continuous_test = df_test[target_continuous_name]
-target_binary_test = df_test[target_binary_name]
-
-# %%
-from sklearn.compose import ColumnTransformer
-from sklearn.compose import make_column_selector as selector
-from sklearn.ensemble import HistGradientBoostingClassifier
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import OrdinalEncoder
-
-sklearn.set_config(transform_output="pandas")
-
-categorical_columns = selector(dtype_include="category")
-preprocessing = ColumnTransformer(
-    transformers=[
-        (
-            "categorical",
-            OrdinalEncoder(
-                handle_unknown="use_encoded_value",
-                unknown_value=-1,
-                min_frequency=0.05,
-                max_categories=255,
-            ),
-            categorical_columns,
-        ),
-    ],
-    remainder="passthrough",
-    n_jobs=-1,
-    verbose_feature_names_out=False,
-)
-model = Pipeline(
-    steps=[
-        ("preprocessing", preprocessing),
-        (
-            "classifier",
-            HistGradientBoostingClassifier(
-                max_iter=1_000,
-                early_stopping=True,
-                categorical_features=categorical_columns(data_train),
-            ),
-            # DecisionTreeClassifier(max_leaf_nodes=200, random_state=42),
-        ),
-    ]
-)
-model.fit(data_train, target_binary_train)
-
-# %%
-model.score(data_test, target_binary_test)
-
-# %%
-from sklearn.metrics import balanced_accuracy_score
-
-balanced_accuracy_score(target_binary_test, model.predict(data_test))
-
-# %%
-gain = cost_metric(
-    y_true=target_binary_test,
-    y_pred=model.predict(data_test),
-    neg_label=neg_label,
-    pos_label=pos_label,
-    donation_amount=target_continuous_test,
-)
-print(f"Gain of the model: ${gain:,.2f}")
-
-# %%
-from sklearn.calibration import CalibrationDisplay
-
-CalibrationDisplay.from_estimator(model, data_test, target_binary_test, n_bins=10)
-
-# %%
-from sklearn.calibration import CalibratedClassifierCV
-from sklearn.model_selection import StratifiedKFold, TunedThresholdClassifier
-
-cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
-tuned_model = TunedThresholdClassifier(
-    # estimator=model,
-    estimator=CalibratedClassifierCV(model, cv=5, method="isotonic"),
-    n_thresholds=np.linspace(0.02, 0.05, 1_000),
-    # n_thresholds=1_000,
-    # strategy="constant",
-    # constant_threshold=0.0302,
-    pos_label=pos_label,
-    objective_metric=cost_scorer,
-    cv=cv,
-    n_jobs=-1,
-)
-tuned_model.fit(
-    data_train, target_binary_train, donation_amount=target_continuous_train
-)
-
-# %%
-CalibrationDisplay.from_estimator(tuned_model, data_test, target_binary_test, n_bins=10)
-
-# %%
-tuned_model.score(data_test, target_binary_test)
-
-# %%
-balanced_accuracy_score(target_binary_test, tuned_model.predict(data_test))
-
-# %%
-gain = cost_metric(
-    y_true=target_binary_test,
-    y_pred=tuned_model.predict(data_test),
-    neg_label=neg_label,
-    pos_label=pos_label,
-    donation_amount=target_continuous_test,
-)
-print(f"Gain of the model: ${gain:,.2f}")
-
-# %%
-import matplotlib.pyplot as plt
-
-for thresholds, scores in zip(tuned_model.cv_thresholds_, tuned_model.cv_scores_):
-    plt.semilogx(thresholds, scores, alpha=0.5)
-
-# %%
-import plotly.graph_objects as go
-
-fig = go.Figure()
-for thresholds, scores in zip(tuned_model.cv_thresholds_, tuned_model.cv_scores_):
-    fig.add_trace(go.Scatter(x=thresholds, y=scores, mode="lines", name="lines"))
-fig.update_xaxes(type="log")
-fig.show()
-
-# %%
-
-# %%
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.impute import SimpleImputer
-
-categorical_columns = selector(dtype_include="category")
-preprocessing = ColumnTransformer(
-    transformers=[
-        (
-            "categorical",
-            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
-            categorical_columns,
-        ),
-    ],
-    remainder="passthrough",
-    n_jobs=-1,
-    verbose_feature_names_out=False,
-)
-model = Pipeline(
-    steps=[
-        ("preprocessing", preprocessing),
-        ("imputer", SimpleImputer(strategy="mean")),
-        ("classifier", RandomForestClassifier(n_jobs=-1)),
-    ]
-)
-tuned_model = TunedThresholdClassifier(
-    estimator=CalibratedClassifierCV(model, cv=5, method="isotonic"),
-    pos_label=pos_label,
-    objective_metric=cost_scorer,
-    cv=0.8,
-)
-tuned_model.fit(
-    data_train, target_binary_train, donation_amount=target_continuous_train
-)
-
-# %%
-balanced_accuracy_score(target_binary_test, tuned_model.predict(data_test))
-
-# %%
-gain = cost_metric(
-    y_true=target_binary_test,
-    y_pred=tuned_model.predict(data_test),
-    neg_label=neg_label,
-    pos_label=pos_label,
-    donation_amount=target_continuous_test,
-)
-print(f"Gain of the model: ${gain:,.2f}")
-
-# %%
-from imblearn.ensemble import BalancedRandomForestClassifier
-
-sklearn.set_config(enable_metadata_routing=False)
-
-categorical_columns = selector(dtype_include="category")
-preprocessing = ColumnTransformer(
-    transformers=[
-        (
-            "categorical",
-            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
-            categorical_columns,
-        ),
-    ],
-    remainder="passthrough",
-    n_jobs=-1,
-    verbose_feature_names_out=False,
-)
-model = Pipeline(
-    steps=[
-        ("preprocessing", preprocessing),
-        ("imputer", SimpleImputer(strategy="mean")),
-        (
-            "classifier",
-            BalancedRandomForestClassifier(
-                sampling_strategy="all", replacement=True, bootstrap=False, n_jobs=-1
-            ),
-        ),
-    ]
-)
-
-# %%
-balanced_accuracy_score(target_binary_test, model.predict(data_test))
-
-# %%
-gain = cost_metric(
-    y_true=target_binary_test,
-    y_pred=model.predict(data_test),
-    neg_label=neg_label,
-    pos_label=pos_label,
-    donation_amount=target_continuous_test,
-)
-print(f"Gain of the model: ${gain:,.2f}")
-
-
-# %%
-# %%
-# %%
 from sklearn.datasets import fetch_openml
 
 credit_card = fetch_openml(data_id=1597, as_frame=True, parser="pandas")
@@ -394,15 +10,17 @@ def cost_metric(y_true, y_pred, neg_label, pos_label, donation_amount):
 amount = credit_card.frame["Amount"].to_numpy()
 
 # %%
-target.value_counts()
+from collections import Counter
 
-# %%
-target.value_counts(normalize=True)
+Counter(target)
 
 # %%
+import matplotlib.pyplot as plt
+
 fraud = target == 1
 amount_fraud = amount[fraud]
-ax = amount_fraud.plot.hist(bins=100)
+_, ax = plt.subplots()
+ax.hist(amount_fraud, bins=100)
 ax.set_title("Amount of fraud transaction")
 _ = ax.set_xlabel("Amount ($)")
 
diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py
index 5cf3527f0cc43..3107b4cf9a6c5 100644
--- a/sklearn/ensemble/_base.py
+++ b/sklearn/ensemble/_base.py
@@ -197,9 +197,7 @@ def _make_estimator(self, append=True, random_state=None):
 
     def __len__(self):
         """Return the number of estimators in the ensemble."""
-        if hasattr(self, "estimators_"):
-            return len(self.estimators_)
-        return True
+        return len(self.estimators_)
 
     def __getitem__(self, index):
         """Return the index'th estimator in the ensemble."""
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 349da566a8a6b..e6b204c2ba7b2 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -218,10 +218,10 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
           If the method is not implemented by the classifier, it will raise an
           error.
 
-    n_thresholds : int, default=100
-        The number of decision threshold to use when discretizing the output
-        of the classifier `method`.
-        # TODO: update to array-like
+    n_thresholds : int or array-like, default=100
+        The number of decision threshold to use when discretizing the output of the
+        classifier `method`. Pass an array-like to manually specify the thresholds
+        to use.
 
     cv : int, float, cross-validation generator, iterable or "prefit", default=None
         Determines the cross-validation splitting strategy to train classifier.

From 43c1da84e96669811fd9e177e9368f618107c3f2 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 28 Jul 2023 18:12:28 +0200
Subject: [PATCH 111/194] iter

---
 doc/modules/classification_threshold.rst      |   1 +
 ...ier.py => plot_cost_sensitive_learning.py} | 268 +++++++++++++++---
 ...eshold_classifier_with_metadata_routing.py | 208 --------------
 3 files changed, 229 insertions(+), 248 deletions(-)
 rename examples/model_selection/{plot_tuned_threshold_classifier.py => plot_cost_sensitive_learning.py} (65%)
 delete mode 100644 examples/model_selection/plot_tuned_threshold_classifier_with_metadata_routing.py

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index d2f18dd640f72..eead33b621d65 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -160,6 +160,7 @@ also possible to manually set the decision threshold in
 Examples
 --------
 
+TODO: add more examples
 - See the example entitled
   :ref:`sphx_glr_auto_examples_model_selection_plot_tuned_threshold_classifier.py`,
   to learn about tuning the decision threshold of a classifier.
diff --git a/examples/model_selection/plot_tuned_threshold_classifier.py b/examples/model_selection/plot_cost_sensitive_learning.py
similarity index 65%
rename from examples/model_selection/plot_tuned_threshold_classifier.py
rename to examples/model_selection/plot_cost_sensitive_learning.py
index 5a7b0467a4a03..f4ac021122e37 100644
--- a/examples/model_selection/plot_tuned_threshold_classifier.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -1,7 +1,7 @@
 """
-==================================================
-Post-tuning the cut-off point of decision function
-==================================================
+===============================================================
+Post-tuning decision threshold based on cost-sensitive learning
+===============================================================
 
 Once a classifier is trained, the output of the :term:`predict` method output class
 label predictions corresponding to a thresholding of either the :term:`decision
@@ -33,15 +33,23 @@
 """
 
 # %%
+# Cost-sensitive learning with constant gains and costs
+# -----------------------------------------------------
+#
+# In this first section, we illustrate the use of the
+# :class:`~sklearn.model_selection.TunedThresholdClassifier` in a setting of
+# cost-sensitive learning when the gains and costs associated to each entry of the
+# confusion matrix are constant. We use the problematic presented in [2]_ using the
+# "Statlog" German credit dataset [1]_.
+#
 # "Statlog" German credit dataset
-# -------------------------------
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
 # We fetch the German credit dataset from OpenML.
 import sklearn
 from sklearn.datasets import fetch_openml
 
 sklearn.set_config(transform_output="pandas")
-sklearn.set_config(enable_metadata_routing=True)
 
 german_credit = fetch_openml(data_id=31, as_frame=True, parser="pandas")
 X, y = german_credit.data, german_credit.target
@@ -79,7 +87,7 @@
 # We are ready to design our predictive model and the associated evaluation strategy.
 #
 # Evaluation metrics
-# ------------------
+# ^^^^^^^^^^^^^^^^^^
 #
 # In this section, we define a set of metrics that we use later. To see
 # the effect of tuning the cut-off point, we evaluate the predictive model using
@@ -122,7 +130,7 @@ def fpr_score(y, y_pred, neg_label, pos_label):
 # %%
 # In addition, the original research [1]_ defines a business metric. They provide a
 # cost-matrix which encodes that predicting a "bad" credit as "good" is 5 times more
-# costly than the opposite. We define a python function that will weight the confusion
+# costly than the opposite. We define a python function that weight the confusion
 # matrix and return the overall cost.
 import numpy as np
 
@@ -138,17 +146,13 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 )
 # %%
 # Vanilla predictive model
-# ------------------------
-#
-# Design of the predictive model
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# ^^^^^^^^^^^^^^^^^^^^^^^^
 #
-# In this section we design our predictive model consisting of a
-# :class:`~sklearn.ensemble.HistGradientBoostingClassifier`. We encode the
-# categorical features with an :class:`~sklearn.preprocessing.OrdinalEncoder`
-# but the numerical features are kept as they are. To identify the categorical
-# columns, we use the helper function
-# :func:`~sklearn.compose.make_column_selector` and the fact that the
+# We first design our predictive model consisting of a
+# :class:`~sklearn.ensemble.HistGradientBoostingClassifier`. We encode the categorical
+# features with an :class:`~sklearn.preprocessing.OrdinalEncoder` but the numerical
+# features are kept as they are. To identify the categorical columns, we use the helper
+# function :func:`~sklearn.compose.make_column_selector` and the fact that the
 # categorical features are stored as `category` dtype.
 from sklearn.compose import ColumnTransformer
 from sklearn.compose import make_column_selector as selector
@@ -185,9 +189,6 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 model.fit(X_train, y_train)
 
 # %%
-# Evaluation of the predictive model
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
 # We evaluate the performance of our predictive model using the ROC and Precision-Recall
 # curves.
 import matplotlib.pyplot as plt
@@ -245,7 +246,7 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 #
 # However, we recall that the original aim was to minimize the cost (or maximize the
 # gain) by the business metric. We can compute the value of the business metric:
-scoring["cost_gain"](model, X_test, y_test)
+print(f"Business defined metric: {scoring['cost_gain'](model, X_test, y_test)}")
 
 # %%
 # At this stage we don't know if any other cut-off can lead to a greater gain.
@@ -259,7 +260,7 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 # .. _cost_sensitive_learning_example:
 #
 # Tuning the cut-off point
-# ------------------------
+# ^^^^^^^^^^^^^^^^^^^^^^^^
 #
 # We use :class:`~sklearn.model_selection.TunedThresholdClassifier` to tune the cut-off
 # point. We need to provide the business metric to optimize as well as the
@@ -268,12 +269,12 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 # stratified cross-validation is used.
 from sklearn.model_selection import TunedThresholdClassifier
 
-model_tuned = TunedThresholdClassifier(
+tuned_model = TunedThresholdClassifier(
     estimator=model,
     pos_label=pos_label,
     objective_metric=scoring["cost_gain"],
 )
-model_tuned.fit(X_train, y_train)
+tuned_model.fit(X_train, y_train)
 
 # %%
 # We plot the ROC and Precision-Recall curves for the vanilla model and the tuned model.
@@ -285,7 +286,7 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 colors = ("tab:blue", "tab:orange")
 names = ("Vanilla GBDT", "Tuned GBDT")
 for idx, (est, linestyle, marker, color, name) in enumerate(
-    zip((model, model_tuned), linestyles, markerstyles, colors, names)
+    zip((model, tuned_model), linestyles, markerstyles, colors, names)
 ):
     decision_threshold = getattr(est, "decision_threshold_", 0.5)
     PrecisionRecallDisplay.from_estimator(
@@ -332,11 +333,11 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 axs[1].legend()
 
 axs[2].plot(
-    model_tuned.decision_thresholds_, model_tuned.objective_scores_, color="tab:orange"
+    tuned_model.decision_thresholds_, tuned_model.objective_scores_, color="tab:orange"
 )
 axs[2].plot(
-    model_tuned.decision_threshold_,
-    model_tuned.objective_score_,
+    tuned_model.decision_threshold_,
+    tuned_model.objective_score_,
     "o",
     markersize=10,
     color="tab:orange",
@@ -363,7 +364,7 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 #
 # We can now check if choosing this cut-off point leads to a better score on the testing
 # set:
-scoring["cost_gain"](model_tuned, X_test, y_test)
+print(f"Business defined metric: {scoring['cost_gain'](tuned_model, X_test, y_test)}")
 
 # %%
 # We observe that the decision generalized on the testing set leading to a better
@@ -372,7 +373,7 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 # .. _tunedthresholdclassifier_no_cv:
 #
 # Consideration regarding model refitting and cross-validation
-# ------------------------------------------------------------
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
 # In the above experiment, we use the default setting of the
 # :class:`~sklearn.model_selection.TunedThresholdClassifier`. In particular, the cut-off
@@ -386,7 +387,7 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 # Also, the underlying classifier is not be refitted. Here, we can try to do such
 # experiment.
 model.fit(X_train, y_train)
-model_tuned.set_params(cv="prefit").fit(X_train, y_train)
+tuned_model.set_params(cv="prefit").fit(X_train, y_train)
 
 
 # %%
@@ -398,7 +399,7 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 colors = ("tab:blue", "tab:orange")
 names = ("Vanilla GBDT", "Tuned GBDT")
 for idx, (est, linestyle, marker, color, name) in enumerate(
-    zip((model, model_tuned), linestyles, markerstyles, colors, names)
+    zip((model, tuned_model), linestyles, markerstyles, colors, names)
 ):
     decision_threshold = getattr(est, "decision_threshold_", 0.5)
     PrecisionRecallDisplay.from_estimator(
@@ -445,11 +446,11 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 axs[1].legend()
 
 axs[2].plot(
-    model_tuned.decision_thresholds_, model_tuned.objective_scores_, color="tab:orange"
+    tuned_model.decision_thresholds_, tuned_model.objective_scores_, color="tab:orange"
 )
 axs[2].plot(
-    model_tuned.decision_threshold_,
-    model_tuned.objective_score_,
+    tuned_model.decision_threshold_,
+    tuned_model.objective_score_,
     "o",
     markersize=10,
     color="tab:orange",
@@ -481,7 +482,7 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 # single train-test split by providing a floating number in range `[0, 1]` to the `cv`
 # parameter. It splits the data into a training and testing set. Let's explore this
 # option:
-model_tuned.set_params(cv=0.75).fit(X_train, y_train)
+tuned_model.set_params(cv=0.75).fit(X_train, y_train)
 
 # %%
 fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(21, 6))
@@ -491,7 +492,7 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 colors = ("tab:blue", "tab:orange")
 names = ("Vanilla GBDT", "Tuned GBDT")
 for idx, (est, linestyle, marker, color, name) in enumerate(
-    zip((model, model_tuned), linestyles, markerstyles, colors, names)
+    zip((model, tuned_model), linestyles, markerstyles, colors, names)
 ):
     decision_threshold = getattr(est, "decision_threshold_", 0.5)
     PrecisionRecallDisplay.from_estimator(
@@ -538,11 +539,11 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 axs[1].legend()
 
 axs[2].plot(
-    model_tuned.decision_thresholds_, model_tuned.objective_scores_, color="tab:orange"
+    tuned_model.decision_thresholds_, tuned_model.objective_scores_, color="tab:orange"
 )
 axs[2].plot(
-    model_tuned.decision_threshold_,
-    model_tuned.objective_score_,
+    tuned_model.decision_threshold_,
+    tuned_model.objective_score_,
     "o",
     markersize=10,
     color="tab:orange",
@@ -566,3 +567,190 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 # As expected, these curves differ from those of the vanilla model, given that we
 # trained the underlying classifier on a subset of the data provided during fitting and
 # reserved a validation set for tuning the cut-off point.
+#
+# Cost-sensitive learning when gains and costs are not constant
+# -------------------------------------------------------------
+#
+# As stated in [2]_, gains and costs are generally not constant in real-world problems.
+# In this section, we use a similar example as in [2]_ by using credit cards
+# records.
+#
+# The credit card dataset
+# ^^^^^^^^^^^^^^^^^^^^^^^
+credit_card = fetch_openml(data_id=1597, as_frame=True, parser="pandas")
+credit_card.frame
+
+# %%
+# The dataset contains information about credit card records from which some are
+# fraudulent and others are legitimate. The goal is therefore to predict whether or
+# not a credit card record is fraudulent.
+#
+# In addition, we have extra information regarding the amount of each card transaction.
+# This information is used to define the business metric later.
+columns_to_drop = ["Class", "Amount"]
+data = credit_card.frame.drop(columns=columns_to_drop)
+target = credit_card.frame["Class"].astype(int)
+amount = credit_card.frame["Amount"].to_numpy()
+
+# %%
+# First, we check the class distribution of the datasets.
+target.value_counts(normalize=True)
+
+# %%
+# The dataset is highly imbalanced with fraudulent transaction representing only 0.17%
+# of the data. Additionally, we check the distribution of the amount of the fraudulent
+# transactions.
+fraud = target == 1
+amount_fraud = amount[fraud]
+_, ax = plt.subplots()
+ax.hist(amount_fraud, bins=100)
+ax.set_title("Amount of fraud transaction")
+_ = ax.set_xlabel("Amount ($)")
+
+# %%
+# Now, we create the business metric that depends on the amount of each transaction. We
+# define the cost matrix similarly to [2]_. Accepting a legitimate transaction provides
+# a gain of 2% of the amount of the transaction. However, accepting a fraudulent
+# transaction result in a loss of the amount of the transaction. As stated in [2]_, the
+# gain and loss related to refusals (of fraudulent and legitimate transactions) are not
+# trivial to define. Here, we define that a refusal of a legitimate transaction is
+# estimated to a loss of $5 while the refusal of a fraudulent transaction is estimated
+# to a gain of $50 dollars and the amount of the transaction. Therefore, we define the
+# following function to compute the total benefit of a given decision:
+
+
+def business_metric(y_true, y_pred, amount):
+    mask_true_positive = (y_true == 1) & (y_pred == 1)
+    mask_true_negative = (y_true == 0) & (y_pred == 0)
+    mask_false_positive = (y_true == 0) & (y_pred == 1)
+    mask_false_negative = (y_true == 1) & (y_pred == 0)
+    fraudulent_refuse = (mask_true_positive.sum() * 50) + amount[
+        mask_true_positive
+    ].sum()
+    fraudulent_accept = -amount[mask_false_negative].sum()
+    legitimate_refuse = mask_false_positive.sum() * -5
+    legitimate_accept = (amount[mask_true_negative] * 0.02).sum()
+    return fraudulent_refuse + fraudulent_accept + legitimate_refuse + legitimate_accept
+
+
+# %%
+# From this business metric, we create a scikit-learn scorer that given a fitted
+# classifier and a test set compute the business metric. In this regard, we use
+# the :func:`~sklearn.metrics.make_scorer` factory. The variable `amount` is an
+# additional metadata to be passed to the scorer and we need to use
+# :ref:`metadata routing <metadata_routing>` to take into account this information.
+sklearn.set_config(enable_metadata_routing=True)
+business_scorer = make_scorer(business_metric).set_score_request(amount=True)
+
+# %%
+# We first start to train a dummy classifier to have some baseline results.
+from sklearn.model_selection import train_test_split
+
+data_train, data_test, target_train, target_test, amount_train, amount_test = (
+    train_test_split(
+        data, target, amount, stratify=target, test_size=0.5, random_state=42
+    )
+)
+
+# %%
+from sklearn.dummy import DummyClassifier
+
+easy_going_classifier = DummyClassifier(strategy="constant", constant=0)
+easy_going_classifier.fit(data_train, target_train)
+benefit_cost = business_scorer(
+    easy_going_classifier, data_test, target_test, amount=amount_test
+)
+print(f"Benefit/cost of our easy-going classifier: ${benefit_cost:,.2f}")
+
+# %%
+# A classifier that predict all transactions as legitimate would create a profit of
+# around $220,000. We make the same evaluation for a classifier that predicts all
+# transactions as fraudulent.
+intolerant_classifier = DummyClassifier(strategy="constant", constant=1)
+intolerant_classifier.fit(data_train, target_train)
+benefit_cost = business_scorer(
+    intolerant_classifier, data_test, target_test, amount=amount_test
+)
+print(f"Benefit/cost of our intolerant classifier: ${benefit_cost:,.2f}")
+
+# %%
+# Such a classifier create a loss of around $670,000. A predictive model should allow
+# us to make a profit larger than $220,000. It is interesting to compare this business
+# metric with another "standard" statistical metric such as the balanced accuracy.
+from sklearn.metrics import get_scorer
+
+balanced_accuracy_scorer = get_scorer("balanced_accuracy")
+print(
+    "Balanced accuracy of our easy-going classifier: "
+    f"{balanced_accuracy_scorer(easy_going_classifier, data_test, target_test):.3f}"
+)
+print(
+    "Balanced accuracy of our intolerant classifier: "
+    f"{balanced_accuracy_scorer(intolerant_classifier, data_test, target_test):.3f}"
+)
+
+# %%
+# This is not a surprise that the balanced accuracy is at 0.5 for both classifiers.
+# However, we need to be careful in the rest of the evaluation: we potentially can
+# obtain a model with a decent balanced accuracy but that does not make any profit.
+# In this case, the model would be useless for our business.
+#
+# Let's now create a predictive model using a logistic regression without tuning the
+# decision threshold.
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+
+model = make_pipeline(StandardScaler(), LogisticRegression(random_state=42)).fit(
+    data_train, target_train
+)
+
+print(
+    "Benefit/cost of our logistic regression: "
+    f"${business_scorer(model, data_test, target_test, amount=amount_test):,.2f}"
+)
+print(
+    "Balanced accuracy of our logistic regression: "
+    f"{balanced_accuracy_scorer(model, data_test, target_test):.3f}"
+)
+
+# %%
+# By observing the balanced accuracy, we see that our predictive model is learning
+# some associations between the features and the target. The business metric also shows
+# that our model is beating the baseline in terms of profit and it would be already
+# beneficial to use it instead of ignoring the fraud detection problem.
+#
+# Now the question is: is our model optimum for the type of decision that we want to do?
+# Up to now, we did not optimize the decision threshold. We use the
+# :class:`~sklearn.model_selection.TunedThresholdClassifier` to optimize the decision
+# given our business scorer.
+tuned_model = TunedThresholdClassifier(
+    estimator=model,
+    objective_metric=business_scorer,
+    n_thresholds=100,
+    n_jobs=2,
+)
+
+# %%
+# Since our business scorer requires the amount of each transaction, we need to pass
+# this information in the `fit` method. The
+# :class:`~sklearn.model_selection.TunedThresholdClassifier` is in charge of
+# automatically dispatching this metadata to the underlying scorer.
+tuned_model.fit(data_train, target_train, amount=amount_train)
+
+# %%
+print(
+    "Benefit/cost of our logistic regression: "
+    f"${business_scorer(tuned_model, data_test, target_test, amount=amount_test):,.2f}"
+)
+print(
+    "Balanced accuracy of our logistic regression: "
+    f"{balanced_accuracy_scorer(tuned_model, data_test, target_test):.3f}"
+)
+
+# %%
+# We observe that tuning the decision threshold increases the profit of our model.
+# Eventually, the balanced accuracy also increased. Note that it might not always be
+# the case because the statistical metric is not necessarily a surrogate of the
+# business metric. It is therefore important, whenever possible, optimize the decision
+# threshold with respect to the business metric.
diff --git a/examples/model_selection/plot_tuned_threshold_classifier_with_metadata_routing.py b/examples/model_selection/plot_tuned_threshold_classifier_with_metadata_routing.py
deleted file mode 100644
index 7adfd65591904..0000000000000
--- a/examples/model_selection/plot_tuned_threshold_classifier_with_metadata_routing.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# %%
-from sklearn.datasets import fetch_openml
-
-credit_card = fetch_openml(data_id=1597, as_frame=True, parser="pandas")
-credit_card.frame
-
-# %%
-data = credit_card.frame.drop(columns=["Class", "Amount"])
-target = credit_card.frame["Class"].astype(int).to_numpy()
-amount = credit_card.frame["Amount"].to_numpy()
-
-# %%
-from collections import Counter
-
-Counter(target)
-
-# %%
-import matplotlib.pyplot as plt
-
-fraud = target == 1
-amount_fraud = amount[fraud]
-_, ax = plt.subplots()
-ax.hist(amount_fraud, bins=100)
-ax.set_title("Amount of fraud transaction")
-_ = ax.set_xlabel("Amount ($)")
-
-# %%
-
-
-def business_metric(y_true, y_pred, amount):
-    mask_true_positive = (y_true == 1) & (y_pred == 1)
-    mask_true_negative = (y_true == 0) & (y_pred == 0)
-    mask_false_positive = (y_true == 0) & (y_pred == 1)
-    mask_false_negative = (y_true == 1) & (y_pred == 0)
-    fraudulent_refuse = (mask_true_positive.sum() * 50) + amount[
-        mask_true_positive
-    ].sum()
-    fraudulent_accept = -amount[mask_false_negative].sum()
-    legitimate_refuse = mask_false_positive.sum() * -5
-    legitimate_accept = (amount[mask_true_negative] * 0.02).sum()
-    return fraudulent_refuse + fraudulent_accept + legitimate_refuse + legitimate_accept
-
-
-# %%
-import numpy as np
-
-benefit_cost = business_metric(target, np.zeros_like(target), amount)
-print(f"Benefit/cost of not detecting fraud: ${benefit_cost:,.2f}")
-
-# %%
-benefit_cost = business_metric(target, np.ones_like(target), amount)
-print(f"Benefit/cost of tagging everything as fraud: ${benefit_cost:,.2f}")
-
-# %%
-from sklearn.model_selection import train_test_split
-
-data_train, data_test, target_train, target_test, amount_train, amount_test = (
-    train_test_split(
-        data, target, amount, stratify=target, test_size=0.5, random_state=42
-    )
-)
-
-# %%
-import sklearn
-from sklearn.metrics import make_scorer
-
-sklearn.set_config(enable_metadata_routing=True)
-
-business_scorer = make_scorer(business_metric).set_score_request(amount=True)
-
-# %%
-from sklearn.dummy import DummyClassifier
-
-easy_going_classifier = DummyClassifier(strategy="constant", constant=0)
-easy_going_classifier.fit(data_train, target_train)
-
-# %%
-benefit_cost = business_scorer(
-    easy_going_classifier, data_test, target_test, amount=amount_test
-)
-print(f"Benefit/cost of our easy-going classifier: ${benefit_cost:,.2f}")
-
-# %%
-intolerant_classifier = DummyClassifier(strategy="constant", constant=1)
-intolerant_classifier.fit(data_train, target_train)
-
-# %%
-benefit_cost = business_scorer(
-    intolerant_classifier, data_test, target_test, amount=amount_test
-)
-print(f"Benefit/cost of our intolerant classifier: ${benefit_cost:,.2f}")
-
-# %%
-from sklearn.linear_model import LogisticRegression
-
-logistic_regression = LogisticRegression(max_iter=1_000).fit(data_train, target_train)
-benefit_cost = business_scorer(
-    logistic_regression, data_test, target_test, amount=amount_test
-)
-print(f"Benefit/cost of our logistic regression: ${benefit_cost:,.2f}")
-
-# %%
-from sklearn.metrics import get_scorer
-
-balanced_accuracy_scorer = get_scorer("balanced_accuracy")
-balanced_accuracy = balanced_accuracy_scorer(
-    logistic_regression, data_test, target_test
-)
-print(f"Balanced accuracy of our logistic regression: {balanced_accuracy:.2f}")
-
-# %%
-from sklearn.model_selection import TunedThresholdClassifier
-
-tuned_model = TunedThresholdClassifier(
-    estimator=logistic_regression,
-    objective_metric=business_scorer,
-    n_thresholds=1_000,
-    n_jobs=-1,
-)
-tuned_model.fit(data_train, target_train, amount=amount_train)
-
-# %%
-benefit_cost = business_scorer(tuned_model, data_test, target_test, amount=amount_test)
-print(f"Benefit/cost of our tuned model: ${benefit_cost:,.2f}")
-
-# %%
-balanced_accuracy = balanced_accuracy_scorer(tuned_model, data_test, target_test)
-print(f"Balanced accuracy of our tuned model: {balanced_accuracy:.2f}")
-
-# %%
-tuned_model.set_params(objective_metric="balanced_accuracy").fit(
-    data_train, target_train
-)
-balanced_accuracy = balanced_accuracy_scorer(tuned_model, data_test, target_test)
-print(f"Balanced accuracy of our tuned model: {balanced_accuracy:.2f}")
-
-# %%
-logistic_regression.set_params(class_weight="balanced").fit(data_train, target_train)
-balanced_accuracy = balanced_accuracy_scorer(
-    logistic_regression, data_test, target_test
-)
-print(f"Balanced accuracy of our logistic regression: {balanced_accuracy:.2f}")
-
-# %%
-from sklearn.calibration import CalibrationDisplay
-
-CalibrationDisplay.from_estimator(
-    logistic_regression, data_test, target_test, n_bins=10
-)
-
-# %%
-CalibrationDisplay.from_estimator(tuned_model, data_test, target_test, n_bins=10)
-
-# %%
-from sklearn.ensemble import RandomForestClassifier
-
-rf = RandomForestClassifier(n_jobs=-1).fit(data_train, target_train)
-# %%
-balanced_accuracy = balanced_accuracy_scorer(rf, data_test, target_test)
-print(f"Balanced accuracy of our random forest: {balanced_accuracy:.2f}")
-benefit_cost = business_scorer(rf, data_test, target_test, amount=amount_test)
-print(f"Benefit/cost of our random forest: ${benefit_cost:,.2f}")
-
-# %%
-from imblearn.ensemble import BalancedRandomForestClassifier
-
-brf = BalancedRandomForestClassifier(
-    sampling_strategy="all", replacement=True, bootstrap=False, n_jobs=-1
-)
-brf.fit(data_train, target_train)
-
-# %%
-balanced_accuracy = balanced_accuracy_scorer(brf, data_test, target_test)
-print(f"Balanced accuracy of our balanced random forest: {balanced_accuracy:.2f}")
-benefit_cost = business_scorer(brf, data_test, target_test, amount=amount_test)
-print(f"Benefit/cost of our balanced random forest: ${benefit_cost:,.2f}")
-
-# %%
-tuned_model = TunedThresholdClassifier(
-    estimator=brf,
-    objective_metric=business_scorer,
-    n_thresholds=1_000,
-    n_jobs=-1,
-)
-tuned_model.fit(data_train, target_train, amount=amount_train)
-
-# %%
-balanced_accuracy = balanced_accuracy_scorer(tuned_model, data_test, target_test)
-print(f"Balanced accuracy of our balanced random forest: {balanced_accuracy:.2f}")
-benefit_cost = business_scorer(tuned_model, data_test, target_test, amount=amount_test)
-print(f"Benefit/cost of our balanced random forest: ${benefit_cost:,.2f}")
-
-# %%
-tuned_model = TunedThresholdClassifier(
-    estimator=rf,
-    objective_metric=business_scorer,
-    n_thresholds=1_000,
-    n_jobs=-1,
-)
-tuned_model.fit(data_train, target_train, amount=amount_train)
-
-# %%
-balanced_accuracy = balanced_accuracy_scorer(tuned_model, data_test, target_test)
-print(f"Balanced accuracy of our balanced random forest: {balanced_accuracy:.2f}")
-benefit_cost = business_scorer(tuned_model, data_test, target_test, amount=amount_test)
-print(f"Benefit/cost of our balanced random forest: ${benefit_cost:,.2f}")
-
-# %%

From ca06717e9504cd0af919ca7036bc30de13206012 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 28 Jul 2023 21:38:45 +0200
Subject: [PATCH 112/194] iter

---
 sklearn/model_selection/_classification_threshold.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index e6b204c2ba7b2..2b93ee66988cb 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -497,9 +497,7 @@ def fit(self, X, y, **params):
         else:
             constraint_value = "highest"
 
-        routed_params = process_routing(
-            obj=self, method="fit", other_params={}, **params
-        )
+        routed_params = process_routing(self, "fit", **params)
         self._scorer = self._get_scorer()
 
         # in the following block, we:

From ab8b466e2ad5f5ec29b49d2365b45c2385692cc0 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 30 Jul 2023 11:53:51 +0200
Subject: [PATCH 113/194] iter

---
 doc/modules/classification_threshold.rst       | 12 +++++++-----
 .../plot_cost_sensitive_learning.py            | 18 +++++++-----------
 .../plot_tuned_decision_threshold.py           |  4 ++--
 3 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index eead33b621d65..fd2898327899f 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -70,8 +70,8 @@ posterior probability greater than 0.5 while the tuned classifier predicts the c
 interest for a very low probability (around 0.02). This cut-off point optimizes a
 utility metric defined by the business (in this case an insurance company).
 
-.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_tuned_threshold_classifier_002.png
-   :target: ../auto_examples/model_selection/plot_tuned_threshold_classifier.html
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cost_sensitive_learning_002.png
+   :target: ../auto_examples/model_selection/plot_cost_sensitive_learning.html
    :align: center
 
 Options to tune the cut-off point
@@ -160,7 +160,9 @@ also possible to manually set the decision threshold in
 Examples
 --------
 
-TODO: add more examples
 - See the example entitled
-  :ref:`sphx_glr_auto_examples_model_selection_plot_tuned_threshold_classifier.py`,
-  to learn about tuning the decision threshold of a classifier.
+  :ref:`sphx_glr_auto_examples_model_selection_plot_tuned_decision_threshold.py`,
+  to get insights on the post-tuning of the decision threshold.
+- See the example entitled
+  :ref:`sphx_glr_auto_examples_model_selection_plot_cost_sensitive_learning.py`,
+  to learn about cost-sensitive learning and decision threshold tuning.
diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index f4ac021122e37..80892d2741168 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -58,9 +58,6 @@
 # We check the feature types available in `X`.
 X.info()
 
-# %%
-X.head()
-
 # %%
 # Many features are categorical and usually string-encoded. We need to encode
 # these categories when we develop our predictive model. Let's check the targets.
@@ -249,13 +246,12 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 print(f"Business defined metric: {scoring['cost_gain'](model, X_test, y_test)}")
 
 # %%
-# At this stage we don't know if any other cut-off can lead to a greater gain.
-# To find the optimal one, we need to compute the cost-gain using the business
-# metric for all possible cut-off points and choose the best. This strategy can
-# be quite tedious to implement by hand, but the
-# :class:`~sklearn.metrics.TunedThresholdClassifier` class is here to help us. It
-# automatically computes the cost-gain for all possible cut-off points and
-# optimizes for the `objective_metric`.
+# At this stage we don't know if any other cut-off can lead to a greater gain. To find
+# the optimal one, we need to compute the cost-gain using the business metric for all
+# possible cut-off points and choose the best. This strategy can be quite tedious to
+# implement by hand, but the :class:`~sklearn.model_selection.TunedThresholdClassifier`
+# class is here to help us. It automatically computes the cost-gain for all possible
+# cut-off points and optimizes for the `objective_metric`.
 #
 # .. _cost_sensitive_learning_example:
 #
@@ -578,7 +574,7 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 # The credit card dataset
 # ^^^^^^^^^^^^^^^^^^^^^^^
 credit_card = fetch_openml(data_id=1597, as_frame=True, parser="pandas")
-credit_card.frame
+credit_card.frame.info()
 
 # %%
 # The dataset contains information about credit card records from which some are
diff --git a/examples/model_selection/plot_tuned_decision_threshold.py b/examples/model_selection/plot_tuned_decision_threshold.py
index 4e903c5de3815..61930f37b1b6d 100644
--- a/examples/model_selection/plot_tuned_decision_threshold.py
+++ b/examples/model_selection/plot_tuned_decision_threshold.py
@@ -176,7 +176,7 @@
 # the most appropriate metric for the problem at hand. The choice of the "right" metric
 # is usually problem-dependent and might require some domain knowledge. Refer to the
 # example entitled,
-# :ref:`sphx_glr_auto_examples_model_selection_plot_tuned_threshold_classifier.py`,
+# :ref:`sphx_glr_auto_examples_model_selection_plot_cost_sensitive_learning.py`,
 # for more details.
 #
 # Tuning the decision threshold under constraint
@@ -356,7 +356,7 @@ def fpr_score(y, y_pred, neg_label, pos_label):
 )
 axs[1].legend()
 axs[1].set_title("ROC curves")
-disp.plot(ax=axs[0])
+_ = disp.plot(ax=axs[0])
 
 # %%
 # We start with the right-hand side plot. It depicts the ROC curves as in the previous

From 8c4c88d2654df9d02f9cc744d1b3ea66654d34e1 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 17 Oct 2023 18:22:20 +0200
Subject: [PATCH 114/194] solve deprecation

---
 sklearn/metrics/_scorer.py                    | 12 ++++---
 .../_classification_threshold.py              |  1 -
 .../tests/test_classification_threshold.py    | 32 +++++++++----------
 3 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 7e2a53324d7d7..59442c9a34b79 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -375,7 +375,7 @@ def __init__(self, score_func, sign, kwargs, n_thresholds, response_method):
             kwargs=kwargs,
             response_method=response_method,
         )
-        self.n_thresholds = n_thresholds
+        self._n_thresholds = n_thresholds
 
     @classmethod
     def from_scorer(cls, scorer, response_method, n_thresholds, pos_label):
@@ -433,16 +433,18 @@ def _score(self, method_caller, estimator, X, y_true, **kwargs):
             Score function applied to prediction of estimator on X.
         """
         pos_label = self._get_pos_label()
-        y_score = method_caller(estimator, self.response_method, X, pos_label=pos_label)
+        y_score = method_caller(
+            estimator, self._response_method, X, pos_label=pos_label
+        )
 
         scoring_kwargs = {**self._kwargs, **kwargs}
         # potential_thresholds = np.unique(y_score)
-        if isinstance(self.n_thresholds, Integral):
+        if isinstance(self._n_thresholds, Integral):
             potential_thresholds = np.linspace(
-                np.min(y_score), np.max(y_score), self.n_thresholds
+                np.min(y_score), np.max(y_score), self._n_thresholds
             )
         else:
-            potential_thresholds = np.array(self.n_thresholds, copy=False)
+            potential_thresholds = np.array(self._n_thresholds, copy=False)
         score_thresholds = [
             self._sign
             * self._score_func(
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 2b93ee66988cb..1646a11739832 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -761,7 +761,6 @@ def _get_scorer(self):
                 score_func = precision_recall_curve
             scorer = make_scorer(
                 score_func,
-                needs_threshold=True,
                 response_method=self._response_method,
                 pos_label=self.pos_label,
             )
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 63551792c377f..99a5078f41f6d 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -47,19 +47,19 @@
             "balanced_accuracy",
         ),
         (
-            make_scorer(roc_curve, needs_proba=True),
+            make_scorer(roc_curve, response_method="predict_proba"),
             "max_tnr_at_tpr_constraint",
         ),
         (
-            make_scorer(roc_curve, needs_proba=True),
+            make_scorer(roc_curve, response_method="predict_proba"),
             "max_tpr_at_tnr_constraint",
         ),
         (
-            make_scorer(precision_recall_curve, needs_proba=True),
+            make_scorer(precision_recall_curve, response_method="predict_proba"),
             "max_precision_at_recall_constraint",
         ),
         (
-            make_scorer(precision_recall_curve, needs_proba=True),
+            make_scorer(precision_recall_curve, response_method="predict_proba"),
             "max_recall_at_precision_constraint",
         ),
     ],
@@ -109,22 +109,22 @@ def test_fit_and_score_scorers(scorer, score_method):
             [0.5, 1.0],
         ),
         (
-            make_scorer(roc_curve, needs_proba=True),
+            make_scorer(roc_curve, response_method="predict_proba"),
             "max_tnr_at_tpr_constraint",
             [[0.0, 1.0], [1.0, 1.0]],
         ),
         (
-            make_scorer(roc_curve, needs_proba=True),
+            make_scorer(roc_curve, response_method="predict_proba"),
             "max_tpr_at_tnr_constraint",
             [[0.0, 1.0], [1.0, 1.0]],
         ),
         (
-            make_scorer(precision_recall_curve, needs_proba=True),
+            make_scorer(precision_recall_curve, response_method="predict_proba"),
             "max_precision_at_recall_constraint",
             [[0.5, 1.0], [1.0, 1.0]],
         ),
         (
-            make_scorer(precision_recall_curve, needs_proba=True),
+            make_scorer(precision_recall_curve, response_method="predict_proba"),
             "max_recall_at_precision_constraint",
             [[0.5, 1.0], [1.0, 1.0]],
         ),
@@ -186,19 +186,19 @@ def test_fit_and_score_prefit(scorer, score_method, expected_score):
             "balanced_accuracy",
         ),
         (
-            make_scorer(roc_curve, needs_proba=True),
+            make_scorer(roc_curve, response_method="predict_proba"),
             "max_tnr_at_tpr_constraint",
         ),
         (
-            make_scorer(roc_curve, needs_proba=True),
+            make_scorer(roc_curve, response_method="predict_proba"),
             "max_tpr_at_tnr_constraint",
         ),
         (
-            make_scorer(precision_recall_curve, needs_proba=True),
+            make_scorer(precision_recall_curve, response_method="predict_proba"),
             "max_precision_at_recall_constraint",
         ),
         (
-            make_scorer(precision_recall_curve, needs_proba=True),
+            make_scorer(precision_recall_curve, response_method="predict_proba"),
             "max_recall_at_precision_constraint",
         ),
     ],
@@ -261,19 +261,19 @@ def test_fit_and_score_sample_weight(scorer, score_method):
             "balanced_accuracy",
         ),
         (
-            make_scorer(roc_curve, needs_proba=True),
+            make_scorer(roc_curve, response_method="predict_proba"),
             "max_tnr_at_tpr_constraint",
         ),
         (
-            make_scorer(roc_curve, needs_proba=True),
+            make_scorer(roc_curve, response_method="predict_proba"),
             "max_tpr_at_tnr_constraint",
         ),
         (
-            make_scorer(precision_recall_curve, needs_proba=True),
+            make_scorer(precision_recall_curve, response_method="predict_proba"),
             "max_precision_at_recall_constraint",
         ),
         (
-            make_scorer(precision_recall_curve, needs_proba=True),
+            make_scorer(precision_recall_curve, response_method="predict_proba"),
             "max_recall_at_precision_constraint",
         ),
     ],

From e37f83197ee02cda1e61225badfa6330c20cfe41 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 17 Oct 2023 18:26:49 +0200
Subject: [PATCH 115/194] update changelog

---
 doc/whats_new/v1.4.rst | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index 7136504db62c7..b3acfed24ef9e 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -338,6 +338,9 @@ Changelog
   for CSR × CSR,  Dense × CSR, and CSR × Dense datasets is now 1.5x faster.
   :pr:`26765` by :user:`Meekail Zain <micky774>`
 
+:mod:`sklearn.metrics`
+......................
+
 - |Efficiency| Computing distances via :class:`metrics.DistanceMetric`
   for CSR × CSR, Dense × CSR, and CSR × Dense now uses ~50% less memory,
   and outputs distances in the same dtype as the provided data.
@@ -381,17 +384,6 @@ Changelog
   `predict_proba`). Such scorer are specific to classification.
   :pr:`26840` by :user:`Guillaume Lemaitre <glemaitre>`.
 
-:mod:`sklearn.metrics`
-......................
-
-- |Enhancement| add a parameter `response_method` to define a list of priority
-  of response methods to use with metrics requiring the option `needs_threshold=True`.
-  :pr:`26840` by :user:`Guillaume Lemaitre <glemaitre>`.
-
-- |Fix| scorers used with :func:`metrics.get_scorer` handles properly
-  multilabel-indicator matrix.
-  :pr:`26840` by :user:`Guillaume Lemaitre <glemaitre>`.
-
 :mod:`sklearn.model_selection`
 ..............................
 

From 383937f8961f302111efe8c264c6acaad25be866 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 17 Oct 2023 18:27:44 +0200
Subject: [PATCH 116/194] whoops

---
 doc/whats_new/v1.4.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index b3acfed24ef9e..0d339c38c14cc 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -334,13 +334,13 @@ Changelog
   proportional to the number of coefficients (`n_features * n_classes`).
   :pr:`27417` by :user:`Christian Lorentzen <lorentzenchr>`.
 
+:mod:`sklearn.metrics`
+......................
+
 - |Efficiency| Computing pairwise distances via :class:`metrics.DistanceMetric`
   for CSR × CSR,  Dense × CSR, and CSR × Dense datasets is now 1.5x faster.
   :pr:`26765` by :user:`Meekail Zain <micky774>`
 
-:mod:`sklearn.metrics`
-......................
-
 - |Efficiency| Computing distances via :class:`metrics.DistanceMetric`
   for CSR × CSR, Dense × CSR, and CSR × Dense now uses ~50% less memory,
   and outputs distances in the same dtype as the provided data.

From d4ce3fb26c3b04f2d759d2a79a4e80eb4fd1a591 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 17 Oct 2023 18:41:26 +0200
Subject: [PATCH 117/194] Update sklearn/metrics/_scorer.py

---
 sklearn/metrics/_scorer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 59442c9a34b79..4a2fbe10ede90 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -438,7 +438,6 @@ def _score(self, method_caller, estimator, X, y_true, **kwargs):
         )
 
         scoring_kwargs = {**self._kwargs, **kwargs}
-        # potential_thresholds = np.unique(y_score)
         if isinstance(self._n_thresholds, Integral):
             potential_thresholds = np.linspace(
                 np.min(y_score), np.max(y_score), self._n_thresholds

From b6b3548b79a5374e26299cd00d43552a5104f52f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 17 Oct 2023 19:34:13 +0200
Subject: [PATCH 118/194] fix doc

---
 doc/modules/classification_threshold.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index fd2898327899f..10473ed7578f7 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -109,7 +109,7 @@ In this example, we maximize the balanced accuracy.
         >>> model = TunedThresholdClassifier(base_model, objective_metric=scorer).fit(
         ...     X_train, y_train)
         >>> scorer(model, X_test, y_test)
-        0.82...
+        0.79...
         >>> # compare it with the internal score found by cross-validation
         >>> model.objective_score_
         0.86...

From 759d6805eda742a3a64d8d3253ea967d66ac858c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 18 Oct 2023 17:42:40 +0200
Subject: [PATCH 119/194] remove useless fitted attributes

---
 sklearn/model_selection/_classification_threshold.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 1646a11739832..a61ef8af4eb47 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -585,7 +585,6 @@ def _mean_interpolated_score(threshold_interpolated, cv_thresholds, cv_scores):
             self.objective_scores_ = _mean_interpolated_score(
                 self.decision_thresholds_, cv_thresholds, cv_scores
             )
-            self.cv_thresholds_, self.cv_scores_ = cv_thresholds, cv_scores
             best_idx = self.objective_scores_.argmax()
             self.objective_score_ = self.objective_scores_[best_idx]
             self.decision_threshold_ = self.decision_thresholds_[best_idx]

From 6904817c32da07329f1b6bdfd4ea694265423cfe Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 4 Dec 2023 17:24:21 +0100
Subject: [PATCH 120/194] bump pandas to 1.1.5

---
 README.rst                   | 2 +-
 sklearn/_min_dependencies.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.rst b/README.rst
index ea3e088e5f180..a8462ad385aa1 100644
--- a/README.rst
+++ b/README.rst
@@ -39,7 +39,7 @@
 .. |ThreadpoolctlMinVersion| replace:: 2.0.0
 .. |MatplotlibMinVersion| replace:: 3.3.4
 .. |Scikit-ImageMinVersion| replace:: 0.16.2
-.. |PandasMinVersion| replace:: 1.0.5
+.. |PandasMinVersion| replace:: 1.1.5
 .. |SeabornMinVersion| replace:: 0.9.0
 .. |PytestMinVersion| replace:: 7.1.2
 .. |PlotlyMinVersion| replace:: 5.14.0
diff --git a/sklearn/_min_dependencies.py b/sklearn/_min_dependencies.py
index a4c0ebb3b2d71..455e1f93935d6 100644
--- a/sklearn/_min_dependencies.py
+++ b/sklearn/_min_dependencies.py
@@ -31,7 +31,7 @@
     "cython": (CYTHON_MIN_VERSION, "build"),
     "matplotlib": ("3.3.4", "benchmark, docs, examples, tests"),
     "scikit-image": ("0.16.2", "docs, examples, tests"),
-    "pandas": ("1.0.5", "benchmark, docs, examples, tests"),
+    "pandas": ("1.1.5", "benchmark, docs, examples, tests"),
     "seaborn": ("0.9.0", "docs, examples"),
     "memory_profiler": ("0.57.0", "benchmark, docs"),
     "pytest": (PYTEST_MIN_VERSION, "tests"),

From bee1ebe4cd62e47de053450b4bfe374821f2f9e9 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 4 Dec 2023 19:39:42 +0100
Subject: [PATCH 121/194] update lock file

---
 build_tools/circle/doc_min_dependencies_environment.yml     | 2 +-
 build_tools/circle/doc_min_dependencies_linux-64_conda.lock | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/build_tools/circle/doc_min_dependencies_environment.yml b/build_tools/circle/doc_min_dependencies_environment.yml
index 26cda35f6588e..c8c5579376b6a 100644
--- a/build_tools/circle/doc_min_dependencies_environment.yml
+++ b/build_tools/circle/doc_min_dependencies_environment.yml
@@ -12,7 +12,7 @@ dependencies:
   - joblib
   - threadpoolctl
   - matplotlib=3.3.4  # min
-  - pandas=1.0.5  # min
+  - pandas=1.1.5  # min
   - pyamg
   - pytest
   - pytest-xdist=2.5.0
diff --git a/build_tools/circle/doc_min_dependencies_linux-64_conda.lock b/build_tools/circle/doc_min_dependencies_linux-64_conda.lock
index 5f4cee440dbc9..6feabc23a4d56 100644
--- a/build_tools/circle/doc_min_dependencies_linux-64_conda.lock
+++ b/build_tools/circle/doc_min_dependencies_linux-64_conda.lock
@@ -1,6 +1,6 @@
 # Generated by conda-lock.
 # platform: linux-64
-# input_hash: aa071bb1fa7968b6df9fd3662b49affc88d4f0648d359f76a96ef677162b92b3
+# input_hash: 039e3c096b52af0094b1a987ab7287b46d257a1a9b4e6ca2fcabdb91bcd346e4
 @EXPLICIT
 https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81
 https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2023.11.17-hbcca054_0.conda#01ffc8d36f9eba0ce0b3c1955fa780ee
@@ -136,6 +136,7 @@ https://conda.anaconda.org/conda-forge/linux-64/mkl-2020.4-h726a3e6_304.tar.bz2#
 https://conda.anaconda.org/conda-forge/noarch/networkx-3.1-pyhd8ed1ab_0.conda#254f787d5068bc89f578bf63893ce8b4
 https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.0-h488ebb8_3.conda#128c25b7fe6a25286a48f3a6a9b5b6f3
 https://conda.anaconda.org/conda-forge/noarch/packaging-23.2-pyhd8ed1ab_0.conda#79002079284aa895f883c6b7f3f88fd6
+https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.1.0-pyhd8ed1ab_0.conda#45a5065664da0d1dfa8f8cd2eaf05ab9
 https://conda.anaconda.org/conda-forge/noarch/pluggy-1.3.0-pyhd8ed1ab_0.conda#2390bd10bed1f3fdc7a537fb5a447d8d
 https://conda.anaconda.org/conda-forge/noarch/ply-3.11-py_1.tar.bz2#7205635cd71531943440fbfe3b6b5727
 https://conda.anaconda.org/conda-forge/linux-64/psutil-5.9.5-py38h01eb140_1.conda#89cb08bb523adf12fed3829558638d84
@@ -182,7 +183,6 @@ https://conda.anaconda.org/conda-forge/noarch/memory_profiler-0.61.0-pyhd8ed1ab_
 https://conda.anaconda.org/conda-forge/noarch/partd-1.4.1-pyhd8ed1ab_0.conda#acf4b7c0bcd5fa3b0e05801c4d2accd6
 https://conda.anaconda.org/conda-forge/linux-64/pillow-10.1.0-py38ha43c96d_0.conda#67ca17c651f86159a3b8ed1132d97c12
 https://conda.anaconda.org/conda-forge/noarch/pip-23.3.1-pyhd8ed1ab_0.conda#2400c0b86889f43aa52067161e1fb108
-https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.0.0-pyhd8ed1ab_0.conda#6bb4ee32cd435deaeac72776c001e7ac
 https://conda.anaconda.org/conda-forge/noarch/plotly-5.14.0-pyhd8ed1ab_0.conda#6a7bcc42ef58dd6cf3da9333ea102433
 https://conda.anaconda.org/conda-forge/linux-64/pulseaudio-client-16.1-hb77b528_5.conda#ac902ff3c1c6d750dd0dfc93a974ab74
 https://conda.anaconda.org/conda-forge/noarch/pytest-7.4.3-pyhd8ed1ab_0.conda#5bdca0aca30b0ee62bb84854e027eae0
@@ -208,7 +208,7 @@ https://conda.anaconda.org/conda-forge/linux-64/blas-2.20-mkl.tar.bz2#e7d09a07f5
 https://conda.anaconda.org/conda-forge/noarch/imageio-2.31.5-pyh8c1a49c_0.conda#6820ccf6a3a27df348f18c85dd89014a
 https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.3.4-py38h0efea84_0.tar.bz2#9818b095ff2ddceadb7553b0d56d219f
 https://conda.anaconda.org/conda-forge/noarch/numpydoc-1.2-pyhd8ed1ab_0.tar.bz2#025ad7ca2c7f65007ab6b6f5d93a56eb
-https://conda.anaconda.org/conda-forge/linux-64/pandas-1.0.5-py38hcb8c335_0.tar.bz2#1e1b4382170fd26cf722ef008ffb651e
+https://conda.anaconda.org/conda-forge/linux-64/pandas-1.1.5-py38h51da96c_0.tar.bz2#d014370308f212a24634955b862b0a8e
 https://conda.anaconda.org/conda-forge/noarch/patsy-0.5.4-pyhd8ed1ab_0.conda#1184267eddebb57e47f8e1419c225595
 https://conda.anaconda.org/conda-forge/linux-64/pywavelets-1.1.1-py38h5c078b8_3.tar.bz2#dafeef887e68bd18ec84681747ca0fd5
 https://conda.anaconda.org/conda-forge/linux-64/qt-main-5.15.8-h82b777d_17.conda#4f01e33dbb406085a16a2813ab067e95

From 48fd7cd565c5a1fe760e6735f07cd5e0ed5e82d6 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 13 Jan 2024 16:27:12 +0100
Subject: [PATCH 122/194] update doc-min lock file

---
 .../doc_min_dependencies_linux-64_conda.lock  | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/build_tools/circle/doc_min_dependencies_linux-64_conda.lock b/build_tools/circle/doc_min_dependencies_linux-64_conda.lock
index b0848d8fbea6f..0ba22a492dcf1 100644
--- a/build_tools/circle/doc_min_dependencies_linux-64_conda.lock
+++ b/build_tools/circle/doc_min_dependencies_linux-64_conda.lock
@@ -1,6 +1,6 @@
 # Generated by conda-lock.
 # platform: linux-64
-# input_hash: a58a98732e5815c15757bc1def8ddc0d87f20f11edcf6e7b408594bf948cbb3e
+# input_hash: 46b1818af4901a4b14e79dab7a99627a28da9815d13cdb73c40e4590b2bd6259
 @EXPLICIT
 https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81
 https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2023.11.17-hbcca054_0.conda#01ffc8d36f9eba0ce0b3c1955fa780ee
@@ -48,7 +48,7 @@ https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.3.2-hd590300_0.co
 https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda#5aa797f8787fe7a17d1b0821485b5adc
 https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.13-hd590300_5.conda#f36c115f1ee199da648e0597ec2047ad
 https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.9.4-hcb278e6_0.conda#318b08df404f9c9be5712aaa5a6f0bb0
-https://conda.anaconda.org/conda-forge/linux-64/mpg123-1.32.3-h59595ed_0.conda#bdadff838d5437aea83607ced8b37f75
+https://conda.anaconda.org/conda-forge/linux-64/mpg123-1.32.4-h59595ed_0.conda#3f1017b4141e943d9bc8739237f749e8
 https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.4-h59595ed_2.conda#7dbaa197d7ba6032caf7ae7f32c1efa0
 https://conda.anaconda.org/conda-forge/linux-64/nspr-4.35-h27087fc_0.conda#da0ec11a6454ae19bff5b02ed881a2b1
 https://conda.anaconda.org/conda-forge/linux-64/openssl-3.2.0-hd590300_1.conda#603827b39ea2b835268adb8c821b8570
@@ -105,7 +105,7 @@ https://conda.anaconda.org/conda-forge/linux-64/xcb-util-keysyms-0.4.0-h8ee46fc_
 https://conda.anaconda.org/conda-forge/linux-64/xcb-util-renderutil-0.3.9-hd590300_1.conda#e995b155d938b6779da6ace6c6b13816
 https://conda.anaconda.org/conda-forge/linux-64/xcb-util-wm-0.4.1-h8ee46fc_1.conda#90108a432fb5c6150ccfee3f03388656
 https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.7-h8ee46fc_0.conda#49e482d882669206653b095f5206c05b
-https://conda.anaconda.org/conda-forge/noarch/alabaster-0.7.13-pyhd8ed1ab_0.conda#06006184e203b61d3525f90de394471e
+https://conda.anaconda.org/conda-forge/noarch/alabaster-0.7.16-pyhd8ed1ab_0.conda#def531a3ac77b7fb8c21d17bb5d0badb
 https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py39h3d6467e_1.conda#c48418c8b35f1d59ae9ae1174812b40a
 https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.7.0-hd590300_0.conda#fad1d0a651bf929c6c16fbf1f6ccfa7c
 https://conda.anaconda.org/conda-forge/noarch/certifi-2023.11.17-pyhd8ed1ab_0.conda#2011bcf45376341dd1d690263fdbc789
@@ -117,7 +117,7 @@ https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_0.conda#5
 https://conda.anaconda.org/conda-forge/linux-64/cython-0.29.33-py39h227be39_0.conda#34bab6ef3e8cdf86fe78c46a984d3217
 https://conda.anaconda.org/conda-forge/linux-64/dbus-1.13.6-h5008d03_3.tar.bz2#ecfff944ba3960ecb334b9a2663d708d
 https://conda.anaconda.org/conda-forge/linux-64/docutils-0.19-py39hf3d152e_1.tar.bz2#adb733ec2ee669f6d010758d054da60f
-https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.2.0-pyhd8ed1ab_0.conda#f6c211fee3c98229652b60a9a42ef363
+https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.2.0-pyhd8ed1ab_2.conda#8d652ea2ee8eaee02ed8dc820bc794aa
 https://conda.anaconda.org/conda-forge/noarch/execnet-2.0.2-pyhd8ed1ab_0.conda#67de0d8241e1060a479e3c37793e26f9
 https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.14.2-h14ed4e7_0.conda#0f69b688f52ff6da70bccb7ff7001d1d
 https://conda.anaconda.org/conda-forge/noarch/fsspec-2023.12.2-pyhca7485f_0.conda#bf40f2a8835b78b1f91083d306b493d2
@@ -175,7 +175,7 @@ https://conda.anaconda.org/conda-forge/linux-64/cytoolz-0.12.2-py39hd1e30aa_1.co
 https://conda.anaconda.org/conda-forge/linux-64/fortran-compiler-1.7.0-heb67821_0.conda#7ef7c0f111dad1c8006504a0f1ccd820
 https://conda.anaconda.org/conda-forge/linux-64/glib-2.78.3-hfc55251_0.conda#e08e51acc7d1ae8dbe13255e7b4c64ac
 https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-7.0.1-pyha770c72_0.conda#746623a787e06191d80a2133e5daff17
-https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.2-pyhd8ed1ab_1.tar.bz2#c8490ed5c70966d232fdd389d0dbed37
+https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.3-pyhd8ed1ab_0.conda#e7d8df6509ba635247ff9aea31134262
 https://conda.anaconda.org/conda-forge/noarch/joblib-1.3.2-pyhd8ed1ab_0.conda#4da50d410f553db77e62ab62ffaa1abc
 https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-20_linux64_openblas.conda#36d486d72ab64ffea932329a1d3729a3
 https://conda.anaconda.org/conda-forge/linux-64/libclang-15.0.7-default_hb11cfb5_4.conda#c90f4cbb57839c98fef8f830e4b9972f
@@ -201,7 +201,7 @@ https://conda.anaconda.org/conda-forge/linux-64/pyqt5-sip-12.12.2-py39h3d6467e_5
 https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.5.0-pyhd8ed1ab_0.conda#d5f595da2daead898ca958ac62f0307b
 https://conda.anaconda.org/conda-forge/noarch/requests-2.31.0-pyhd8ed1ab_0.conda#a30144e4156cdbb236f99ebb49828f8b
 https://conda.anaconda.org/conda-forge/linux-64/blas-devel-3.9.0-20_linux64_openblas.conda#9932a1d4e9ecf2d35fb19475446e361e
-https://conda.anaconda.org/conda-forge/noarch/dask-core-2023.12.1-pyhd8ed1ab_0.conda#bf6ad72d882bc3f04e6a0fb50fd2cce8
+https://conda.anaconda.org/conda-forge/noarch/dask-core-2024.1.0-pyhd8ed1ab_0.conda#cab4cec272dc1e30086f7d32faa4f130
 https://conda.anaconda.org/conda-forge/linux-64/gst-plugins-base-1.22.8-h8e1006c_1.conda#3926dab94fe06d88ade0e716d77b8cf8
 https://conda.anaconda.org/conda-forge/linux-64/imagecodecs-lite-2019.12.3-py39hd257fcd_5.tar.bz2#32dba66d6abc2b4b5b019c9e54307312
 https://conda.anaconda.org/conda-forge/noarch/imageio-2.33.1-pyh8c1a49c_0.conda#1c34d58ac469a34e7e96832861368bce
@@ -226,10 +226,10 @@ https://conda.anaconda.org/conda-forge/noarch/numpydoc-1.2-pyhd8ed1ab_0.tar.bz2#
 https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_0.conda#ac832cc43adc79118cf6e23f1f9b8995
 https://conda.anaconda.org/conda-forge/noarch/sphinx-gallery-0.15.0-pyhd8ed1ab_0.conda#1a49ca9515ef9a96edff2eea06143dc6
 https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.3.0-py_0.tar.bz2#9363002e2a134a287af4e32ff0f26cdc
-https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-applehelp-1.0.7-pyhd8ed1ab_0.conda#aebfabcb60c33a89c1f9290cab49bc93
-https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-devhelp-1.0.5-pyhd8ed1ab_0.conda#ebf08f5184d8eaa486697bc060031953
-https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-htmlhelp-2.0.4-pyhd8ed1ab_0.conda#a9a89000dfd19656ad004b937eeb6828
-https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-qthelp-1.0.6-pyhd8ed1ab_0.conda#cf5c9649272c677a964a7313279e3a9b
+https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-applehelp-1.0.8-pyhd8ed1ab_0.conda#611a35a27914fac3aa37611a6fe40bb5
+https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-devhelp-1.0.6-pyhd8ed1ab_0.conda#d7e4954df0d3aea2eacc7835ad12671d
+https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-htmlhelp-2.0.5-pyhd8ed1ab_0.conda#7e1e7437273682ada2ed5e9e9714b140
+https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-qthelp-1.0.7-pyhd8ed1ab_0.conda#26acae54b06f178681bfb551760f5dd1
 https://conda.anaconda.org/conda-forge/noarch/sphinx-6.0.0-pyhd8ed1ab_2.conda#ac1d3b55da1669ee3a56973054fd7efb
-https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-serializinghtml-1.1.9-pyhd8ed1ab_0.conda#0612e497d7860728f2cda421ea2aec09
+https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-serializinghtml-1.1.10-pyhd8ed1ab_0.conda#e507335cb4ca9cff4c3d0fa9cdab255e
 # pip sphinxext-opengraph @ https://files.pythonhosted.org/packages/50/ac/c105ed3e0a00b14b28c0aa630935af858fd8a32affeff19574b16e2c6ae8/sphinxext_opengraph-0.4.2-py3-none-any.whl#sha256=a51f2604f9a5b6c0d25d3a88e694d5c02e20812dc0e482adf96c8628f9109357

From 0854cd497e5790f9d746cd071f67a7a8f93b49de Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 13 Jan 2024 17:20:54 +0100
Subject: [PATCH 123/194] partial reviews

---
 doc/modules/classification_threshold.rst      |  32 +++--
 .../plot_cost_sensitive_learning.py           |   2 +-
 sklearn/metrics/_scorer.py                    | 108 ---------------
 sklearn/metrics/tests/test_score_objects.py   |  88 -------------
 .../_classification_threshold.py              | 123 +++++++++++++++++-
 .../tests/test_classification_threshold.py    |  93 ++++++++++++-
 6 files changed, 224 insertions(+), 222 deletions(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index 10473ed7578f7..f732c512c1743 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -2,9 +2,9 @@
 
 .. _tunedthresholdclassifier:
 
-========================================================
-Tuning cut-off decision threshold for classes prediction
-========================================================
+======================================================
+Tuning cut-off decision threshold for class prediction
+======================================================
 
 Classifiers are predictive models: they use statistical learning to predict outcomes.
 The outcomes of a classifier are scores for each sample in relation to each class and
@@ -12,9 +12,9 @@ categorical prediction (class label). Scores are obtained from :term:`predict_pr
 :term:`decision_function`. The former returns posterior probability estimates for each
 class, while the latter returns a decision score for each class. The decision score is a
 measure of how strongly the sample is predicted to belong to the positive class (e.g.,
-the distance to the decision boundary). A decision rule is then defined by thresholding
-the scores, leading to a class label for each sample. Those labels are obtained with
-:term:`predict`.
+the distance to the decision boundary). In binary classification, a decision rule is
+then defined by thresholding the scores, leading to a single class label for each
+sample. Those labels are obtained with :term:`predict`.
 
 For binary classification in scikit-learn, class labels are obtained by associating the
 positive class with posterior probability estimates greater than 0.5 (obtained with
@@ -29,17 +29,15 @@ probability estimates and class labels::
     >>> X, y = make_classification(random_state=0)
     >>> classifier = DecisionTreeClassifier(max_depth=2, random_state=0).fit(X, y)
     >>> classifier.predict_proba(X[:4])
-    array([[0.94   , 0.06   ],
-           [0.94   , 0.06   ],
-           [0.04..., 0.95...],
-           [0.04..., 0.95...]])
+    array([[0.94     , 0.06     ],
+           [0.94     , 0.06     ],
+           [0.0416..., 0.9583...],
+           [0.0416..., 0.9583...]])
     >>> classifier.predict(X[:4])
     array([0, 0, 1, 1])
 
 While these approaches are reasonable as default behaviors, they are not ideal for
-all cases. The context and nature of the use case defines the expected behavior of the
-classifier and thus, the strategy to convert soft predictions into hard predictions. We
-illustrate this point with an example.
+all cases. Let's illustrate with an example.
 
 Let's consider a scenario where a predictive model is being deployed to assist medical
 doctors in detecting tumors. In this setting, doctors will be most likely interested in
@@ -115,8 +113,8 @@ In this example, we maximize the balanced accuracy.
         0.86...
 
 A second strategy aims to maximize one metric while imposing constraints on another
-metric. There are four pre-defined options, 2 use the Receiver Operating
-Characteristic (ROC) statistics and 2 use the Precision-Recall statistics.
+metric. There are four pre-defined options, two use the Receiver Operating
+Characteristic (ROC) statistics and two use the Precision-Recall statistics.
 
 - `"max_tpr_at_tnr_constraint"`: maximizes the True Positive Rate (TPR) such that the
   True Negative Rate (TNR) is the closest to a given value.
@@ -140,8 +138,8 @@ setting `cv="prefit"` and providing a fitted classifier. In this case, the cut-o
 is tuned on the data provided to the `fit` method.
 
 However, you should be extremely careful when using this option. You should never use
-the same data for training the classifier and tuning the cut-off point at the risk of
-overfitting. Refer to the following example section for more details (cf.
+the same data for training the classifier and tuning the cut-off point due to the risk
+of overfitting. Refer to the following example section for more details (cf.
 :ref:`tunedthresholdclassifier_no_cv`). If you have limited resources, consider using a
 float number to limit to an internal single train-test split.
 
diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index 80892d2741168..1f7f3259532f4 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -3,7 +3,7 @@
 Post-tuning decision threshold based on cost-sensitive learning
 ===============================================================
 
-Once a classifier is trained, the output of the :term:`predict` method output class
+Once a classifier is trained, the output of the :term:`predict` method outputs class
 label predictions corresponding to a thresholding of either the :term:`decision
 function` or the :term:`predict_proba` output. For a binary classifier, the default
 threshold is defined as a posterior probability estimate of 0.5 or a decision score of
diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 869e91a2cf5a0..3e55b627ee08a 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -23,11 +23,8 @@
 from collections import Counter
 from functools import partial
 from inspect import signature
-from numbers import Integral
 from traceback import format_exc
 
-import numpy as np
-
 from ..base import is_regressor
 from ..utils import Bunch
 from ..utils._param_validation import HasMethods, Hidden, StrOptions, validate_params
@@ -353,111 +350,6 @@ def _score(self, method_caller, estimator, X, y_true, **kwargs):
         return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
 
 
-def _threshold_scores_to_class_labels(y_score, threshold, classes, pos_label):
-    """Threshold `y_score` and return the associated class labels."""
-    if pos_label is None:
-        map_thresholded_score_to_label = np.array([0, 1])
-    else:
-        pos_label_idx = np.flatnonzero(classes == pos_label)[0]
-        neg_label_idx = np.flatnonzero(classes != pos_label)[0]
-        map_thresholded_score_to_label = np.array([neg_label_idx, pos_label_idx])
-
-    return classes[map_thresholded_score_to_label[(y_score >= threshold).astype(int)]]
-
-
-class _ContinuousScorer(_BaseScorer):
-    """Scorer taking a continuous response and output a score for each threshold."""
-
-    def __init__(self, score_func, sign, kwargs, n_thresholds, response_method):
-        super().__init__(
-            score_func=score_func,
-            sign=sign,
-            kwargs=kwargs,
-            response_method=response_method,
-        )
-        self._n_thresholds = n_thresholds
-
-    @classmethod
-    def from_scorer(cls, scorer, response_method, n_thresholds, pos_label):
-        """Create a continuous scorer from a normal scorer."""
-        # add `pos_label` if requested by the scorer function
-        scorer_kwargs = {**scorer._kwargs}
-        signature_scoring_func = signature(scorer._score_func)
-        if (
-            "pos_label" in signature_scoring_func.parameters
-            and "pos_label" not in scorer_kwargs
-        ):
-            if pos_label is None:
-                # Since the provided `pos_label` is the default, we need to
-                # use the default value of the scoring function that can be either
-                # `None` or `1`.
-                scorer_kwargs["pos_label"] = signature_scoring_func.parameters[
-                    "pos_label"
-                ].default
-            else:
-                scorer_kwargs["pos_label"] = pos_label
-        # transform a binary metric into a curve metric for all possible decision
-        # thresholds
-        instance = cls(
-            score_func=scorer._score_func,
-            sign=scorer._sign,
-            response_method=response_method,
-            n_thresholds=n_thresholds,
-            kwargs=scorer_kwargs,
-        )
-        # transfer the metadata request
-        instance._metadata_request = scorer._get_metadata_request()
-        return instance
-
-    def _score(self, method_caller, estimator, X, y_true, **kwargs):
-        """Evaluate predicted target values for X relative to y_true.
-
-        Parameters
-        ----------
-        estimator : object
-            Trained estimator to use for scoring.
-
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            Test data that will be fed to estimator.predict.
-
-        y_true : array-like of shape (n_samples,)
-            Gold standard target values for X.
-
-        **kwargs : dict
-            Other parameters passed to the scorer. Refer to
-            :func:`set_score_request` for more details.
-
-        Returns
-        -------
-        score : float
-            Score function applied to prediction of estimator on X.
-        """
-        pos_label = self._get_pos_label()
-        y_score = method_caller(
-            estimator, self._response_method, X, pos_label=pos_label
-        )
-
-        scoring_kwargs = {**self._kwargs, **kwargs}
-        if isinstance(self._n_thresholds, Integral):
-            potential_thresholds = np.linspace(
-                np.min(y_score), np.max(y_score), self._n_thresholds
-            )
-        else:
-            potential_thresholds = np.array(self._n_thresholds, copy=False)
-        score_thresholds = [
-            self._sign
-            * self._score_func(
-                y_true,
-                _threshold_scores_to_class_labels(
-                    y_score, th, estimator.classes_, self._get_pos_label()
-                ),
-                **scoring_kwargs,
-            )
-            for th in potential_thresholds
-        ]
-        return potential_thresholds, np.array(score_thresholds)
-
-
 @validate_params(
     {
         "scoring": [str, callable, None],
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index eed6c1eabbc97..6db20bff58fc3 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -46,7 +46,6 @@
 from sklearn.metrics import cluster as cluster_module
 from sklearn.metrics._scorer import (
     _check_multimetric_scoring,
-    _ContinuousScorer,
     _MultimetricScorer,
     _PassthroughScorer,
     _Scorer,
@@ -1207,93 +1206,6 @@ def test_scorer_no_op_multiclass_select_proba():
     scorer(lr, X_test, y_test)
 
 
-def test_continuous_scorer():
-    """Check the behaviour of the `_ContinuousScorer` class."""
-    X, y = make_classification(random_state=0)
-    estimator = LogisticRegression().fit(X, y)
-    scorer = _ContinuousScorer(
-        balanced_accuracy_score,
-        sign=1,
-        response_method="predict_proba",
-        n_thresholds=10,
-        kwargs={},
-    )
-    thresholds, scores = scorer(estimator, X, y)
-
-    assert thresholds.shape == scores.shape
-    # check that the thresholds are probability with extreme values close to 0 and 1
-    assert 0 <= thresholds.min() <= 0.01
-    assert 0.99 <= thresholds.max() <= 1
-    # balanced accuracy should be between 0.5 and 1 when it is not adjusted
-    assert 0.5 <= scores.min() <= 1
-
-    # check that passing kwargs to the scorer works
-    scorer = _ContinuousScorer(
-        balanced_accuracy_score,
-        sign=1,
-        response_method="predict_proba",
-        n_thresholds=10,
-        kwargs={"adjusted": True},
-    )
-    thresholds, scores = scorer(estimator, X, y)
-
-    # balanced accuracy should be between 0.5 and 1 when it is not adjusted
-    assert 0 <= scores.min() <= 0.5
-
-    # check that we can inverse the sign of the score when dealing with `neg_*` scorer
-    scorer = _ContinuousScorer(
-        balanced_accuracy_score,
-        sign=-1,
-        response_method="predict_proba",
-        n_thresholds=10,
-        kwargs={"adjusted": True},
-    )
-    thresholds, scores = scorer(estimator, X, y)
-
-    assert all(scores <= 0)
-
-
-def test_continuous_scorer_pos_label(global_random_seed):
-    """Check that we propagate properly the `pos_label` parameter to the scorer."""
-    n_samples = 30
-    X, y = make_classification(
-        n_samples=n_samples, weights=[0.9, 0.1], random_state=global_random_seed
-    )
-    estimator = LogisticRegression().fit(X, y)
-
-    scorer = _ContinuousScorer(
-        recall_score,
-        sign=1,
-        response_method="predict_proba",
-        n_thresholds=1000,
-        kwargs={"pos_label": 1},
-    )
-    thresholds_pos_label_1, scores_pos_label_1 = scorer(estimator, X, y)
-
-    scorer = _ContinuousScorer(
-        recall_score,
-        sign=1,
-        response_method="predict_proba",
-        n_thresholds=1000,
-        kwargs={"pos_label": 0},
-    )
-    thresholds_pos_label_0, scores_pos_label_0 = scorer(estimator, X, y)
-
-    # If `pos_label` is not forwarded to the scorer, the thresholds will be equal.
-    # Make sure that this is not the case.
-    # assert not (thresholds_pos_label_1 == thresholds_pos_label_0).all()
-    # Since we have an imbalanced problem, the thresholds should represent higher
-    # probabilities level when `pos_label=0` than with `pos_label=1`.
-    assert np.sum(thresholds_pos_label_1 < 0.15) > 2 / 3 * n_samples
-    assert np.sum(thresholds_pos_label_0 > 0.85) > 2 / 3 * n_samples
-
-    # The recall cannot be negative and `pos_label=1` should have a higher recall
-    # since there is less samples to be considered.
-    assert 0.0 < scores_pos_label_0.min() < scores_pos_label_1.min()
-    assert scores_pos_label_0.max() == pytest.approx(1.0)
-    assert scores_pos_label_1.max() == pytest.approx(1.0)
-
-
 @pytest.mark.parametrize("name", get_scorer_names())
 def test_scorer_set_score_request_raises(name):
     """Test that set_score_request is only available when feature flag is on."""
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index a61ef8af4eb47..c0b782e32a7ef 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -1,4 +1,5 @@
 from collections.abc import MutableMapping
+from inspect import signature
 from numbers import Integral, Real
 
 import numpy as np
@@ -18,7 +19,7 @@
     precision_recall_curve,
     roc_curve,
 )
-from ..metrics._scorer import _ContinuousScorer, _threshold_scores_to_class_labels
+from ..metrics._scorer import _BaseScorer
 from ..utils import _safe_indexing
 from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
 from ..utils._response import _get_response_values_binary
@@ -41,17 +42,127 @@
 from ._split import StratifiedShuffleSplit, check_cv
 
 
+def _threshold_scores_to_class_labels(y_score, threshold, classes, pos_label):
+    """Threshold `y_score` and return the associated class labels."""
+    if pos_label is None:
+        map_thresholded_score_to_label = np.array([0, 1])
+    else:
+        pos_label_idx = np.flatnonzero(classes == pos_label)[0]
+        neg_label_idx = np.flatnonzero(classes != pos_label)[0]
+        map_thresholded_score_to_label = np.array([neg_label_idx, pos_label_idx])
+
+    return classes[map_thresholded_score_to_label[(y_score >= threshold).astype(int)]]
+
+
+class _ContinuousScorer(_BaseScorer):
+    """Scorer taking a continuous response and output a score for each threshold."""
+
+    def __init__(self, score_func, sign, kwargs, n_thresholds, response_method):
+        super().__init__(
+            score_func=score_func,
+            sign=sign,
+            kwargs=kwargs,
+            response_method=response_method,
+        )
+        self._n_thresholds = n_thresholds
+
+    @classmethod
+    def from_scorer(cls, scorer, response_method, n_thresholds, pos_label):
+        """Create a continuous scorer from a normal scorer."""
+        # add `pos_label` if requested by the scorer function
+        scorer_kwargs = {**scorer._kwargs}
+        signature_scoring_func = signature(scorer._score_func)
+        if (
+            "pos_label" in signature_scoring_func.parameters
+            and "pos_label" not in scorer_kwargs
+        ):
+            if pos_label is None:
+                # Since the provided `pos_label` is the default, we need to
+                # use the default value of the scoring function that can be either
+                # `None` or `1`.
+                scorer_kwargs["pos_label"] = signature_scoring_func.parameters[
+                    "pos_label"
+                ].default
+            else:
+                scorer_kwargs["pos_label"] = pos_label
+        # transform a binary metric into a curve metric for all possible decision
+        # thresholds
+        instance = cls(
+            score_func=scorer._score_func,
+            sign=scorer._sign,
+            response_method=response_method,
+            n_thresholds=n_thresholds,
+            kwargs=scorer_kwargs,
+        )
+        # transfer the metadata request
+        instance._metadata_request = scorer._get_metadata_request()
+        return instance
+
+    def _score(self, method_caller, estimator, X, y_true, **kwargs):
+        """Evaluate predicted target values for X relative to y_true.
+
+        Parameters
+        ----------
+        estimator : object
+            Trained estimator to use for scoring.
+
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Test data that will be fed to estimator.predict.
+
+        y_true : array-like of shape (n_samples,)
+            Gold standard target values for X.
+
+        **kwargs : dict
+            Other parameters passed to the scorer. Refer to
+            :func:`set_score_request` for more details.
+
+        Returns
+        -------
+        score : float
+            Score function applied to prediction of estimator on X.
+        """
+        pos_label = self._get_pos_label()
+        y_score = method_caller(
+            estimator, self._response_method, X, pos_label=pos_label
+        )
+
+        scoring_kwargs = {**self._kwargs, **kwargs}
+        if isinstance(self._n_thresholds, Integral):
+            potential_thresholds = np.linspace(
+                np.min(y_score), np.max(y_score), self._n_thresholds
+            )
+        else:
+            potential_thresholds = np.array(self._n_thresholds, copy=False)
+        score_thresholds = [
+            self._sign
+            * self._score_func(
+                y_true,
+                _threshold_scores_to_class_labels(
+                    y_score, th, estimator.classes_, self._get_pos_label()
+                ),
+                **scoring_kwargs,
+            )
+            for th in potential_thresholds
+        ]
+        return potential_thresholds, np.array(score_thresholds)
+
+
 def _estimator_has(attr):
     """Check if we can delegate a method to the underlying estimator.
 
     First, we check the first fitted estimator if available, otherwise we
     check the unfitted estimator.
     """
-    return lambda self: (
-        hasattr(self.estimator_, attr)
-        if hasattr(self, "estimator_")
-        else hasattr(self.estimator, attr)
-    )
+
+    def check(self):
+        if hasattr(self, "estimator_"):
+            getattr(self.estimator_, attr)
+            return True
+        else:
+            getattr(self.estimator, attr)
+            return True
+
+    return check
 
 
 def _fit_and_score(
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 99a5078f41f6d..841999706ed7c 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -18,9 +18,11 @@
     recall_score,
     roc_curve,
 )
-from sklearn.metrics._scorer import _ContinuousScorer
 from sklearn.model_selection import TunedThresholdClassifier
-from sklearn.model_selection._classification_threshold import _fit_and_score
+from sklearn.model_selection._classification_threshold import (
+    _ContinuousScorer,
+    _fit_and_score,
+)
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC
@@ -33,6 +35,93 @@
 )
 
 
+def test_continuous_scorer():
+    """Check the behaviour of the `_ContinuousScorer` class."""
+    X, y = make_classification(random_state=0)
+    estimator = LogisticRegression().fit(X, y)
+    scorer = _ContinuousScorer(
+        balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        n_thresholds=10,
+        kwargs={},
+    )
+    thresholds, scores = scorer(estimator, X, y)
+
+    assert thresholds.shape == scores.shape
+    # check that the thresholds are probability with extreme values close to 0 and 1
+    assert 0 <= thresholds.min() <= 0.01
+    assert 0.99 <= thresholds.max() <= 1
+    # balanced accuracy should be between 0.5 and 1 when it is not adjusted
+    assert 0.5 <= scores.min() <= 1
+
+    # check that passing kwargs to the scorer works
+    scorer = _ContinuousScorer(
+        balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        n_thresholds=10,
+        kwargs={"adjusted": True},
+    )
+    thresholds, scores = scorer(estimator, X, y)
+
+    # balanced accuracy should be between 0.5 and 1 when it is not adjusted
+    assert 0 <= scores.min() <= 0.5
+
+    # check that we can inverse the sign of the score when dealing with `neg_*` scorer
+    scorer = _ContinuousScorer(
+        balanced_accuracy_score,
+        sign=-1,
+        response_method="predict_proba",
+        n_thresholds=10,
+        kwargs={"adjusted": True},
+    )
+    thresholds, scores = scorer(estimator, X, y)
+
+    assert all(scores <= 0)
+
+
+def test_continuous_scorer_pos_label(global_random_seed):
+    """Check that we propagate properly the `pos_label` parameter to the scorer."""
+    n_samples = 30
+    X, y = make_classification(
+        n_samples=n_samples, weights=[0.9, 0.1], random_state=global_random_seed
+    )
+    estimator = LogisticRegression().fit(X, y)
+
+    scorer = _ContinuousScorer(
+        recall_score,
+        sign=1,
+        response_method="predict_proba",
+        n_thresholds=1000,
+        kwargs={"pos_label": 1},
+    )
+    thresholds_pos_label_1, scores_pos_label_1 = scorer(estimator, X, y)
+
+    scorer = _ContinuousScorer(
+        recall_score,
+        sign=1,
+        response_method="predict_proba",
+        n_thresholds=1000,
+        kwargs={"pos_label": 0},
+    )
+    thresholds_pos_label_0, scores_pos_label_0 = scorer(estimator, X, y)
+
+    # If `pos_label` is not forwarded to the scorer, the thresholds will be equal.
+    # Make sure that this is not the case.
+    # assert not (thresholds_pos_label_1 == thresholds_pos_label_0).all()
+    # Since we have an imbalanced problem, the thresholds should represent higher
+    # probabilities level when `pos_label=0` than with `pos_label=1`.
+    assert np.sum(thresholds_pos_label_1 < 0.15) > 2 / 3 * n_samples
+    assert np.sum(thresholds_pos_label_0 > 0.85) > 2 / 3 * n_samples
+
+    # The recall cannot be negative and `pos_label=1` should have a higher recall
+    # since there is less samples to be considered.
+    assert 0.0 < scores_pos_label_0.min() < scores_pos_label_1.min()
+    assert scores_pos_label_0.max() == pytest.approx(1.0)
+    assert scores_pos_label_1.max() == pytest.approx(1.0)
+
+
 @pytest.mark.parametrize(
     "scorer, score_method",
     [

From 2df616e0b28dbcf9c25b99839e59f5ab517be70b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 18 Mar 2024 17:43:37 +0100
Subject: [PATCH 124/194] Apply suggestions from code review

Co-authored-by: Andreas Mueller <t3kcit@gmail.com>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
Co-authored-by: Joel Nothman <joeln@canva.com>
---
 doc/modules/classification_threshold.rst       |  2 +-
 doc/whats_new/v1.5.rst                         |  2 +-
 .../_classification_threshold.py               | 18 +++++++++---------
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index f732c512c1743..7d1a51e3e7b8f 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -141,7 +141,7 @@ However, you should be extremely careful when using this option. You should neve
 the same data for training the classifier and tuning the cut-off point due to the risk
 of overfitting. Refer to the following example section for more details (cf.
 :ref:`tunedthresholdclassifier_no_cv`). If you have limited resources, consider using a
-float number to limit to an internal single train-test split.
+float number for `cv` to limit to an internal single train-test split.
 
 The option `cv="prefit"` should only be used when the provided classifier was already
 trained, and you just want to find the best cut-off using a new validation set.
diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
index 5470600009d99..0d85a8dba2dfb 100644
--- a/doc/whats_new/v1.5.rst
+++ b/doc/whats_new/v1.5.rst
@@ -34,7 +34,7 @@ Changelog
 :mod:`sklearn.model_selection`
 ..............................
 
-- |MajorFeature| :class:`model_selection.TunedThresholdClassifier` calibrates
+- |MajorFeature| :class:`model_selection.TunedThresholdClassifier` adjusts
   the decision threshold function of a binary classifier by maximizing a
   classification metric through cross-validation.
   :pr:`26120` by :user:`Guillaume Lemaitre <glemaitre>`.
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index c0b782e32a7ef..749b973c234ec 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -132,7 +132,7 @@ def _score(self, method_caller, estimator, X, y_true, **kwargs):
                 np.min(y_score), np.max(y_score), self._n_thresholds
             )
         else:
-            potential_thresholds = np.array(self._n_thresholds, copy=False)
+            potential_thresholds = np.asarray(self._n_thresholds)
         score_thresholds = [
             self._sign
             * self._score_func(
@@ -219,7 +219,7 @@ def _fit_and_score(
         The decision thresholds used to compute the scores. They are returned in
         ascending order.
 
-    scores : ndarray of shape (n_thresholds,) or tuple os such arrays
+    scores : ndarray of shape (n_thresholds,) or tuple of such arrays
         The scores computed for each decision threshold. When TPR/TNR or precision/
         recall are computed, `scores` is a tuple of two arrays.
     """
@@ -269,7 +269,7 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
 
     Read more in the :ref:`User Guide <tunedthresholdclassifier>`.
 
-    .. versionadded:: 1.4
+    .. versionadded:: 1.5
 
     Parameters
     ----------
@@ -618,7 +618,7 @@ def fit(self, X, y, **params):
         if cv == "prefit":
             self.estimator_ = self.estimator
             classifier = self.estimator_
-            splits = ([None, range(_num_samples(X))],)
+            splits = [(None, range(_num_samples(X)))]
         else:
             self.estimator_ = clone(self.estimator)
             classifier = clone(self.estimator)
@@ -674,20 +674,20 @@ def fit(self, X, y, **params):
             )
 
         # find the global min and max thresholds across all folds
-        min_threshold = np.min([th.min() for th in cv_thresholds])
+        min_threshold = min(split_thresholds.min() for split_thresholds in cv_thresholds)
         max_threshold = np.max([th.max() for th in cv_thresholds])
         if isinstance(self.n_thresholds, Integral):
             self.decision_thresholds_ = np.linspace(
                 min_threshold, max_threshold, num=self.n_thresholds
             )
         else:
-            self.decision_thresholds_ = np.array(self.n_thresholds, copy=False)
+            self.decision_thresholds_ = np.asarray(self.n_thresholds)
 
         def _mean_interpolated_score(threshold_interpolated, cv_thresholds, cv_scores):
             return np.mean(
                 [
-                    np.interp(threshold_interpolated, th, sc)
-                    for th, sc in zip(cv_thresholds, cv_scores)
+                    np.interp(target_thresholds, split_thresholds, split_score)
+                    for split_thresholds, c in zip(cv_thresholds, cv_scores)
                 ],
                 axis=0,
             )
@@ -720,7 +720,7 @@ def _get_best_idx(constrained_score, maximized_score):
                 indices = np.arange(len(constrained_score))
                 mask = constrained_score >= constraint_value
                 mask_idx = maximized_score[mask].argmax()
-                return indices[mask][mask_idx]
+                return np.flatnonzero(mask)[mask_idx]
 
             if self.objective_metric == "max_tpr_at_tnr_constraint":
                 constrained_score, maximized_score = mean_tnr, mean_tpr

From b14225c7735d490049e76490344941b923ba1212 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Mon, 18 Mar 2024 17:44:26 +0100
Subject: [PATCH 125/194] update lock files

---
 .../azure/pylatest_conda_forge_mkl_linux-64_conda.lock        | 2 +-
 build_tools/azure/pylatest_conda_forge_mkl_osx-64_conda.lock  | 2 +-
 .../azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock      | 4 ++--
 .../azure/pylatest_pip_openblas_pandas_linux-64_conda.lock    | 4 ++--
 build_tools/azure/pylatest_pip_scipy_dev_linux-64_conda.lock  | 4 ++--
 .../azure/pymin_conda_defaults_openblas_linux-64_conda.lock   | 4 ++--
 build_tools/azure/pymin_conda_forge_mkl_win-64_conda.lock     | 2 +-
 ...pymin_conda_forge_openblas_ubuntu_2204_linux-64_conda.lock | 2 +-
 build_tools/azure/pypy3_linux-64_conda.lock                   | 2 +-
 build_tools/circle/doc_linux-64_conda.lock                    | 2 +-
 build_tools/circle/doc_min_dependencies_linux-64_conda.lock   | 2 +-
 build_tools/cirrus/pymin_conda_forge_linux-aarch64_conda.lock | 2 +-
 12 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock b/build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock
index 3f948012a3f94..0ec2d0dd363b0 100644
--- a/build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock
+++ b/build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock
@@ -1,6 +1,6 @@
 # Generated by conda-lock.
 # platform: linux-64
-# input_hash: 0ef2318a417ecd9806c39a466da49a53e8dab8b199cceb5bffcdd59c0a293907
+# input_hash: af75cac3c9536f0e474e89e9154ddf689946049bdab5cb7e9bdd4c5618a68189
 @EXPLICIT
 https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81
 https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.2.2-hbcca054_0.conda#2f4327a1cbe7f022401b236e915a5fef
diff --git a/build_tools/azure/pylatest_conda_forge_mkl_osx-64_conda.lock b/build_tools/azure/pylatest_conda_forge_mkl_osx-64_conda.lock
index fb1268d7fb700..8221e0e9c1b36 100644
--- a/build_tools/azure/pylatest_conda_forge_mkl_osx-64_conda.lock
+++ b/build_tools/azure/pylatest_conda_forge_mkl_osx-64_conda.lock
@@ -1,6 +1,6 @@
 # Generated by conda-lock.
 # platform: osx-64
-# input_hash: 8d19b3cb048dd1e254e00f21d81841feddd52c98a15661153cb472e9903b5cb3
+# input_hash: 1a426ea210e386d35f7d10d1994232053aaddcffe015b7c418298385f796c6e5
 @EXPLICIT
 https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-h10d778d_5.conda#6097a6ca9ada32699b5fc4312dd6ef18
 https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2024.2.2-h8857fd0_0.conda#f2eacee8c33c43692f1ccfd33d0f50b1
diff --git a/build_tools/azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock b/build_tools/azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock
index 1ceec6607714f..94be7e091e64d 100644
--- a/build_tools/azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock
+++ b/build_tools/azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock
@@ -1,10 +1,10 @@
 # Generated by conda-lock.
 # platform: osx-64
-# input_hash: 7dcf380a01bb97d3b32f38a0a9e6fcb11d8a71356477798b9a61987d26f479dd
+# input_hash: 0852937217d7f245972202bbf4d45e87bae0b554b334e0a6a351c65ba033ae17
 @EXPLICIT
 https://repo.anaconda.com/pkgs/main/osx-64/blas-1.0-mkl.conda#cb2c87e85ac8e0ceae776d26d4214c8a
 https://repo.anaconda.com/pkgs/main/osx-64/bzip2-1.0.8-h6c40b1e_5.conda#0f51dde96c82dcf58a788787fed4c5b9
-https://repo.anaconda.com/pkgs/main/osx-64/ca-certificates-2023.12.12-hecd8cb5_0.conda#1f885715539fba0c408ab58d1bda6c8e
+https://repo.anaconda.com/pkgs/main/osx-64/ca-certificates-2024.3.11-hecd8cb5_0.conda#a2e29a11940c66baf9942912096fad5f
 https://repo.anaconda.com/pkgs/main/osx-64/jpeg-9e-h6c40b1e_1.conda#fc3e61fa41309946c9283fe8737d7f41
 https://repo.anaconda.com/pkgs/main/osx-64/libbrotlicommon-1.0.9-hca72f7f_7.conda#6c865b9e76fa2fad0c8ac32aa0f01f75
 https://repo.anaconda.com/pkgs/main/osx-64/libcxx-14.0.6-h9765a3e_0.conda#387757bb354ae9042370452cd0fb5627
diff --git a/build_tools/azure/pylatest_pip_openblas_pandas_linux-64_conda.lock b/build_tools/azure/pylatest_pip_openblas_pandas_linux-64_conda.lock
index ca1b9dfd29e60..dafac93789945 100644
--- a/build_tools/azure/pylatest_pip_openblas_pandas_linux-64_conda.lock
+++ b/build_tools/azure/pylatest_pip_openblas_pandas_linux-64_conda.lock
@@ -1,9 +1,9 @@
 # Generated by conda-lock.
 # platform: linux-64
-# input_hash: 7284a2a441d970d915f9dc9dfe38bba34f361cd709c273ada02f9c57391aebf7
+# input_hash: 34348bee3155c403a50611a490fcb9a3ee64efade8f5e6225a0ffdcbc6abcb6c
 @EXPLICIT
 https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda#c3473ff8bdb3d124ed5ff11ec380d6f9
-https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2023.12.12-h06a4308_0.conda#12bf7315c3f5ca50300e8b48d1b4ef2e
+https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2024.3.11-h06a4308_0.conda#08529eb3504712baabcbda266a19feb7
 https://repo.anaconda.com/pkgs/main/linux-64/ld_impl_linux-64-2.38-h1181459_1.conda#68eedfd9c06f2b0e6888d8db345b7f5b
 https://repo.anaconda.com/pkgs/main/noarch/tzdata-2024a-h04d1e81_0.conda#452af53adae0a5b06eb5d05c707b2f25
 https://repo.anaconda.com/pkgs/main/linux-64/libgomp-11.2.0-h1234567_1.conda#b372c0eea9b60732fdae4b817a63c8cd
diff --git a/build_tools/azure/pylatest_pip_scipy_dev_linux-64_conda.lock b/build_tools/azure/pylatest_pip_scipy_dev_linux-64_conda.lock
index 2d15f7c39de6e..5160e70110827 100644
--- a/build_tools/azure/pylatest_pip_scipy_dev_linux-64_conda.lock
+++ b/build_tools/azure/pylatest_pip_scipy_dev_linux-64_conda.lock
@@ -1,9 +1,9 @@
 # Generated by conda-lock.
 # platform: linux-64
-# input_hash: 6e54a62104ad2c034299ce38740a48f06380b633c92a06f3c7da39b2f8e7e160
+# input_hash: 8e0b47b0e224c00dd9d806d48c4b9d5bc816d4a8edd28c04a423d6c7027a5d61
 @EXPLICIT
 https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda#c3473ff8bdb3d124ed5ff11ec380d6f9
-https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2023.12.12-h06a4308_0.conda#12bf7315c3f5ca50300e8b48d1b4ef2e
+https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2024.3.11-h06a4308_0.conda#08529eb3504712baabcbda266a19feb7
 https://repo.anaconda.com/pkgs/main/linux-64/ld_impl_linux-64-2.38-h1181459_1.conda#68eedfd9c06f2b0e6888d8db345b7f5b
 https://repo.anaconda.com/pkgs/main/noarch/tzdata-2024a-h04d1e81_0.conda#452af53adae0a5b06eb5d05c707b2f25
 https://repo.anaconda.com/pkgs/main/linux-64/libgomp-11.2.0-h1234567_1.conda#b372c0eea9b60732fdae4b817a63c8cd
diff --git a/build_tools/azure/pymin_conda_defaults_openblas_linux-64_conda.lock b/build_tools/azure/pymin_conda_defaults_openblas_linux-64_conda.lock
index 9d8efe98592d5..9879bf3aff539 100644
--- a/build_tools/azure/pymin_conda_defaults_openblas_linux-64_conda.lock
+++ b/build_tools/azure/pymin_conda_defaults_openblas_linux-64_conda.lock
@@ -1,10 +1,10 @@
 # Generated by conda-lock.
 # platform: linux-64
-# input_hash: 41020bafedd0621811b60cfe152e0ae26c980938a6eb40a66f25016f398c374e
+# input_hash: 499e99747d5cb2ab23ae102a3e478a65a81f5cd794889f22a16e905a538d3f83
 @EXPLICIT
 https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda#c3473ff8bdb3d124ed5ff11ec380d6f9
 https://repo.anaconda.com/pkgs/main/linux-64/blas-1.0-openblas.conda#9ddfcaef10d79366c90128f5dc444be8
-https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2023.12.12-h06a4308_0.conda#12bf7315c3f5ca50300e8b48d1b4ef2e
+https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2024.3.11-h06a4308_0.conda#08529eb3504712baabcbda266a19feb7
 https://repo.anaconda.com/pkgs/main/linux-64/ld_impl_linux-64-2.38-h1181459_1.conda#68eedfd9c06f2b0e6888d8db345b7f5b
 https://repo.anaconda.com/pkgs/main/linux-64/libgfortran5-11.2.0-h1234567_1.conda#36a01a8c30e0cadf0d3e842c50b73f3b
 https://repo.anaconda.com/pkgs/main/noarch/tzdata-2024a-h04d1e81_0.conda#452af53adae0a5b06eb5d05c707b2f25
diff --git a/build_tools/azure/pymin_conda_forge_mkl_win-64_conda.lock b/build_tools/azure/pymin_conda_forge_mkl_win-64_conda.lock
index be78170e97a8f..8d58e1aed3705 100644
--- a/build_tools/azure/pymin_conda_forge_mkl_win-64_conda.lock
+++ b/build_tools/azure/pymin_conda_forge_mkl_win-64_conda.lock
@@ -1,6 +1,6 @@
 # Generated by conda-lock.
 # platform: win-64
-# input_hash: 2f4b1d16d553e6307f97867b796d97131fd60899af1e29931840dbbc1b00d7b9
+# input_hash: 816da54e5588e59c1421ecbadecee72df09985ec71ba8aa60722041f52705984
 @EXPLICIT
 https://conda.anaconda.org/conda-forge/win-64/ca-certificates-2024.2.2-h56e8100_0.conda#63da060240ab8087b60d1357051ea7d6
 https://conda.anaconda.org/conda-forge/win-64/intel-openmp-2024.0.0-h57928b3_49841.conda#e3255c8cdaf1d52f15816d1970f9c77a
diff --git a/build_tools/azure/pymin_conda_forge_openblas_ubuntu_2204_linux-64_conda.lock b/build_tools/azure/pymin_conda_forge_openblas_ubuntu_2204_linux-64_conda.lock
index d6da030a4e079..3c635178f850d 100644
--- a/build_tools/azure/pymin_conda_forge_openblas_ubuntu_2204_linux-64_conda.lock
+++ b/build_tools/azure/pymin_conda_forge_openblas_ubuntu_2204_linux-64_conda.lock
@@ -1,6 +1,6 @@
 # Generated by conda-lock.
 # platform: linux-64
-# input_hash: b935d29d6295a279263217c782c762611b3f18c4767864bc2ea18d05239cc284
+# input_hash: ce10cdc2684e55fdb1296afa7ad859cd0aa576013c5510bed730caf08da20080
 @EXPLICIT
 https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81
 https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.2.2-hbcca054_0.conda#2f4327a1cbe7f022401b236e915a5fef
diff --git a/build_tools/azure/pypy3_linux-64_conda.lock b/build_tools/azure/pypy3_linux-64_conda.lock
index a76db5a171f79..a0ee6a358f342 100644
--- a/build_tools/azure/pypy3_linux-64_conda.lock
+++ b/build_tools/azure/pypy3_linux-64_conda.lock
@@ -1,6 +1,6 @@
 # Generated by conda-lock.
 # platform: linux-64
-# input_hash: 83e71733908600b8c80fd6c0a959a78c7be7ee4fc286c64fc4fba4f8514700a5
+# input_hash: 275d8624a80fb4b400ce81d68ad13e4f246441e8a2249012b203ed00e647b4df
 @EXPLICIT
 https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81
 https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.2.2-hbcca054_0.conda#2f4327a1cbe7f022401b236e915a5fef
diff --git a/build_tools/circle/doc_linux-64_conda.lock b/build_tools/circle/doc_linux-64_conda.lock
index f672ff464d415..8cb65362dc337 100644
--- a/build_tools/circle/doc_linux-64_conda.lock
+++ b/build_tools/circle/doc_linux-64_conda.lock
@@ -1,6 +1,6 @@
 # Generated by conda-lock.
 # platform: linux-64
-# input_hash: 86bdb5e5bcb07ed5d295cba118b2a0e1956c192244222f788ae47cf911561090
+# input_hash: 671c47e02e4c9614522ee2a492c1871b06f5b8aaaabbf5a94ca528d414c59a18
 @EXPLICIT
 https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81
 https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.2.2-hbcca054_0.conda#2f4327a1cbe7f022401b236e915a5fef
diff --git a/build_tools/circle/doc_min_dependencies_linux-64_conda.lock b/build_tools/circle/doc_min_dependencies_linux-64_conda.lock
index ae5f5129c5cb8..be37da4392cd1 100644
--- a/build_tools/circle/doc_min_dependencies_linux-64_conda.lock
+++ b/build_tools/circle/doc_min_dependencies_linux-64_conda.lock
@@ -1,6 +1,6 @@
 # Generated by conda-lock.
 # platform: linux-64
-# input_hash: 7ce9c9426465ee803d8ca86182dd1fcf7274f9c9fdb8706ea972b10c0afd0495
+# input_hash: 62191e41483376d606e0c7209a869b015b0d83dee49a1000bf7ab85839f31622
 @EXPLICIT
 https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81
 https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.2.2-hbcca054_0.conda#2f4327a1cbe7f022401b236e915a5fef
diff --git a/build_tools/cirrus/pymin_conda_forge_linux-aarch64_conda.lock b/build_tools/cirrus/pymin_conda_forge_linux-aarch64_conda.lock
index 6b012514137b1..dcc3a42490b8e 100644
--- a/build_tools/cirrus/pymin_conda_forge_linux-aarch64_conda.lock
+++ b/build_tools/cirrus/pymin_conda_forge_linux-aarch64_conda.lock
@@ -1,6 +1,6 @@
 # Generated by conda-lock.
 # platform: linux-aarch64
-# input_hash: b463bb504fdbdd8b9e845c0a0fac8af73690a5cf4d41c56794a5ba2fd1caa406
+# input_hash: 9e532e1a40216d527e696ee83365cae90b77260bb0196418edda3b75110be526
 @EXPLICIT
 https://conda.anaconda.org/conda-forge/linux-aarch64/ca-certificates-2024.2.2-hcefe29a_0.conda#57c226edb90c4e973b9b7503537dd339
 https://conda.anaconda.org/conda-forge/linux-aarch64/ld_impl_linux-aarch64-2.40-h2d8c526_0.conda#16246d69e945d0b1969a6099e7c5d457

From 7e3d7aa1e8018141468e520b82b0c4d08b3e02e5 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Mon, 18 Mar 2024 17:53:51 +0100
Subject: [PATCH 126/194] iter

---
 .../_classification_threshold.py              | 17 ++++++-----
 .../tests/test_classification_threshold.py    | 28 ++++++++++---------
 2 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 749b973c234ec..063025e5dc5c9 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -165,7 +165,7 @@ def check(self):
     return check
 
 
-def _fit_and_score(
+def _fit_and_score_over_thresholds(
     classifier,
     X,
     y,
@@ -652,7 +652,7 @@ def fit(self, X, y, **params):
 
         cv_thresholds, cv_scores = zip(
             *Parallel(n_jobs=self.n_jobs)(
-                delayed(_fit_and_score)(
+                delayed(_fit_and_score_over_thresholds)(
                     classifier,
                     X,
                     y,
@@ -674,8 +674,12 @@ def fit(self, X, y, **params):
             )
 
         # find the global min and max thresholds across all folds
-        min_threshold = min(split_thresholds.min() for split_thresholds in cv_thresholds)
-        max_threshold = np.max([th.max() for th in cv_thresholds])
+        min_threshold = min(
+            split_thresholds.min() for split_thresholds in cv_thresholds
+        )
+        max_threshold = max(
+            split_thresholds.max() for split_thresholds in cv_thresholds
+        )
         if isinstance(self.n_thresholds, Integral):
             self.decision_thresholds_ = np.linspace(
                 min_threshold, max_threshold, num=self.n_thresholds
@@ -683,11 +687,11 @@ def fit(self, X, y, **params):
         else:
             self.decision_thresholds_ = np.asarray(self.n_thresholds)
 
-        def _mean_interpolated_score(threshold_interpolated, cv_thresholds, cv_scores):
+        def _mean_interpolated_score(target_thresholds, cv_thresholds, cv_scores):
             return np.mean(
                 [
                     np.interp(target_thresholds, split_thresholds, split_score)
-                    for split_thresholds, c in zip(cv_thresholds, cv_scores)
+                    for split_thresholds, split_score in zip(cv_thresholds, cv_scores)
                 ],
                 axis=0,
             )
@@ -717,7 +721,6 @@ def _mean_interpolated_score(threshold_interpolated, cv_thresholds, cv_scores):
 
             def _get_best_idx(constrained_score, maximized_score):
                 """Find the index of the best score constrained by another score."""
-                indices = np.arange(len(constrained_score))
                 mask = constrained_score >= constraint_value
                 mask_idx = maximized_score[mask].argmax()
                 return np.flatnonzero(mask)[mask_idx]
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 841999706ed7c..e7debce236b10 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -21,7 +21,7 @@
 from sklearn.model_selection import TunedThresholdClassifier
 from sklearn.model_selection._classification_threshold import (
     _ContinuousScorer,
-    _fit_and_score,
+    _fit_and_score_over_thresholds,
 )
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
@@ -153,14 +153,14 @@ def test_continuous_scorer_pos_label(global_random_seed):
         ),
     ],
 )
-def test_fit_and_score_scorers(scorer, score_method):
-    """Check that `_fit_and_score` returns thresholds in ascending order for the
-    different accepted scorers."""
+def test_fit_and_score_over_thresholds_scorers(scorer, score_method):
+    """Check that `_fit_and_score_over_thresholds` returns thresholds in ascending order
+    for the different accepted scorers."""
     X, y = make_classification(n_samples=100, random_state=0)
     train_idx, val_idx = np.arange(50), np.arange(50, 100)
     classifier = LogisticRegression()
 
-    thresholds, scores = _fit_and_score(
+    thresholds, scores = _fit_and_score_over_thresholds(
         classifier,
         X,
         y,
@@ -219,7 +219,7 @@ def test_fit_and_score_scorers(scorer, score_method):
         ),
     ],
 )
-def test_fit_and_score_prefit(scorer, score_method, expected_score):
+def test_fit_and_score_over_thresholds_prefit(scorer, score_method, expected_score):
     """Check the behaviour with a prefit classifier."""
     X, y = make_classification(n_samples=100, random_state=0)
 
@@ -228,7 +228,7 @@ def test_fit_and_score_prefit(scorer, score_method, expected_score):
     classifier = DecisionTreeClassifier(random_state=0)
 
     with pytest.raises(NotFittedError):
-        _fit_and_score(
+        _fit_and_score_over_thresholds(
             classifier,
             X,
             y,
@@ -245,7 +245,7 @@ def test_fit_and_score_prefit(scorer, score_method, expected_score):
     # we get perfect predictions and thus match the expected score
     assert classifier.score(X[val_idx], y[val_idx]) == pytest.approx(1.0)
 
-    thresholds, scores = _fit_and_score(
+    thresholds, scores = _fit_and_score_over_thresholds(
         classifier,
         X,
         y,
@@ -292,7 +292,7 @@ def test_fit_and_score_prefit(scorer, score_method, expected_score):
         ),
     ],
 )
-def test_fit_and_score_sample_weight(scorer, score_method):
+def test_fit_and_score_over_thresholds_sample_weight(scorer, score_method):
     """Check that we dispatch the sample-weight to fit and score the classifier."""
     X, y = load_iris(return_X_y=True)
     X, y = X[:100], y[:100]  # only 2 classes
@@ -306,7 +306,7 @@ def test_fit_and_score_sample_weight(scorer, score_method):
     classifier = LogisticRegression()
     train_repeated_idx = np.arange(X_repeated.shape[0])
     val_repeated_idx = np.arange(X_repeated.shape[0])
-    thresholds_repeated, scores_repeated = _fit_and_score(
+    thresholds_repeated, scores_repeated = _fit_and_score_over_thresholds(
         classifier,
         X_repeated,
         y_repeated,
@@ -319,7 +319,7 @@ def test_fit_and_score_sample_weight(scorer, score_method):
     )
 
     train_idx, val_idx = np.arange(X.shape[0]), np.arange(X.shape[0])
-    thresholds, scores = _fit_and_score(
+    thresholds, scores = _fit_and_score_over_thresholds(
         classifier.set_fit_request(sample_weight=True),
         X,
         y,
@@ -368,7 +368,9 @@ def test_fit_and_score_sample_weight(scorer, score_method):
     ],
 )
 @pytest.mark.parametrize("fit_params_type", ["list", "array"])
-def test_fit_and_score_fit_params(scorer, score_method, fit_params_type):
+def test_fit_and_score_over_thresholds_fit_params(
+    scorer, score_method, fit_params_type
+):
     """Check that we pass `fit_params` to the classifier when calling `fit`."""
     X, y = make_classification(n_samples=100, random_state=0)
     fit_params = {
@@ -380,7 +382,7 @@ def test_fit_and_score_fit_params(scorer, score_method, fit_params_type):
     classifier.set_fit_request(a=True, b=True)
     train_idx, val_idx = np.arange(50), np.arange(50, 100)
 
-    _fit_and_score(
+    _fit_and_score_over_thresholds(
         classifier,
         X,
         y,

From b958bb0c14fb5931646fbb0d209dae1266b3e167 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Tue, 19 Mar 2024 12:32:52 +0100
Subject: [PATCH 127/194] simplify refit and do not allow cv == 1

---
 .../plot_cost_sensitive_learning.py           |  6 ++--
 .../_classification_threshold.py              | 28 +++++++------------
 .../tests/test_classification_threshold.py    |  3 ++
 3 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index 1f7f3259532f4..971e4e357361f 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -380,10 +380,10 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 # These two strategies can be changed by providing the `refit` and `cv` parameters.
 # For instance, one could provide a fitted `estimator` and set `cv="prefit"`, in which
 # case the cut-off point is found on the entire dataset provided at fitting time.
-# Also, the underlying classifier is not be refitted. Here, we can try to do such
-# experiment.
+# Also, the underlying classifier is not be refitted by setting `refit=False`. Here, we
+# can try to do such experiment.
 model.fit(X_train, y_train)
-tuned_model.set_params(cv="prefit").fit(X_train, y_train)
+tuned_model.set_params(cv="prefit", refit=False).fit(X_train, y_train)
 
 
 # %%
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 063025e5dc5c9..8538e6aeb560f 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -157,10 +157,9 @@ def _estimator_has(attr):
     def check(self):
         if hasattr(self, "estimator_"):
             getattr(self.estimator_, attr)
-            return True
         else:
             getattr(self.estimator, attr)
-            return True
+        return True
 
     return check
 
@@ -358,12 +357,10 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
             different from the one used to tune the cut-off point (by calling
             :meth:`TunedThresholdClassifier.fit`).
 
-    refit : "auto" or bool, default="auto"
+    refit : bool, default=True
         Whether or not to refit the classifier on the entire training set once
-        the decision threshold has been found. By default, `refit="auto"` is
-        equivalent to `refit=False` when `cv` is a float number using a single
-        shuffle split or `cv="prefit"` otherwise `refit=True` in all other
-        cases. Note that forcing `refit=False` on cross-validation having more
+        the decision threshold has been found.
+        Note that forcing `refit=False` on cross-validation having more
         than a single split will raise an error. Similarly, `refit=True` in
         conjunction with `cv="prefit"` will raise an error.
 
@@ -491,9 +488,9 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
         "cv": [
             "cv_object",
             StrOptions({"prefit"}),
-            Interval(RealNotInt, 0.0, 1.0, closed="right"),
+            Interval(RealNotInt, 0.0, 1.0, closed="neither"),
         ],
-        "refit": ["boolean", StrOptions({"auto"})],
+        "refit": ["boolean"],
         "n_jobs": [Integral, None],
         "random_state": ["random_state"],
     }
@@ -510,7 +507,7 @@ def __init__(
         response_method="auto",
         n_thresholds=100,
         cv=None,
-        refit="auto",
+        refit=True,
         n_jobs=None,
         random_state=None,
     ):
@@ -562,11 +559,10 @@ def fit(self, X, y, **params):
                 f"Only binary classification is supported. Unknown label type: {y_type}"
             )
 
-        if isinstance(self.cv, Real) and 0 < self.cv <= 1:
+        if isinstance(self.cv, Real) and 0 < self.cv < 1:
             cv = StratifiedShuffleSplit(
                 n_splits=1, test_size=self.cv, random_state=self.random_state
             )
-            refit = False if self.refit == "auto" else self.refit
         elif self.cv == "prefit":
             if self.refit is True:
                 raise ValueError("When cv='prefit', refit cannot be True.")
@@ -576,15 +572,11 @@ def fit(self, X, y, **params):
                 raise NotFittedError(
                     """When cv='prefit', `estimator` must be fitted."""
                 ) from exc
-            cv, refit = self.cv, False
+            cv = self.cv
         else:
             cv = check_cv(self.cv, y=y, classifier=True)
             if self.refit is False and cv.get_n_splits() > 1:
                 raise ValueError("When cv has several folds, refit cannot be False.")
-            if self.refit == "auto":
-                refit = cv.get_n_splits() > 1
-            else:
-                refit = self.refit
 
         if self.response_method == "auto":
             self._response_method = ["predict_proba", "decision_function"]
@@ -624,7 +616,7 @@ def fit(self, X, y, **params):
             classifier = clone(self.estimator)
             splits = cv.split(X, y, **routed_params.splitter.split)
 
-            if refit:
+            if self.refit:
                 # train on the whole dataset
                 X_train, y_train, fit_params_train = X, y, routed_params.estimator.fit
             else:
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index e7debce236b10..1ab8f24e21937 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -778,6 +778,7 @@ def test_tunedthresholdclassifier_pos_label_precision_recall(
         objective_metric=objective_metric,
         constraint_value=constraint_value,
         cv="prefit",
+        refit=False,
         pos_label=pos_label,
     ).fit(X, y)
 
@@ -811,6 +812,7 @@ def test_tunedthresholdclassifier_pos_label_tnr_tpr(objective_metric, pos_label)
         objective_metric=objective_metric,
         constraint_value=constraint_value,
         cv="prefit",
+        refit=False,
         pos_label=pos_label,
     ).fit(X, y)
 
@@ -859,6 +861,7 @@ def test_tunedthresholdclassifier_pos_label_single_metric(pos_label, metric_type
         estimator,
         objective_metric=objective_metric,
         cv="prefit",
+        refit=False,
         pos_label=pos_label,
         n_thresholds=500,
     ).fit(X, y)

From 98a1db8c79d061eebbe72daadb600add05ffc262 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Tue, 19 Mar 2024 12:44:09 +0100
Subject: [PATCH 128/194] check raise for multilabel

---
 .../tests/test_classification_threshold.py    | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 1ab8f24e21937..a21825fb48ab1 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -2,7 +2,12 @@
 import pytest
 
 from sklearn.base import clone
-from sklearn.datasets import load_breast_cancer, load_iris, make_classification
+from sklearn.datasets import (
+    load_breast_cancer,
+    load_iris,
+    make_classification,
+    make_multilabel_classification,
+)
 from sklearn.dummy import DummyClassifier
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.exceptions import NotFittedError
@@ -395,12 +400,18 @@ def test_fit_and_score_over_thresholds_fit_params(
     )
 
 
-def test_tunedthresholdclassifier_no_binary():
+@pytest.mark.parametrize(
+    "data",
+    [
+        make_classification(n_classes=3, n_clusters_per_class=1, random_state=0),
+        make_multilabel_classification(random_state=0),
+    ],
+)
+def test_tunedthresholdclassifier_no_binary(data):
     """Check that we raise an informative error message for non-binary problem."""
-    X, y = make_classification(n_classes=3, n_clusters_per_class=1)
     err_msg = "Only binary classification is supported."
     with pytest.raises(ValueError, match=err_msg):
-        TunedThresholdClassifier(LogisticRegression()).fit(X, y)
+        TunedThresholdClassifier(LogisticRegression()).fit(*data)
 
 
 @pytest.mark.parametrize(

From c28a3e1f3fd1927891b6250c7766a564f13d67fb Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Tue, 19 Mar 2024 12:45:37 +0100
Subject: [PATCH 129/194] fix test name

---
 sklearn/model_selection/tests/test_classification_threshold.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index a21825fb48ab1..063f60172d505 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -480,7 +480,7 @@ def test_tunedthresholdclassifier_estimator_response_methods(
 @pytest.mark.parametrize(
     "response_method", ["auto", "decision_function", "predict_proba"]
 )
-def test_tunedthresholdclassifier_with_constraint_value(response_method):
+def test_tunedthresholdclassifier_without_constraint_value(response_method):
     """Check that `TunedThresholdClassifier` is optimizing a given objective metric."""
     X, y = load_breast_cancer(return_X_y=True)
     # remove feature to degrade performances

From 076fd29b05e8c29157320bcaee3e83035a5e91ca Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Tue, 19 Mar 2024 12:49:45 +0100
Subject: [PATCH 130/194] test another to check if beta is forwarded

---
 .../tests/test_classification_threshold.py          | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 063f60172d505..b75162698f7fd 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -535,20 +535,27 @@ def test_tunedthresholdclassifier_limit_metric_tradeoff(metrics):
 
 def test_tunedthresholdclassifier_metric_with_parameter():
     """Check that we can pass a metric with a parameter in addition check that
-    `f_beta with beta=1` is equivalent to `f1`.
+    `f_beta` with `beta=1` is equivalent to `f1` and different from `f_beta` with
+    `beta=2`.
     """
     X, y = load_breast_cancer(return_X_y=True)
     lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
-    model_fbeta = TunedThresholdClassifier(
+    model_fbeta_1 = TunedThresholdClassifier(
         estimator=lr, objective_metric=make_scorer(fbeta_score, beta=1)
     ).fit(X, y)
+    model_fbeta_2 = TunedThresholdClassifier(
+        estimator=lr, objective_metric=make_scorer(fbeta_score, beta=2)
+    ).fit(X, y)
     model_f1 = TunedThresholdClassifier(
         estimator=lr, objective_metric=make_scorer(f1_score)
     ).fit(X, y)
 
-    assert model_fbeta.decision_threshold_ == pytest.approx(
+    assert model_fbeta_1.decision_threshold_ == pytest.approx(
         model_f1.decision_threshold_
     )
+    assert model_fbeta_1.decision_threshold_ != pytest.approx(
+        model_fbeta_2.decision_threshold_
+    )
 
 
 @pytest.mark.parametrize(

From e728f1d6710f117f7e120764c0445101abfe8e37 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Wed, 20 Mar 2024 19:19:09 +0100
Subject: [PATCH 131/194] iter

---
 .../tests/test_classification_threshold.py    | 90 +++++++++++++++----
 1 file changed, 71 insertions(+), 19 deletions(-)

diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index b75162698f7fd..514d290f48c53 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -23,7 +23,7 @@
     recall_score,
     roc_curve,
 )
-from sklearn.model_selection import TunedThresholdClassifier
+from sklearn.model_selection import StratifiedShuffleSplit, TunedThresholdClassifier
 from sklearn.model_selection._classification_threshold import (
     _ContinuousScorer,
     _fit_and_score_over_thresholds,
@@ -98,7 +98,7 @@ def test_continuous_scorer_pos_label(global_random_seed):
         recall_score,
         sign=1,
         response_method="predict_proba",
-        n_thresholds=1000,
+        n_thresholds=1_000,
         kwargs={"pos_label": 1},
     )
     thresholds_pos_label_1, scores_pos_label_1 = scorer(estimator, X, y)
@@ -107,7 +107,7 @@ def test_continuous_scorer_pos_label(global_random_seed):
         recall_score,
         sign=1,
         response_method="predict_proba",
-        n_thresholds=1000,
+        n_thresholds=1_000,
         kwargs={"pos_label": 0},
     )
     thresholds_pos_label_0, scores_pos_label_0 = scorer(estimator, X, y)
@@ -407,7 +407,7 @@ def test_fit_and_score_over_thresholds_fit_params(
         make_multilabel_classification(random_state=0),
     ],
 )
-def test_tunedthresholdclassifier_no_binary(data):
+def test_tuned_threshold_classifier_no_binary(data):
     """Check that we raise an informative error message for non-binary problem."""
     err_msg = "Only binary classification is supported."
     with pytest.raises(ValueError, match=err_msg):
@@ -435,7 +435,7 @@ def test_tunedthresholdclassifier_no_binary(data):
     ],
 )
 @pytest.mark.parametrize("strategy", ["optimum", "constant"])
-def test_tunedthresholdclassifier_conflict_cv_refit(
+def test_tuned_threshold_classifier_conflict_cv_refit(
     strategy, params, err_type, err_msg
 ):
     """Check that we raise an informative error message when `cv` and `refit`
@@ -456,7 +456,7 @@ def test_tunedthresholdclassifier_conflict_cv_refit(
     "response_method", ["predict_proba", "predict_log_proba", "decision_function"]
 )
 @pytest.mark.parametrize("strategy", ["optimum", "constant"])
-def test_tunedthresholdclassifier_estimator_response_methods(
+def test_tuned_threshold_classifier_estimator_response_methods(
     estimator, strategy, response_method
 ):
     """Check that `TunedThresholdClassifier` exposes the same response methods as the
@@ -480,7 +480,7 @@ def test_tunedthresholdclassifier_estimator_response_methods(
 @pytest.mark.parametrize(
     "response_method", ["auto", "decision_function", "predict_proba"]
 )
-def test_tunedthresholdclassifier_without_constraint_value(response_method):
+def test_tuned_threshold_classifier_without_constraint_value(response_method):
     """Check that `TunedThresholdClassifier` is optimizing a given objective metric."""
     X, y = load_breast_cancer(return_X_y=True)
     # remove feature to degrade performances
@@ -516,7 +516,7 @@ def test_tunedthresholdclassifier_without_constraint_value(response_method):
         ("max_tnr_at_tpr_constraint", "max_tpr_at_tnr_constraint"),
     ],
 )
-def test_tunedthresholdclassifier_limit_metric_tradeoff(metrics):
+def test_tuned_threshold_classifier_limit_metric_tradeoff(metrics):
     """Check that an objective value of 0 give opposite predictions with tnr/tpr and
     precision/recall.
     """
@@ -533,7 +533,7 @@ def test_tunedthresholdclassifier_limit_metric_tradeoff(metrics):
     assert np.mean(y_pred_1 == y_pred_2) > 0.98
 
 
-def test_tunedthresholdclassifier_metric_with_parameter():
+def test_tuned_threshold_classifier_metric_with_parameter():
     """Check that we can pass a metric with a parameter in addition check that
     `f_beta` with `beta=1` is equivalent to `f1` and different from `f_beta` with
     `beta=2`.
@@ -572,7 +572,7 @@ def test_tunedthresholdclassifier_metric_with_parameter():
         make_scorer(f1_score, pos_label="cancer"),
     ],
 )
-def test_tunedthresholdclassifier_with_string_targets(response_method, metric):
+def test_tuned_threshold_classifier_with_string_targets(response_method, metric):
     """Check that targets represented by str are properly managed.
     Also, check with several metrics to be sure that `pos_label` is properly
     dispatched.
@@ -599,7 +599,7 @@ def test_tunedthresholdclassifier_with_string_targets(response_method, metric):
 @pytest.mark.usefixtures("enable_slep006")
 @pytest.mark.parametrize("strategy", ["optimum", "constant"])
 @pytest.mark.parametrize("with_sample_weight", [True, False])
-def test_tunedthresholdclassifier_refit(
+def test_tuned_threshold_classifier_refit(
     strategy, with_sample_weight, global_random_seed
 ):
     """Check the behaviour of the `refit` parameter."""
@@ -663,7 +663,7 @@ def test_tunedthresholdclassifier_refit(
     ],
 )
 @pytest.mark.parametrize("fit_params_type", ["list", "array"])
-def test_tunedthresholdclassifier_fit_params(objective_metric, fit_params_type):
+def test_tuned_threshold_classifier_fit_params(objective_metric, fit_params_type):
     """Check that we pass `fit_params` to the classifier when calling `fit`."""
     X, y = make_classification(n_samples=100, random_state=0)
     fit_params = {
@@ -691,7 +691,7 @@ def test_tunedthresholdclassifier_fit_params(objective_metric, fit_params_type):
 @pytest.mark.parametrize(
     "response_method", ["auto", "decision_function", "predict_proba"]
 )
-def test_tunedthresholdclassifier_response_method_scorer_with_constraint_metric(
+def test_tuned_threshold_classifier_response_method_scorer_with_constraint_metric(
     objective_metric, constraint_value, response_method, global_random_seed
 ):
     """Check that we use the proper scorer and forwarding the requested response method
@@ -736,7 +736,7 @@ def test_tunedthresholdclassifier_response_method_scorer_with_constraint_metric(
 
 
 @pytest.mark.usefixtures("enable_slep006")
-def test_tunedthresholdclassifier_cv_zeros_sample_weights_equivalence():
+def test_tuned_threshold_classifier_cv_zeros_sample_weights_equivalence():
     """Check that passing removing some sample from the dataset `X` is
     equivalent to passing a `sample_weight` with a factor 0."""
     X, y = load_iris(return_X_y=True)
@@ -765,7 +765,7 @@ def test_tunedthresholdclassifier_cv_zeros_sample_weights_equivalence():
     assert_allclose(y_pred_with_weights, y_pred_without_weights)
 
 
-def test_tunedthresholdclassifier_error_constant_learner():
+def test_tuned_threshold_classifier_error_constant_learner():
     """Check that we raise an error message when providing an estimator that predicts
     only a single class."""
     X, y = make_classification(random_state=0)
@@ -780,7 +780,7 @@ def test_tunedthresholdclassifier_error_constant_learner():
     ["max_precision_at_recall_constraint", "max_recall_at_precision_constraint"],
 )
 @pytest.mark.parametrize("pos_label", [0, 1])
-def test_tunedthresholdclassifier_pos_label_precision_recall(
+def test_tuned_threshold_classifier_pos_label_precision_recall(
     objective_metric, pos_label
 ):
     """Check that `pos_label` is dispatched correctly by checking the precision and
@@ -816,7 +816,7 @@ def test_tunedthresholdclassifier_pos_label_precision_recall(
     "objective_metric", ["max_tnr_at_tpr_constraint", "max_tpr_at_tnr_constraint"]
 )
 @pytest.mark.parametrize("pos_label", [0, 1])
-def test_tunedthresholdclassifier_pos_label_tnr_tpr(objective_metric, pos_label):
+def test_tuned_threshold_classifier_pos_label_tnr_tpr(objective_metric, pos_label):
     """Check that `pos_label` is dispatched correctly by checking the TNR and TPR
     score found during the optimization and the one found at `predict` time."""
     X, y = make_classification(n_samples=5_000, weights=[0.6, 0.4], random_state=42)
@@ -858,7 +858,7 @@ def tnr_tpr_score(y_true, y_pred, pos_label=pos_label):
     ["string", "scorer_without_pos_label", "scorer_with_pos_label"],
 )
 @pytest.mark.parametrize("pos_label", [0, 1])
-def test_tunedthresholdclassifier_pos_label_single_metric(pos_label, metric_type):
+def test_tuned_threshold_classifier_pos_label_single_metric(pos_label, metric_type):
     """Check that `pos_label` is dispatched correctly when getting a scorer linked to
     a known metric. By default, the scorer in scikit-learn only have a default value
     for `pos_label` which is 1.
@@ -892,7 +892,7 @@ def test_tunedthresholdclassifier_pos_label_single_metric(pos_label, metric_type
     "predict_method",
     ["predict", "predict_proba", "decision_function", "predict_log_proba"],
 )
-def test_tunedthresholdclassifier_constant_strategy(predict_method):
+def test_tuned_threshold_classifier_constant_strategy(predict_method):
     """Check the behavior when `strategy='contant'."""
     X, y = make_classification(n_samples=100, weights=[0.6, 0.4], random_state=42)
 
@@ -910,3 +910,55 @@ def test_tunedthresholdclassifier_constant_strategy(predict_method):
     assert_allclose(
         getattr(tuned_model, predict_method)(X), getattr(estimator, predict_method)(X)
     )
+
+
+def test_tuned_threshold_classifier_n_thresholds_array():
+    """Check that we can pass an array to `n_thresholds` and it is used as candidate
+    threshold internally."""
+    X, y = make_classification(random_state=0)
+    estimator = LogisticRegression()
+    n_thresholds = np.linspace(0, 1, 11)
+    tuned_model = TunedThresholdClassifier(
+        estimator, n_thresholds=n_thresholds, response_method="predict_proba"
+    ).fit(X, y)
+    assert_allclose(tuned_model.decision_thresholds_, n_thresholds)
+
+
+def test_tuned_threshold_classifier_cv_float():
+    """Check the behaviour when `cv` is set to a float."""
+    X, y = make_classification(random_state=0)
+
+    # case where `refit=False` and cv is a float: the underlying estimator will be fit
+    # on the training set given by a ShuffleSplit. We check that we get the same model
+    # coefficients.
+    test_size = 0.3
+    estimator = LogisticRegression()
+    tuned_model = TunedThresholdClassifier(
+        estimator, cv=test_size, refit=False, random_state=0
+    ).fit(X, y)
+    tuned_model.fit(X, y)
+
+    cv = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=0)
+    train_idx, val_idx = next(cv.split(X, y))
+    cloned_estimator = clone(estimator).fit(X[train_idx], y[train_idx])
+
+    assert_allclose(tuned_model.estimator_.coef_, cloned_estimator.coef_)
+
+
+@pytest.mark.parametrize(
+    "objective_metric",
+    [
+        "max_tpr_at_tnr_constraint",
+        "max_tnr_at_tpr_constraint",
+        "max_precision_at_recall_constraint",
+        "max_recall_at_precision_constraint",
+    ],
+)
+def test_tuned_threshold_classifier_error_missing_constraint(objective_metric):
+    """Check that we raise an informative error when using a objective metric requested
+    a constraint but no `constraint_value` is provided."""
+    X, y = make_classification(random_state=0)
+    estimator = LogisticRegression()
+    tuned_model = TunedThresholdClassifier(estimator, objective_metric=objective_metric)
+    with pytest.raises(ValueError, match="`constraint_value` must be provided"):
+        tuned_model.fit(X, y)

From c73b20582d07c17b6819ca3011771baee0c7e0f6 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Wed, 20 Mar 2024 19:27:48 +0100
Subject: [PATCH 132/194] refit=True and cv is float

---
 .../model_selection/tests/test_classification_threshold.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 514d290f48c53..06c025ed4e18a 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -944,6 +944,13 @@ def test_tuned_threshold_classifier_cv_float():
 
     assert_allclose(tuned_model.estimator_.coef_, cloned_estimator.coef_)
 
+    # case where `refit=True`, then the underlying estimator is fitted on the full
+    # dataset.
+    tuned_model.set_params(refit=True).fit(X, y)
+    cloned_estimator = clone(estimator).fit(X, y)
+
+    assert_allclose(tuned_model.estimator_.coef_, cloned_estimator.coef_)
+
 
 @pytest.mark.parametrize(
     "objective_metric",

From a4890dffbe171ac57b8eb6daae26898d6c6878f8 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Fri, 22 Mar 2024 19:33:13 +0100
Subject: [PATCH 133/194] rename scorer to curve scorer internally

---
 .../_classification_threshold.py              | 57 ++++++++------
 .../tests/test_classification_threshold.py    | 76 ++++++++++---------
 2 files changed, 71 insertions(+), 62 deletions(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 8538e6aeb560f..d949a4a32a7ca 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -54,7 +54,7 @@ def _threshold_scores_to_class_labels(y_score, threshold, classes, pos_label):
     return classes[map_thresholded_score_to_label[(y_score >= threshold).astype(int)]]
 
 
-class _ContinuousScorer(_BaseScorer):
+class _CurveScorer(_BaseScorer):
     """Scorer taking a continuous response and output a score for each threshold."""
 
     def __init__(self, score_func, sign, kwargs, n_thresholds, response_method):
@@ -118,8 +118,11 @@ def _score(self, method_caller, estimator, X, y_true, **kwargs):
 
         Returns
         -------
-        score : float
-            Score function applied to prediction of estimator on X.
+        scores : ndarray of shape (n_thresholds,)
+            The scores associated to each threshold.
+
+        potential_thresholds : ndarray of shape (n_thresholds,)
+            The potential thresholds used to compute the scores.
         """
         pos_label = self._get_pos_label()
         y_score = method_caller(
@@ -144,7 +147,7 @@ def _score(self, method_caller, estimator, X, y_true, **kwargs):
             )
             for th in potential_thresholds
         ]
-        return potential_thresholds, np.array(score_thresholds)
+        return np.array(score_thresholds), potential_thresholds
 
 
 def _estimator_has(attr):
@@ -172,11 +175,12 @@ def _fit_and_score_over_thresholds(
     fit_params,
     train_idx,
     val_idx,
-    scorer,
+    curve_scorer,
     score_method,
     score_params,
 ):
-    """Fit a classifier and compute the scores for different decision thresholds.
+    """Fit a classifier and compute the scores for different decision thresholds
+    representing a curve.
 
     Parameters
     ----------
@@ -201,9 +205,10 @@ def _fit_and_score_over_thresholds(
         The indices of the validation set used to score `classifier`. If `train_idx`,
         the entire set will be used.
 
-    scorer : scorer instance
+    curve_scorer : scorer instance
         The scorer taking `classifier` and the validation set as input and outputting
-        decision thresholds and scores.
+        decision thresholds and scores as a curve. Note that this is different from
+        the usual scorer that output a single score value.
 
     score_method : str or callable
         The scoring method to use. Used to detect if we compute TPR/TNR or precision/
@@ -214,7 +219,7 @@ def _fit_and_score_over_thresholds(
 
     Returns
     -------
-    thresholds : ndarray of shape (n_thresholds,)
+    potential_thresholds : ndarray of shape (n_thresholds,)
         The decision thresholds used to compute the scores. They are returned in
         ascending order.
 
@@ -236,7 +241,7 @@ def _fit_and_score_over_thresholds(
 
     if isinstance(score_method, str):
         if score_method in {"max_tpr_at_tnr_constraint", "max_tnr_at_tpr_constraint"}:
-            fpr, tpr, potential_thresholds = scorer(
+            fpr, tpr, potential_thresholds = curve_scorer(
                 classifier, X_val, y_val, **score_params_val
             )
             # For fpr=0/tpr=0, the threshold is set to `np.inf`. We need to remove it.
@@ -247,14 +252,17 @@ def _fit_and_score_over_thresholds(
             "max_precision_at_recall_constraint",
             "max_recall_at_precision_constraint",
         }:
-            precision, recall, potential_thresholds = scorer(
+            precision, recall, potential_thresholds = curve_scorer(
                 classifier, X_val, y_val, **score_params_val
             )
             # thresholds are in increasing order
             # the last element of the precision and recall is not associated with any
             # threshold and should be discarded
             return potential_thresholds, (precision[:-1], recall[:-1])
-    return scorer(classifier, X_val, y_val, **score_params_val)
+    scores, potential_thresholds = curve_scorer(
+        classifier, X_val, y_val, **score_params_val
+    )
+    return potential_thresholds, scores
 
 
 class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
@@ -601,7 +609,7 @@ def fit(self, X, y, **params):
             constraint_value = "highest"
 
         routed_params = process_routing(self, "fit", **params)
-        self._scorer = self._get_scorer()
+        self._curve_scorer = self._get_curve_scorer()
 
         # in the following block, we:
         # - define the final classifier `self.estimator_` and train it if necessary
@@ -651,7 +659,7 @@ def fit(self, X, y, **params):
                     fit_params=routed_params.estimator.fit,
                     train_idx=train_idx,
                     val_idx=val_idx,
-                    scorer=self._scorer,
+                    curve_scorer=self._curve_scorer,
                     score_method=self.objective_metric,
                     score_params=routed_params.scorer.score,
                 )
@@ -757,7 +765,7 @@ def predict(self, X):
         check_is_fitted(self, "estimator_")
         if self.strategy == "optimum":
             # `pos_label` has been validated and is stored in the scorer
-            pos_label = self._scorer._get_pos_label()
+            pos_label = self._curve_scorer._get_pos_label()
         else:
             pos_label = self.pos_label
         y_score, _ = _get_response_values_binary(
@@ -836,7 +844,6 @@ def get_metadata_routing(self):
         """
         router = (
             MetadataRouter(owner=self.__class__.__name__)
-            .add_self_request(self)
             .add(
                 estimator=self.estimator,
                 method_mapping=MethodMapping().add(callee="fit", caller="fit"),
@@ -846,14 +853,14 @@ def get_metadata_routing(self):
                 method_mapping=MethodMapping().add(callee="split", caller="fit"),
             )
             .add(
-                scorer=self._get_scorer(),
+                scorer=self._get_curve_scorer(),
                 method_mapping=MethodMapping().add(callee="score", caller="fit"),
             )
         )
         return router
 
-    def _get_scorer(self):
-        """Get the scorer based on the objective metric used."""
+    def _get_curve_scorer(self):
+        """Get the curve scorer based on the objective metric used."""
         if self.objective_metric in {
             "max_tnr_at_tpr_constraint",
             "max_tpr_at_tnr_constraint",
@@ -861,20 +868,20 @@ def _get_scorer(self):
             "max_recall_at_precision_constraint",
         }:
             if "tpr" in self.objective_metric:  # tpr/tnr
-                score_func = roc_curve
+                score_curve_func = roc_curve
             else:  # precision/recall
-                score_func = precision_recall_curve
-            scorer = make_scorer(
-                score_func,
+                score_curve_func = precision_recall_curve
+            curve_scorer = make_scorer(
+                score_curve_func,
                 response_method=self._response_method,
                 pos_label=self.pos_label,
             )
         else:
             scoring = check_scoring(self.estimator, scoring=self.objective_metric)
-            scorer = _ContinuousScorer.from_scorer(
+            curve_scorer = _CurveScorer.from_scorer(
                 scoring, self._response_method, self.n_thresholds, self.pos_label
             )
-        return scorer
+        return curve_scorer
 
     def _more_tags(self):
         return {
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 06c025ed4e18a..ce423f82b5654 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -25,7 +25,7 @@
 )
 from sklearn.model_selection import StratifiedShuffleSplit, TunedThresholdClassifier
 from sklearn.model_selection._classification_threshold import (
-    _ContinuousScorer,
+    _CurveScorer,
     _fit_and_score_over_thresholds,
 )
 from sklearn.pipeline import make_pipeline
@@ -40,18 +40,18 @@
 )
 
 
-def test_continuous_scorer():
-    """Check the behaviour of the `_ContinuousScorer` class."""
+def test_curve_scorer():
+    """Check the behaviour of the `_CurveScorer` class."""
     X, y = make_classification(random_state=0)
     estimator = LogisticRegression().fit(X, y)
-    scorer = _ContinuousScorer(
+    curve_scorer = _CurveScorer(
         balanced_accuracy_score,
         sign=1,
         response_method="predict_proba",
         n_thresholds=10,
         kwargs={},
     )
-    thresholds, scores = scorer(estimator, X, y)
+    scores, thresholds = curve_scorer(estimator, X, y)
 
     assert thresholds.shape == scores.shape
     # check that the thresholds are probability with extreme values close to 0 and 1
@@ -61,32 +61,32 @@ def test_continuous_scorer():
     assert 0.5 <= scores.min() <= 1
 
     # check that passing kwargs to the scorer works
-    scorer = _ContinuousScorer(
+    curve_scorer = _CurveScorer(
         balanced_accuracy_score,
         sign=1,
         response_method="predict_proba",
         n_thresholds=10,
         kwargs={"adjusted": True},
     )
-    thresholds, scores = scorer(estimator, X, y)
+    scores, thresholds = curve_scorer(estimator, X, y)
 
     # balanced accuracy should be between 0.5 and 1 when it is not adjusted
     assert 0 <= scores.min() <= 0.5
 
     # check that we can inverse the sign of the score when dealing with `neg_*` scorer
-    scorer = _ContinuousScorer(
+    curve_scorer = _CurveScorer(
         balanced_accuracy_score,
         sign=-1,
         response_method="predict_proba",
         n_thresholds=10,
         kwargs={"adjusted": True},
     )
-    thresholds, scores = scorer(estimator, X, y)
+    scores, thresholds = curve_scorer(estimator, X, y)
 
     assert all(scores <= 0)
 
 
-def test_continuous_scorer_pos_label(global_random_seed):
+def test_curve_scorer_pos_label(global_random_seed):
     """Check that we propagate properly the `pos_label` parameter to the scorer."""
     n_samples = 30
     X, y = make_classification(
@@ -94,25 +94,25 @@ def test_continuous_scorer_pos_label(global_random_seed):
     )
     estimator = LogisticRegression().fit(X, y)
 
-    scorer = _ContinuousScorer(
+    curve_scorer = _CurveScorer(
         recall_score,
         sign=1,
         response_method="predict_proba",
         n_thresholds=1_000,
         kwargs={"pos_label": 1},
     )
-    thresholds_pos_label_1, scores_pos_label_1 = scorer(estimator, X, y)
+    scores_pos_label_1, thresholds_pos_label_1 = curve_scorer(estimator, X, y)
 
-    scorer = _ContinuousScorer(
+    curve_scorer = _CurveScorer(
         recall_score,
         sign=1,
         response_method="predict_proba",
         n_thresholds=1_000,
         kwargs={"pos_label": 0},
     )
-    thresholds_pos_label_0, scores_pos_label_0 = scorer(estimator, X, y)
+    scores_pos_label_0, thresholds_pos_label_0 = curve_scorer(estimator, X, y)
 
-    # If `pos_label` is not forwarded to the scorer, the thresholds will be equal.
+    # If `pos_label` is not forwarded to the curve_scorer, the thresholds will be equal.
     # Make sure that this is not the case.
     # assert not (thresholds_pos_label_1 == thresholds_pos_label_0).all()
     # Since we have an imbalanced problem, the thresholds should represent higher
@@ -128,10 +128,10 @@ def test_continuous_scorer_pos_label(global_random_seed):
 
 
 @pytest.mark.parametrize(
-    "scorer, score_method",
+    "curve_scorer, score_method",
     [
         (
-            _ContinuousScorer(
+            _CurveScorer(
                 score_func=balanced_accuracy_score,
                 sign=1,
                 response_method="predict_proba",
@@ -158,9 +158,9 @@ def test_continuous_scorer_pos_label(global_random_seed):
         ),
     ],
 )
-def test_fit_and_score_over_thresholds_scorers(scorer, score_method):
+def test_fit_and_score_over_thresholds_curve_scorers(curve_scorer, score_method):
     """Check that `_fit_and_score_over_thresholds` returns thresholds in ascending order
-    for the different accepted scorers."""
+    for the different accepted curve scorers."""
     X, y = make_classification(n_samples=100, random_state=0)
     train_idx, val_idx = np.arange(50), np.arange(50, 100)
     classifier = LogisticRegression()
@@ -172,7 +172,7 @@ def test_fit_and_score_over_thresholds_scorers(scorer, score_method):
         fit_params={},
         train_idx=train_idx,
         val_idx=val_idx,
-        scorer=scorer,
+        curve_scorer=curve_scorer,
         score_method=score_method,
         score_params={},
     )
@@ -189,10 +189,10 @@ def test_fit_and_score_over_thresholds_scorers(scorer, score_method):
 
 
 @pytest.mark.parametrize(
-    "scorer, score_method, expected_score",
+    "curve_scorer, score_method, expected_score",
     [
         (
-            _ContinuousScorer(
+            _CurveScorer(
                 score_func=balanced_accuracy_score,
                 sign=1,
                 response_method="predict_proba",
@@ -224,7 +224,9 @@ def test_fit_and_score_over_thresholds_scorers(scorer, score_method):
         ),
     ],
 )
-def test_fit_and_score_over_thresholds_prefit(scorer, score_method, expected_score):
+def test_fit_and_score_over_thresholds_prefit(
+    curve_scorer, score_method, expected_score
+):
     """Check the behaviour with a prefit classifier."""
     X, y = make_classification(n_samples=100, random_state=0)
 
@@ -240,7 +242,7 @@ def test_fit_and_score_over_thresholds_prefit(scorer, score_method, expected_sco
             fit_params={},
             train_idx=train_idx,
             val_idx=val_idx,
-            scorer=scorer,
+            curve_scorer=curve_scorer,
             score_method=score_method,
             score_params={},
         )
@@ -257,7 +259,7 @@ def test_fit_and_score_over_thresholds_prefit(scorer, score_method, expected_sco
         fit_params={},
         train_idx=train_idx,
         val_idx=val_idx,
-        scorer=scorer,
+        curve_scorer=curve_scorer,
         score_method=score_method,
         score_params={},
     )
@@ -267,10 +269,10 @@ def test_fit_and_score_over_thresholds_prefit(scorer, score_method, expected_sco
 
 @pytest.mark.usefixtures("enable_slep006")
 @pytest.mark.parametrize(
-    "scorer, score_method",
+    "curve_scorer, score_method",
     [
         (
-            _ContinuousScorer(
+            _CurveScorer(
                 score_func=balanced_accuracy_score,
                 sign=1,
                 response_method="predict_proba",
@@ -297,7 +299,7 @@ def test_fit_and_score_over_thresholds_prefit(scorer, score_method, expected_sco
         ),
     ],
 )
-def test_fit_and_score_over_thresholds_sample_weight(scorer, score_method):
+def test_fit_and_score_over_thresholds_sample_weight(curve_scorer, score_method):
     """Check that we dispatch the sample-weight to fit and score the classifier."""
     X, y = load_iris(return_X_y=True)
     X, y = X[:100], y[:100]  # only 2 classes
@@ -318,7 +320,7 @@ def test_fit_and_score_over_thresholds_sample_weight(scorer, score_method):
         fit_params={},
         train_idx=train_repeated_idx,
         val_idx=val_repeated_idx,
-        scorer=scorer,
+        curve_scorer=curve_scorer,
         score_method=score_method,
         score_params={},
     )
@@ -331,7 +333,7 @@ def test_fit_and_score_over_thresholds_sample_weight(scorer, score_method):
         fit_params={"sample_weight": sample_weight},
         train_idx=train_idx,
         val_idx=val_idx,
-        scorer=scorer.set_score_request(sample_weight=True),
+        curve_scorer=curve_scorer.set_score_request(sample_weight=True),
         score_method=score_method,
         score_params={"sample_weight": sample_weight},
     )
@@ -342,10 +344,10 @@ def test_fit_and_score_over_thresholds_sample_weight(scorer, score_method):
 
 @pytest.mark.usefixtures("enable_slep006")
 @pytest.mark.parametrize(
-    "scorer, score_method",
+    "curve_scorer, score_method",
     [
         (
-            _ContinuousScorer(
+            _CurveScorer(
                 score_func=balanced_accuracy_score,
                 sign=1,
                 response_method="predict_proba",
@@ -374,7 +376,7 @@ def test_fit_and_score_over_thresholds_sample_weight(scorer, score_method):
 )
 @pytest.mark.parametrize("fit_params_type", ["list", "array"])
 def test_fit_and_score_over_thresholds_fit_params(
-    scorer, score_method, fit_params_type
+    curve_scorer, score_method, fit_params_type
 ):
     """Check that we pass `fit_params` to the classifier when calling `fit`."""
     X, y = make_classification(n_samples=100, random_state=0)
@@ -394,7 +396,7 @@ def test_fit_and_score_over_thresholds_fit_params(
         fit_params=fit_params,
         train_idx=train_idx,
         val_idx=val_idx,
-        scorer=scorer,
+        curve_scorer=curve_scorer,
         score_method=score_method,
         score_params={},
     )
@@ -691,11 +693,11 @@ def test_tuned_threshold_classifier_fit_params(objective_metric, fit_params_type
 @pytest.mark.parametrize(
     "response_method", ["auto", "decision_function", "predict_proba"]
 )
-def test_tuned_threshold_classifier_response_method_scorer_with_constraint_metric(
+def test_tuned_threshold_classifier_response_method_curve_scorer_with_constraint_metric(
     objective_metric, constraint_value, response_method, global_random_seed
 ):
-    """Check that we use the proper scorer and forwarding the requested response method
-    for TNR/TPR and precision/recall metrics.
+    """Check that we use the proper curve scorer and forwarding the requested
+    response method for TNR/TPR and precision/recall metrics.
     """
     X, y = make_classification(n_samples=100, random_state=global_random_seed)
     classifier = LogisticRegression()

From f8a5a793f4e4411f93106c87ad0938849e739b3d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Fri, 22 Mar 2024 19:39:10 +0100
Subject: [PATCH 134/194] add a note regarding the abuse of the scorer API

---
 sklearn/model_selection/_classification_threshold.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index d949a4a32a7ca..4b74e1a0df032 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -860,7 +860,14 @@ def get_metadata_routing(self):
         return router
 
     def _get_curve_scorer(self):
-        """Get the curve scorer based on the objective metric used."""
+        """Get the curve scorer based on the objective metric used.
+
+        Here, we reuse the conventional "scorer API" via `make_scorer` or
+        `_CurveScorer`. Note that the use here is unconventional because `make_scorer`
+        or the "scorer API" is expected to return a single score value when calling
+        `scorer(estimator, X, y)`. Here the score function used are both returning
+        scores and thresholds representing a curve.
+        """
         if self.objective_metric in {
             "max_tnr_at_tpr_constraint",
             "max_tpr_at_tnr_constraint",

From 5dfa43505ececf5cba99b9325285dfcf12915240 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Fri, 22 Mar 2024 19:46:16 +0100
Subject: [PATCH 135/194] use None instead of highest

---
 sklearn/model_selection/_classification_threshold.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 4b74e1a0df032..6fdb4b11c00a0 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -606,7 +606,7 @@ def fit(self, X, y, **params):
                 )
             constraint_value = self.constraint_value
         else:
-            constraint_value = "highest"
+            constraint_value = None  # ignore the constraint value
 
         routed_params = process_routing(self, "fit", **params)
         self._curve_scorer = self._get_curve_scorer()
@@ -696,7 +696,7 @@ def _mean_interpolated_score(target_thresholds, cv_thresholds, cv_scores):
                 axis=0,
             )
 
-        if constraint_value == "highest":  # find best score
+        if constraint_value is None:  # find best score that is the highest value
             self.objective_scores_ = _mean_interpolated_score(
                 self.decision_thresholds_, cv_thresholds, cv_scores
             )

From d45a71baf6741fb7b501a9ab193150f68b93c686 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sat, 23 Mar 2024 12:20:48 +0100
Subject: [PATCH 136/194] use a closer CV API

---
 .../_classification_threshold.py              | 113 +++++++++++-------
 .../tests/test_classification_threshold.py    |  56 ++++-----
 2 files changed, 96 insertions(+), 73 deletions(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 6fdb4b11c00a0..2a942fb2cddfd 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -383,30 +383,42 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
         Controls the randomness of cross-validation when `cv` is a float.
         See :term:`Glossary <random_state>`.
 
+    store_cv_results : bool, default=False
+        Whether to store all scores and thresholds computed during the cross-validation
+        process.
+
     Attributes
     ----------
     estimator_ : estimator instance
         The fitted classifier used when predicting.
 
-    decision_threshold_ : float
+    best_threshold_ : float
         The new decision threshold.
 
-    decision_thresholds_ : ndarray of shape (n_thresholds,) or None
-        All decision thresholds that were evaluated. If `strategy="constant"`,
-        `decision_thresholds_` is None.
+    best_score_ : float or None
+        The score of the objective metric maximized associated with the decision
+        threshold found. If `strategy="constant"`, `best_score_` is None.
 
-    objective_score_ : float or tuple of floats
-        The score of the objective metric associated with the decision threshold found.
+    constrained_score_ : float or None
         When `objective_metric` is one of `"max_tpr_at_tnr_constraint"`,
         `"max_tnr_at_tpr_constraint"`, `"max_precision_at_recall_constraint"`,
-        `"max_recall_at_precision_constraint"`, it will corresponds to a tuple of
-        two float values: the first one is the score of the metric which is constrained
-        and the second one is the score of the maximized metric. If
-        `strategy="constant"`, `objective_score_` is None.
-
-    objective_scores_ : ndarray of shape (n_thresholds,)
-        The scores of the objective metric associated with the decision thresholds.
-        If `strategy="constant"`, `objective_scores_` is None.
+        `"max_recall_at_precision_constraint"`, it will corresponds to the score of the
+        metric which is constrained. It should be close to `constraint_value`. If
+        `objective_metric` is not one of the above or when `strategy="constant",
+        `constrained_score_` is None.
+
+    cv_results_ : dict or None
+        A dictionary containing the scores and thresholds computed during the
+        cross-validation process. Only exist if `store_cv_results=True`.
+        The keys are different depending on the `objective_metric` and `strategy` used:
+
+        * when `strategy="constant"`, `cv_results_` is None;
+        * when `objective_metric` is one of `"max_tpr_at_tnr_constraint"`,
+          `"max_tnr_at_tpr_constraint"`, `"max_precision_at_recall_constraint"`,
+          `"max_recall_at_precision_constraint"`, the keys are `"thresholds"`,
+          `"constrained_scores"`, and `"maximized_scores"`;
+        * otherwise, for score computing a single values, the keys are `"thresholds"`
+          and `"scores"`.
 
     classes_ : ndarray of shape (n_classes,)
         The class labels.
@@ -452,9 +464,9 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
     ...     constraint_value=0.7,
     ... ).fit(X_train, y_train)
     >>> print(
-    ...     f"Cut-off point found at {classifier_tuned.decision_threshold_:.3f} for a "
-    ...     f"recall of {classifier_tuned.objective_score_[0]:.3f} and a precision of "
-    ...     f"{classifier_tuned.objective_score_[1]:.3f}."
+    ...     f"Cut-off point found at {classifier_tuned.best_threshold_:.3f} for a "
+    ...     f"recall of {classifier_tuned.best_score_[0]:.3f} and a precision of "
+    ...     f"{classifier_tuned.best_score_[1]:.3f}."
     ... )
     Cut-off point found at 0.3... for a recall of 0.7... and a precision of 0.7...
     >>> print(classification_report(y_test, classifier_tuned.predict(X_test)))
@@ -501,6 +513,7 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
         "refit": ["boolean"],
         "n_jobs": [Integral, None],
         "random_state": ["random_state"],
+        "store_cv_results": ["boolean"],
     }
 
     def __init__(
@@ -518,6 +531,7 @@ def __init__(
         refit=True,
         n_jobs=None,
         random_state=None,
+        store_cv_results=False,
     ):
         self.estimator = estimator
         self.strategy = strategy
@@ -531,6 +545,7 @@ def __init__(
         self.refit = refit
         self.n_jobs = n_jobs
         self.random_state = random_state
+        self.store_cv_results = store_cv_results
 
     @_fit_context(
         # estimators in TunedThresholdClassifier.estimator is not validated yet
@@ -645,9 +660,10 @@ def fit(self, X, y, **params):
 
         if self.strategy == "constant":
             # early exit when we don't need to find the optimal threshold
-            self.decision_threshold_ = self.constant_threshold
-            self.decision_thresholds_ = None
-            self.objective_score_, self.objective_scores_ = None, None
+            self.best_threshold_ = self.constant_threshold
+            self.best_score_, self.constrained_score_ = None, None
+            if self.store_cv_results:
+                self.cv_results_ = None
             return self
 
         cv_thresholds, cv_scores = zip(
@@ -681,11 +697,11 @@ def fit(self, X, y, **params):
             split_thresholds.max() for split_thresholds in cv_thresholds
         )
         if isinstance(self.n_thresholds, Integral):
-            self.decision_thresholds_ = np.linspace(
+            decision_thresholds = np.linspace(
                 min_threshold, max_threshold, num=self.n_thresholds
             )
         else:
-            self.decision_thresholds_ = np.asarray(self.n_thresholds)
+            decision_thresholds = np.asarray(self.n_thresholds)
 
         def _mean_interpolated_score(target_thresholds, cv_thresholds, cv_scores):
             return np.mean(
@@ -697,25 +713,27 @@ def _mean_interpolated_score(target_thresholds, cv_thresholds, cv_scores):
             )
 
         if constraint_value is None:  # find best score that is the highest value
-            self.objective_scores_ = _mean_interpolated_score(
-                self.decision_thresholds_, cv_thresholds, cv_scores
+            objective_scores = _mean_interpolated_score(
+                decision_thresholds, cv_thresholds, cv_scores
             )
-            best_idx = self.objective_scores_.argmax()
-            self.objective_score_ = self.objective_scores_[best_idx]
-            self.decision_threshold_ = self.decision_thresholds_[best_idx]
+            best_idx = objective_scores.argmax()
+            self.best_score_ = objective_scores[best_idx]
+            self.best_threshold_ = decision_thresholds[best_idx]
+            self.constrained_score_ = None
+            if self.store_cv_results:
+                self.cv_results_ = {
+                    "thresholds": decision_thresholds,
+                    "scores": objective_scores,
+                }
         else:
             if "tpr" in self.objective_metric:  # tpr/tnr
                 mean_tnr, mean_tpr = [
-                    _mean_interpolated_score(
-                        self.decision_thresholds_, cv_thresholds, sc
-                    )
+                    _mean_interpolated_score(decision_thresholds, cv_thresholds, sc)
                     for sc in zip(*cv_scores)
                 ]
             else:  # precision/recall
                 mean_precision, mean_recall = [
-                    _mean_interpolated_score(
-                        self.decision_thresholds_, cv_thresholds, sc
-                    )
+                    _mean_interpolated_score(decision_thresholds, cv_thresholds, sc)
                     for sc in zip(*cv_scores)
                 ]
 
@@ -726,21 +744,24 @@ def _get_best_idx(constrained_score, maximized_score):
                 return np.flatnonzero(mask)[mask_idx]
 
             if self.objective_metric == "max_tpr_at_tnr_constraint":
-                constrained_score, maximized_score = mean_tnr, mean_tpr
+                constrained_scores, maximized_scores = mean_tnr, mean_tpr
             elif self.objective_metric == "max_tnr_at_tpr_constraint":
-                constrained_score, maximized_score = mean_tpr, mean_tnr
+                constrained_scores, maximized_scores = mean_tpr, mean_tnr
             elif self.objective_metric == "max_precision_at_recall_constraint":
-                constrained_score, maximized_score = mean_recall, mean_precision
+                constrained_scores, maximized_scores = mean_recall, mean_precision
             else:  # max_recall_at_precision_constraint
-                constrained_score, maximized_score = mean_precision, mean_recall
-
-            self.objective_scores_ = (constrained_score, maximized_score)
-            best_idx = _get_best_idx(constrained_score, maximized_score)
-            self.objective_score_ = (
-                constrained_score[best_idx],
-                maximized_score[best_idx],
-            )
-            self.decision_threshold_ = self.decision_thresholds_[best_idx]
+                constrained_scores, maximized_scores = mean_precision, mean_recall
+
+            best_idx = _get_best_idx(constrained_scores, maximized_scores)
+            self.best_score_ = maximized_scores[best_idx]
+            self.constrained_score_ = constrained_scores[best_idx]
+            self.best_threshold_ = decision_thresholds[best_idx]
+            if self.store_cv_results:
+                self.cv_results_ = {
+                    "thresholds": decision_thresholds,
+                    "constrained_scores": constrained_scores,
+                    "maximized_scores": maximized_scores,
+                }
 
         return self
 
@@ -773,7 +794,7 @@ def predict(self, X):
         )
 
         return _threshold_scores_to_class_labels(
-            y_score, self.decision_threshold_, self.classes_, pos_label
+            y_score, self.best_threshold_, self.classes_, pos_label
         )
 
     @available_if(_estimator_has("predict_proba"))
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index ce423f82b5654..233a1e5813d59 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -503,12 +503,13 @@ def test_tuned_threshold_classifier_without_constraint_value(response_method):
         objective_metric="balanced_accuracy",
         response_method=response_method,
         n_thresholds=n_thresholds,
+        store_cv_results=True,
     )
     score_optimized = balanced_accuracy_score(y, model.fit(X, y).predict(X))
     score_baseline = balanced_accuracy_score(y, lr.predict(X))
     assert score_optimized > score_baseline
-    assert model.decision_thresholds_.shape == (n_thresholds,)
-    assert model.objective_scores_.shape == (n_thresholds,)
+    assert model.cv_results_["thresholds"].shape == (n_thresholds,)
+    assert model.cv_results_["scores"].shape == (n_thresholds,)
 
 
 @pytest.mark.parametrize(
@@ -552,12 +553,8 @@ def test_tuned_threshold_classifier_metric_with_parameter():
         estimator=lr, objective_metric=make_scorer(f1_score)
     ).fit(X, y)
 
-    assert model_fbeta_1.decision_threshold_ == pytest.approx(
-        model_f1.decision_threshold_
-    )
-    assert model_fbeta_1.decision_threshold_ != pytest.approx(
-        model_fbeta_2.decision_threshold_
-    )
+    assert model_fbeta_1.best_threshold_ == pytest.approx(model_f1.best_threshold_)
+    assert model_fbeta_1.best_threshold_ != pytest.approx(model_fbeta_2.best_threshold_)
 
 
 @pytest.mark.parametrize(
@@ -709,10 +706,12 @@ def test_tuned_threshold_classifier_response_method_curve_scorer_with_constraint
         constraint_value=constraint_value,
         response_method=response_method,
         n_thresholds=n_thresholds,
+        store_cv_results=True,
     )
     model.fit(X, y)
-    assert model.decision_thresholds_.shape == (n_thresholds,)
-    assert all(score.shape == (n_thresholds,) for score in model.objective_scores_)
+    assert model.cv_results_["thresholds"].shape == (n_thresholds,)
+    assert model.cv_results_["constrained_scores"].shape == (n_thresholds,)
+    assert model.cv_results_["maximized_scores"].shape == (n_thresholds,)
 
     if response_method in ("auto", "predict_proba"):
         # "auto" will fall back  in priority on `predict_proba` if `estimator`
@@ -722,9 +721,9 @@ def test_tuned_threshold_classifier_response_method_curve_scorer_with_constraint
             "max_tnr_at_tpr_constraint",
             "max_precision_at_recall_constraint",
         ):
-            assert 0.5 <= model.decision_threshold_ <= 1
+            assert 0.5 <= model.best_threshold_ <= 1
         else:  # "max_tpr_at_tnr_constraint" or "max_recall_at_precision_constraint"
-            assert 0 <= model.decision_threshold_ <= 0.5
+            assert 0 <= model.best_threshold_ <= 0.5
     else:  # "decision_function"
         # we expect the decision function to be centered in 0.0 and to be larger than
         # -1 and 1.
@@ -732,9 +731,9 @@ def test_tuned_threshold_classifier_response_method_curve_scorer_with_constraint
             "max_tnr_at_tpr_constraint",
             "max_precision_at_recall_constraint",
         ):
-            assert 0 < model.decision_threshold_ < 20
+            assert 0 < model.best_threshold_ < 20
         else:  # "max_tpr_at_tnr_constraint" or "max_recall_at_precision_constraint"
-            assert -20 < model.decision_threshold_ < 0
+            assert -20 < model.best_threshold_ < 0
 
 
 @pytest.mark.usefixtures("enable_slep006")
@@ -807,11 +806,11 @@ def test_tuned_threshold_classifier_pos_label_precision_recall(
 
     # due to internal interpolation, the scores will vary slightly
     if objective_metric == "max_precision_at_recall_constraint":
-        assert recall == pytest.approx(model.objective_score_[0], abs=1e-3)
-        assert precision == pytest.approx(model.objective_score_[1], abs=1e-3)
+        assert precision == pytest.approx(model.best_score_, abs=1e-3)
+        assert recall == pytest.approx(model.constrained_score_, abs=1e-3)
     else:
-        assert precision == pytest.approx(model.objective_score_[0], abs=1e-3)
-        assert recall == pytest.approx(model.objective_score_[1], abs=1e-3)
+        assert recall == pytest.approx(model.best_score_, abs=1e-3)
+        assert precision == pytest.approx(model.constrained_score_, abs=1e-3)
 
 
 @pytest.mark.parametrize(
@@ -848,11 +847,11 @@ def tnr_tpr_score(y_true, y_pred, pos_label=pos_label):
     tnr, tpr = tnr_tpr_score(y, model.predict(X), pos_label=pos_label)
     # due to internal interpolation, the scores will vary slightly
     if objective_metric == "max_tnr_at_tpr_constraint":
-        assert tpr == pytest.approx(model.objective_score_[0], abs=0.05)
-        assert tnr == pytest.approx(model.objective_score_[1], abs=0.05)
+        assert tnr == pytest.approx(model.best_score_, abs=0.05)
+        assert tpr == pytest.approx(model.constrained_score_, abs=0.05)
     else:
-        assert tnr == pytest.approx(model.objective_score_[0], abs=0.05)
-        assert tpr == pytest.approx(model.objective_score_[1], abs=0.05)
+        assert tpr == pytest.approx(model.best_score_, abs=0.05)
+        assert tnr == pytest.approx(model.constrained_score_, abs=0.05)
 
 
 @pytest.mark.parametrize(
@@ -887,7 +886,7 @@ def test_tuned_threshold_classifier_pos_label_single_metric(pos_label, metric_ty
     ).fit(X, y)
 
     precision = precision_score(y, model.predict(X), pos_label=pos_label)
-    assert precision == pytest.approx(model.objective_score_, abs=1e-3)
+    assert precision == pytest.approx(model.best_score_, abs=1e-3)
 
 
 @pytest.mark.parametrize(
@@ -905,8 +904,8 @@ def test_tuned_threshold_classifier_constant_strategy(predict_method):
     tuned_model = TunedThresholdClassifier(
         estimator, strategy="constant", constant_threshold=constant_threshold
     ).fit(X, y)
-    assert tuned_model.decision_threshold_ == pytest.approx(constant_threshold)
-    for attribute in ("decision_thresholds_", "objective_score_", "objective_scores_"):
+    assert tuned_model.best_threshold_ == pytest.approx(constant_threshold)
+    for attribute in ("best_score_", "constrained_score_"):
         assert getattr(tuned_model, attribute) is None
 
     assert_allclose(
@@ -921,9 +920,12 @@ def test_tuned_threshold_classifier_n_thresholds_array():
     estimator = LogisticRegression()
     n_thresholds = np.linspace(0, 1, 11)
     tuned_model = TunedThresholdClassifier(
-        estimator, n_thresholds=n_thresholds, response_method="predict_proba"
+        estimator,
+        n_thresholds=n_thresholds,
+        response_method="predict_proba",
+        store_cv_results=True,
     ).fit(X, y)
-    assert_allclose(tuned_model.decision_thresholds_, n_thresholds)
+    assert_allclose(tuned_model.cv_results_["thresholds"], n_thresholds)
 
 
 def test_tuned_threshold_classifier_cv_float():

From a32c151e0d20fcdfed043a5904a4896cbd3b66ba Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sat, 23 Mar 2024 12:36:14 +0100
Subject: [PATCH 137/194] fix example

---
 .../plot_cost_sensitive_learning.py           | 27 ++++++++++++-------
 .../plot_tuned_decision_threshold.py          | 21 ++++++++-------
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index 971e4e357361f..6f62caab51fd5 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -269,6 +269,7 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
     estimator=model,
     pos_label=pos_label,
     objective_metric=scoring["cost_gain"],
+    store_cv_results=True,  # necessary to inspect all results
 )
 tuned_model.fit(X_train, y_train)
 
@@ -329,11 +330,13 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 axs[1].legend()
 
 axs[2].plot(
-    tuned_model.decision_thresholds_, tuned_model.objective_scores_, color="tab:orange"
+    tuned_model.cv_results_["thresholds"],
+    tuned_model.cv_results_["scores"],
+    color="tab:orange",
 )
 axs[2].plot(
-    tuned_model.decision_threshold_,
-    tuned_model.objective_score_,
+    tuned_model.best_threshold_,
+    tuned_model.best_score_,
     "o",
     markersize=10,
     color="tab:orange",
@@ -442,11 +445,13 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 axs[1].legend()
 
 axs[2].plot(
-    tuned_model.decision_thresholds_, tuned_model.objective_scores_, color="tab:orange"
+    tuned_model.cv_results_["thresholds"],
+    tuned_model.cv_results_["scores"],
+    color="tab:orange",
 )
 axs[2].plot(
-    tuned_model.decision_threshold_,
-    tuned_model.objective_score_,
+    tuned_model.best_threshold_,
+    tuned_model.best_score_,
     "o",
     markersize=10,
     color="tab:orange",
@@ -490,7 +495,7 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 for idx, (est, linestyle, marker, color, name) in enumerate(
     zip((model, tuned_model), linestyles, markerstyles, colors, names)
 ):
-    decision_threshold = getattr(est, "decision_threshold_", 0.5)
+    decision_threshold = getattr(est, "best_threshold_", 0.5)
     PrecisionRecallDisplay.from_estimator(
         est,
         X_test,
@@ -535,11 +540,13 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 axs[1].legend()
 
 axs[2].plot(
-    tuned_model.decision_thresholds_, tuned_model.objective_scores_, color="tab:orange"
+    tuned_model.cv_results_["thresholds"],
+    tuned_model.cv_results_["scores"],
+    color="tab:orange",
 )
 axs[2].plot(
-    tuned_model.decision_threshold_,
-    tuned_model.objective_score_,
+    tuned_model.best_threshold_,
+    tuned_model.best_score_,
     "o",
     markersize=10,
     color="tab:orange",
diff --git a/examples/model_selection/plot_tuned_decision_threshold.py b/examples/model_selection/plot_tuned_decision_threshold.py
index 61930f37b1b6d..dba14ebabd1f1 100644
--- a/examples/model_selection/plot_tuned_decision_threshold.py
+++ b/examples/model_selection/plot_tuned_decision_threshold.py
@@ -152,7 +152,7 @@
 # %%
 # Only the decision threshold of each model was changed during the cross-validation.
 decision_threshold = pd.Series(
-    [est.decision_threshold_ for est in cv_results_tuned_model["estimator"]],
+    [est.best_threshold_ for est in cv_results_tuned_model["estimator"]],
 )
 ax = decision_threshold.plot.kde()
 ax.axvline(
@@ -262,7 +262,7 @@ def fpr_score(y, y_pred, neg_label, pos_label):
     marker=">",
     markersize=10,
     color="tab:orange",
-    label=f"Cut-off point at probability of {tuned_model.decision_threshold_:.2f}",
+    label=f"Cut-off point at probability of {tuned_model.best_threshold_:.2f}",
 )
 disp.ax_.legend()
 _ = disp.ax_.set_title("ROC curves")
@@ -293,6 +293,7 @@ def fpr_score(y, y_pred, neg_label, pos_label):
     objective_metric="max_tpr_at_tnr_constraint",
     constraint_value=constraint_value,
     pos_label=pos_label,
+    store_cv_results=True,
 )
 tuned_model.fit(data_train, target_train)
 
@@ -303,18 +304,18 @@ def fpr_score(y, y_pred, neg_label, pos_label):
 _, axs = plt.subplots(ncols=2, figsize=(12, 5))
 
 disp = RocCurveDisplay(
-    fpr=1 - tuned_model.objective_scores_[0],
-    tpr=tuned_model.objective_scores_[1],
+    fpr=1 - tuned_model.cv_results_["constrained_scores"],
+    tpr=tuned_model.cv_results_["maximized_scores"],
     estimator_name="ROC of the tuned model",
     pos_label=pos_label,
 )
 axs[0].plot(
-    1 - tuned_model.objective_score_[0],
-    tuned_model.objective_score_[1],
+    1 - tuned_model.constrained_score_,
+    tuned_model.best_score_,
     marker="o",
     markersize=10,
     color="tab:blue",
-    label=f"Cut-off point at probability of {tuned_model.decision_threshold_:.2f}",
+    label=f"Cut-off point at probability of {tuned_model.best_threshold_:.2f}",
 )
 axs[0].axvline(
     1 - constraint_value, 0, 1, color="tab:blue", linestyle="--", label="FPR constraint"
@@ -347,12 +348,12 @@ def fpr_score(y, y_pred, neg_label, pos_label):
     label="Default cut-off point at a probability of 0.5",
 )
 axs[1].plot(
-    1 - tuned_model.objective_score_[0],
-    tuned_model.objective_score_[1],
+    1 - tuned_model.constrained_score_,
+    tuned_model.best_score_,
     marker="^",
     markersize=10,
     color="tab:orange",
-    label=f"Cut-off point at probability of {tuned_model.decision_threshold_:.2f}",
+    label=f"Cut-off point at probability of {tuned_model.best_threshold_:.2f}",
 )
 axs[1].legend()
 axs[1].set_title("ROC curves")

From 759243734b973dc6f3869d7d8eee7ef4bb097b59 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sat, 23 Mar 2024 12:44:31 +0100
Subject: [PATCH 138/194] simplify model

---
 .../plot_cost_sensitive_learning.py           | 42 +++----------------
 1 file changed, 5 insertions(+), 37 deletions(-)

diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index 6f62caab51fd5..98667581d2431 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -145,45 +145,13 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 # Vanilla predictive model
 # ^^^^^^^^^^^^^^^^^^^^^^^^
 #
-# We first design our predictive model consisting of a
-# :class:`~sklearn.ensemble.HistGradientBoostingClassifier`. We encode the categorical
-# features with an :class:`~sklearn.preprocessing.OrdinalEncoder` but the numerical
-# features are kept as they are. To identify the categorical columns, we use the helper
-# function :func:`~sklearn.compose.make_column_selector` and the fact that the
-# categorical features are stored as `category` dtype.
-from sklearn.compose import ColumnTransformer
-from sklearn.compose import make_column_selector as selector
+# We use :class:`~sklearn.ensemble.HistGradientBoostingClassifier` as a predictive model
+# that natively handles categorical features and missing values.
 from sklearn.ensemble import HistGradientBoostingClassifier
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import OrdinalEncoder
-
-categorical_columns = selector(dtype_include="category")(X)
-numerical_columns = selector(dtype_exclude="category")(X)
-
-preprocessor = ColumnTransformer(
-    [
-        (
-            "categorical",
-            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
-            categorical_columns,
-        ),
-        ("numerical", "passthrough", numerical_columns),
-    ],
-    verbose_feature_names_out=False,
-)
-model = Pipeline(
-    [
-        ("preprocessor", preprocessor),
-        (
-            "classifier",
-            HistGradientBoostingClassifier(
-                categorical_features=categorical_columns, random_state=0
-            ),
-        ),
-    ]
-)
 
-model.fit(X_train, y_train)
+model = HistGradientBoostingClassifier(
+    categorical_features="from_dtype", random_state=0
+).fit(X_train, y_train)
 
 # %%
 # We evaluate the performance of our predictive model using the ROC and Precision-Recall

From dc5346b482a00aa4c83b8c2128d1447bfcec18b9 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sat, 23 Mar 2024 12:45:28 +0100
Subject: [PATCH 139/194] fix

---
 examples/model_selection/plot_cost_sensitive_learning.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index 98667581d2431..d559db5cf0c9b 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -152,6 +152,7 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 model = HistGradientBoostingClassifier(
     categorical_features="from_dtype", random_state=0
 ).fit(X_train, y_train)
+model
 
 # %%
 # We evaluate the performance of our predictive model using the ROC and Precision-Recall

From 843ca04991f5c4ad4ed53a9621fb0b8ebcd555dd Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sat, 23 Mar 2024 12:48:38 +0100
Subject: [PATCH 140/194] fix docstring

---
 doc/modules/classification_threshold.rst             | 2 +-
 sklearn/model_selection/_classification_threshold.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index 7d1a51e3e7b8f..7ef5432aec501 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -109,7 +109,7 @@ In this example, we maximize the balanced accuracy.
         >>> scorer(model, X_test, y_test)
         0.79...
         >>> # compare it with the internal score found by cross-validation
-        >>> model.objective_score_
+        >>> model.best_score_
         0.86...
 
 A second strategy aims to maximize one metric while imposing constraints on another
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 2a942fb2cddfd..1395de8956d2c 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -465,8 +465,8 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
     ... ).fit(X_train, y_train)
     >>> print(
     ...     f"Cut-off point found at {classifier_tuned.best_threshold_:.3f} for a "
-    ...     f"recall of {classifier_tuned.best_score_[0]:.3f} and a precision of "
-    ...     f"{classifier_tuned.best_score_[1]:.3f}."
+    ...     f"recall of {classifier_tuned.constrained_score_:.3f} and a precision of "
+    ...     f"{classifier_tuned.best_score_:.3f}."
     ... )
     Cut-off point found at 0.3... for a recall of 0.7... and a precision of 0.7...
     >>> print(classification_report(y_test, classifier_tuned.predict(X_test)))

From 51ed9a8d782dcebad8a093b4cb37502fddfa054c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 30 Mar 2024 14:01:50 +0100
Subject: [PATCH 141/194] Apply suggestions from code review

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 .../plot_cost_sensitive_learning.py           | 124 ++++++++++++------
 1 file changed, 85 insertions(+), 39 deletions(-)

diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index d559db5cf0c9b..e34e2beba8d15 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -1,7 +1,7 @@
 """
-===============================================================
-Post-tuning decision threshold based on cost-sensitive learning
-===============================================================
+==============================================================
+Post-tuning the decision threshold for cost-sensitive learning
+==============================================================
 
 Once a classifier is trained, the output of the :term:`predict` method outputs class
 label predictions corresponding to a thresholding of either the :term:`decision
@@ -14,12 +14,15 @@
 In this dataset, the task is to predict whether a person has a "good" or "bad" credit.
 In addition, a cost-matrix is provided that specifies the cost of
 misclassification. Specifically, misclassifying a "bad" credit as "good" is five
-times more costly than misclassifying a "good" credit as "bad".
+times more costly on average than misclassifying a "good" credit as "bad".
 
 We use the :class:`~sklearn.model_selection.TunedThresholdClassifier` to select the
 cut-off point of the decision function that minimizes the provided business
 cost.
 
+In the second part of the example, we further extend this approach by
+considering the problem of fraud detection in credit card transactions: in this
+case, the business metric depends on the amount of each individual transaction.
 .. topic:: References
 
     .. [1] "Statlog (German Credit Data) Data Set", UCI Machine Learning Repository,
@@ -29,7 +32,7 @@
     .. [2] `Charles Elkan, "The Foundations of Cost-Sensitive Learning",
        International joint conference on artificial intelligence.
        Vol. 17. No. 1. Lawrence Erlbaum Associates Ltd, 2001.
-       <https://cseweb.ucsd.edu//~elkan/rescale.pdf>`_
+       <https://cseweb.ucsd.edu/~elkan/rescale.pdf>`_
 """
 
 # %%
@@ -89,9 +92,10 @@
 # In this section, we define a set of metrics that we use later. To see
 # the effect of tuning the cut-off point, we evaluate the predictive model using
 # the Receiver Operating Characteristic (ROC) curve and the Precision-Recall curve.
-# The values reported on these plots are therefore the true positive rate (TPR) and
-# the false positive rate (FPR) for the ROC curve and the precision and recall for the
-# Precision-Recall curve.
+# The values reported on these plots are therefore the true positive rate (TPR),
+# also known as the recall or the sensitivity, and the false positive rate (FPR),
+# also known as the specificity, for the ROC curve and the precision and recall for
+# the Precision-Recall curve.
 #
 # From these four metrics, scikit-learn does not provide a scorer for the FPR. We
 # therefore need to define a small custom function to compute it.
@@ -125,17 +129,42 @@ def fpr_score(y, y_pred, neg_label, pos_label):
 }
 
 # %%
-# In addition, the original research [1]_ defines a business metric. They provide a
-# cost-matrix which encodes that predicting a "bad" credit as "good" is 5 times more
-# costly than the opposite. We define a python function that weight the confusion
-# matrix and return the overall cost.
+# In addition, the original research [1]_ defines a custom business metric. We
+# call a "business metric" any metric function that aims at quantifying how the
+# predictions (correct or wrong) might impact the business value of deploying a
+# given machine learning model in a specific application context. For our
+# credit prediction task, the authors provide a custom cost-matrix which
+# encodes that classifying a a "bad" credit as "good" is 5 times more costly on
+# average than the opposite: it is less costly for the financing institution to
+# not grant a credit to a potential customer that will not default (and
+# therefore miss a good customer that would have otherwise both reimbursed the
+# credit and payed interests) than to grant a credit to a customer that will
+# default.
+#
+# We define a python function that weight the confusion matrix and return the
+# overall cost.
 import numpy as np
 
 
 def gain_cost_score(y, y_pred, neg_label, pos_label):
     cm = confusion_matrix(y, y_pred, labels=[neg_label, pos_label])
-    cost_matrix = np.array([[0, -1], [-5, 0]])
-    return np.sum(cm * cost_matrix)
+    # The rows of the confusion matrix hold the counts of observed classes
+    # while the columns hold counts of predicted classes. Recall that here
+    # we consider "bad" as the positive class (second row and column).
+    # Scikit-learn model selection tools expect that we follow a convention
+    # that "higher" means "better", hence the following gain matrix assigns
+    # negative gains (costs) to the two kinds of prediction errors:
+    # - a gain of -1 for each false positive ("good" credit labeled as "bad"),
+    # - a gain of -5 for each false negative ("bad" credit labeled as "good"),
+    # The true positives and true negatives are assigned null gains in this
+    # metric.
+    gain_matrix = np.array(
+        [
+            [0, -1],  # -1 gain for false positives
+            [-5, 0],  # -5 gain for false negatives
+        ]
+    )
+    return np.sum(cm * gain_matrix)
 
 
 scoring["cost_gain"] = make_scorer(
@@ -206,12 +235,13 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 #
 # Here, the different cut-off points correspond to different levels of posterior
 # probability estimates ranging between 0 and 1. By default, `model.predict` uses a
-# cut-off point at a probability estimate of 0.5. The metrics for such cut-off point are
-# reported with the blue dot on the curves: it corresponds to the statistical
+# cut-off point at a probability estimate of 0.5. The metrics for such a cut-off point
+# are reported with the blue dot on the curves: it corresponds to the statistical
 # performance of the model when using `model.predict`.
 #
 # However, we recall that the original aim was to minimize the cost (or maximize the
-# gain) by the business metric. We can compute the value of the business metric:
+# gain) as defined by the business metric. We can compute the value of the business
+# metric:
 print(f"Business defined metric: {scoring['cost_gain'](model, X_test, y_test)}")
 
 # %%
@@ -241,6 +271,7 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
     store_cv_results=True,  # necessary to inspect all results
 )
 tuned_model.fit(X_train, y_train)
+print(f"{tuned_model.best_threshold_=:0.2f}")
 
 # %%
 # We plot the ROC and Precision-Recall curves for the vanilla model and the tuned model.
@@ -254,7 +285,7 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 for idx, (est, linestyle, marker, color, name) in enumerate(
     zip((model, tuned_model), linestyles, markerstyles, colors, names)
 ):
-    decision_threshold = getattr(est, "decision_threshold_", 0.5)
+    decision_threshold = getattr(est, "best_threshold_", 0.5)
     PrecisionRecallDisplay.from_estimator(
         est,
         X_test,
@@ -328,22 +359,25 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 # different. To understand why the tuned model has chosen this cut-off point, we can
 # look at the right-hand side plot that plots the objective score that is our exactly
 # the same as our business metric. We see that the optimum threshold corresponds to the
-# maximum of the objective score.
+# maximum of the objective score. This maximum is reached for a decision threshold
+# much lower than 0.5: the tuned model enjoys a much higher recall at the cost of
+# of significantly lower precision: the tuned model is much more eager to
+# predict the "bad" class label to larger fraction of individuals.
 #
 # We can now check if choosing this cut-off point leads to a better score on the testing
 # set:
 print(f"Business defined metric: {scoring['cost_gain'](tuned_model, X_test, y_test)}")
 
 # %%
-# We observe that the decision generalized on the testing set leading to a better
-# business score.
+# We observe that tuning the decision threshold almost improves our business gains
+# by factor of 2.
 #
 # .. _tunedthresholdclassifier_no_cv:
 #
 # Consideration regarding model refitting and cross-validation
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
-# In the above experiment, we use the default setting of the
+# In the above experiment, we used the default setting of the
 # :class:`~sklearn.model_selection.TunedThresholdClassifier`. In particular, the cut-off
 # point is tuned using a 5-fold stratified cross-validation. Also, the
 # underlying predictive model is refitted on the entire training data once the
@@ -356,7 +390,7 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 # can try to do such experiment.
 model.fit(X_train, y_train)
 tuned_model.set_params(cv="prefit", refit=False).fit(X_train, y_train)
-
+print(f"{tuned_model.best_threshold_=:0.2f}")
 
 # %%
 # Then, we evaluate our model with the same approach as before:
@@ -369,7 +403,7 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 for idx, (est, linestyle, marker, color, name) in enumerate(
     zip((model, tuned_model), linestyles, markerstyles, colors, names)
 ):
-    decision_threshold = getattr(est, "decision_threshold_", 0.5)
+    decision_threshold = getattr(est, "best_threshold_", 0.5)
     PrecisionRecallDisplay.from_estimator(
         est,
         X_test,
@@ -434,21 +468,22 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 _ = fig.suptitle("Tuned GBDT model without refitting and using the entire dataset")
 
 # %%
-# We observe the that the optimum cut-off point is different than in the previous
-# experiment. If we look at the right-hand side plot, we observe that the objective
-# score has large plateau with a minimum cost (around 0). This behavior is symptomatic
-# of an overfitting. Because we disable cross-validation, we tuned the cut-off point on
-# the same set as the model was trained on, and this is the reason for the observed
-# overfitting.
+# We observe the that the optimum cut-off point is different from the one found
+# in the previous experiment. If we look at the right-hand side plot, we
+# observe that the business gain has large plateau of near-optimal 0 gain for a
+# large span of decision thresholds. This behavior is symptomatic of an
+# overfitting. Because we disable cross-validation, we tuned the cut-off point
+# on the same set as the model was trained on, and this is the reason for the
+# observed overfitting.
 #
 # This option should therefore be used with caution. One needs to make sure that the
-# data providing at fitting time to the
+# data provided at fitting time to the
 # :class:`~sklearn.model_selection.TunedThresholdClassifier` is not the same as the data
 # used to train the underlying classifier. This could happen sometimes when the idea is
 # just to tune the predictive model on a completely new validation set without a costly
 # complete refit.
 #
-# In the case that cross-validation is too costly, a potential alternative is to use a
+# When cross-validation is too costly, a potential alternative is to use a
 # single train-test split by providing a floating number in range `[0, 1]` to the `cv`
 # parameter. It splits the data into a training and testing set. Let's explore this
 # option:
@@ -544,8 +579,8 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 # -------------------------------------------------------------
 #
 # As stated in [2]_, gains and costs are generally not constant in real-world problems.
-# In this section, we use a similar example as in [2]_ by using credit cards
-# records.
+# In this section, we use a similar example as in [2]_ for the problem of
+# detecting fraud in credit card transaction records.
 #
 # The credit card dataset
 # ^^^^^^^^^^^^^^^^^^^^^^^
@@ -669,13 +704,16 @@ def business_metric(y_true, y_pred, amount):
 #
 # Let's now create a predictive model using a logistic regression without tuning the
 # decision threshold.
-from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import LogisticRegressionCV
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
 
-model = make_pipeline(StandardScaler(), LogisticRegression(random_state=42)).fit(
-    data_train, target_train
-)
+model = make_pipeline(
+    StandardScaler(),
+    LogisticRegressionCV(
+        random_state=42, Cs=np.logspace(-6, 6, 13), scoring="roc_auc"
+    ),
+).fit(data_train, target_train)
 
 print(
     "Benefit/cost of our logistic regression: "
@@ -721,8 +759,16 @@ def business_metric(y_true, y_pred, amount):
 )
 
 # %%
-# We observe that tuning the decision threshold increases the profit of our model.
+# We observe that tuning the decision threshold increases the expected profit of
+# deploying our model as estimated by the business metric. 
 # Eventually, the balanced accuracy also increased. Note that it might not always be
 # the case because the statistical metric is not necessarily a surrogate of the
 # business metric. It is therefore important, whenever possible, optimize the decision
 # threshold with respect to the business metric.
+#
+# Finally, the estimate of the business metric itself can be unreliable, in
+# particular when the number of data points in the minority class is so small.
+# Any business impact estimated by cross-validation of a business metric on
+# historical data (offline evaluation) should ideally be confirmed by A/B testing
+# on live data (online evaluation). Note however that A/B testing models is
+# beyond the scope of the scikit-learn library itself.

From 3c89ab3ac860d66e296124c9fa9984b1186abf6a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 2 Apr 2024 13:47:44 +0200
Subject: [PATCH 142/194] Apply suggestions from code review

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 doc/modules/classification_threshold.rst      | 48 +++++++++----------
 doc/whats_new/v1.5.rst                        |  4 +-
 .../plot_cost_sensitive_learning.py           |  4 +-
 3 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index 7ef5432aec501..c8bcfe7ed036a 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -2,23 +2,23 @@
 
 .. _tunedthresholdclassifier:
 
-======================================================
-Tuning cut-off decision threshold for class prediction
-======================================================
+==================================================
+Tuning the decision threshold for class prediction
+==================================================
 
-Classifiers are predictive models: they use statistical learning to predict outcomes.
-The outcomes of a classifier are scores for each sample in relation to each class and
-categorical prediction (class label). Scores are obtained from :term:`predict_proba` or
+Classifiers are predictive models: they use statistical learning to predict categorical outcomes.
+The predictions of a classifier are, ideally, the probabilities of the class labels or, more generally, scores for each sample with a higher score meaning higher probability for the corresponding class.
+Scores are obtained from :term:`predict_proba` or
 :term:`decision_function`. The former returns posterior probability estimates for each
-class, while the latter returns a decision score for each class. The decision score is a
-measure of how strongly the sample is predicted to belong to the positive class (e.g.,
-the distance to the decision boundary). In binary classification, a decision rule is
-then defined by thresholding the scores, leading to a single class label for each
+class, while the latter returns a decision score for each class.
+
+In binary classification, a decision rule or action is
+then defined by thresholding the scores, leading to the prediction of a single class label for each
 sample. Those labels are obtained with :term:`predict`.
 
-For binary classification in scikit-learn, class labels are obtained by associating the
-positive class with posterior probability estimates greater than 0.5 (obtained with
-:term:`predict_proba`) or decision scores greater than 0 (obtained with
+For binary classification in scikit-learn, class labels predictions are obtained by hard-coded cut-off rules:
+a positive class is predicted when the posterior probability is greater than 0.5 (obtained with
+:term:`predict_proba`) or if the decision score is greater than 0 (obtained with
 :term:`decision_function`).
 
 Here, we show an example that illustrates the relation between posterior
@@ -36,19 +36,17 @@ probability estimates and class labels::
     >>> classifier.predict(X[:4])
     array([0, 0, 1, 1])
 
-While these approaches are reasonable as default behaviors, they are not ideal for
-all cases. Let's illustrate with an example.
-
-Let's consider a scenario where a predictive model is being deployed to assist medical
-doctors in detecting tumors. In this setting, doctors will be most likely interested in
-correctly identifying all patients with cancer so that they can provide them with the
-right treatment. In other words, doctors prioritize achieving a high recall rate,
-meaning they want to identify all cases of cancer without missing any patients who have
-it. This emphasis on recall comes, of course, with the trade-off of potentially more
-false-positive predictions, reducing the precision of the model, but that is a risk
-doctors are willing to take. Consequently, when it comes to deciding whether to classify
+While these hard-coded rules might at first seem reasonable as default behavior, they are most certainly
+not ideal for most use cases. Let's illustrate with an example.
+
+Let's consider a scenario where a predictive model is being deployed to assist physicians
+in detecting tumors. In this setting, physicians will be most likely interested in
+identifying all patients with cancer and not missing anyone with cancer so that they can provide them with the
+right treatment. In other words, physicians prioritize achieving a high recall rate. This emphasis on recall comes, of course, with the trade-off of potentially more
+false-positive predictions, reducing the precision of the model. That is a risk
+physicians are willing to take because the cost of a missed cancer is much higher than the cost of further diagnostic tests. Consequently, when it comes to deciding whether to classify
 a patient as having cancer or not, it may be more beneficial to classify them as
-positive for cancer when the posterior probability estimate is lower than 0.5.
+positive for cancer when the posterior probability estimate is much lower than 0.5.
 
 Post-tuning the decision threshold
 ==================================
diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
index 44249a5ad8ce2..30573f1bd87ac 100644
--- a/doc/whats_new/v1.5.rst
+++ b/doc/whats_new/v1.5.rst
@@ -258,8 +258,8 @@ Changelog
 :mod:`sklearn.model_selection`
 ..............................
 
-- |MajorFeature| :class:`model_selection.TunedThresholdClassifier` calibrates
-  the decision threshold function of a binary classifier by maximizing a
+- |MajorFeature| :class:`model_selection.TunedThresholdClassifier` finds
+  the decision threshold of a binary classifier that maximizes a
   classification metric through cross-validation.
   :pr:`26120` by :user:`Guillaume Lemaitre <glemaitre>`.
 
diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index e34e2beba8d15..dd996053fddfa 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -9,7 +9,7 @@
 threshold is defined as a posterior probability estimate of 0.5 or a decision score of
 0.0.
 
-However, this default strategy may not be optimal for the task at hand.
+However, this default strategy is most likely not optimal for the task at hand.
 Here, we use the "Statlog" German credit dataset [1]_ to illustrate a use case.
 In this dataset, the task is to predict whether a person has a "good" or "bad" credit.
 In addition, a cost-matrix is provided that specifies the cost of
@@ -699,7 +699,7 @@ def business_metric(y_true, y_pred, amount):
 # %%
 # This is not a surprise that the balanced accuracy is at 0.5 for both classifiers.
 # However, we need to be careful in the rest of the evaluation: we potentially can
-# obtain a model with a decent balanced accuracy but that does not make any profit.
+# obtain a model with a decent balanced accuracy that does not make any profit.
 # In this case, the model would be useless for our business.
 #
 # Let's now create a predictive model using a logistic regression without tuning the

From 8cd5582db810cec4d5975c2fa6668cd3d23b6412 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Tue, 2 Apr 2024 13:50:13 +0200
Subject: [PATCH 143/194] pep8

---
 doc/modules/classification_threshold.rst      | 54 ++++++++++---------
 .../plot_cost_sensitive_learning.py           |  6 +--
 2 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index c8bcfe7ed036a..e21e300a36063 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -6,20 +6,21 @@
 Tuning the decision threshold for class prediction
 ==================================================
 
-Classifiers are predictive models: they use statistical learning to predict categorical outcomes.
-The predictions of a classifier are, ideally, the probabilities of the class labels or, more generally, scores for each sample with a higher score meaning higher probability for the corresponding class.
-Scores are obtained from :term:`predict_proba` or
-:term:`decision_function`. The former returns posterior probability estimates for each
-class, while the latter returns a decision score for each class.
-
-In binary classification, a decision rule or action is
-then defined by thresholding the scores, leading to the prediction of a single class label for each
-sample. Those labels are obtained with :term:`predict`.
-
-For binary classification in scikit-learn, class labels predictions are obtained by hard-coded cut-off rules:
-a positive class is predicted when the posterior probability is greater than 0.5 (obtained with
-:term:`predict_proba`) or if the decision score is greater than 0 (obtained with
-:term:`decision_function`).
+Classifiers are predictive models: they use statistical learning to predict categorical
+outcomes. The predictions of a classifier are, ideally, the probabilities of the class
+labels or, more generally, scores for each sample with a higher score meaning higher
+probability for the corresponding class. Scores are obtained from :term:`predict_proba`
+or :term:`decision_function`. The former returns posterior probability estimates for
+each class, while the latter returns a decision score for each class.
+
+In binary classification, a decision rule or action is then defined by thresholding the
+scores, leading to the prediction of a single class label for each sample. Those labels
+are obtained with :term:`predict`.
+
+For binary classification in scikit-learn, class labels predictions are obtained by
+hard-coded cut-off rules: a positive class is predicted when the posterior probability
+is greater than 0.5 (obtained with :term:`predict_proba`) or if the decision score is
+greater than 0 (obtained with :term:`decision_function`).
 
 Here, we show an example that illustrates the relation between posterior
 probability estimates and class labels::
@@ -36,17 +37,20 @@ probability estimates and class labels::
     >>> classifier.predict(X[:4])
     array([0, 0, 1, 1])
 
-While these hard-coded rules might at first seem reasonable as default behavior, they are most certainly
-not ideal for most use cases. Let's illustrate with an example.
-
-Let's consider a scenario where a predictive model is being deployed to assist physicians
-in detecting tumors. In this setting, physicians will be most likely interested in
-identifying all patients with cancer and not missing anyone with cancer so that they can provide them with the
-right treatment. In other words, physicians prioritize achieving a high recall rate. This emphasis on recall comes, of course, with the trade-off of potentially more
-false-positive predictions, reducing the precision of the model. That is a risk
-physicians are willing to take because the cost of a missed cancer is much higher than the cost of further diagnostic tests. Consequently, when it comes to deciding whether to classify
-a patient as having cancer or not, it may be more beneficial to classify them as
-positive for cancer when the posterior probability estimate is much lower than 0.5.
+While these hard-coded rules might at first seem reasonable as default behavior, they
+are most certainly not ideal for most use cases. Let's illustrate with an example.
+
+Let's consider a scenario where a predictive model is being deployed to assist
+physicians in detecting tumors. In this setting, physicians will be most likely
+interested in identifying all patients with cancer and not missing anyone with cancer so
+that they can provide them with the right treatment. In other words, physicians
+prioritize achieving a high recall rate. This emphasis on recall comes, of course, with
+the trade-off of potentially more false-positive predictions, reducing the precision of
+the model. That is a risk physicians are willing to take because the cost of a missed
+cancer is much higher than the cost of further diagnostic tests. Consequently, when it
+comes to deciding whether to classify a patient as having cancer or not, it may be more
+beneficial to classify them as positive for cancer when the posterior probability
+estimate is much lower than 0.5.
 
 Post-tuning the decision threshold
 ==================================
diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index dd996053fddfa..bb2ebbf74fe50 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -710,9 +710,7 @@ def business_metric(y_true, y_pred, amount):
 
 model = make_pipeline(
     StandardScaler(),
-    LogisticRegressionCV(
-        random_state=42, Cs=np.logspace(-6, 6, 13), scoring="roc_auc"
-    ),
+    LogisticRegressionCV(random_state=42, Cs=np.logspace(-6, 6, 13), scoring="roc_auc"),
 ).fit(data_train, target_train)
 
 print(
@@ -760,7 +758,7 @@ def business_metric(y_true, y_pred, amount):
 
 # %%
 # We observe that tuning the decision threshold increases the expected profit of
-# deploying our model as estimated by the business metric. 
+# deploying our model as estimated by the business metric.
 # Eventually, the balanced accuracy also increased. Note that it might not always be
 # the case because the statistical metric is not necessarily a surrogate of the
 # business metric. It is therefore important, whenever possible, optimize the decision

From a48487cbbf99581601371e6072eca69834c39321 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Tue, 2 Apr 2024 15:27:19 +0200
Subject: [PATCH 144/194] rephrase suggestions

---
 doc/modules/classification_threshold.rst      | 36 +++++++++++--------
 .../plot_cost_sensitive_learning.py           |  2 +-
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index e21e300a36063..39d09bf0e2bbe 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -6,21 +6,27 @@
 Tuning the decision threshold for class prediction
 ==================================================
 
-Classifiers are predictive models: they use statistical learning to predict categorical
-outcomes. The predictions of a classifier are, ideally, the probabilities of the class
-labels or, more generally, scores for each sample with a higher score meaning higher
-probability for the corresponding class. Scores are obtained from :term:`predict_proba`
-or :term:`decision_function`. The former returns posterior probability estimates for
-each class, while the latter returns a decision score for each class.
-
-In binary classification, a decision rule or action is then defined by thresholding the
-scores, leading to the prediction of a single class label for each sample. Those labels
-are obtained with :term:`predict`.
-
-For binary classification in scikit-learn, class labels predictions are obtained by
-hard-coded cut-off rules: a positive class is predicted when the posterior probability
-is greater than 0.5 (obtained with :term:`predict_proba`) or if the decision score is
-greater than 0 (obtained with :term:`decision_function`).
+Classification is best divided into two parts:
+
+* the statistical problem of learning a model to predict, ideally, class probabilities;
+* the decision problem to take concrete action based on those probability predictions.
+
+Let's take a straightforward example related weather forecasting: the first point is
+related to answering "what is the chance of rain tomorrow?" while the second point is
+related to answering "should I take an umbrella tomorrow?".
+
+When it comes to the scikit-learn API, the first point is addressed providing scores
+using :term:`predict_proba` or :term:`decision_function`. The former returns posterior
+probability estimates for each class, while the latter returns a decision score for each
+class.
+
+The decision corresponding to the labels are obtained with :term:`predict`. In binary
+classification, a decision rule or action is then defined by thresholding the scores,
+leading to the prediction of a single class label for each sample. For binary
+classification in scikit-learn, class labels predictions are obtained by hard-coded
+cut-off rules: a positive class is predicted when the posterior probability is greater
+than 0.5 (obtained with :term:`predict_proba`) or if the decision score is greater than
+0 (obtained with :term:`decision_function`).
 
 Here, we show an example that illustrates the relation between posterior
 probability estimates and class labels::
diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index bb2ebbf74fe50..e024435790bcf 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -700,7 +700,7 @@ def business_metric(y_true, y_pred, amount):
 # This is not a surprise that the balanced accuracy is at 0.5 for both classifiers.
 # However, we need to be careful in the rest of the evaluation: we potentially can
 # obtain a model with a decent balanced accuracy that does not make any profit.
-# In this case, the model would be useless for our business.
+# In this case, the model would be harmful for our business.
 #
 # Let's now create a predictive model using a logistic regression without tuning the
 # decision threshold.

From 27515caf05bc4d796dbbb34a30305f2a7c03379a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Mon, 8 Apr 2024 10:08:10 +0200
Subject: [PATCH 145/194] fix

---
 examples/model_selection/plot_cost_sensitive_learning.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index e024435790bcf..f5f3566b6609c 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -146,7 +146,7 @@ def fpr_score(y, y_pred, neg_label, pos_label):
 import numpy as np
 
 
-def gain_cost_score(y, y_pred, neg_label, pos_label):
+def credit_gain_score(y, y_pred, neg_label, pos_label):
     cm = confusion_matrix(y, y_pred, labels=[neg_label, pos_label])
     # The rows of the confusion matrix hold the counts of observed classes
     # while the columns hold counts of predicted classes. Recall that here
@@ -168,7 +168,7 @@ def gain_cost_score(y, y_pred, neg_label, pos_label):
 
 
 scoring["cost_gain"] = make_scorer(
-    gain_cost_score, neg_label=neg_label, pos_label=pos_label
+    credit_gain_score, neg_label=neg_label, pos_label=pos_label
 )
 # %%
 # Vanilla predictive model

From 811dec92e212972d88d553a4e667930f896bb052 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Mon, 8 Apr 2024 14:33:12 +0200
Subject: [PATCH 146/194] include and discuss more about amount

---
 .../plot_cost_sensitive_learning.py           | 29 ++++++++++++++-----
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index f5f3566b6609c..f986faee4cf89 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -591,13 +591,9 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
 # The dataset contains information about credit card records from which some are
 # fraudulent and others are legitimate. The goal is therefore to predict whether or
 # not a credit card record is fraudulent.
-#
-# In addition, we have extra information regarding the amount of each card transaction.
-# This information is used to define the business metric later.
-columns_to_drop = ["Class", "Amount"]
+columns_to_drop = ["Class"]
 data = credit_card.frame.drop(columns=columns_to_drop)
 target = credit_card.frame["Class"].astype(int)
-amount = credit_card.frame["Amount"].to_numpy()
 
 # %%
 # First, we check the class distribution of the datasets.
@@ -605,10 +601,17 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
 
 # %%
 # The dataset is highly imbalanced with fraudulent transaction representing only 0.17%
-# of the data. Additionally, we check the distribution of the amount of the fraudulent
-# transactions.
+# of the data. Since we are interested in training a machine learning model, we should
+# also make sure that we have enough samples in the minority class to train the model.
+target.value_counts()
+
+# %%
+# We observe that we have around 500 samples that is on the low end of the number of
+# samples required to train a machine learning model. In addition of the target
+# distribution, we check the distribution of the amount of the
+# fraudulent transactions.
 fraud = target == 1
-amount_fraud = amount[fraud]
+amount_fraud = data["Amount"][fraud]
 _, ax = plt.subplots()
 ax.hist(amount_fraud, bins=100)
 ax.set_title("Amount of fraud transaction")
@@ -649,6 +652,16 @@ def business_metric(y_true, y_pred, amount):
 sklearn.set_config(enable_metadata_routing=True)
 business_scorer = make_scorer(business_metric).set_score_request(amount=True)
 
+# %%
+# So at this stage, we observe that the amount of the transaction is used twice: once
+# as a feature to train our predictive model and once as a metadata to compute the
+# the business metric and thus the statistical performance of our model. When used as a
+# feature, we are only required to have a column in `data` that contains the amount of
+# each transaction. To use this information as metadata, we need to have an external
+# variable that we can pass to the scorer or the model that internally routes this
+# metadata to the scorer. So let's create this variable.
+amount = credit_card.frame["Amount"].to_numpy()
+
 # %%
 # We first start to train a dummy classifier to have some baseline results.
 from sklearn.model_selection import train_test_split

From c83b4e1a53c69cf756c22a200af423191f3f232b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Mon, 8 Apr 2024 14:40:46 +0200
Subject: [PATCH 147/194] iter

---
 examples/model_selection/plot_cost_sensitive_learning.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index f986faee4cf89..7defd695a25d0 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -149,8 +149,8 @@ def fpr_score(y, y_pred, neg_label, pos_label):
 def credit_gain_score(y, y_pred, neg_label, pos_label):
     cm = confusion_matrix(y, y_pred, labels=[neg_label, pos_label])
     # The rows of the confusion matrix hold the counts of observed classes
-    # while the columns hold counts of predicted classes. Recall that here
-    # we consider "bad" as the positive class (second row and column).
+    # while the columns hold counts of predicted classes. Recall that here we
+    # consider "bad" as the positive class (second row and column).
     # Scikit-learn model selection tools expect that we follow a convention
     # that "higher" means "better", hence the following gain matrix assigns
     # negative gains (costs) to the two kinds of prediction errors:
@@ -158,6 +158,11 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
     # - a gain of -5 for each false negative ("bad" credit labeled as "good"),
     # The true positives and true negatives are assigned null gains in this
     # metric.
+    #
+    # Note that theoretically, given that our model is calibrated and our data
+    # set representative and large enough, we do not need to tune the
+    # threshold, but can safely set it to the cost ration 1/5, as stated by Eq.
+    # (2) in Elkan paper [2]_.
     gain_matrix = np.array(
         [
             [0, -1],  # -1 gain for false positives

From d4e232f0961297f9ab07a0a336dbed3fc6c54a79 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 25 Apr 2024 10:38:28 +0200
Subject: [PATCH 148/194] Apply suggestions from code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <jeremie@probabl.ai>
---
 doc/modules/classification_threshold.rst      |  8 +--
 .../plot_tuned_decision_threshold.py          | 59 +++++++++----------
 .../_classification_threshold.py              |  1 -
 3 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index 39d09bf0e2bbe..d2e66a33d5519 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -11,8 +11,8 @@ Classification is best divided into two parts:
 * the statistical problem of learning a model to predict, ideally, class probabilities;
 * the decision problem to take concrete action based on those probability predictions.
 
-Let's take a straightforward example related weather forecasting: the first point is
-related to answering "what is the chance of rain tomorrow?" while the second point is
+Let's take a straightforward example related to weather forecasting: the first point is
+related to answering "what is the chance that it will rain tomorrow?" while the second point is
 related to answering "should I take an umbrella tomorrow?".
 
 When it comes to the scikit-learn API, the first point is addressed providing scores
@@ -46,8 +46,8 @@ probability estimates and class labels::
 While these hard-coded rules might at first seem reasonable as default behavior, they
 are most certainly not ideal for most use cases. Let's illustrate with an example.
 
-Let's consider a scenario where a predictive model is being deployed to assist
-physicians in detecting tumors. In this setting, physicians will be most likely
+Consider a scenario where a predictive model is being deployed to assist
+physicians in detecting tumors. In this setting, physicians will most likely be
 interested in identifying all patients with cancer and not missing anyone with cancer so
 that they can provide them with the right treatment. In other words, physicians
 prioritize achieving a high recall rate. This emphasis on recall comes, of course, with
diff --git a/examples/model_selection/plot_tuned_decision_threshold.py b/examples/model_selection/plot_tuned_decision_threshold.py
index dba14ebabd1f1..8be9257d98554 100644
--- a/examples/model_selection/plot_tuned_decision_threshold.py
+++ b/examples/model_selection/plot_tuned_decision_threshold.py
@@ -3,15 +3,14 @@
 Post-tuning the cut-off point of decision function
 ==================================================
 
-Once a classifier is trained, the output of the :term:`predict` method output class
-label predictions corresponding to a thresholding of either the :term:`decision
-function` or the :term:`predict_proba` output. For a binary classifier, the default
+Once a binary classifier is trained, the :term:`predict` method outputs class
+label predictions corresponding to a thresholding of either the :term:`decision_function` or the :term:`predict_proba` output. The default
 threshold is defined as a posterior probability estimate of 0.5 or a decision score of
 0.0. However, this default strategy may not be optimal for the task at hand.
 
 This example shows how to use the
-:class:`~sklearn.model_selection.TunedThresholdClassifier` to tune the decision function
-threshold, depending on a metric of interest as well as under a specific constraint.
+:class:`~sklearn.model_selection.TunedThresholdClassifier` to tune the decision
+threshold, depending on a metric of interest as well as under a specific constraints.
 """
 
 # %%
@@ -31,9 +30,9 @@
 target.value_counts()
 
 # %%
-# We see that we are dealing with a binary classification problem. Since the labels are
-# not encoded as 0 and 1, we will store which label we considered the negative class
-# and which one we considered the positive class: "tested_negative" will be considered
+# We can see that we are dealing with a binary classification problem. Since the labels are
+# not encoded as 0 and 1, we will store which label we consider to be the negative class
+# and which one we consider to be the positive class: "tested_negative" will be considered
 # the negative class and "tested_positive" the positive class.
 #
 # We also observed that this binary problem is slightly imbalanced where we have around
@@ -85,23 +84,23 @@
 cv_results_vanilla_model[cv_scores].aggregate(["mean", "std"]).T
 
 # %%
-# Our predictive model succeed to grasp relationship between the data and the target.
+# Our predictive model succeeds to grasp the relationship between the data and the target.
 # The training and testing scores are close to each other, meaning that our predictive
-# model is not overfitting. We also observe that the balanced accuracy is lower than
-# the accuracy, due to the class imbalanced previously mentioned.
+# model is not overfitting. We can also observe that the balanced accuracy is lower than
+# the accuracy, due to the class imbalance previously mentioned.
 #
-# For this classifier, we used a decision threshold of 0.5 to convert the probability
-# of the positive class into a class prediction. However, this threshold might not be
+# For this classifier, we let the decision threshold, used convert the probability of the positive
+# class into a class prediction, to its default value: 0.5. However, this threshold might not be
 # optimal. If our interest is to maximize the balanced accuracy, we should select
 # another threshold that would maximize this metric.
 #
-# The :class:`~sklearn.model_selection.TunedThresholdClassifier` allows to tune the
+# The :class:`~sklearn.model_selection.TunedThresholdClassifier` meta-estimator allows to tune the
 # decision threshold of a classifier given a metric of interest.
 #
 # Tuning the decision threshold
 # -----------------------------
 #
-# We create a :class:`~sklearn.model_selection.TunedThresholdClassifier` and we
+# We create a :class:`~sklearn.model_selection.TunedThresholdClassifier` and
 # configure it to maximize the balanced accuracy. We evaluate the model using the same
 # cross-validation strategy as previously.
 from sklearn.model_selection import TunedThresholdClassifier
@@ -129,7 +128,7 @@
 # negative class.
 #
 # However, it is important to note that this tuned predictive model is internally the
-# same model as the vanilla model.
+# same model as the vanilla model: they have the same fitted coefficients.
 import matplotlib.pyplot as plt
 
 vanilla_model_coef = pd.DataFrame(
@@ -168,9 +167,9 @@
 )
 
 # %%
-# In average, a decision threshold around 0.32 is maximizing the balanced accuracy. It
-# is thus different from the default decision threshold of 0.5. Tuning the decision
-# threshold is thus particularly important when the output of the predictive model
+# In average, a decision threshold around 0.32 maximizes the balanced accuracy, which is
+# different from the default decision threshold of 0.5. Thus tuning the decision
+# threshold is particularly important when the output of the predictive model
 # is used to make decisions. Besides, the metric used to tune the decision threshold
 # should be chosen carefully. Here, we used the balanced accuracy but it might not be
 # the most appropriate metric for the problem at hand. The choice of the "right" metric
@@ -182,12 +181,12 @@
 # Tuning the decision threshold under constraint
 # ----------------------------------------------
 #
-# In some cases, we do not want only to maximize a specific metric but instead maximize
+# In some cases, we do not want to only maximize a given metric but instead to maximize
 # a metric while satisfying a constraint on another metric. In the current example, we
 # could imagine that the decision of our predictive model will be reviewed by a medical
-# doctor. In this case, this doctor will only accept a ratio of false positive.
-# Therefore, we are interesting at maximizing the true positive rate while having a
-# a false positive rate lower than a given threshold.
+# doctor. In this case, this doctor will only accept a ratio of false positive lower than a given value.
+# Therefore, we are interested in maximizing the true positive rate while having a
+# a false positive rate lower than this value.
 #
 # The :class:`~sklearn.model_selection.TunedThresholdClassifier` allows to tune the
 # decision threshold with such specification. We should how to proceed using a single
@@ -211,7 +210,7 @@
 
 # %%
 # To show the benefit on optimizing a metric under constraint, we will evaluate the
-# models using the ROC curves statistics: the true positive rate (TPR) and the false
+# models using the ROC curve statistics: the true positive rate (TPR) and the false
 # positive rate (FPR).
 #
 # The FPR is not defined in scikit-learn and we define it below:
@@ -268,17 +267,17 @@ def fpr_score(y, y_pred, neg_label, pos_label):
 _ = disp.ax_.set_title("ROC curves")
 
 # %%
-# We observe that both models have the same ROC curves. This is expected since the tuned
-# model is only a post-processing step of the vanilla model. The tuning is only
-# changing the decision threshold threshold as displayed by the markers blue and orange.
-# To optimize the balanced accuracy, the tuned model moved the decision threshold is
-# moved from 0.5 to 0.22. By shifting this point, we increase the FPR while increasing
+# As expected, both models have the same ROC curves since the tuned
+# model is only a post-processing step of the vanilla model. The tuning step is only
+# changing the decision threshold, as displayed by the blue and orange markers.
+# To optimize the balanced accuracy, the tuned model moved the decision threshold
+# from 0.5 to 0.22. By shifting this point, we increase the FPR while increasing
 # the TPR: in short we make more false positive but also more true positive. This is
 # exactly what we concluded in the previous section when looking at the balanced
 # accuracy score.
 #
 # However, this decision threshold might not be acceptable for our medical doctor. He
-# might be instead interested to have a low FPR, let say lower than 5%. For this level
+# might be interested to have a low FPR instead, let say lower than 5%. For this level
 # of FPR, he would like our predictive model to maximize the TPR.
 #
 # The :class:`~sklearn.model_selection.TunedThresholdClassifier` allows to specify such
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 1395de8956d2c..54dd1e91abb0a 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -573,7 +573,6 @@ def fit(self, X, y, **params):
         """
         _raise_for_params(params, self, None)
 
-        self._validate_params()
         X, y = indexable(X, y)
 
         y_type = type_of_target(y, input_name="y")

From 92f6e05745ef3d9815a111c46c84f5279c055805 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 25 Apr 2024 10:43:27 +0200
Subject: [PATCH 149/194] Update
 examples/model_selection/plot_tuned_decision_threshold.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <jeremie@probabl.ai>
---
 examples/model_selection/plot_tuned_decision_threshold.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/model_selection/plot_tuned_decision_threshold.py b/examples/model_selection/plot_tuned_decision_threshold.py
index 8be9257d98554..171a954950598 100644
--- a/examples/model_selection/plot_tuned_decision_threshold.py
+++ b/examples/model_selection/plot_tuned_decision_threshold.py
@@ -35,7 +35,7 @@
 # and which one we consider to be the positive class: "tested_negative" will be considered
 # the negative class and "tested_positive" the positive class.
 #
-# We also observed that this binary problem is slightly imbalanced where we have around
+# We can also observe that this binary problem is slightly imbalanced where we have around
 # twice more samples from the negative class than from the positive class. When it
 # comes to evaluation, we should consider this aspect to interpret the results.
 neg_label, pos_label = target.value_counts().index

From 1c5c3f4ffae5967f4395248f7fed90f721e5d77e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Thu, 25 Apr 2024 10:58:48 +0200
Subject: [PATCH 150/194] iter

---
 .../plot_tuned_decision_threshold.py          | 43 ++++++++++---------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/examples/model_selection/plot_tuned_decision_threshold.py b/examples/model_selection/plot_tuned_decision_threshold.py
index 171a954950598..d7be021298404 100644
--- a/examples/model_selection/plot_tuned_decision_threshold.py
+++ b/examples/model_selection/plot_tuned_decision_threshold.py
@@ -3,10 +3,11 @@
 Post-tuning the cut-off point of decision function
 ==================================================
 
-Once a binary classifier is trained, the :term:`predict` method outputs class
-label predictions corresponding to a thresholding of either the :term:`decision_function` or the :term:`predict_proba` output. The default
-threshold is defined as a posterior probability estimate of 0.5 or a decision score of
-0.0. However, this default strategy may not be optimal for the task at hand.
+Once a binary classifier is trained, the :term:`predict` method outputs class label
+predictions corresponding to a thresholding of either the :term:`decision_function` or
+the :term:`predict_proba` output. The default threshold is defined as a posterior
+probability estimate of 0.5 or a decision score of 0.0. However, this default strategy
+may not be optimal for the task at hand.
 
 This example shows how to use the
 :class:`~sklearn.model_selection.TunedThresholdClassifier` to tune the decision
@@ -30,10 +31,10 @@
 target.value_counts()
 
 # %%
-# We can see that we are dealing with a binary classification problem. Since the labels are
-# not encoded as 0 and 1, we will store which label we consider to be the negative class
-# and which one we consider to be the positive class: "tested_negative" will be considered
-# the negative class and "tested_positive" the positive class.
+# We can see that we are dealing with a binary classification problem. Since the labels
+# are not encoded as 0 and 1, we will store which label we consider to be the negative
+# class and which one we consider to be the positive class: "tested_negative" will be
+# considered the negative class and "tested_positive" the positive class.
 #
 # We can also observe that this binary problem is slightly imbalanced where we have around
 # twice more samples from the negative class than from the positive class. When it
@@ -84,18 +85,18 @@
 cv_results_vanilla_model[cv_scores].aggregate(["mean", "std"]).T
 
 # %%
-# Our predictive model succeeds to grasp the relationship between the data and the target.
-# The training and testing scores are close to each other, meaning that our predictive
-# model is not overfitting. We can also observe that the balanced accuracy is lower than
-# the accuracy, due to the class imbalance previously mentioned.
+# Our predictive model succeeds to grasp the relationship between the data and the
+# target. The training and testing scores are close to each other, meaning that our
+# predictive model is not overfitting. We can also observe that the balanced accuracy is
+# lower than the accuracy, due to the class imbalance previously mentioned.
 #
-# For this classifier, we let the decision threshold, used convert the probability of the positive
-# class into a class prediction, to its default value: 0.5. However, this threshold might not be
-# optimal. If our interest is to maximize the balanced accuracy, we should select
-# another threshold that would maximize this metric.
+# For this classifier, we let the decision threshold, used convert the probability of
+# the positive class into a class prediction, to its default value: 0.5. However, this
+# threshold might not be optimal. If our interest is to maximize the balanced accuracy,
+# we should select another threshold that would maximize this metric.
 #
-# The :class:`~sklearn.model_selection.TunedThresholdClassifier` meta-estimator allows to tune the
-# decision threshold of a classifier given a metric of interest.
+# The :class:`~sklearn.model_selection.TunedThresholdClassifier` meta-estimator allows
+# to tune the decision threshold of a classifier given a metric of interest.
 #
 # Tuning the decision threshold
 # -----------------------------
@@ -184,9 +185,9 @@
 # In some cases, we do not want to only maximize a given metric but instead to maximize
 # a metric while satisfying a constraint on another metric. In the current example, we
 # could imagine that the decision of our predictive model will be reviewed by a medical
-# doctor. In this case, this doctor will only accept a ratio of false positive lower than a given value.
-# Therefore, we are interested in maximizing the true positive rate while having a
-# a false positive rate lower than this value.
+# doctor. In this case, this doctor will only accept a ratio of false positive lower
+# than a given value. Therefore, we are interested in maximizing the true positive rate
+# while having a a false positive rate lower than this value.
 #
 # The :class:`~sklearn.model_selection.TunedThresholdClassifier` allows to tune the
 # decision threshold with such specification. We should how to proceed using a single

From 94160badd6605a359fb2e878f870ab14e7383a50 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Thu, 25 Apr 2024 12:05:54 +0200
Subject: [PATCH 151/194] iter

---
 examples/model_selection/plot_tuned_decision_threshold.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/model_selection/plot_tuned_decision_threshold.py b/examples/model_selection/plot_tuned_decision_threshold.py
index d7be021298404..df29ca42ed181 100644
--- a/examples/model_selection/plot_tuned_decision_threshold.py
+++ b/examples/model_selection/plot_tuned_decision_threshold.py
@@ -36,9 +36,9 @@
 # class and which one we consider to be the positive class: "tested_negative" will be
 # considered the negative class and "tested_positive" the positive class.
 #
-# We can also observe that this binary problem is slightly imbalanced where we have around
-# twice more samples from the negative class than from the positive class. When it
-# comes to evaluation, we should consider this aspect to interpret the results.
+# We can also observe that this binary problem is slightly imbalanced where we have
+# around twice more samples from the negative class than from the positive class. When
+# it comes to evaluation, we should consider this aspect to interpret the results.
 neg_label, pos_label = target.value_counts().index
 
 # %%

From 85c848498852a3b902f5246a373e4bce4f2ba94d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Thu, 25 Apr 2024 12:43:55 +0200
Subject: [PATCH 152/194] other comment

---
 .../plot_tuned_decision_threshold.py          | 22 ++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/examples/model_selection/plot_tuned_decision_threshold.py b/examples/model_selection/plot_tuned_decision_threshold.py
index df29ca42ed181..886e99a120200 100644
--- a/examples/model_selection/plot_tuned_decision_threshold.py
+++ b/examples/model_selection/plot_tuned_decision_threshold.py
@@ -35,13 +35,13 @@
 # are not encoded as 0 and 1, we will store which label we consider to be the negative
 # class and which one we consider to be the positive class: "tested_negative" will be
 # considered the negative class and "tested_positive" the positive class.
-#
-# We can also observe that this binary problem is slightly imbalanced where we have
-# around twice more samples from the negative class than from the positive class. When
-# it comes to evaluation, we should consider this aspect to interpret the results.
 neg_label, pos_label = target.value_counts().index
 
 # %%
+# We can also observe that this binary problem is slightly imbalanced where we have
+# around twice more samples from the negative class than from the positive class. When
+# it comes to evaluation, we should consider this aspect to interpret the results.
+#
 # Our vanilla classifier
 # ----------------------
 #
@@ -59,6 +59,12 @@
 # accuracy to report the performance of our model. The balanced accuracy is a metric
 # that is less sensitive to class imbalance and will allow us to put the accuracy
 # score in perspective.
+#
+# Cross-validation allows us to study the variance of the decision threshold across
+# different splits of the data. However, the dataset is rather small and it would be
+# detrimental to use more than 5 folds to evaluate the dispersion. Therefore, we use
+# a :class:`~sklearn.model_selection.RepeatedStratifiedKFold` where we apply several
+# repetitions of 5-fold cross-validation.
 import pandas as pd
 
 from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate
@@ -187,12 +193,12 @@
 # could imagine that the decision of our predictive model will be reviewed by a medical
 # doctor. In this case, this doctor will only accept a ratio of false positive lower
 # than a given value. Therefore, we are interested in maximizing the true positive rate
-# while having a a false positive rate lower than this value.
+# while having a false positive rate lower than this value.
 #
 # The :class:`~sklearn.model_selection.TunedThresholdClassifier` allows to tune the
-# decision threshold with such specification. We should how to proceed using a single
-# cross-validation split to display the Receiver Operating Characteristic (ROC) curves
-# to get intuition on the problem.
+# decision threshold with such specification. We illustrate this strategy together with
+# a single train-test split split to display the Receiver Operating Characteristic (ROC)
+# curves to get better intuitions.
 #
 # First, we split the data into a training and testing set.
 

From 4f86e9d74a81998676bb954a11f14d9c85d5ba48 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Thu, 25 Apr 2024 18:40:26 +0200
Subject: [PATCH 153/194] addressed comments

---
 doc/modules/classification_threshold.rst      | 53 +++++++++----------
 .../plot_tuned_decision_threshold.py          |  6 +--
 2 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index d2e66a33d5519..03a5600e0186d 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -64,31 +64,31 @@ Post-tuning the decision threshold
 One solution to address the problem stated in the introduction is to tune the decision
 threshold of the classifier once the model has been trained. The
 :class:`~sklearn.model_selection.TunedThresholdClassifier` tunes this threshold using an
-internal cross-validation. The optimum threshold is chosen to maximize a given metric
-with or without constraints.
+internal cross-validation. The optimum threshold is chosen to maximize a given metric.
 
-The following image illustrates the tuning of the cut-off point for a gradient boosting
-classifier. While the vanilla and tuned classifiers provide the same Receiver Operating
-Characteristic (ROC) and Precision-Recall curves, and thus the same
-:term:`predict_proba` outputs, the class label predictions differ because of the tuned
+The following image illustrates the tuning of the decision threshold for a gradient
+boosting classifier. While the vanilla and tuned classifiers provide the same
+:term:`predict_proba` outputs and thus the same Receiver Operating Characteristic (ROC)
+and Precision-Recall curves, the class label predictions differ because of the tuned
 decision threshold. The vanilla classifier predicts the class of interest for a
 posterior probability greater than 0.5 while the tuned classifier predicts the class of
-interest for a very low probability (around 0.02). This cut-off point optimizes a
+interest for a very low probability (around 0.02). This decision threshold optimizes a
 utility metric defined by the business (in this case an insurance company).
 
 .. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cost_sensitive_learning_002.png
    :target: ../auto_examples/model_selection/plot_cost_sensitive_learning.html
    :align: center
 
-Options to tune the cut-off point
----------------------------------
+Options to tune the decision threshold
+--------------------------------------
 
-The cut-off point can be tuned through different strategies controlled by the parameter
-`objective_metric`.
+The decision threshold can be tuned through different strategies controlled by the
+parameter `objective_metric`.
 
 One way to tune the threshold is by maximizing a pre-defined scikit-learn metric. These
 metrics can be found by calling the function :func:`~sklearn.metrics.get_scorer_names`.
-In this example, we maximize the balanced accuracy.
+By default, the balanced accuracy is the metric used but be aware that one should choose
+a meaningful metric for their use case.
 
 .. note::
 
@@ -99,23 +99,19 @@ In this example, we maximize the balanced accuracy.
     :func:`~sklearn.metrics.make_scorer`. Refer to :ref:`scoring` to get
     information to define your own scoring function. For instance, we show how to pass
     the information to the scorer that the label of interest is `0` when maximizing the
-    :func:`~sklearn.metrics.f1_score`:
+    :func:`~sklearn.metrics.f1_score`::
 
         >>> from sklearn.linear_model import LogisticRegression
-        >>> from sklearn.model_selection import (
-        ...     TunedThresholdClassifier, train_test_split
-        ... )
+        >>> from sklearn.model_selection import TunedThresholdClassifier
         >>> from sklearn.metrics import make_scorer, f1_score
         >>> X, y = make_classification(
-        ...    n_samples=1_000, weights=[0.1, 0.9], random_state=0)
-        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+        ...   n_samples=1_000, weights=[0.1, 0.9], random_state=0)
         >>> pos_label = 0
         >>> scorer = make_scorer(f1_score, pos_label=pos_label)
         >>> base_model = LogisticRegression()
-        >>> model = TunedThresholdClassifier(base_model, objective_metric=scorer).fit(
-        ...     X_train, y_train)
-        >>> scorer(model, X_test, y_test)
-        0.79...
+        >>> model = TunedThresholdClassifier(base_model, objective_metric=scorer)
+        >>> scorer(model.fit(X, y), X, y)
+        0.88...
         >>> # compare it with the internal score found by cross-validation
         >>> model.best_score_
         0.86...
@@ -140,19 +136,20 @@ Important notes regarding the internal cross-validation
 -------------------------------------------------------
 
 By default :class:`~sklearn.model_selection.TunedThresholdClassifier` uses a 5-fold
-stratified cross-validation to tune the cut-off point. The parameter `cv` allows to
+stratified cross-validation to tune the decision threshold. The parameter `cv` allows to
 control the cross-validation strategy. It is possible to bypass cross-validation by
-setting `cv="prefit"` and providing a fitted classifier. In this case, the cut-off point
-is tuned on the data provided to the `fit` method.
+setting `cv="prefit"` and providing a fitted classifier. In this case, the decision
+threshold is tuned on the data provided to the `fit` method.
 
 However, you should be extremely careful when using this option. You should never use
-the same data for training the classifier and tuning the cut-off point due to the risk
-of overfitting. Refer to the following example section for more details (cf.
+the same data for training the classifier and tuning the decision threshold due to the
+risk of overfitting. Refer to the following example section for more details (cf.
 :ref:`tunedthresholdclassifier_no_cv`). If you have limited resources, consider using a
 float number for `cv` to limit to an internal single train-test split.
 
 The option `cv="prefit"` should only be used when the provided classifier was already
-trained, and you just want to find the best cut-off using a new validation set.
+trained, and you just want to find the best decision threshold using a new validation
+set.
 
 Manually setting the decision threshold
 ---------------------------------------
diff --git a/examples/model_selection/plot_tuned_decision_threshold.py b/examples/model_selection/plot_tuned_decision_threshold.py
index 886e99a120200..31c1a76b18b40 100644
--- a/examples/model_selection/plot_tuned_decision_threshold.py
+++ b/examples/model_selection/plot_tuned_decision_threshold.py
@@ -1,7 +1,7 @@
 """
-==================================================
-Post-tuning the cut-off point of decision function
-==================================================
+======================================================
+Post-hoc tuning the cut-off point of decision function
+======================================================
 
 Once a binary classifier is trained, the :term:`predict` method outputs class label
 predictions corresponding to a thresholding of either the :term:`decision_function` or

From d747098b45ee454f3c2f92616fced2e42e01df0b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 27 Apr 2024 19:12:51 +0200
Subject: [PATCH 154/194] Apply suggestions from code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <jeremie@probabl.ai>
---
 .../model_selection/_classification_threshold.py  | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 54dd1e91abb0a..45e92fff39510 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -153,7 +153,7 @@ def _score(self, method_caller, estimator, X, y_true, **kwargs):
 def _estimator_has(attr):
     """Check if we can delegate a method to the underlying estimator.
 
-    First, we check the first fitted estimator if available, otherwise we
+    First, we check the fitted estimator if available, otherwise we
     check the unfitted estimator.
     """
 
@@ -179,8 +179,7 @@ def _fit_and_score_over_thresholds(
     score_method,
     score_params,
 ):
-    """Fit a classifier and compute the scores for different decision thresholds
-    representing a curve.
+    """Fit a classifier and compute the scores for different decision thresholds.
 
     Parameters
     ----------
@@ -271,7 +270,7 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
     This estimator post-tunes the decision threshold (cut-off point) that is
     used for converting posterior probability estimates (i.e. output of
     `predict_proba`) or decision scores (i.e. output of `decision_function`)
-    into a class label. The tuning is done by maximizing a binary metric,
+    into a class label. The tuning is done by optimizing a binary metric,
     potentially constrained by a another metric.
 
     Read more in the :ref:`User Guide <tunedthresholdclassifier>`.
@@ -396,8 +395,8 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
         The new decision threshold.
 
     best_score_ : float or None
-        The score of the objective metric maximized associated with the decision
-        threshold found. If `strategy="constant"`, `best_score_` is None.
+        The optimal score of the objective metric, evaluated at `best_threshold_`.
+        If `strategy="constant"`, `best_score_` is None.
 
     constrained_score_ : float or None
         When `objective_metric` is one of `"max_tpr_at_tnr_constraint"`,
@@ -548,8 +547,8 @@ def __init__(
         self.store_cv_results = store_cv_results
 
     @_fit_context(
-        # estimators in TunedThresholdClassifier.estimator is not validated yet
-        prefer_skip_nested_validation=True
+        # TunedThresholdClassifier.estimator is not validated yet
+        prefer_skip_nested_validation=False
     )
     def fit(self, X, y, **params):
         """Fit the classifier and post-tune the decision threshold.

From 6d0f4181809b1b9d7566d4d81d4ec46840dd47cd Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sat, 27 Apr 2024 19:37:03 +0200
Subject: [PATCH 155/194] rename TunedThresholdClassifier to
 TunedThresholdClassifierCV

---
 doc/modules/classes.rst                       |  2 +-
 doc/modules/classification_threshold.rst      | 14 ++---
 doc/whats_new/v1.5.rst                        |  2 +-
 .../plot_cost_sensitive_learning.py           | 51 ++++++++--------
 .../plot_tuned_decision_threshold.py          | 22 +++----
 sklearn/model_selection/__init__.py           |  4 +-
 .../_classification_threshold.py              | 16 ++---
 .../tests/test_classification_threshold.py    | 59 ++++++++++---------
 8 files changed, 87 insertions(+), 83 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index dc1671682e64a..bcae843f1dae8 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1257,7 +1257,7 @@ Model post-fit tuning
    :toctree: generated/
    :template: class.rst
 
-   model_selection.TunedThresholdClassifier
+   model_selection.TunedThresholdClassifierCV
 
 Model validation
 ----------------
diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index 03a5600e0186d..cd8946bfc28f1 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -1,6 +1,6 @@
 .. currentmodule:: sklearn.model_selection
 
-.. _tunedthresholdclassifier:
+.. _TunedThresholdClassifierCV:
 
 ==================================================
 Tuning the decision threshold for class prediction
@@ -63,7 +63,7 @@ Post-tuning the decision threshold
 
 One solution to address the problem stated in the introduction is to tune the decision
 threshold of the classifier once the model has been trained. The
-:class:`~sklearn.model_selection.TunedThresholdClassifier` tunes this threshold using an
+:class:`~sklearn.model_selection.TunedThresholdClassifierCV` tunes this threshold using an
 internal cross-validation. The optimum threshold is chosen to maximize a given metric.
 
 The following image illustrates the tuning of the decision threshold for a gradient
@@ -102,14 +102,14 @@ a meaningful metric for their use case.
     :func:`~sklearn.metrics.f1_score`::
 
         >>> from sklearn.linear_model import LogisticRegression
-        >>> from sklearn.model_selection import TunedThresholdClassifier
+        >>> from sklearn.model_selection import TunedThresholdClassifierCV
         >>> from sklearn.metrics import make_scorer, f1_score
         >>> X, y = make_classification(
         ...   n_samples=1_000, weights=[0.1, 0.9], random_state=0)
         >>> pos_label = 0
         >>> scorer = make_scorer(f1_score, pos_label=pos_label)
         >>> base_model = LogisticRegression()
-        >>> model = TunedThresholdClassifier(base_model, objective_metric=scorer)
+        >>> model = TunedThresholdClassifierCV(base_model, objective_metric=scorer)
         >>> scorer(model.fit(X, y), X, y)
         0.88...
         >>> # compare it with the internal score found by cross-validation
@@ -135,7 +135,7 @@ you can use the `pos_label` parameter to indicate the label of the class of inte
 Important notes regarding the internal cross-validation
 -------------------------------------------------------
 
-By default :class:`~sklearn.model_selection.TunedThresholdClassifier` uses a 5-fold
+By default :class:`~sklearn.model_selection.TunedThresholdClassifierCV` uses a 5-fold
 stratified cross-validation to tune the decision threshold. The parameter `cv` allows to
 control the cross-validation strategy. It is possible to bypass cross-validation by
 setting `cv="prefit"` and providing a fitted classifier. In this case, the decision
@@ -144,7 +144,7 @@ threshold is tuned on the data provided to the `fit` method.
 However, you should be extremely careful when using this option. You should never use
 the same data for training the classifier and tuning the decision threshold due to the
 risk of overfitting. Refer to the following example section for more details (cf.
-:ref:`tunedthresholdclassifier_no_cv`). If you have limited resources, consider using a
+:ref:`TunedThresholdClassifierCV_no_cv`). If you have limited resources, consider using a
 float number for `cv` to limit to an internal single train-test split.
 
 The option `cv="prefit"` should only be used when the provided classifier was already
@@ -156,7 +156,7 @@ Manually setting the decision threshold
 
 The previous sections discussed strategies to find an optimal decision threshold. It is
 also possible to manually set the decision threshold in
-:class`~sklearn.model_selection.TunedThresholdClassifier` by setting the parameter
+:class`~sklearn.model_selection.TunedThresholdClassifierCV` by setting the parameter
 `strategy` to `"constant"` and providing the desired threshold using the parameter
 `constant_threshold`.
 
diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
index 95c472fd79fef..d7df771406bec 100644
--- a/doc/whats_new/v1.5.rst
+++ b/doc/whats_new/v1.5.rst
@@ -304,7 +304,7 @@ Changelog
 :mod:`sklearn.model_selection`
 ..............................
 
-- |MajorFeature| :class:`model_selection.TunedThresholdClassifier` finds
+- |MajorFeature| :class:`model_selection.TunedThresholdClassifierCV` finds
   the decision threshold of a binary classifier that maximizes a
   classification metric through cross-validation.
   :pr:`26120` by :user:`Guillaume Lemaitre <glemaitre>`.
diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index 7defd695a25d0..e85b2f7d14222 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -16,7 +16,7 @@
 misclassification. Specifically, misclassifying a "bad" credit as "good" is five
 times more costly on average than misclassifying a "good" credit as "bad".
 
-We use the :class:`~sklearn.model_selection.TunedThresholdClassifier` to select the
+We use the :class:`~sklearn.model_selection.TunedThresholdClassifierCV` to select the
 cut-off point of the decision function that minimizes the provided business
 cost.
 
@@ -40,7 +40,7 @@
 # -----------------------------------------------------
 #
 # In this first section, we illustrate the use of the
-# :class:`~sklearn.model_selection.TunedThresholdClassifier` in a setting of
+# :class:`~sklearn.model_selection.TunedThresholdClassifierCV` in a setting of
 # cost-sensitive learning when the gains and costs associated to each entry of the
 # confusion matrix are constant. We use the problematic presented in [2]_ using the
 # "Statlog" German credit dataset [1]_.
@@ -253,23 +253,24 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
 # At this stage we don't know if any other cut-off can lead to a greater gain. To find
 # the optimal one, we need to compute the cost-gain using the business metric for all
 # possible cut-off points and choose the best. This strategy can be quite tedious to
-# implement by hand, but the :class:`~sklearn.model_selection.TunedThresholdClassifier`
-# class is here to help us. It automatically computes the cost-gain for all possible
-# cut-off points and optimizes for the `objective_metric`.
+# implement by hand, but the
+# :class:`~sklearn.model_selection.TunedThresholdClassifierCV` class is here to help us.
+# It automatically computes the cost-gain for all possible cut-off points and optimizes
+# for the `objective_metric`.
 #
 # .. _cost_sensitive_learning_example:
 #
 # Tuning the cut-off point
 # ^^^^^^^^^^^^^^^^^^^^^^^^
 #
-# We use :class:`~sklearn.model_selection.TunedThresholdClassifier` to tune the cut-off
-# point. We need to provide the business metric to optimize as well as the
-# positive label. Internally, the optimum cut-off point is chosen such that it
-# maximizes the business metric via cross-validation. By default a 5-fold
-# stratified cross-validation is used.
-from sklearn.model_selection import TunedThresholdClassifier
-
-tuned_model = TunedThresholdClassifier(
+# We use :class:`~sklearn.model_selection.TunedThresholdClassifierCV` to tune the
+# cut-off point. We need to provide the business metric to optimize as well as the
+# positive label. Internally, the optimum cut-off point is chosen such that it maximizes
+# the business metric via cross-validation. By default a 5-fold stratified
+# cross-validation is used.
+from sklearn.model_selection import TunedThresholdClassifierCV
+
+tuned_model = TunedThresholdClassifierCV(
     estimator=model,
     pos_label=pos_label,
     objective_metric=scoring["cost_gain"],
@@ -377,16 +378,16 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
 # We observe that tuning the decision threshold almost improves our business gains
 # by factor of 2.
 #
-# .. _tunedthresholdclassifier_no_cv:
+# .. _TunedThresholdClassifierCV_no_cv:
 #
 # Consideration regarding model refitting and cross-validation
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
 # In the above experiment, we used the default setting of the
-# :class:`~sklearn.model_selection.TunedThresholdClassifier`. In particular, the cut-off
-# point is tuned using a 5-fold stratified cross-validation. Also, the
-# underlying predictive model is refitted on the entire training data once the
-# cut-off point is chosen.
+# :class:`~sklearn.model_selection.TunedThresholdClassifierCV`. In particular, the
+# cut-off point is tuned using a 5-fold stratified cross-validation. Also, the
+# underlying predictive model is refitted on the entire training data once the cut-off
+# point is chosen.
 #
 # These two strategies can be changed by providing the `refit` and `cv` parameters.
 # For instance, one could provide a fitted `estimator` and set `cv="prefit"`, in which
@@ -483,10 +484,10 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
 #
 # This option should therefore be used with caution. One needs to make sure that the
 # data provided at fitting time to the
-# :class:`~sklearn.model_selection.TunedThresholdClassifier` is not the same as the data
-# used to train the underlying classifier. This could happen sometimes when the idea is
-# just to tune the predictive model on a completely new validation set without a costly
-# complete refit.
+# :class:`~sklearn.model_selection.TunedThresholdClassifierCV` is not the same as the
+# data used to train the underlying classifier. This could happen sometimes when the
+# idea is just to tune the predictive model on a completely new validation set without a
+# costly complete refit.
 #
 # When cross-validation is too costly, a potential alternative is to use a
 # single train-test split by providing a floating number in range `[0, 1]` to the `cv`
@@ -748,9 +749,9 @@ def business_metric(y_true, y_pred, amount):
 #
 # Now the question is: is our model optimum for the type of decision that we want to do?
 # Up to now, we did not optimize the decision threshold. We use the
-# :class:`~sklearn.model_selection.TunedThresholdClassifier` to optimize the decision
+# :class:`~sklearn.model_selection.TunedThresholdClassifierCV` to optimize the decision
 # given our business scorer.
-tuned_model = TunedThresholdClassifier(
+tuned_model = TunedThresholdClassifierCV(
     estimator=model,
     objective_metric=business_scorer,
     n_thresholds=100,
@@ -760,7 +761,7 @@ def business_metric(y_true, y_pred, amount):
 # %%
 # Since our business scorer requires the amount of each transaction, we need to pass
 # this information in the `fit` method. The
-# :class:`~sklearn.model_selection.TunedThresholdClassifier` is in charge of
+# :class:`~sklearn.model_selection.TunedThresholdClassifierCV` is in charge of
 # automatically dispatching this metadata to the underlying scorer.
 tuned_model.fit(data_train, target_train, amount=amount_train)
 
diff --git a/examples/model_selection/plot_tuned_decision_threshold.py b/examples/model_selection/plot_tuned_decision_threshold.py
index 31c1a76b18b40..aa323b1f33fac 100644
--- a/examples/model_selection/plot_tuned_decision_threshold.py
+++ b/examples/model_selection/plot_tuned_decision_threshold.py
@@ -10,7 +10,7 @@
 may not be optimal for the task at hand.
 
 This example shows how to use the
-:class:`~sklearn.model_selection.TunedThresholdClassifier` to tune the decision
+:class:`~sklearn.model_selection.TunedThresholdClassifierCV` to tune the decision
 threshold, depending on a metric of interest as well as under a specific constraints.
 """
 
@@ -101,18 +101,18 @@
 # threshold might not be optimal. If our interest is to maximize the balanced accuracy,
 # we should select another threshold that would maximize this metric.
 #
-# The :class:`~sklearn.model_selection.TunedThresholdClassifier` meta-estimator allows
+# The :class:`~sklearn.model_selection.TunedThresholdClassifierCV` meta-estimator allows
 # to tune the decision threshold of a classifier given a metric of interest.
 #
 # Tuning the decision threshold
 # -----------------------------
 #
-# We create a :class:`~sklearn.model_selection.TunedThresholdClassifier` and
+# We create a :class:`~sklearn.model_selection.TunedThresholdClassifierCV` and
 # configure it to maximize the balanced accuracy. We evaluate the model using the same
 # cross-validation strategy as previously.
-from sklearn.model_selection import TunedThresholdClassifier
+from sklearn.model_selection import TunedThresholdClassifierCV
 
-tuned_model = TunedThresholdClassifier(
+tuned_model = TunedThresholdClassifierCV(
     estimator=model, objective_metric="balanced_accuracy"
 )
 cv_results_tuned_model = pd.DataFrame(
@@ -195,7 +195,7 @@
 # than a given value. Therefore, we are interested in maximizing the true positive rate
 # while having a false positive rate lower than this value.
 #
-# The :class:`~sklearn.model_selection.TunedThresholdClassifier` allows to tune the
+# The :class:`~sklearn.model_selection.TunedThresholdClassifierCV` allows to tune the
 # decision threshold with such specification. We illustrate this strategy together with
 # a single train-test split split to display the Receiver Operating Characteristic (ROC)
 # curves to get better intuitions.
@@ -287,11 +287,11 @@ def fpr_score(y, y_pred, neg_label, pos_label):
 # might be interested to have a low FPR instead, let say lower than 5%. For this level
 # of FPR, he would like our predictive model to maximize the TPR.
 #
-# The :class:`~sklearn.model_selection.TunedThresholdClassifier` allows to specify such
-# constraint by providing the name of the metric and the constraint value. Here, we use
-# `max_tpr_at_tnr_constraint` which is exactly what we want. Since the true negative
-# rate (TNR) is equal to 1 - FPR, we can rewrite the constraint value as
-# `1 - 0.05 = 0.95`.
+# The :class:`~sklearn.model_selection.TunedThresholdClassifierCV` allows to specify
+# such constraint by providing the name of the metric and the constraint value. Here, we
+# use `max_tpr_at_tnr_constraint` which is exactly what we want. Since the true negative
+# rate (TNR) is equal to 1 - FPR, we can rewrite the constraint value as `1 - 0.05 =
+# 0.95`.
 
 # %%
 constraint_value = 0.95
diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py
index 5facb793e3922..f5ee5fea18560 100644
--- a/sklearn/model_selection/__init__.py
+++ b/sklearn/model_selection/__init__.py
@@ -1,6 +1,6 @@
 import typing
 
-from ._classification_threshold import TunedThresholdClassifier
+from ._classification_threshold import TunedThresholdClassifierCV
 from ._plot import LearningCurveDisplay, ValidationCurveDisplay
 from ._search import GridSearchCV, ParameterGrid, ParameterSampler, RandomizedSearchCV
 from ._split import (
@@ -64,7 +64,7 @@
     "StratifiedKFold",
     "StratifiedGroupKFold",
     "StratifiedShuffleSplit",
-    "TunedThresholdClassifier",
+    "TunedThresholdClassifierCV",
     "check_cv",
     "cross_val_predict",
     "cross_val_score",
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 45e92fff39510..f63cc25a732e5 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -264,7 +264,7 @@ def _fit_and_score_over_thresholds(
     return potential_thresholds, scores
 
 
-class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
+class TunedThresholdClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
     """Decision threshold tuning for binary classification.
 
     This estimator post-tunes the decision threshold (cut-off point) that is
@@ -273,7 +273,7 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
     into a class label. The tuning is done by optimizing a binary metric,
     potentially constrained by a another metric.
 
-    Read more in the :ref:`User Guide <tunedthresholdclassifier>`.
+    Read more in the :ref:`User Guide <TunedThresholdClassifierCV>`.
 
     .. versionadded:: 1.5
 
@@ -358,11 +358,11 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
         .. warning::
             Using `cv="prefit"` and passing the same dataset for fitting `estimator`
             and tuning the cut-off point is subject to undesired overfitting. You can
-            refer to :ref:`tunedthresholdclassifier_no_cv` for an example.
+            refer to :ref:`TunedThresholdClassifierCV_no_cv` for an example.
 
             This option should only be used when the set used to fit `estimator` is
             different from the one used to tune the cut-off point (by calling
-            :meth:`TunedThresholdClassifier.fit`).
+            :meth:`TunedThresholdClassifierCV.fit`).
 
     refit : bool, default=True
         Whether or not to refit the classifier on the entire training set once
@@ -440,7 +440,7 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
     >>> from sklearn.datasets import make_classification
     >>> from sklearn.ensemble import RandomForestClassifier
     >>> from sklearn.metrics import classification_report
-    >>> from sklearn.model_selection import TunedThresholdClassifier, train_test_split
+    >>> from sklearn.model_selection import TunedThresholdClassifierCV, train_test_split
     >>> X, y = make_classification(
     ...     n_samples=1_000, weights=[0.9, 0.1], class_sep=0.8, random_state=42
     ... )
@@ -458,7 +458,7 @@ class TunedThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimato
        macro avg       0.87      0.72      0.77       250
     weighted avg       0.93      0.93      0.92       250
     <BLANKLINE>
-    >>> classifier_tuned = TunedThresholdClassifier(
+    >>> classifier_tuned = TunedThresholdClassifierCV(
     ...     classifier, objective_metric="max_precision_at_recall_constraint",
     ...     constraint_value=0.7,
     ... ).fit(X_train, y_train)
@@ -547,7 +547,7 @@ def __init__(
         self.store_cv_results = store_cv_results
 
     @_fit_context(
-        # TunedThresholdClassifier.estimator is not validated yet
+        # TunedThresholdClassifierCV.estimator is not validated yet
         prefer_skip_nested_validation=False
     )
     def fit(self, X, y, **params):
@@ -917,7 +917,7 @@ def _more_tags(self):
                 "check_sample_weights_invariance": (
                     "Due to the cross-validation and sample ordering, removing a sample"
                     " is not strictly equal to putting is weight to zero. Specific unit"
-                    " tests are added for TunedThresholdClassifier specifically."
+                    " tests are added for TunedThresholdClassifierCV specifically."
                 ),
             },
         }
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 233a1e5813d59..3a60c00154e2e 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -23,7 +23,7 @@
     recall_score,
     roc_curve,
 )
-from sklearn.model_selection import StratifiedShuffleSplit, TunedThresholdClassifier
+from sklearn.model_selection import StratifiedShuffleSplit, TunedThresholdClassifierCV
 from sklearn.model_selection._classification_threshold import (
     _CurveScorer,
     _fit_and_score_over_thresholds,
@@ -413,7 +413,7 @@ def test_tuned_threshold_classifier_no_binary(data):
     """Check that we raise an informative error message for non-binary problem."""
     err_msg = "Only binary classification is supported."
     with pytest.raises(ValueError, match=err_msg):
-        TunedThresholdClassifier(LogisticRegression()).fit(*data)
+        TunedThresholdClassifierCV(LogisticRegression()).fit(*data)
 
 
 @pytest.mark.parametrize(
@@ -445,9 +445,9 @@ def test_tuned_threshold_classifier_conflict_cv_refit(
     """
     X, y = make_classification(n_samples=100, random_state=0)
     with pytest.raises(err_type, match=err_msg):
-        TunedThresholdClassifier(LogisticRegression(), strategy=strategy, **params).fit(
-            X, y
-        )
+        TunedThresholdClassifierCV(
+            LogisticRegression(), strategy=strategy, **params
+        ).fit(X, y)
 
 
 @pytest.mark.parametrize(
@@ -461,12 +461,12 @@ def test_tuned_threshold_classifier_conflict_cv_refit(
 def test_tuned_threshold_classifier_estimator_response_methods(
     estimator, strategy, response_method
 ):
-    """Check that `TunedThresholdClassifier` exposes the same response methods as the
+    """Check that `TunedThresholdClassifierCV` exposes the same response methods as the
     underlying estimator.
     """
     X, y = make_classification(n_samples=100, random_state=0)
 
-    model = TunedThresholdClassifier(estimator, strategy=strategy)
+    model = TunedThresholdClassifierCV(estimator, strategy=strategy)
     assert hasattr(model, response_method) == hasattr(estimator, response_method)
 
     model.fit(X, y)
@@ -483,7 +483,8 @@ def test_tuned_threshold_classifier_estimator_response_methods(
     "response_method", ["auto", "decision_function", "predict_proba"]
 )
 def test_tuned_threshold_classifier_without_constraint_value(response_method):
-    """Check that `TunedThresholdClassifier` is optimizing a given objective metric."""
+    """Check that `TunedThresholdClassifierCV` is optimizing a given objective
+    metric."""
     X, y = load_breast_cancer(return_X_y=True)
     # remove feature to degrade performances
     X = X[:, :5]
@@ -498,7 +499,7 @@ def test_tuned_threshold_classifier_without_constraint_value(response_method):
 
     lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
     n_thresholds = 100
-    model = TunedThresholdClassifier(
+    model = TunedThresholdClassifierCV(
         estimator=lr,
         objective_metric="balanced_accuracy",
         response_method=response_method,
@@ -525,7 +526,7 @@ def test_tuned_threshold_classifier_limit_metric_tradeoff(metrics):
     """
     X, y = load_breast_cancer(return_X_y=True)
     estimator = make_pipeline(StandardScaler(), LogisticRegression())
-    model = TunedThresholdClassifier(
+    model = TunedThresholdClassifierCV(
         estimator=estimator,
         objective_metric=metrics[0],
         constraint_value=0,
@@ -543,13 +544,13 @@ def test_tuned_threshold_classifier_metric_with_parameter():
     """
     X, y = load_breast_cancer(return_X_y=True)
     lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
-    model_fbeta_1 = TunedThresholdClassifier(
+    model_fbeta_1 = TunedThresholdClassifierCV(
         estimator=lr, objective_metric=make_scorer(fbeta_score, beta=1)
     ).fit(X, y)
-    model_fbeta_2 = TunedThresholdClassifier(
+    model_fbeta_2 = TunedThresholdClassifierCV(
         estimator=lr, objective_metric=make_scorer(fbeta_score, beta=2)
     ).fit(X, y)
-    model_f1 = TunedThresholdClassifier(
+    model_f1 = TunedThresholdClassifierCV(
         estimator=lr, objective_metric=make_scorer(f1_score)
     ).fit(X, y)
 
@@ -582,7 +583,7 @@ def test_tuned_threshold_classifier_with_string_targets(response_method, metric)
     # encoded as 0.
     classes = np.array(["cancer", "healthy"], dtype=object)
     y = classes[y]
-    model = TunedThresholdClassifier(
+    model = TunedThresholdClassifierCV(
         estimator=make_pipeline(StandardScaler(), LogisticRegression()),
         objective_metric=metric,
         constraint_value=0.9,
@@ -612,7 +613,7 @@ def test_tuned_threshold_classifier_refit(
 
     # check that `estimator_` if fitted on the full dataset when `refit=True`
     estimator = LogisticRegression().set_fit_request(sample_weight=True)
-    model = TunedThresholdClassifier(estimator, strategy=strategy, refit=True).fit(
+    model = TunedThresholdClassifierCV(estimator, strategy=strategy, refit=True).fit(
         X, y, sample_weight=sample_weight
     )
 
@@ -625,7 +626,7 @@ def test_tuned_threshold_classifier_refit(
     estimator = LogisticRegression().set_fit_request(sample_weight=True)
     estimator.fit(X, y, sample_weight=sample_weight)
     coef = estimator.coef_.copy()
-    model = TunedThresholdClassifier(
+    model = TunedThresholdClassifierCV(
         estimator, strategy=strategy, cv="prefit", refit=False
     ).fit(X, y, sample_weight=sample_weight)
 
@@ -637,7 +638,7 @@ def test_tuned_threshold_classifier_refit(
     cv = [
         (np.arange(50), np.arange(50, 100)),
     ]  # single split
-    model = TunedThresholdClassifier(
+    model = TunedThresholdClassifierCV(
         estimator, strategy=strategy, cv=cv, refit=False
     ).fit(X, y, sample_weight=sample_weight)
 
@@ -672,7 +673,7 @@ def test_tuned_threshold_classifier_fit_params(objective_metric, fit_params_type
 
     classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0)
     classifier.set_fit_request(a=True, b=True)
-    model = TunedThresholdClassifier(
+    model = TunedThresholdClassifierCV(
         classifier, objective_metric=objective_metric, constraint_value=0.5
     )
     model.fit(X, y, **fit_params)
@@ -700,7 +701,7 @@ def test_tuned_threshold_classifier_response_method_curve_scorer_with_constraint
     classifier = LogisticRegression()
 
     n_thresholds = 100
-    model = TunedThresholdClassifier(
+    model = TunedThresholdClassifierCV(
         classifier,
         objective_metric=objective_metric,
         constraint_value=constraint_value,
@@ -751,7 +752,7 @@ def test_tuned_threshold_classifier_cv_zeros_sample_weights_equivalence():
     sample_weight[::2] = 1
 
     estimator = LogisticRegression().set_fit_request(sample_weight=True)
-    model_without_weights = TunedThresholdClassifier(estimator, cv=2)
+    model_without_weights = TunedThresholdClassifierCV(estimator, cv=2)
     model_with_weights = clone(model_without_weights)
 
     model_with_weights.fit(X, y, sample_weight=sample_weight)
@@ -773,7 +774,7 @@ def test_tuned_threshold_classifier_error_constant_learner():
     estimator = DummyClassifier(strategy="constant", constant=1)
     err_msg = "The provided estimator makes constant predictions."
     with pytest.raises(ValueError, match=err_msg):
-        TunedThresholdClassifier(estimator).fit(X, y)
+        TunedThresholdClassifierCV(estimator).fit(X, y)
 
 
 @pytest.mark.parametrize(
@@ -792,7 +793,7 @@ def test_tuned_threshold_classifier_pos_label_precision_recall(
     estimator = LogisticRegression().fit(X, y)
 
     constraint_value = 0.7
-    model = TunedThresholdClassifier(
+    model = TunedThresholdClassifierCV(
         estimator,
         objective_metric=objective_metric,
         constraint_value=constraint_value,
@@ -826,7 +827,7 @@ def test_tuned_threshold_classifier_pos_label_tnr_tpr(objective_metric, pos_labe
     estimator = LogisticRegression().fit(X, y)
 
     constraint_value = 0.7
-    model = TunedThresholdClassifier(
+    model = TunedThresholdClassifierCV(
         estimator,
         objective_metric=objective_metric,
         constraint_value=constraint_value,
@@ -876,7 +877,7 @@ def test_tuned_threshold_classifier_pos_label_single_metric(pos_label, metric_ty
     else:  # metric_type == "scorer_with_pos_label"
         objective_metric = make_scorer(precision_score, pos_label=pos_label)
 
-    model = TunedThresholdClassifier(
+    model = TunedThresholdClassifierCV(
         estimator,
         objective_metric=objective_metric,
         cv="prefit",
@@ -901,7 +902,7 @@ def test_tuned_threshold_classifier_constant_strategy(predict_method):
     # original model
     estimator = LogisticRegression().fit(X, y)
     constant_threshold = 0.5
-    tuned_model = TunedThresholdClassifier(
+    tuned_model = TunedThresholdClassifierCV(
         estimator, strategy="constant", constant_threshold=constant_threshold
     ).fit(X, y)
     assert tuned_model.best_threshold_ == pytest.approx(constant_threshold)
@@ -919,7 +920,7 @@ def test_tuned_threshold_classifier_n_thresholds_array():
     X, y = make_classification(random_state=0)
     estimator = LogisticRegression()
     n_thresholds = np.linspace(0, 1, 11)
-    tuned_model = TunedThresholdClassifier(
+    tuned_model = TunedThresholdClassifierCV(
         estimator,
         n_thresholds=n_thresholds,
         response_method="predict_proba",
@@ -937,7 +938,7 @@ def test_tuned_threshold_classifier_cv_float():
     # coefficients.
     test_size = 0.3
     estimator = LogisticRegression()
-    tuned_model = TunedThresholdClassifier(
+    tuned_model = TunedThresholdClassifierCV(
         estimator, cv=test_size, refit=False, random_state=0
     ).fit(X, y)
     tuned_model.fit(X, y)
@@ -970,6 +971,8 @@ def test_tuned_threshold_classifier_error_missing_constraint(objective_metric):
     a constraint but no `constraint_value` is provided."""
     X, y = make_classification(random_state=0)
     estimator = LogisticRegression()
-    tuned_model = TunedThresholdClassifier(estimator, objective_metric=objective_metric)
+    tuned_model = TunedThresholdClassifierCV(
+        estimator, objective_metric=objective_metric
+    )
     with pytest.raises(ValueError, match="`constraint_value` must be provided"):
         tuned_model.fit(X, y)

From 5671dd68846173b6d3d6670d3b15670c53342596 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sat, 27 Apr 2024 20:16:39 +0200
Subject: [PATCH 156/194] use meaningful values for check the thresholds values
 depending on pos_label

---
 .../_classification_threshold.py                |  2 +-
 .../tests/test_classification_threshold.py      | 17 ++++++++++-------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index f63cc25a732e5..036a81e751595 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -141,7 +141,7 @@ def _score(self, method_caller, estimator, X, y_true, **kwargs):
             * self._score_func(
                 y_true,
                 _threshold_scores_to_class_labels(
-                    y_score, th, estimator.classes_, self._get_pos_label()
+                    y_score, th, estimator.classes_, pos_label
                 ),
                 **scoring_kwargs,
             )
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 3a60c00154e2e..d12333ac13eea 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -98,7 +98,7 @@ def test_curve_scorer_pos_label(global_random_seed):
         recall_score,
         sign=1,
         response_method="predict_proba",
-        n_thresholds=1_000,
+        n_thresholds=10,
         kwargs={"pos_label": 1},
     )
     scores_pos_label_1, thresholds_pos_label_1 = curve_scorer(estimator, X, y)
@@ -107,18 +107,21 @@ def test_curve_scorer_pos_label(global_random_seed):
         recall_score,
         sign=1,
         response_method="predict_proba",
-        n_thresholds=1_000,
+        n_thresholds=10,
         kwargs={"pos_label": 0},
     )
     scores_pos_label_0, thresholds_pos_label_0 = curve_scorer(estimator, X, y)
 
     # If `pos_label` is not forwarded to the curve_scorer, the thresholds will be equal.
     # Make sure that this is not the case.
-    # assert not (thresholds_pos_label_1 == thresholds_pos_label_0).all()
-    # Since we have an imbalanced problem, the thresholds should represent higher
-    # probabilities level when `pos_label=0` than with `pos_label=1`.
-    assert np.sum(thresholds_pos_label_1 < 0.15) > 2 / 3 * n_samples
-    assert np.sum(thresholds_pos_label_0 > 0.85) > 2 / 3 * n_samples
+    assert not (thresholds_pos_label_1 == thresholds_pos_label_0).all()
+    # The min-max range for the thresholds is defined by the probabilities of the
+    # `pos_label` class (the column of `predict_proba`).
+    y_pred = estimator.predict_proba(X)
+    assert thresholds_pos_label_0.min() == pytest.approx(y_pred.min(axis=0)[0])
+    assert thresholds_pos_label_0.max() == pytest.approx(y_pred.max(axis=0)[0])
+    assert thresholds_pos_label_1.min() == pytest.approx(y_pred.min(axis=0)[1])
+    assert thresholds_pos_label_1.max() == pytest.approx(y_pred.max(axis=0)[1])
 
     # The recall cannot be negative and `pos_label=1` should have a higher recall
     # since there is less samples to be considered.

From f04085d5e71eddc346867c38376da3ee8b8e6739 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sat, 27 Apr 2024 20:20:38 +0200
Subject: [PATCH 157/194] TST add more info regarding why not exactly 0 and 1

---
 .../model_selection/tests/test_classification_threshold.py    | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index d12333ac13eea..716fefd956a85 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -54,7 +54,9 @@ def test_curve_scorer():
     scores, thresholds = curve_scorer(estimator, X, y)
 
     assert thresholds.shape == scores.shape
-    # check that the thresholds are probability with extreme values close to 0 and 1
+    # check that the thresholds are probability with extreme values close to 0 and 1.
+    # they are not exactly 0 and 1 because they are the extremum of the
+    # `estimator.predict_proba(X)` values.
     assert 0 <= thresholds.min() <= 0.01
     assert 0.99 <= thresholds.max() <= 1
     # balanced accuracy should be between 0.5 and 1 when it is not adjusted

From d179b5fb2e2bec2a414a5776cf3c2066e05dae9b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sat, 27 Apr 2024 22:18:44 +0200
Subject: [PATCH 158/194] DOC add documentation for base scorer

---
 sklearn/metrics/_scorer.py                    | 22 ++++++++++++++
 .../_classification_threshold.py              | 29 ++++++++++++++++++-
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 7c5321b04730f..39464eef41a86 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -192,6 +192,28 @@ def get_metadata_routing(self):
 
 
 class _BaseScorer(_MetadataRequester):
+    """Base scorer that is used as `scorer(estimator, X, y_true)`.
+
+    Parameters
+    ----------
+    score_func : callable
+        The score function to use. It will be called as
+        `score_func(y_true, y_pred, **kwargs)`.
+
+    sign : int
+        Either 1 or -1 to returns the score with `sign * score_func(estimator, X, y)`.
+        Thus, `sign` defined if higher scores are better or worse.
+
+    n_thresholds : int or array-like
+        Related to the number of decision thresholds for which we want to compute the
+        score. If an integer, it will be used to generate `n_thresholds` thresholds
+        uniformly distributed between the minimum and maximum predicted scores. If an
+        array-like, it will be used as the thresholds.
+
+    kwargs : dict
+        Additional parameters to pass to the score function.
+    """
+
     def __init__(self, score_func, sign, kwargs, response_method="predict"):
         self._score_func = score_func
         self._sign = sign
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 036a81e751595..cbc897b0a13d7 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -55,7 +55,30 @@ def _threshold_scores_to_class_labels(y_score, threshold, classes, pos_label):
 
 
 class _CurveScorer(_BaseScorer):
-    """Scorer taking a continuous response and output a score for each threshold."""
+    """Scorer taking a continuous response and output a score for each threshold.
+
+    Parameters
+    ----------
+    score_func : callable
+        The score function to use. It will be called as
+        `score_func(y_true, y_pred, **kwargs)`.
+
+    sign : int
+        Either 1 or -1 to returns the score with `sign * score_func(estimator, X, y)`.
+        Thus, `sign` defined if higher scores are better or worse.
+
+    n_thresholds : int or array-like
+        Related to the number of decision thresholds for which we want to compute the
+        score. If an integer, it will be used to generate `n_thresholds` thresholds
+        uniformly distributed between the minimum and maximum predicted scores. If an
+        array-like, it will be used as the thresholds.
+
+    kwargs : dict
+        Additional parameters to pass to the score function.
+
+    response_method : str
+        The method to call on the estimator to get the response values.
+    """
 
     def __init__(self, score_func, sign, kwargs, n_thresholds, response_method):
         super().__init__(
@@ -103,6 +126,10 @@ def _score(self, method_caller, estimator, X, y_true, **kwargs):
 
         Parameters
         ----------
+        method_caller : callable
+            Returns predictions given an estimator, method name, and other
+            arguments, potentially caching results.
+
         estimator : object
             Trained estimator to use for scoring.
 

From 2c375f8a06a22e3286140d4b006eb29953fbdebe Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sat, 27 Apr 2024 22:24:09 +0200
Subject: [PATCH 159/194] DOC add more details regarding the curve scorer

---
 sklearn/model_selection/_classification_threshold.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index cbc897b0a13d7..bb7128cf96554 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -234,7 +234,13 @@ def _fit_and_score_over_thresholds(
     curve_scorer : scorer instance
         The scorer taking `classifier` and the validation set as input and outputting
         decision thresholds and scores as a curve. Note that this is different from
-        the usual scorer that output a single score value.
+        the usual scorer that output a single score value:
+
+        * when `score_method` is one of the four constraint metrics, the curve scorer
+          will output a curve of two scores parametrized by the decision threshold, e.g.
+          TPR/TNR or precision/recall curves for each threshold;
+        * otherwise, the curve scorer will output a single score value for each
+          threshold.
 
     score_method : str or callable
         The scoring method to use. Used to detect if we compute TPR/TNR or precision/

From 66ba8da2cf37b0e4c272c38685b3e092ba27ef3d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sat, 27 Apr 2024 22:48:17 +0200
Subject: [PATCH 160/194] directly test curve_scorer instead to look for
 function anem

---
 .../_classification_threshold.py              | 54 +++++------
 .../tests/test_classification_threshold.py    | 97 +++++--------------
 2 files changed, 51 insertions(+), 100 deletions(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index bb7128cf96554..0a76a26979069 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -203,7 +203,6 @@ def _fit_and_score_over_thresholds(
     train_idx,
     val_idx,
     curve_scorer,
-    score_method,
     score_params,
 ):
     """Fit a classifier and compute the scores for different decision thresholds.
@@ -242,10 +241,6 @@ def _fit_and_score_over_thresholds(
         * otherwise, the curve scorer will output a single score value for each
           threshold.
 
-    score_method : str or callable
-        The scoring method to use. Used to detect if we compute TPR/TNR or precision/
-        recall.
-
     score_params : dict
         Parameters to pass to the `score` method of the underlying scorer.
 
@@ -271,29 +266,31 @@ def _fit_and_score_over_thresholds(
         X_val, y_val, score_params_val = X, y, score_params
         check_is_fitted(classifier, "classes_")
 
-    if isinstance(score_method, str):
-        if score_method in {"max_tpr_at_tnr_constraint", "max_tnr_at_tpr_constraint"}:
-            fpr, tpr, potential_thresholds = curve_scorer(
-                classifier, X_val, y_val, **score_params_val
-            )
-            # For fpr=0/tpr=0, the threshold is set to `np.inf`. We need to remove it.
-            fpr, tpr, potential_thresholds = fpr[1:], tpr[1:], potential_thresholds[1:]
-            # thresholds are in decreasing order
-            return potential_thresholds[::-1], ((1 - fpr)[::-1], tpr[::-1])
-        elif score_method in {
-            "max_precision_at_recall_constraint",
-            "max_recall_at_precision_constraint",
-        }:
-            precision, recall, potential_thresholds = curve_scorer(
-                classifier, X_val, y_val, **score_params_val
-            )
-            # thresholds are in increasing order
-            # the last element of the precision and recall is not associated with any
-            # threshold and should be discarded
-            return potential_thresholds, (precision[:-1], recall[:-1])
-    scores, potential_thresholds = curve_scorer(
-        classifier, X_val, y_val, **score_params_val
-    )
+    if curve_scorer is roc_curve or (
+        isinstance(curve_scorer, _BaseScorer) and curve_scorer._score_func is roc_curve
+    ):
+        fpr, tpr, potential_thresholds = curve_scorer(
+            classifier, X_val, y_val, **score_params_val
+        )
+        # For fpr=0/tpr=0, the threshold is set to `np.inf`. We need to remove it.
+        fpr, tpr, potential_thresholds = fpr[1:], tpr[1:], potential_thresholds[1:]
+        # thresholds are in decreasing order
+        return potential_thresholds[::-1], ((1 - fpr)[::-1], tpr[::-1])
+    elif curve_scorer is precision_recall_curve or (
+        isinstance(curve_scorer, _BaseScorer)
+        and curve_scorer._score_func is precision_recall_curve
+    ):
+        precision, recall, potential_thresholds = curve_scorer(
+            classifier, X_val, y_val, **score_params_val
+        )
+        # thresholds are in increasing order
+        # the last element of the precision and recall is not associated with any
+        # threshold and should be discarded
+        return potential_thresholds, (precision[:-1], recall[:-1])
+    else:
+        scores, potential_thresholds = curve_scorer(
+            classifier, X_val, y_val, **score_params_val
+        )
     return potential_thresholds, scores
 
 
@@ -707,7 +704,6 @@ def fit(self, X, y, **params):
                     train_idx=train_idx,
                     val_idx=val_idx,
                     curve_scorer=self._curve_scorer,
-                    score_method=self.objective_metric,
                     score_params=routed_params.scorer.score,
                 )
                 for train_idx, val_idx in splits
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 716fefd956a85..63cf3351f9ea1 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -178,7 +178,6 @@ def test_fit_and_score_over_thresholds_curve_scorers(curve_scorer, score_method)
         train_idx=train_idx,
         val_idx=val_idx,
         curve_scorer=curve_scorer,
-        score_method=score_method,
         score_params={},
     )
 
@@ -194,7 +193,7 @@ def test_fit_and_score_over_thresholds_curve_scorers(curve_scorer, score_method)
 
 
 @pytest.mark.parametrize(
-    "curve_scorer, score_method, expected_score",
+    "curve_scorer, expected_score",
     [
         (
             _CurveScorer(
@@ -204,34 +203,27 @@ def test_fit_and_score_over_thresholds_curve_scorers(curve_scorer, score_method)
                 n_thresholds=2,
                 kwargs={},
             ),
-            "balanced_accuracy",
             [0.5, 1.0],
         ),
         (
             make_scorer(roc_curve, response_method="predict_proba"),
-            "max_tnr_at_tpr_constraint",
             [[0.0, 1.0], [1.0, 1.0]],
         ),
         (
             make_scorer(roc_curve, response_method="predict_proba"),
-            "max_tpr_at_tnr_constraint",
             [[0.0, 1.0], [1.0, 1.0]],
         ),
         (
             make_scorer(precision_recall_curve, response_method="predict_proba"),
-            "max_precision_at_recall_constraint",
             [[0.5, 1.0], [1.0, 1.0]],
         ),
         (
             make_scorer(precision_recall_curve, response_method="predict_proba"),
-            "max_recall_at_precision_constraint",
             [[0.5, 1.0], [1.0, 1.0]],
         ),
     ],
 )
-def test_fit_and_score_over_thresholds_prefit(
-    curve_scorer, score_method, expected_score
-):
+def test_fit_and_score_over_thresholds_prefit(curve_scorer, expected_score):
     """Check the behaviour with a prefit classifier."""
     X, y = make_classification(n_samples=100, random_state=0)
 
@@ -248,7 +240,6 @@ def test_fit_and_score_over_thresholds_prefit(
             train_idx=train_idx,
             val_idx=val_idx,
             curve_scorer=curve_scorer,
-            score_method=score_method,
             score_params={},
         )
 
@@ -265,7 +256,6 @@ def test_fit_and_score_over_thresholds_prefit(
         train_idx=train_idx,
         val_idx=val_idx,
         curve_scorer=curve_scorer,
-        score_method=score_method,
         score_params={},
     )
     assert_array_equal(np.argsort(thresholds), np.arange(len(thresholds)))
@@ -274,37 +264,22 @@ def test_fit_and_score_over_thresholds_prefit(
 
 @pytest.mark.usefixtures("enable_slep006")
 @pytest.mark.parametrize(
-    "curve_scorer, score_method",
+    "curve_scorer",
     [
-        (
-            _CurveScorer(
-                score_func=balanced_accuracy_score,
-                sign=1,
-                response_method="predict_proba",
-                n_thresholds=10,
-                kwargs={},
-            ),
-            "balanced_accuracy",
-        ),
-        (
-            make_scorer(roc_curve, response_method="predict_proba"),
-            "max_tnr_at_tpr_constraint",
-        ),
-        (
-            make_scorer(roc_curve, response_method="predict_proba"),
-            "max_tpr_at_tnr_constraint",
-        ),
-        (
-            make_scorer(precision_recall_curve, response_method="predict_proba"),
-            "max_precision_at_recall_constraint",
-        ),
-        (
-            make_scorer(precision_recall_curve, response_method="predict_proba"),
-            "max_recall_at_precision_constraint",
+        _CurveScorer(
+            score_func=balanced_accuracy_score,
+            sign=1,
+            response_method="predict_proba",
+            n_thresholds=10,
+            kwargs={},
         ),
+        make_scorer(roc_curve, response_method="predict_proba"),
+        make_scorer(roc_curve, response_method="predict_proba"),
+        make_scorer(precision_recall_curve, response_method="predict_proba"),
+        make_scorer(precision_recall_curve, response_method="predict_proba"),
     ],
 )
-def test_fit_and_score_over_thresholds_sample_weight(curve_scorer, score_method):
+def test_fit_and_score_over_thresholds_sample_weight(curve_scorer):
     """Check that we dispatch the sample-weight to fit and score the classifier."""
     X, y = load_iris(return_X_y=True)
     X, y = X[:100], y[:100]  # only 2 classes
@@ -326,7 +301,6 @@ def test_fit_and_score_over_thresholds_sample_weight(curve_scorer, score_method)
         train_idx=train_repeated_idx,
         val_idx=val_repeated_idx,
         curve_scorer=curve_scorer,
-        score_method=score_method,
         score_params={},
     )
 
@@ -339,7 +313,6 @@ def test_fit_and_score_over_thresholds_sample_weight(curve_scorer, score_method)
         train_idx=train_idx,
         val_idx=val_idx,
         curve_scorer=curve_scorer.set_score_request(sample_weight=True),
-        score_method=score_method,
         score_params={"sample_weight": sample_weight},
     )
 
@@ -349,40 +322,23 @@ def test_fit_and_score_over_thresholds_sample_weight(curve_scorer, score_method)
 
 @pytest.mark.usefixtures("enable_slep006")
 @pytest.mark.parametrize(
-    "curve_scorer, score_method",
+    "curve_scorer",
     [
-        (
-            _CurveScorer(
-                score_func=balanced_accuracy_score,
-                sign=1,
-                response_method="predict_proba",
-                n_thresholds=10,
-                kwargs={},
-            ),
-            "balanced_accuracy",
-        ),
-        (
-            make_scorer(roc_curve, response_method="predict_proba"),
-            "max_tnr_at_tpr_constraint",
-        ),
-        (
-            make_scorer(roc_curve, response_method="predict_proba"),
-            "max_tpr_at_tnr_constraint",
-        ),
-        (
-            make_scorer(precision_recall_curve, response_method="predict_proba"),
-            "max_precision_at_recall_constraint",
-        ),
-        (
-            make_scorer(precision_recall_curve, response_method="predict_proba"),
-            "max_recall_at_precision_constraint",
+        _CurveScorer(
+            score_func=balanced_accuracy_score,
+            sign=1,
+            response_method="predict_proba",
+            n_thresholds=10,
+            kwargs={},
         ),
+        make_scorer(roc_curve, response_method="predict_proba"),
+        make_scorer(roc_curve, response_method="predict_proba"),
+        make_scorer(precision_recall_curve, response_method="predict_proba"),
+        make_scorer(precision_recall_curve, response_method="predict_proba"),
     ],
 )
 @pytest.mark.parametrize("fit_params_type", ["list", "array"])
-def test_fit_and_score_over_thresholds_fit_params(
-    curve_scorer, score_method, fit_params_type
-):
+def test_fit_and_score_over_thresholds_fit_params(curve_scorer, fit_params_type):
     """Check that we pass `fit_params` to the classifier when calling `fit`."""
     X, y = make_classification(n_samples=100, random_state=0)
     fit_params = {
@@ -402,7 +358,6 @@ def test_fit_and_score_over_thresholds_fit_params(
         train_idx=train_idx,
         val_idx=val_idx,
         curve_scorer=curve_scorer,
-        score_method=score_method,
         score_params={},
     )
 

From a6b19c127ec3ba9c6a4bc67440c207850d2b8725 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sat, 27 Apr 2024 22:50:14 +0200
Subject: [PATCH 161/194] add required arguments

---
 sklearn/model_selection/_classification_threshold.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 0a76a26979069..441262c73299b 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -510,6 +510,7 @@ class TunedThresholdClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstima
     <BLANKLINE>
     """
 
+    _required_parameters = ["estimator"]
     _parameter_constraints: dict = {
         "estimator": [
             HasMethods(["fit", "predict_proba"]),

From b3b99ff6f5a5f7d1318d348fe83d5ae200a29757 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sat, 27 Apr 2024 22:55:18 +0200
Subject: [PATCH 162/194] DOC add docstring for interpolated score

---
 .../_classification_threshold.py              | 37 ++++++++++++++-----
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 441262c73299b..d24c631bb0468 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -294,6 +294,34 @@ def _fit_and_score_over_thresholds(
     return potential_thresholds, scores
 
 
+def _mean_interpolated_score(target_thresholds, cv_thresholds, cv_scores):
+    """Compute the mean interpolated score across folds by defining common thresholds.
+
+    Parameters
+    ----------
+    target_thresholds : ndarray of shape (n_thresholds,)
+        The thresholds to use to compute the mean score.
+
+    cv_thresholds : ndarray of shape (n_folds, n_thresholds_fold)
+        The thresholds used to compute the scores for each fold.
+
+    cv_scores : ndarray of shape (n_folds, n_thresholds_fold)
+        The scores computed for each threshold for each fold.
+
+    Returns
+    -------
+    mean_score : ndarray of shape (n_thresholds,)
+        The mean score across all folds for each target threshold.
+    """
+    return np.mean(
+        [
+            np.interp(target_thresholds, split_thresholds, split_score)
+            for split_thresholds, split_score in zip(cv_thresholds, cv_scores)
+        ],
+        axis=0,
+    )
+
+
 class TunedThresholdClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
     """Decision threshold tuning for binary classification.
 
@@ -731,15 +759,6 @@ def fit(self, X, y, **params):
         else:
             decision_thresholds = np.asarray(self.n_thresholds)
 
-        def _mean_interpolated_score(target_thresholds, cv_thresholds, cv_scores):
-            return np.mean(
-                [
-                    np.interp(target_thresholds, split_thresholds, split_score)
-                    for split_thresholds, split_score in zip(cv_thresholds, cv_scores)
-                ],
-                axis=0,
-            )
-
         if constraint_value is None:  # find best score that is the highest value
             objective_scores = _mean_interpolated_score(
                 decision_thresholds, cv_thresholds, cv_scores

From dda0d2ca729952d38a5a79b3d59c9740a7030895 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 29 Apr 2024 15:28:14 +0200
Subject: [PATCH 163/194] Update
 sklearn/model_selection/tests/test_classification_threshold.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <jeremie@probabl.ai>
---
 .../model_selection/tests/test_classification_threshold.py    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 63cf3351f9ea1..fcf29bd1e53e1 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -181,13 +181,13 @@ def test_fit_and_score_over_thresholds_curve_scorers(curve_scorer, score_method)
         score_params={},
     )
 
+    assert np.all(thresholds[:-1] <= thresholds[1:])
+
     if score_method.startswith("max_"):
-        assert_array_equal(np.argsort(thresholds), np.arange(len(thresholds)))
         assert isinstance(scores, tuple) and len(scores) == 2
         for sc in scores:
             assert np.logical_and(sc >= 0, sc <= 1).all()
     else:
-        assert_array_equal(np.argsort(thresholds), np.arange(len(thresholds)))
         assert isinstance(scores, np.ndarray)
         assert np.logical_and(scores >= 0, scores <= 1).all()
 

From 48e782988a9a947f48ceaf36031b4625bc1e3a75 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 29 Apr 2024 15:28:29 +0200
Subject: [PATCH 164/194] Update
 sklearn/model_selection/tests/test_classification_threshold.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <jeremie@probabl.ai>
---
 sklearn/model_selection/tests/test_classification_threshold.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index fcf29bd1e53e1..76395b855b33f 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -927,7 +927,7 @@ def test_tuned_threshold_classifier_cv_float():
     ],
 )
 def test_tuned_threshold_classifier_error_missing_constraint(objective_metric):
-    """Check that we raise an informative error when using a objective metric requested
+    """Check that we raise an informative error when using a objective metric requesting
     a constraint but no `constraint_value` is provided."""
     X, y = make_classification(random_state=0)
     estimator = LogisticRegression()

From 17839e833feadd39a5893b7206f7fbac1248eba6 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 29 Apr 2024 15:32:53 +0200
Subject: [PATCH 165/194] Apply suggestions from code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <jeremie@probabl.ai>
---
 .../model_selection/tests/test_classification_threshold.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 76395b855b33f..ec6f8f079bc2d 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -54,7 +54,7 @@ def test_curve_scorer():
     scores, thresholds = curve_scorer(estimator, X, y)
 
     assert thresholds.shape == scores.shape
-    # check that the thresholds are probability with extreme values close to 0 and 1.
+    # check that the thresholds are probabilities with extreme values close to 0 and 1.
     # they are not exactly 0 and 1 because they are the extremum of the
     # `estimator.predict_proba(X)` values.
     assert 0 <= thresholds.min() <= 0.01
@@ -114,8 +114,7 @@ def test_curve_scorer_pos_label(global_random_seed):
     )
     scores_pos_label_0, thresholds_pos_label_0 = curve_scorer(estimator, X, y)
 
-    # If `pos_label` is not forwarded to the curve_scorer, the thresholds will be equal.
-    # Make sure that this is not the case.
+    # Since `pos_label` is forwarded to the curve_scorer, the thresholds are not equal.
     assert not (thresholds_pos_label_1 == thresholds_pos_label_0).all()
     # The min-max range for the thresholds is defined by the probabilities of the
     # `pos_label` class (the column of `predict_proba`).
@@ -258,7 +257,7 @@ def test_fit_and_score_over_thresholds_prefit(curve_scorer, expected_score):
         curve_scorer=curve_scorer,
         score_params={},
     )
-    assert_array_equal(np.argsort(thresholds), np.arange(len(thresholds)))
+    assert np.all(thresholds[:-1] <= thresholds[1:])
     assert_allclose(scores, expected_score)
 
 

From 3f02bc3486a41ffff3877b1bfa997e5a4c57b4bd Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Mon, 29 Apr 2024 22:19:18 +0200
Subject: [PATCH 166/194] remove duplicated check

---
 .../model_selection/_classification_threshold.py |  3 ---
 .../tests/test_classification_threshold.py       | 16 +---------------
 2 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index d24c631bb0468..b6b1beb3093f3 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -35,7 +35,6 @@
 from ..utils.validation import (
     _check_method_params,
     _num_samples,
-    check_consistent_length,
     check_is_fitted,
     indexable,
 )
@@ -254,7 +253,6 @@ def _fit_and_score_over_thresholds(
         The scores computed for each decision threshold. When TPR/TNR or precision/
         recall are computed, `scores` is a tuple of two arrays.
     """
-    check_consistent_length(X, y)
 
     if train_idx is not None:
         X_train, X_val = _safe_indexing(X, train_idx), _safe_indexing(X, val_idx)
@@ -264,7 +262,6 @@ def _fit_and_score_over_thresholds(
         classifier.fit(X_train, y_train, **fit_params_train)
     else:  # prefit estimator, only a validation set is provided
         X_val, y_val, score_params_val = X, y, score_params
-        check_is_fitted(classifier, "classes_")
 
     if curve_scorer is roc_curve or (
         isinstance(curve_scorer, _BaseScorer) and curve_scorer._score_func is roc_curve
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index ec6f8f079bc2d..5a012a74a0c4d 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -228,21 +228,7 @@ def test_fit_and_score_over_thresholds_prefit(curve_scorer, expected_score):
 
     # `train_idx is None` to indicate that the classifier is prefit
     train_idx, val_idx = None, np.arange(50, 100)
-    classifier = DecisionTreeClassifier(random_state=0)
-
-    with pytest.raises(NotFittedError):
-        _fit_and_score_over_thresholds(
-            classifier,
-            X,
-            y,
-            fit_params={},
-            train_idx=train_idx,
-            val_idx=val_idx,
-            curve_scorer=curve_scorer,
-            score_params={},
-        )
-
-    classifier.fit(X, y)
+    classifier = DecisionTreeClassifier(random_state=0).fit(X, y)
     # make sure that the classifier memorized the full dataset such that
     # we get perfect predictions and thus match the expected score
     assert classifier.score(X[val_idx], y[val_idx]) == pytest.approx(1.0)

From 553cfce2cb016ae705399fb72afe59798e1cd2ca Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Mon, 29 Apr 2024 22:21:19 +0200
Subject: [PATCH 167/194] remove duplicated check

---
 sklearn/model_selection/_classification_threshold.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index b6b1beb3093f3..3e9176c408580 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -673,9 +673,9 @@ def fit(self, X, y, **params):
                     " or 'max_recall_at_precision_constraint', `constraint_value` must "
                     "be provided. Got None instead."
                 )
-            constraint_value = self.constraint_value
+            constrained_metric = True
         else:
-            constraint_value = None  # ignore the constraint value
+            constrained_metric = False
 
         routed_params = process_routing(self, "fit", **params)
         self._curve_scorer = self._get_curve_scorer()
@@ -756,7 +756,7 @@ def fit(self, X, y, **params):
         else:
             decision_thresholds = np.asarray(self.n_thresholds)
 
-        if constraint_value is None:  # find best score that is the highest value
+        if not constrained_metric:  # find best score that is the highest value
             objective_scores = _mean_interpolated_score(
                 decision_thresholds, cv_thresholds, cv_scores
             )
@@ -783,7 +783,7 @@ def fit(self, X, y, **params):
 
             def _get_best_idx(constrained_score, maximized_score):
                 """Find the index of the best score constrained by another score."""
-                mask = constrained_score >= constraint_value
+                mask = constrained_score >= self.constraint_value
                 mask_idx = maximized_score[mask].argmax()
                 return np.flatnonzero(mask)[mask_idx]
 

From 6ae6d27cfbf3931d18528613ca7b41264677a50b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Mon, 29 Apr 2024 22:38:34 +0200
Subject: [PATCH 168/194] check cv_results_ API

---
 .../tests/test_classification_threshold.py    | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 5a012a74a0c4d..7452c1809fd13 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -874,6 +874,39 @@ def test_tuned_threshold_classifier_n_thresholds_array():
     assert_allclose(tuned_model.cv_results_["thresholds"], n_thresholds)
 
 
+@pytest.mark.parametrize(
+    "params",
+    [
+        {"strategy": "constant", "constant_threshold": 0.5},
+        {"strategy": "optimum"},
+        {"objective_metric": "max_tpr_at_tnr_constraint", "constraint_value": 0.5},
+        {"objective_metric": "max_tnr_at_tpr_constraint", "constraint_value": 0.5},
+        {
+            "objective_metric": "max_precision_at_recall_constraint",
+            "constraint_value": 0.5,
+        },
+        {
+            "objective_metric": "max_recall_at_precision_constraint",
+            "constraint_value": 0.5,
+        },
+    ],
+)
+@pytest.mark.parametrize("store_cv_results", [True, False])
+def test_tuned_threshold_classifier_store_cv_results(params, store_cv_results):
+    """Check that if `cv_results_` exists depending on `store_cv_results`."""
+    X, y = make_classification(random_state=0)
+    estimator = LogisticRegression()
+    tuned_model = TunedThresholdClassifierCV(
+        estimator,
+        store_cv_results=store_cv_results,
+        **params,
+    ).fit(X, y)
+    if store_cv_results:
+        assert hasattr(tuned_model, "cv_results_")
+    else:
+        assert not hasattr(tuned_model, "cv_results_")
+
+
 def test_tuned_threshold_classifier_cv_float():
     """Check the behaviour when `cv` is set to a float."""
     X, y = make_classification(random_state=0)

From d0100961c6668bdf36b7ea2b66f7a70435a9cc34 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Mon, 29 Apr 2024 22:50:21 +0200
Subject: [PATCH 169/194] clone classifier

---
 sklearn/model_selection/_classification_threshold.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 3e9176c408580..305e0b8b2f849 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -723,7 +723,7 @@ def fit(self, X, y, **params):
         cv_thresholds, cv_scores = zip(
             *Parallel(n_jobs=self.n_jobs)(
                 delayed(_fit_and_score_over_thresholds)(
-                    classifier,
+                    clone(classifier) if cv != "prefit" else classifier,
                     X,
                     y,
                     fit_params=routed_params.estimator.fit,

From 8bb8ca69935edac6a63d113ead0de6d38cff8d61 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Mon, 29 Apr 2024 23:10:42 +0200
Subject: [PATCH 170/194] TST better comments

---
 .../tests/test_classification_threshold.py    | 20 +++++++------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 7452c1809fd13..93240fddfc612 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -458,28 +458,22 @@ def test_tuned_threshold_classifier_without_constraint_value(response_method):
     assert model.cv_results_["scores"].shape == (n_thresholds,)
 
 
-@pytest.mark.parametrize(
-    "metrics",
-    [
-        ("max_tpr_at_tnr_constraint", "max_tnr_at_tpr_constraint"),
-        ("max_tnr_at_tpr_constraint", "max_tpr_at_tnr_constraint"),
-    ],
-)
-def test_tuned_threshold_classifier_limit_metric_tradeoff(metrics):
-    """Check that an objective value of 0 give opposite predictions with tnr/tpr and
-    precision/recall.
+def test_tuned_threshold_classifier_limit_metric_tradeoff():
+    """Check that max TPR lead to opposite prediction of max TNR when constraint is
+    set to 0.0.
     """
     X, y = load_breast_cancer(return_X_y=True)
     estimator = make_pipeline(StandardScaler(), LogisticRegression())
     model = TunedThresholdClassifierCV(
         estimator=estimator,
-        objective_metric=metrics[0],
+        objective_metric="max_tpr_at_tnr_constraint",
         constraint_value=0,
     )
     y_pred_1 = model.fit(X, y).predict(X)
-    model.set_params(objective_metric=metrics[1])
+    model.set_params(objective_metric="max_tnr_at_tpr_constraint")
     y_pred_2 = (~model.fit(X, y).predict(X).astype(bool)).astype(int)
-    assert np.mean(y_pred_1 == y_pred_2) > 0.98
+    # check that we have opposite predictions with a slight tolerance
+    assert np.mean(y_pred_1 == y_pred_2) > 0.99
 
 
 def test_tuned_threshold_classifier_metric_with_parameter():

From fd971c7955a3bcb50f0eef98fd5ae40ba9d53685 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Mon, 29 Apr 2024 23:15:49 +0200
Subject: [PATCH 171/194] iter

---
 .../tests/test_classification_threshold.py             | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 93240fddfc612..7f898d73642ee 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -654,9 +654,8 @@ def test_tuned_threshold_classifier_response_method_curve_scorer_with_constraint
     assert model.cv_results_["maximized_scores"].shape == (n_thresholds,)
 
     if response_method in ("auto", "predict_proba"):
-        # "auto" will fall back  in priority on `predict_proba` if `estimator`
-        # supports it.
-        # we expect the decision threshold to be in [0, 1]
+        # "auto" will fall back in priority on `predict_proba` if `estimator`
+        # supports it. We expect the decision threshold to be in [0, 1]
         if objective_metric in (
             "max_tnr_at_tpr_constraint",
             "max_precision_at_recall_constraint",
@@ -665,8 +664,9 @@ def test_tuned_threshold_classifier_response_method_curve_scorer_with_constraint
         else:  # "max_tpr_at_tnr_constraint" or "max_recall_at_precision_constraint"
             assert 0 <= model.best_threshold_ <= 0.5
     else:  # "decision_function"
-        # we expect the decision function to be centered in 0.0 and to be larger than
-        # -1 and 1.
+        # We expect the decision function to be centered in 0.0 and to be larger than
+        # -1 and 1. We therefore check that the threshold is positive in one case and
+        # negative in the other.
         if objective_metric in (
             "max_tnr_at_tpr_constraint",
             "max_precision_at_recall_constraint",

From bf57dac71234d64c9a8a0046e85f4bad4c8aaf11 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 30 Apr 2024 16:35:38 +0200
Subject: [PATCH 172/194] FEA add a ConstantThresholdClassifier instead of
 strategy="constant" (#16)

---
 doc/modules/classes.rst                       |   1 +
 doc/modules/classification_threshold.rst      |  21 +-
 doc/whats_new/v1.5.rst                        |   2 +
 sklearn/model_selection/__init__.py           |   6 +-
 .../_classification_threshold.py              | 546 ++++++++++++------
 .../tests/test_classification_threshold.py    | 141 +++--
 sklearn/utils/_response.py                    |  18 +-
 7 files changed, 500 insertions(+), 235 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index bcae843f1dae8..e57bdbdec111c 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1257,6 +1257,7 @@ Model post-fit tuning
    :toctree: generated/
    :template: class.rst
 
+   model_selection.FixedThresholdClassifier
    model_selection.TunedThresholdClassifierCV
 
 Model validation
diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index cd8946bfc28f1..1ef78b7a761db 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -12,8 +12,8 @@ Classification is best divided into two parts:
 * the decision problem to take concrete action based on those probability predictions.
 
 Let's take a straightforward example related to weather forecasting: the first point is
-related to answering "what is the chance that it will rain tomorrow?" while the second point is
-related to answering "should I take an umbrella tomorrow?".
+related to answering "what is the chance that it will rain tomorrow?" while the second
+point is related to answering "should I take an umbrella tomorrow?".
 
 When it comes to the scikit-learn API, the first point is addressed providing scores
 using :term:`predict_proba` or :term:`decision_function`. The former returns posterior
@@ -63,8 +63,9 @@ Post-tuning the decision threshold
 
 One solution to address the problem stated in the introduction is to tune the decision
 threshold of the classifier once the model has been trained. The
-:class:`~sklearn.model_selection.TunedThresholdClassifierCV` tunes this threshold using an
-internal cross-validation. The optimum threshold is chosen to maximize a given metric.
+:class:`~sklearn.model_selection.TunedThresholdClassifierCV` tunes this threshold using
+an internal cross-validation. The optimum threshold is chosen to maximize a given
+metric.
 
 The following image illustrates the tuning of the decision threshold for a gradient
 boosting classifier. While the vanilla and tuned classifiers provide the same
@@ -144,21 +145,21 @@ threshold is tuned on the data provided to the `fit` method.
 However, you should be extremely careful when using this option. You should never use
 the same data for training the classifier and tuning the decision threshold due to the
 risk of overfitting. Refer to the following example section for more details (cf.
-:ref:`TunedThresholdClassifierCV_no_cv`). If you have limited resources, consider using a
-float number for `cv` to limit to an internal single train-test split.
+:ref:`TunedThresholdClassifierCV_no_cv`). If you have limited resources, consider using
+a float number for `cv` to limit to an internal single train-test split.
 
 The option `cv="prefit"` should only be used when the provided classifier was already
 trained, and you just want to find the best decision threshold using a new validation
 set.
 
+.. _FixedThresholdClassifier:
+
 Manually setting the decision threshold
 ---------------------------------------
 
 The previous sections discussed strategies to find an optimal decision threshold. It is
-also possible to manually set the decision threshold in
-:class`~sklearn.model_selection.TunedThresholdClassifierCV` by setting the parameter
-`strategy` to `"constant"` and providing the desired threshold using the parameter
-`constant_threshold`.
+also possible to manually set the decision threshold using the class
+:class:`~sklearn.model_selection.FixedThresholdClassifier`.
 
 Examples
 --------
diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
index d7df771406bec..07d5e8b74dffd 100644
--- a/doc/whats_new/v1.5.rst
+++ b/doc/whats_new/v1.5.rst
@@ -307,6 +307,8 @@ Changelog
 - |MajorFeature| :class:`model_selection.TunedThresholdClassifierCV` finds
   the decision threshold of a binary classifier that maximizes a
   classification metric through cross-validation.
+  :class:`model_selection.FixedThresholdClassifier` is an alternative when one wants
+  to use a fixed decision threshold without any tuning scheme.
   :pr:`26120` by :user:`Guillaume Lemaitre <glemaitre>`.
 
 - |Enhancement| :term:`CV splitters <CV splitter>` that ignores the group parameter now
diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py
index f5ee5fea18560..c97d48f4b20b7 100644
--- a/sklearn/model_selection/__init__.py
+++ b/sklearn/model_selection/__init__.py
@@ -1,6 +1,9 @@
 import typing
 
-from ._classification_threshold import TunedThresholdClassifierCV
+from ._classification_threshold import (
+    FixedThresholdClassifier,
+    TunedThresholdClassifierCV,
+)
 from ._plot import LearningCurveDisplay, ValidationCurveDisplay
 from ._search import GridSearchCV, ParameterGrid, ParameterSampler, RandomizedSearchCV
 from ._split import (
@@ -64,6 +67,7 @@
     "StratifiedKFold",
     "StratifiedGroupKFold",
     "StratifiedShuffleSplit",
+    "FixedThresholdClassifier",
     "TunedThresholdClassifierCV",
     "check_cv",
     "cross_val_predict",
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 305e0b8b2f849..dbf19cfe0c536 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -41,6 +41,194 @@
 from ._split import StratifiedShuffleSplit, check_cv
 
 
+def _estimator_has(attr):
+    """Check if we can delegate a method to the underlying estimator.
+
+    First, we check the fitted estimator if available, otherwise we
+    check the unfitted estimator.
+    """
+
+    def check(self):
+        if hasattr(self, "estimator_"):
+            getattr(self.estimator_, attr)
+        else:
+            getattr(self.estimator, attr)
+        return True
+
+    return check
+
+
+class BaseThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
+    """Base class for classifiers that set a non-default decision threshold.
+
+    In this base class, we define the following interface:
+
+    - the validation of common parameters in `fit`;
+    - the different prediction methods that can be used with the classifier.
+
+    .. versionadded:: 1.5
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        The classifier, fitted or not, for which we want to optimize
+        the decision threshold used during `predict`.
+
+    pos_label : int, float, bool or str, default=None
+        The label of the positive class. Used when `objective_metric` is
+        `"max_tnr_at_tpr_constraint"`"`, `"max_tpr_at_tnr_constraint"`, or a dictionary.
+        When `pos_label=None`, if `y_true` is in `{-1, 1}` or `{0, 1}`,
+        `pos_label` is set to 1, otherwise an error will be raised. When using a
+        scorer, `pos_label` can be passed as a keyword argument to
+        :func:`~sklearn.metrics.make_scorer`.
+
+    response_method : {"auto", "decision_function", "predict_proba"}, default="auto"
+        Methods by the classifier `base_estimator` corresponding to the
+        decision function for which we want to find a threshold. It can be:
+
+        * if `"auto"`, it will try to invoke, for each classifier,
+          `"predict_proba"` or `"decision_function"` in that order.
+        * otherwise, one of `"predict_proba"` or `"decision_function"`.
+          If the method is not implemented by the classifier, it will raise an
+          error.
+    """
+
+    _required_parameters = ["estimator"]
+    _parameter_constraints: dict = {
+        "estimator": [
+            HasMethods(["fit", "predict_proba"]),
+            HasMethods(["fit", "decision_function"]),
+        ],
+        "pos_label": [Real, str, "boolean", None],
+        "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})],
+    }
+
+    def __init__(self, estimator, *, pos_label=None, response_method="auto"):
+        self.estimator = estimator
+        self.pos_label = pos_label
+        self.response_method = response_method
+
+    @_fit_context(
+        # *ThresholdClassifier*.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y, **params):
+        """Fit the classifier.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        **params : dict
+            Parameters to pass to the `fit` method of the underlying
+            classifier and to the `objective_metric` scorer.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        _raise_for_params(params, self, None)
+
+        X, y = indexable(X, y)
+
+        y_type = type_of_target(y, input_name="y")
+        if y_type != "binary":
+            raise ValueError(
+                f"Only binary classification is supported. Unknown label type: {y_type}"
+            )
+
+        if self.response_method == "auto":
+            self._response_method = ["predict_proba", "decision_function"]
+        else:
+            self._response_method = self.response_method
+
+        self._fit(X, y, **params)
+
+        if hasattr(self.estimator_, "n_features_in_"):
+            self.n_features_in_ = self.estimator_.n_features_in_
+        if hasattr(self.estimator_, "feature_names_in_"):
+            self.feature_names_in_ = self.estimator_.feature_names_in_
+
+        return self
+
+    @property
+    def classes_(self):
+        """Classes labels."""
+        return self.estimator_.classes_
+
+    @available_if(_estimator_has("predict_proba"))
+    def predict_proba(self, X):
+        """Predict class probabilities for `X` using the fitted estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        probabilities : ndarray of shape (n_samples, n_classes)
+            The class probabilities of the input samples.
+        """
+        check_is_fitted(self, "estimator_")
+        return self.estimator_.predict_proba(X)
+
+    @available_if(_estimator_has("predict_log_proba"))
+    def predict_log_proba(self, X):
+        """Predict logarithm class probabilities for `X` using the fitted estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        log_probabilities : ndarray of shape (n_samples, n_classes)
+            The logarithm class probabilities of the input samples.
+        """
+        check_is_fitted(self, "estimator_")
+        return self.estimator_.predict_log_proba(X)
+
+    @available_if(_estimator_has("decision_function"))
+    def decision_function(self, X):
+        """Decision function for samples in `X` using the fitted estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        decisions : ndarray of shape (n_samples,)
+            The decision function computed the fitted estimator.
+        """
+        check_is_fitted(self, "estimator_")
+        return self.estimator_.decision_function(X)
+
+    def _more_tags(self):
+        return {
+            "binary_only": True,
+            "_xfail_checks": {
+                "check_classifiers_train": "Threshold at probability 0.5 does not hold",
+                "check_sample_weights_invariance": (
+                    "Due to the cross-validation and sample ordering, removing a sample"
+                    " is not strictly equal to putting is weight to zero. Specific unit"
+                    " tests are added for TunedThresholdClassifierCV specifically."
+                ),
+            },
+        }
+
+
 def _threshold_scores_to_class_labels(y_score, threshold, classes, pos_label):
     """Threshold `y_score` and return the associated class labels."""
     if pos_label is None:
@@ -53,6 +241,185 @@ def _threshold_scores_to_class_labels(y_score, threshold, classes, pos_label):
     return classes[map_thresholded_score_to_label[(y_score >= threshold).astype(int)]]
 
 
+class FixedThresholdClassifier(BaseThresholdClassifier):
+    """Classifier that manually sets the decision threshold.
+
+    This classifier allows to change the default decision threshold used for
+    converting posterior probability estimates (i.e. output of `predict_proba`) or
+    decision scores (i.e. output of `decision_function`) into a class label.
+
+    Here, the threshold is not optimized and is set to a constant value.
+
+    Read more in the :ref:`User Guide <FixedThresholdClassifier>`.
+
+    .. versionadded:: 1.5
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        The classifier, fitted or not, for which we want to optimize
+        the decision threshold used during `predict`.
+
+    threshold : {"auto"} or float, default="auto"
+        The decision threshold to use when converting posterior probability estimates
+        (i.e. output of `predict_proba`) or decision scores (i.e. output of
+        `decision_function`) into a class label. When `"auto"`, the threshold is set
+        to 0.5 if `predict_proba` is used as `response_method`, otherwise it is set to
+        0 (i.e. the default threshold for `decision_function`).
+
+    pos_label : int, float, bool or str, default=None
+        The label of the positive class. Used to process the output of the
+        `response_method` method. When `pos_label=None`, if `y_true` is in `{-1, 1}` or
+        `{0, 1}`, `pos_label` is set to 1, otherwise an error will be raised.
+
+    response_method : {"auto", "decision_function", "predict_proba"}, default="auto"
+        Methods by the classifier `estimator` corresponding to the
+        decision function for which we want to find a threshold. It can be:
+
+        * if `"auto"`, it will try to invoke `"predict_proba"` or `"decision_function"`
+          in that order.
+        * otherwise, one of `"predict_proba"` or `"decision_function"`.
+          If the method is not implemented by the classifier, it will raise an
+          error.
+
+    Attributes
+    ----------
+    estimator_ : estimator instance
+        The fitted classifier used when predicting.
+
+    classes_ : ndarray of shape (n_classes,)
+        The class labels.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+    See Also
+    --------
+    sklearn.model_selection.TunedThresholdClassifierCV : Classifier that post-tunes
+        the decision threshold based on some metrics and using cross-validation.
+    sklearn.calibration.CalibratedClassifierCV : Estimator that calibrates
+        probabilities.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.metrics import confusion_matrix
+    >>> from sklearn.model_selection import FixedThresholdClassifier, train_test_split
+    >>> X, y = make_classification(
+    ...     n_samples=1_000, weights=[0.9, 0.1], class_sep=0.8, random_state=42
+    ... )
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, stratify=y, random_state=42
+    ... )
+    >>> classifier = LogisticRegression(random_state=0).fit(X_train, y_train)
+    >>> print(confusion_matrix(y_test, classifier.predict(X_test)))
+    [[217   7]
+     [ 19   7]]
+    >>> classifier_other_threshold = FixedThresholdClassifier(
+    ...     classifier, threshold=0.1, response_method="predict_proba"
+    ... ).fit(X_train, y_train)
+    >>> print(confusion_matrix(y_test, classifier_other_threshold.predict(X_test)))
+    [[184  40]
+     [  6  20]]
+    """
+
+    _parameter_constraints: dict = {
+        **BaseThresholdClassifier._parameter_constraints,
+        "threshold": [Real],
+    }
+
+    def __init__(
+        self,
+        estimator,
+        *,
+        threshold=0.5,
+        pos_label=None,
+        response_method="auto",
+    ):
+        super().__init__(
+            estimator=estimator, pos_label=pos_label, response_method=response_method
+        )
+        self.threshold = threshold
+
+    def _fit(self, X, y, **params):
+        """Fit the classifier.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        **params : dict
+            Parameters to pass to the `fit` method of the underlying
+            classifier and to the `objective_metric` scorer.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        self.estimator_ = clone(self.estimator).fit(X, y, **params)
+        return self
+
+    def predict(self, X):
+        """Predict the target of new samples.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The samples, as accepted by `estimator.predict`.
+
+        Returns
+        -------
+        class_labels : ndarray of shape (n_samples,)
+            The predicted class.
+        """
+        check_is_fitted(self, "estimator_")
+        y_score, _, response_method_used = _get_response_values_binary(
+            self.estimator_,
+            X,
+            self._response_method,
+            pos_label=self.pos_label,
+            return_response_method_used=True,
+        )
+
+        if self.threshold == "auto":
+            decision_threshold = 0.5 if response_method_used == "predict_proba" else 0.0
+        else:
+            decision_threshold = self.threshold
+
+        return _threshold_scores_to_class_labels(
+            y_score, decision_threshold, self.classes_, self.pos_label
+        )
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__).add(
+            estimator=self.estimator,
+            method_mapping=MethodMapping().add(callee="fit", caller="fit"),
+        )
+        return router
+
+
 class _CurveScorer(_BaseScorer):
     """Scorer taking a continuous response and output a score for each threshold.
 
@@ -176,23 +543,6 @@ def _score(self, method_caller, estimator, X, y_true, **kwargs):
         return np.array(score_thresholds), potential_thresholds
 
 
-def _estimator_has(attr):
-    """Check if we can delegate a method to the underlying estimator.
-
-    First, we check the fitted estimator if available, otherwise we
-    check the unfitted estimator.
-    """
-
-    def check(self):
-        if hasattr(self, "estimator_"):
-            getattr(self.estimator_, attr)
-        else:
-            getattr(self.estimator, attr)
-        return True
-
-    return check
-
-
 def _fit_and_score_over_thresholds(
     classifier,
     X,
@@ -319,8 +669,8 @@ def _mean_interpolated_score(target_thresholds, cv_thresholds, cv_scores):
     )
 
 
-class TunedThresholdClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
-    """Decision threshold tuning for binary classification.
+class TunedThresholdClassifierCV(BaseThresholdClassifier):
+    """Classifier that post-tunes the decision threshold using cross-validation.
 
     This estimator post-tunes the decision threshold (cut-off point) that is
     used for converting posterior probability estimates (i.e. output of
@@ -338,13 +688,6 @@ class TunedThresholdClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstima
         The classifier, fitted or not, for which we want to optimize
         the decision threshold used during `predict`.
 
-    strategy : {"optimum", "constant"}, default="optimum"
-        The strategy to use for tuning the decision threshold:
-
-        * `"optimum"`: the decision threshold is tuned to optimize the objective
-            metric;
-        * `"constant"`: the decision threshold is set to `constant_value`.
-
     objective_metric : {"max_tpr_at_tnr_constraint", "max_tnr_at_tpr_constraint", \
             "max_precision_at_recall_constraint, "max_recall_at_precision_constraint"} \
             , str, dict or callable, default="balanced_accuracy"
@@ -369,9 +712,6 @@ class TunedThresholdClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstima
         `"max_precision_at_recall_constraint"`, or
         `"max_recall_at_precision_constraint"`.
 
-    constant_threshold : float, default=0.5
-        The constant threshold to use when `strategy` is `"constant"`.
-
     pos_label : int, float, bool or str, default=None
         The label of the positive class. Used when `objective_metric` is
         `"max_tnr_at_tpr_constraint"`"`, `"max_tpr_at_tnr_constraint"`, or a dictionary.
@@ -451,22 +791,19 @@ class TunedThresholdClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstima
 
     best_score_ : float or None
         The optimal score of the objective metric, evaluated at `best_threshold_`.
-        If `strategy="constant"`, `best_score_` is None.
 
     constrained_score_ : float or None
         When `objective_metric` is one of `"max_tpr_at_tnr_constraint"`,
         `"max_tnr_at_tpr_constraint"`, `"max_precision_at_recall_constraint"`,
         `"max_recall_at_precision_constraint"`, it will corresponds to the score of the
         metric which is constrained. It should be close to `constraint_value`. If
-        `objective_metric` is not one of the above or when `strategy="constant",
-        `constrained_score_` is None.
+        `objective_metric` is not one of the above, `constrained_score_` is None.
 
     cv_results_ : dict or None
         A dictionary containing the scores and thresholds computed during the
         cross-validation process. Only exist if `store_cv_results=True`.
-        The keys are different depending on the `objective_metric` and `strategy` used:
+        The keys are different depending on the `objective_metric` used:
 
-        * when `strategy="constant"`, `cv_results_` is None;
         * when `objective_metric` is one of `"max_tpr_at_tnr_constraint"`,
           `"max_tnr_at_tpr_constraint"`, `"max_precision_at_recall_constraint"`,
           `"max_recall_at_precision_constraint"`, the keys are `"thresholds"`,
@@ -487,6 +824,8 @@ class TunedThresholdClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstima
 
     See Also
     --------
+    sklearn.model_selection.FixedThresholdClassifier : Classifier that uses a
+        constant threshold.
     sklearn.calibration.CalibratedClassifierCV : Estimator that calibrates
         probabilities.
 
@@ -535,13 +874,8 @@ class TunedThresholdClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstima
     <BLANKLINE>
     """
 
-    _required_parameters = ["estimator"]
     _parameter_constraints: dict = {
-        "estimator": [
-            HasMethods(["fit", "predict_proba"]),
-            HasMethods(["fit", "decision_function"]),
-        ],
-        "strategy": [StrOptions({"optimum", "constant"})],
+        **BaseThresholdClassifier._parameter_constraints,
         "objective_metric": [
             StrOptions(
                 set(get_scorer_names())
@@ -556,9 +890,6 @@ class TunedThresholdClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstima
             MutableMapping,
         ],
         "constraint_value": [Real, None],
-        "constant_threshold": [Real],
-        "pos_label": [Real, str, "boolean", None],
-        "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})],
         "n_thresholds": [Interval(Integral, 1, None, closed="left"), "array-like"],
         "cv": [
             "cv_object",
@@ -575,10 +906,8 @@ def __init__(
         self,
         estimator,
         *,
-        strategy="optimum",
         objective_metric="balanced_accuracy",
         constraint_value=None,
-        constant_threshold=0.5,
         pos_label=None,
         response_method="auto",
         n_thresholds=100,
@@ -588,13 +917,11 @@ def __init__(
         random_state=None,
         store_cv_results=False,
     ):
-        self.estimator = estimator
-        self.strategy = strategy
+        super().__init__(
+            estimator=estimator, response_method=response_method, pos_label=pos_label
+        )
         self.objective_metric = objective_metric
         self.constraint_value = constraint_value
-        self.constant_threshold = constant_threshold
-        self.pos_label = pos_label
-        self.response_method = response_method
         self.n_thresholds = n_thresholds
         self.cv = cv
         self.refit = refit
@@ -602,11 +929,7 @@ def __init__(
         self.random_state = random_state
         self.store_cv_results = store_cv_results
 
-    @_fit_context(
-        # TunedThresholdClassifierCV.estimator is not validated yet
-        prefer_skip_nested_validation=False
-    )
-    def fit(self, X, y, **params):
+    def _fit(self, X, y, **params):
         """Fit the classifier and post-tune the decision threshold.
 
         Parameters
@@ -626,16 +949,6 @@ def fit(self, X, y, **params):
         self : object
             Returns an instance of self.
         """
-        _raise_for_params(params, self, None)
-
-        X, y = indexable(X, y)
-
-        y_type = type_of_target(y, input_name="y")
-        if y_type != "binary":
-            raise ValueError(
-                f"Only binary classification is supported. Unknown label type: {y_type}"
-            )
-
         if isinstance(self.cv, Real) and 0 < self.cv < 1:
             cv = StratifiedShuffleSplit(
                 n_splits=1, test_size=self.cv, random_state=self.random_state
@@ -655,11 +968,6 @@ def fit(self, X, y, **params):
             if self.refit is False and cv.get_n_splits() > 1:
                 raise ValueError("When cv has several folds, refit cannot be False.")
 
-        if self.response_method == "auto":
-            self._response_method = ["predict_proba", "decision_function"]
-        else:
-            self._response_method = self.response_method
-
         if isinstance(self.objective_metric, str) and self.objective_metric in {
             "max_tpr_at_tnr_constraint",
             "max_tnr_at_tpr_constraint",
@@ -707,19 +1015,6 @@ def fit(self, X, y, **params):
 
             self.estimator_.fit(X_train, y_train, **fit_params_train)
 
-        if hasattr(self.estimator_, "n_features_in_"):
-            self.n_features_in_ = self.estimator_.n_features_in_
-        if hasattr(self.estimator_, "feature_names_in_"):
-            self.feature_names_in_ = self.estimator_.feature_names_in_
-
-        if self.strategy == "constant":
-            # early exit when we don't need to find the optimal threshold
-            self.best_threshold_ = self.constant_threshold
-            self.best_score_, self.constrained_score_ = None, None
-            if self.store_cv_results:
-                self.cv_results_ = None
-            return self
-
         cv_thresholds, cv_scores = zip(
             *Parallel(n_jobs=self.n_jobs)(
                 delayed(_fit_and_score_over_thresholds)(
@@ -809,11 +1104,6 @@ def _get_best_idx(constrained_score, maximized_score):
 
         return self
 
-    @property
-    def classes_(self):
-        """Classes labels."""
-        return self.estimator_.classes_
-
     def predict(self, X):
         """Predict the target of new samples.
 
@@ -824,77 +1114,22 @@ def predict(self, X):
 
         Returns
         -------
-        C : ndarray of shape (n_samples,)
+        class_labels : ndarray of shape (n_samples,)
             The predicted class.
         """
         check_is_fitted(self, "estimator_")
-        if self.strategy == "optimum":
-            # `pos_label` has been validated and is stored in the scorer
-            pos_label = self._curve_scorer._get_pos_label()
-        else:
-            pos_label = self.pos_label
+        pos_label = self._curve_scorer._get_pos_label()
         y_score, _ = _get_response_values_binary(
-            self.estimator_, X, self._response_method, pos_label=pos_label
+            self.estimator_,
+            X,
+            self._response_method,
+            pos_label=pos_label,
         )
 
         return _threshold_scores_to_class_labels(
             y_score, self.best_threshold_, self.classes_, pos_label
         )
 
-    @available_if(_estimator_has("predict_proba"))
-    def predict_proba(self, X):
-        """Predict class probabilities for `X` using the fitted estimator.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            Training vectors, where `n_samples` is the number of samples and
-            `n_features` is the number of features.
-
-        Returns
-        -------
-        probabilities : ndarray of shape (n_samples, n_classes)
-            The class probabilities of the input samples.
-        """
-        check_is_fitted(self, "estimator_")
-        return self.estimator_.predict_proba(X)
-
-    @available_if(_estimator_has("predict_log_proba"))
-    def predict_log_proba(self, X):
-        """Predict logarithm class probabilities for `X` using the fitted estimator.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            Training vectors, where `n_samples` is the number of samples and
-            `n_features` is the number of features.
-
-        Returns
-        -------
-        log_probabilities : ndarray of shape (n_samples, n_classes)
-            The logarithm class probabilities of the input samples.
-        """
-        check_is_fitted(self, "estimator_")
-        return self.estimator_.predict_log_proba(X)
-
-    @available_if(_estimator_has("decision_function"))
-    def decision_function(self, X):
-        """Decision function for samples in `X` using the fitted estimator.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            Training vectors, where `n_samples` is the number of samples and
-            `n_features` is the number of features.
-
-        Returns
-        -------
-        decisions : ndarray of shape (n_samples,)
-            The decision function computed the fitted estimator.
-        """
-        check_is_fitted(self, "estimator_")
-        return self.estimator_.decision_function(X)
-
     def get_metadata_routing(self):
         """Get metadata routing of this object.
 
@@ -954,16 +1189,3 @@ def _get_curve_scorer(self):
                 scoring, self._response_method, self.n_thresholds, self.pos_label
             )
         return curve_scorer
-
-    def _more_tags(self):
-        return {
-            "binary_only": True,
-            "_xfail_checks": {
-                "check_classifiers_train": "Threshold at probability 0.5 does not hold",
-                "check_sample_weights_invariance": (
-                    "Due to the cross-validation and sample ordering, removing a sample"
-                    " is not strictly equal to putting is weight to zero. Specific unit"
-                    " tests are added for TunedThresholdClassifierCV specifically."
-                ),
-            },
-        }
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 7f898d73642ee..fc04dd798058b 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -8,7 +8,6 @@
     make_classification,
     make_multilabel_classification,
 )
-from sklearn.dummy import DummyClassifier
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.exceptions import NotFittedError
 from sklearn.linear_model import LogisticRegression
@@ -23,7 +22,11 @@
     recall_score,
     roc_curve,
 )
-from sklearn.model_selection import StratifiedShuffleSplit, TunedThresholdClassifierCV
+from sklearn.model_selection import (
+    FixedThresholdClassifier,
+    StratifiedShuffleSplit,
+    TunedThresholdClassifierCV,
+)
 from sklearn.model_selection._classification_threshold import (
     _CurveScorer,
     _fit_and_score_over_thresholds,
@@ -381,18 +384,13 @@ def test_tuned_threshold_classifier_no_binary(data):
         ),
     ],
 )
-@pytest.mark.parametrize("strategy", ["optimum", "constant"])
-def test_tuned_threshold_classifier_conflict_cv_refit(
-    strategy, params, err_type, err_msg
-):
+def test_tuned_threshold_classifier_conflict_cv_refit(params, err_type, err_msg):
     """Check that we raise an informative error message when `cv` and `refit`
     cannot be used together.
     """
     X, y = make_classification(n_samples=100, random_state=0)
     with pytest.raises(err_type, match=err_msg):
-        TunedThresholdClassifierCV(
-            LogisticRegression(), strategy=strategy, **params
-        ).fit(X, y)
+        TunedThresholdClassifierCV(LogisticRegression(), **params).fit(X, y)
 
 
 @pytest.mark.parametrize(
@@ -402,16 +400,18 @@ def test_tuned_threshold_classifier_conflict_cv_refit(
 @pytest.mark.parametrize(
     "response_method", ["predict_proba", "predict_log_proba", "decision_function"]
 )
-@pytest.mark.parametrize("strategy", ["optimum", "constant"])
-def test_tuned_threshold_classifier_estimator_response_methods(
-    estimator, strategy, response_method
+@pytest.mark.parametrize(
+    "ThresholdClassifier", [FixedThresholdClassifier, TunedThresholdClassifierCV]
+)
+def test_threshold_classifier_estimator_response_methods(
+    ThresholdClassifier, estimator, response_method
 ):
     """Check that `TunedThresholdClassifierCV` exposes the same response methods as the
     underlying estimator.
     """
     X, y = make_classification(n_samples=100, random_state=0)
 
-    model = TunedThresholdClassifierCV(estimator, strategy=strategy)
+    model = ThresholdClassifier(estimator=estimator)
     assert hasattr(model, response_method) == hasattr(estimator, response_method)
 
     model.fit(X, y)
@@ -532,15 +532,12 @@ def test_tuned_threshold_classifier_with_string_targets(response_method, metric)
     ).fit(X, y)
     assert_array_equal(model.classes_, np.sort(classes))
     y_pred = model.predict(X)
-    assert_array_equal(np.sort(np.unique(y_pred)), np.sort(classes))
+    assert_array_equal(np.unique(y_pred), np.sort(classes))
 
 
 @pytest.mark.usefixtures("enable_slep006")
-@pytest.mark.parametrize("strategy", ["optimum", "constant"])
 @pytest.mark.parametrize("with_sample_weight", [True, False])
-def test_tuned_threshold_classifier_refit(
-    strategy, with_sample_weight, global_random_seed
-):
+def test_tuned_threshold_classifier_refit(with_sample_weight, global_random_seed):
     """Check the behaviour of the `refit` parameter."""
     rng = np.random.RandomState(global_random_seed)
     X, y = make_classification(n_samples=100, random_state=0)
@@ -552,7 +549,7 @@ def test_tuned_threshold_classifier_refit(
 
     # check that `estimator_` if fitted on the full dataset when `refit=True`
     estimator = LogisticRegression().set_fit_request(sample_weight=True)
-    model = TunedThresholdClassifierCV(estimator, strategy=strategy, refit=True).fit(
+    model = TunedThresholdClassifierCV(estimator, refit=True).fit(
         X, y, sample_weight=sample_weight
     )
 
@@ -565,9 +562,9 @@ def test_tuned_threshold_classifier_refit(
     estimator = LogisticRegression().set_fit_request(sample_weight=True)
     estimator.fit(X, y, sample_weight=sample_weight)
     coef = estimator.coef_.copy()
-    model = TunedThresholdClassifierCV(
-        estimator, strategy=strategy, cv="prefit", refit=False
-    ).fit(X, y, sample_weight=sample_weight)
+    model = TunedThresholdClassifierCV(estimator, cv="prefit", refit=False).fit(
+        X, y, sample_weight=sample_weight
+    )
 
     assert model.estimator_ is estimator
     assert_allclose(model.estimator_.coef_, coef)
@@ -577,9 +574,9 @@ def test_tuned_threshold_classifier_refit(
     cv = [
         (np.arange(50), np.arange(50, 100)),
     ]  # single split
-    model = TunedThresholdClassifierCV(
-        estimator, strategy=strategy, cv=cv, refit=False
-    ).fit(X, y, sample_weight=sample_weight)
+    model = TunedThresholdClassifierCV(estimator, cv=cv, refit=False).fit(
+        X, y, sample_weight=sample_weight
+    )
 
     assert model.estimator_ is not estimator
     if with_sample_weight:
@@ -706,16 +703,6 @@ def test_tuned_threshold_classifier_cv_zeros_sample_weights_equivalence():
     assert_allclose(y_pred_with_weights, y_pred_without_weights)
 
 
-def test_tuned_threshold_classifier_error_constant_learner():
-    """Check that we raise an error message when providing an estimator that predicts
-    only a single class."""
-    X, y = make_classification(random_state=0)
-    estimator = DummyClassifier(strategy="constant", constant=1)
-    err_msg = "The provided estimator makes constant predictions."
-    with pytest.raises(ValueError, match=err_msg):
-        TunedThresholdClassifierCV(estimator).fit(X, y)
-
-
 @pytest.mark.parametrize(
     "objective_metric",
     ["max_precision_at_recall_constraint", "max_recall_at_precision_constraint"],
@@ -829,30 +816,6 @@ def test_tuned_threshold_classifier_pos_label_single_metric(pos_label, metric_ty
     assert precision == pytest.approx(model.best_score_, abs=1e-3)
 
 
-@pytest.mark.parametrize(
-    "predict_method",
-    ["predict", "predict_proba", "decision_function", "predict_log_proba"],
-)
-def test_tuned_threshold_classifier_constant_strategy(predict_method):
-    """Check the behavior when `strategy='contant'."""
-    X, y = make_classification(n_samples=100, weights=[0.6, 0.4], random_state=42)
-
-    # With a constant strategy and a threshold at 0.5, we should get the same than the
-    # original model
-    estimator = LogisticRegression().fit(X, y)
-    constant_threshold = 0.5
-    tuned_model = TunedThresholdClassifierCV(
-        estimator, strategy="constant", constant_threshold=constant_threshold
-    ).fit(X, y)
-    assert tuned_model.best_threshold_ == pytest.approx(constant_threshold)
-    for attribute in ("best_score_", "constrained_score_"):
-        assert getattr(tuned_model, attribute) is None
-
-    assert_allclose(
-        getattr(tuned_model, predict_method)(X), getattr(estimator, predict_method)(X)
-    )
-
-
 def test_tuned_threshold_classifier_n_thresholds_array():
     """Check that we can pass an array to `n_thresholds` and it is used as candidate
     threshold internally."""
@@ -871,8 +834,7 @@ def test_tuned_threshold_classifier_n_thresholds_array():
 @pytest.mark.parametrize(
     "params",
     [
-        {"strategy": "constant", "constant_threshold": 0.5},
-        {"strategy": "optimum"},
+        {"objective_metric": "balanced_accuracy"},
         {"objective_metric": "max_tpr_at_tnr_constraint", "constraint_value": 0.5},
         {"objective_metric": "max_tnr_at_tpr_constraint", "constraint_value": 0.5},
         {
@@ -948,3 +910,60 @@ def test_tuned_threshold_classifier_error_missing_constraint(objective_metric):
     )
     with pytest.raises(ValueError, match="`constraint_value` must be provided"):
         tuned_model.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "response_method", ["auto", "predict_proba", "decision_function"]
+)
+def test_fixed_threshold_classifier_equivalence_default(response_method):
+    """Check that `FixedThresholdClassifier` has the same behaviour as the vanilla
+    classifier.
+    """
+    X, y = make_classification(random_state=0)
+    classifier = LogisticRegression().fit(X, y)
+    classifier_default_threshold = FixedThresholdClassifier(estimator=clone(classifier))
+    classifier_default_threshold.fit(X, y)
+
+    assert_allclose(classifier.predict(X), classifier_default_threshold.predict(X))
+
+
+@pytest.mark.parametrize(
+    "response_method, threshold", [("predict_proba", 0.7), ("decision_function", 2.0)]
+)
+@pytest.mark.parametrize("pos_label", [0, 1])
+def test_fixed_threshold_classifier(response_method, threshold, pos_label):
+    """Check that applying `predict` lead to the same prediction as applying the
+    threshold to the output of the response method.
+    """
+    X, y = make_classification(n_samples=50, random_state=0)
+    logistic_regression = LogisticRegression().fit(X, y)
+    model = FixedThresholdClassifier(
+        estimator=clone(logistic_regression),
+        threshold=threshold,
+        response_method=response_method,
+        pos_label=pos_label,
+    ).fit(X, y)
+
+    # check that the underlying estimator is the same
+    assert_allclose(model.estimator_.coef_, logistic_regression.coef_)
+
+    # emulate the response method that should take into account the `pos_label`
+    if response_method == "predict_proba":
+        y_score = model.predict_proba(X)[:, pos_label]
+    else:  # response_method == "decision_function"
+        y_score = model.decision_function(X)
+        y_score = y_score if pos_label == 1 else -y_score
+
+    # create a mapping from boolean values to class labels
+    map_to_label = np.array([0, 1]) if pos_label == 1 else np.array([1, 0])
+    y_pred_lr = map_to_label[(y_score >= threshold).astype(int)]
+    assert_allclose(model.predict(X), y_pred_lr)
+
+    for method in ("predict_proba", "predict_log_proba", "decision_function"):
+        assert_allclose(
+            getattr(model, method)(X), getattr(logistic_regression, method)(X)
+        )
+        assert_allclose(
+            getattr(model.estimator_, method)(X),
+            getattr(logistic_regression, method)(X),
+        )
diff --git a/sklearn/utils/_response.py b/sklearn/utils/_response.py
index e647ba3a4f009..0381c872a94b0 100644
--- a/sklearn/utils/_response.py
+++ b/sklearn/utils/_response.py
@@ -2,6 +2,7 @@
 
 It allows to make uniform checks and validation.
 """
+
 import numpy as np
 
 from ..base import is_classifier
@@ -242,7 +243,9 @@ def _get_response_values(
     return y_pred, pos_label
 
 
-def _get_response_values_binary(estimator, X, response_method, pos_label=None):
+def _get_response_values_binary(
+    estimator, X, response_method, pos_label=None, return_response_method_used=False
+):
     """Compute the response values of a binary classifier.
 
     Parameters
@@ -265,6 +268,12 @@ def _get_response_values_binary(estimator, X, response_method, pos_label=None):
         the metrics. By default, `estimators.classes_[1]` is
         considered as the positive class.
 
+    return_response_method_used : bool, default=False
+        Whether to return the response method used to compute the response
+        values.
+
+        .. versionadded:: 1.5
+
     Returns
     -------
     y_pred : ndarray of shape (n_samples,)
@@ -274,6 +283,12 @@ def _get_response_values_binary(estimator, X, response_method, pos_label=None):
     pos_label : int, float, bool or str
         The class considered as the positive class when computing
         the metrics.
+
+    response_method_used : str
+        The response method used to compute the response values. Only returned
+        if `return_response_method_used` is `True`.
+
+        .. versionadded:: 1.5
     """
     classification_error = "Expected 'estimator' to be a binary classifier."
 
@@ -295,4 +310,5 @@ def _get_response_values_binary(estimator, X, response_method, pos_label=None):
         X,
         response_method,
         pos_label=pos_label,
+        return_response_method_used=return_response_method_used,
     )

From 04099326a80df37eb0fcad4fdaa9103414b14908 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Tue, 30 Apr 2024 17:04:53 +0200
Subject: [PATCH 173/194] make FixedThresholdClassifier appear in example

---
 .../plot_cost_sensitive_learning.py           | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index e85b2f7d14222..da10279fda102 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -624,6 +624,9 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
 _ = ax.set_xlabel("Amount ($)")
 
 # %%
+# Addressing the problem with a business metric
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
 # Now, we create the business metric that depends on the amount of each transaction. We
 # define the cost matrix similarly to [2]_. Accepting a legitimate transaction provides
 # a gain of 2% of the amount of the transaction. However, accepting a fraudulent
@@ -747,6 +750,9 @@ def business_metric(y_true, y_pred, amount):
 # that our model is beating the baseline in terms of profit and it would be already
 # beneficial to use it instead of ignoring the fraud detection problem.
 #
+# Tuning the decision threshold
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
 # Now the question is: is our model optimum for the type of decision that we want to do?
 # Up to now, we did not optimize the decision threshold. We use the
 # :class:`~sklearn.model_selection.TunedThresholdClassifierCV` to optimize the decision
@@ -789,3 +795,37 @@ def business_metric(y_true, y_pred, amount):
 # historical data (offline evaluation) should ideally be confirmed by A/B testing
 # on live data (online evaluation). Note however that A/B testing models is
 # beyond the scope of the scikit-learn library itself.
+#
+# Manually setting the decision threshold instead of tuning it
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# In the previous example, we used the
+# :class:`~sklearn.model_selection.TunedThresholdClassifierCV` to find the optimal
+# decision threshold. However, in some cases, we might have some prior knowledge about
+# the problem at hand and we might be happy to set the decision threshold manually.
+#
+# The class :class:`~sklearn.model_selection.FixedThresholdClassifier` allows us to
+# manually set the decision threshold. At prediction time, it behave as the previous
+# tuned model but no search is performed during the fitting process.
+#
+# Here, we will reuse the decision threshold found in the previous section to create a
+# new model and check that it gives the same results.
+from sklearn.model_selection import FixedThresholdClassifier
+
+model_fixed_threshold = FixedThresholdClassifier(
+    estimator=model, threshold=tuned_model.best_threshold_
+).fit(data_train, target_train)
+
+# %%
+business_score = business_scorer(
+    model_fixed_threshold, data_test, target_test, amount=amount_test
+)
+print(f"Benefit/cost of our logistic regression: ${business_score:,.2f}")
+print(
+    "Balanced accuracy of our logistic regression: "
+    f"{balanced_accuracy_scorer(model_fixed_threshold, data_test, target_test):.3f}"
+)
+
+# %%
+# We observe that we obtained the exact same results but the fitting process was much
+# faster since we did not perform any search.

From 66ea575bd3f4087b9c4993bf687563fd237a1d80 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Tue, 30 Apr 2024 17:07:22 +0200
Subject: [PATCH 174/194] iter

---
 sklearn/model_selection/_classification_threshold.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index dbf19cfe0c536..cf72684848305 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -693,8 +693,8 @@ class TunedThresholdClassifierCV(BaseThresholdClassifier):
             , str, dict or callable, default="balanced_accuracy"
         The objective metric to be optimized. Can be one of:
 
-        * a string associated to a scoring function (see model evaluation
-          documentation);
+        * a string associated to a scoring function for binary classification
+          (see model evaluation documentation);
         * a scorer callable object created with :func:`~sklearn.metrics.make_scorer`;
         * `"max_tnr_at_tpr_constraint"`: find the decision threshold for a true
           positive ratio (TPR) of `constraint_value`;

From 1c97dd4e79fcd1f8806ba994f411b6362bd6b285 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 30 Apr 2024 18:50:37 +0200
Subject: [PATCH 175/194] Update doc/modules/classes.rst
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <jeremie@probabl.ai>
---
 doc/modules/classes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index e57bdbdec111c..804546eababef 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1248,7 +1248,7 @@ Hyper-parameter optimizers
    model_selection.RandomizedSearchCV
    model_selection.HalvingRandomSearchCV
 
-Model post-fit tuning
+Post-fit model tuning
 ---------------------
 
 .. currentmodule:: sklearn

From c8c1d0c749cfd09458fa812727d1753d61fe6f92 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 30 Apr 2024 19:11:56 +0200
Subject: [PATCH 176/194] Update
 sklearn/model_selection/_classification_threshold.py

Co-authored-by: Adrin Jalali <adrin.jalali@gmail.com>
---
 sklearn/model_selection/_classification_threshold.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index cf72684848305..50bd5eaed2743 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -367,7 +367,8 @@ def _fit(self, X, y, **params):
         self : object
             Returns an instance of self.
         """
-        self.estimator_ = clone(self.estimator).fit(X, y, **params)
+        routed_params = process_routing(self, "fit", **params)
+        self.estimator_ = clone(self.estimator).fit(X, y, **routed_params.estimator.fit)
         return self
 
     def predict(self, X):

From 8a52bc6fef31bb3e26bb480803b6b0d1f688f47e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Tue, 30 Apr 2024 19:11:44 +0200
Subject: [PATCH 177/194] TST and fix default parameter

---
 sklearn/metrics/_scorer.py                    |  9 ++--
 .../_classification_threshold.py              | 47 +++++++++----------
 .../tests/test_classification_threshold.py    | 15 +++++-
 3 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 39464eef41a86..07487856d4972 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -204,14 +204,11 @@ class _BaseScorer(_MetadataRequester):
         Either 1 or -1 to returns the score with `sign * score_func(estimator, X, y)`.
         Thus, `sign` defined if higher scores are better or worse.
 
-    n_thresholds : int or array-like
-        Related to the number of decision thresholds for which we want to compute the
-        score. If an integer, it will be used to generate `n_thresholds` thresholds
-        uniformly distributed between the minimum and maximum predicted scores. If an
-        array-like, it will be used as the thresholds.
-
     kwargs : dict
         Additional parameters to pass to the score function.
+
+    response_method : str
+        The method to call on the estimator to get the response values.
     """
 
     def __init__(self, score_func, sign, kwargs, response_method="predict"):
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 50bd5eaed2743..6387344df4648 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -58,6 +58,18 @@ def check(self):
     return check
 
 
+def _threshold_scores_to_class_labels(y_score, threshold, classes, pos_label):
+    """Threshold `y_score` and return the associated class labels."""
+    if pos_label is None:
+        map_thresholded_score_to_label = np.array([0, 1])
+    else:
+        pos_label_idx = np.flatnonzero(classes == pos_label)[0]
+        neg_label_idx = np.flatnonzero(classes != pos_label)[0]
+        map_thresholded_score_to_label = np.array([neg_label_idx, pos_label_idx])
+
+    return classes[map_thresholded_score_to_label[(y_score >= threshold).astype(int)]]
+
+
 class BaseThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
     """Base class for classifiers that set a non-default decision threshold.
 
@@ -75,15 +87,12 @@ class BaseThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator
         the decision threshold used during `predict`.
 
     pos_label : int, float, bool or str, default=None
-        The label of the positive class. Used when `objective_metric` is
-        `"max_tnr_at_tpr_constraint"`"`, `"max_tpr_at_tnr_constraint"`, or a dictionary.
-        When `pos_label=None`, if `y_true` is in `{-1, 1}` or `{0, 1}`,
-        `pos_label` is set to 1, otherwise an error will be raised. When using a
-        scorer, `pos_label` can be passed as a keyword argument to
-        :func:`~sklearn.metrics.make_scorer`.
+        The label of the positive class. Used to process the output of the
+        `response_method` method. When `pos_label=None`, if `y_true` is in `{-1, 1}` or
+        `{0, 1}`, `pos_label` is set to 1, otherwise an error will be raised.
 
     response_method : {"auto", "decision_function", "predict_proba"}, default="auto"
-        Methods by the classifier `base_estimator` corresponding to the
+        Methods by the classifier `estimator` corresponding to the
         decision function for which we want to find a threshold. It can be:
 
         * if `"auto"`, it will try to invoke, for each classifier,
@@ -229,18 +238,6 @@ def _more_tags(self):
         }
 
 
-def _threshold_scores_to_class_labels(y_score, threshold, classes, pos_label):
-    """Threshold `y_score` and return the associated class labels."""
-    if pos_label is None:
-        map_thresholded_score_to_label = np.array([0, 1])
-    else:
-        pos_label_idx = np.flatnonzero(classes == pos_label)[0]
-        neg_label_idx = np.flatnonzero(classes != pos_label)[0]
-        map_thresholded_score_to_label = np.array([neg_label_idx, pos_label_idx])
-
-    return classes[map_thresholded_score_to_label[(y_score >= threshold).astype(int)]]
-
-
 class FixedThresholdClassifier(BaseThresholdClassifier):
     """Classifier that manually sets the decision threshold.
 
@@ -331,14 +328,14 @@ class FixedThresholdClassifier(BaseThresholdClassifier):
 
     _parameter_constraints: dict = {
         **BaseThresholdClassifier._parameter_constraints,
-        "threshold": [Real],
+        "threshold": [StrOptions({"auto"}), Real],
     }
 
     def __init__(
         self,
         estimator,
         *,
-        threshold=0.5,
+        threshold="auto",
         pos_label=None,
         response_method="auto",
     ):
@@ -434,15 +431,15 @@ class _CurveScorer(_BaseScorer):
         Either 1 or -1 to returns the score with `sign * score_func(estimator, X, y)`.
         Thus, `sign` defined if higher scores are better or worse.
 
+    kwargs : dict
+        Additional parameters to pass to the score function.
+
     n_thresholds : int or array-like
         Related to the number of decision thresholds for which we want to compute the
         score. If an integer, it will be used to generate `n_thresholds` thresholds
         uniformly distributed between the minimum and maximum predicted scores. If an
         array-like, it will be used as the thresholds.
 
-    kwargs : dict
-        Additional parameters to pass to the score function.
-
     response_method : str
         The method to call on the estimator to get the response values.
     """
@@ -722,7 +719,7 @@ class TunedThresholdClassifierCV(BaseThresholdClassifier):
         :func:`~sklearn.metrics.make_scorer`.
 
     response_method : {"auto", "decision_function", "predict_proba"}, default="auto"
-        Methods by the classifier `base_estimator` corresponding to the
+        Methods by the classifier `estimator` corresponding to the
         decision function for which we want to find a threshold. It can be:
 
         * if `"auto"`, it will try to invoke, for each classifier,
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index fc04dd798058b..74cf8770a391b 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -921,10 +921,21 @@ def test_fixed_threshold_classifier_equivalence_default(response_method):
     """
     X, y = make_classification(random_state=0)
     classifier = LogisticRegression().fit(X, y)
-    classifier_default_threshold = FixedThresholdClassifier(estimator=clone(classifier))
+    classifier_default_threshold = FixedThresholdClassifier(
+        estimator=clone(classifier), response_method=response_method
+    )
     classifier_default_threshold.fit(X, y)
 
-    assert_allclose(classifier.predict(X), classifier_default_threshold.predict(X))
+    # emulate the response method that should take into account the `pos_label`
+    if response_method in ("auto", "predict_proba"):
+        y_score = classifier_default_threshold.predict_proba(X)[:, 1]
+        threshold = 0.5
+    else:  # response_method == "decision_function"
+        y_score = classifier_default_threshold.decision_function(X)
+        threshold = 0.0
+
+    y_pred_lr = (y_score >= threshold).astype(int)
+    assert_allclose(classifier_default_threshold.predict(X), y_pred_lr)
 
 
 @pytest.mark.parametrize(

From fdbf68e90592c6984349b6a41f7a20517b94e8ae Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Tue, 30 Apr 2024 19:34:01 +0200
Subject: [PATCH 178/194] TST metadarouting FixedThresholdClassifier

---
 .../tests/test_classification_threshold.py          | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 74cf8770a391b..9f202919c8d96 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -978,3 +978,16 @@ def test_fixed_threshold_classifier(response_method, threshold, pos_label):
             getattr(model.estimator_, method)(X),
             getattr(logistic_regression, method)(X),
         )
+
+
+@pytest.mark.usefixtures("enable_slep006")
+def test_fixed_threshold_classifier_metadata_routing():
+    """Check that everything works with metadata routing."""
+    X, y = make_classification(random_state=0)
+    sample_weight = np.ones_like(y)
+    sample_weight[::2] = 2
+    classifier = LogisticRegression().set_fit_request(sample_weight=True)
+    classifier.fit(X, y, sample_weight=sample_weight)
+    classifier_default_threshold = FixedThresholdClassifier(estimator=clone(classifier))
+    classifier_default_threshold.fit(X, y, sample_weight=sample_weight)
+    assert_allclose(classifier_default_threshold.estimator_.coef_, classifier.coef_)

From 9c0c13d9e96243d70ca86daf327f8e35dac23558 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Tue, 30 Apr 2024 20:02:34 +0200
Subject: [PATCH 179/194] rename n_thresholds to thresholds

---
 .../plot_cost_sensitive_learning.py           |  2 +-
 .../_classification_threshold.py              | 66 +++++++++----------
 .../tests/test_classification_threshold.py    | 50 +++++++-------
 3 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index da10279fda102..5da7569c83e38 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -760,7 +760,7 @@ def business_metric(y_true, y_pred, amount):
 tuned_model = TunedThresholdClassifierCV(
     estimator=model,
     objective_metric=business_scorer,
-    n_thresholds=100,
+    thresholds=100,
     n_jobs=2,
 )
 
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 6387344df4648..81ba3afa5dc31 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -434,9 +434,9 @@ class _CurveScorer(_BaseScorer):
     kwargs : dict
         Additional parameters to pass to the score function.
 
-    n_thresholds : int or array-like
+    thresholds : int or array-like
         Related to the number of decision thresholds for which we want to compute the
-        score. If an integer, it will be used to generate `n_thresholds` thresholds
+        score. If an integer, it will be used to generate `thresholds` thresholds
         uniformly distributed between the minimum and maximum predicted scores. If an
         array-like, it will be used as the thresholds.
 
@@ -444,17 +444,17 @@ class _CurveScorer(_BaseScorer):
         The method to call on the estimator to get the response values.
     """
 
-    def __init__(self, score_func, sign, kwargs, n_thresholds, response_method):
+    def __init__(self, score_func, sign, kwargs, thresholds, response_method):
         super().__init__(
             score_func=score_func,
             sign=sign,
             kwargs=kwargs,
             response_method=response_method,
         )
-        self._n_thresholds = n_thresholds
+        self._thresholds = thresholds
 
     @classmethod
-    def from_scorer(cls, scorer, response_method, n_thresholds, pos_label):
+    def from_scorer(cls, scorer, response_method, thresholds, pos_label):
         """Create a continuous scorer from a normal scorer."""
         # add `pos_label` if requested by the scorer function
         scorer_kwargs = {**scorer._kwargs}
@@ -478,7 +478,7 @@ def from_scorer(cls, scorer, response_method, n_thresholds, pos_label):
             score_func=scorer._score_func,
             sign=scorer._sign,
             response_method=response_method,
-            n_thresholds=n_thresholds,
+            thresholds=thresholds,
             kwargs=scorer_kwargs,
         )
         # transfer the metadata request
@@ -509,10 +509,10 @@ def _score(self, method_caller, estimator, X, y_true, **kwargs):
 
         Returns
         -------
-        scores : ndarray of shape (n_thresholds,)
+        scores : ndarray of shape (thresholds,)
             The scores associated to each threshold.
 
-        potential_thresholds : ndarray of shape (n_thresholds,)
+        potential_thresholds : ndarray of shape (thresholds,)
             The potential thresholds used to compute the scores.
         """
         pos_label = self._get_pos_label()
@@ -521,12 +521,12 @@ def _score(self, method_caller, estimator, X, y_true, **kwargs):
         )
 
         scoring_kwargs = {**self._kwargs, **kwargs}
-        if isinstance(self._n_thresholds, Integral):
+        if isinstance(self._thresholds, Integral):
             potential_thresholds = np.linspace(
-                np.min(y_score), np.max(y_score), self._n_thresholds
+                np.min(y_score), np.max(y_score), self._thresholds
             )
         else:
-            potential_thresholds = np.asarray(self._n_thresholds)
+            potential_thresholds = np.asarray(self._thresholds)
         score_thresholds = [
             self._sign
             * self._score_func(
@@ -593,11 +593,11 @@ def _fit_and_score_over_thresholds(
 
     Returns
     -------
-    potential_thresholds : ndarray of shape (n_thresholds,)
+    potential_thresholds : ndarray of shape (thresholds,)
         The decision thresholds used to compute the scores. They are returned in
         ascending order.
 
-    scores : ndarray of shape (n_thresholds,) or tuple of such arrays
+    scores : ndarray of shape (thresholds,) or tuple of such arrays
         The scores computed for each decision threshold. When TPR/TNR or precision/
         recall are computed, `scores` is a tuple of two arrays.
     """
@@ -644,18 +644,18 @@ def _mean_interpolated_score(target_thresholds, cv_thresholds, cv_scores):
 
     Parameters
     ----------
-    target_thresholds : ndarray of shape (n_thresholds,)
+    target_thresholds : ndarray of shape (thresholds,)
         The thresholds to use to compute the mean score.
 
-    cv_thresholds : ndarray of shape (n_folds, n_thresholds_fold)
+    cv_thresholds : ndarray of shape (n_folds, thresholds_fold)
         The thresholds used to compute the scores for each fold.
 
-    cv_scores : ndarray of shape (n_folds, n_thresholds_fold)
+    cv_scores : ndarray of shape (n_folds, thresholds_fold)
         The scores computed for each threshold for each fold.
 
     Returns
     -------
-    mean_score : ndarray of shape (n_thresholds,)
+    mean_score : ndarray of shape (thresholds,)
         The mean score across all folds for each target threshold.
     """
     return np.mean(
@@ -728,7 +728,7 @@ class TunedThresholdClassifierCV(BaseThresholdClassifier):
           If the method is not implemented by the classifier, it will raise an
           error.
 
-    n_thresholds : int or array-like, default=100
+    thresholds : int or array-like, default=100
         The number of decision threshold to use when discretizing the output of the
         classifier `method`. Pass an array-like to manually specify the thresholds
         to use.
@@ -888,7 +888,7 @@ class TunedThresholdClassifierCV(BaseThresholdClassifier):
             MutableMapping,
         ],
         "constraint_value": [Real, None],
-        "n_thresholds": [Interval(Integral, 1, None, closed="left"), "array-like"],
+        "thresholds": [Interval(Integral, 1, None, closed="left"), "array-like"],
         "cv": [
             "cv_object",
             StrOptions({"prefit"}),
@@ -908,7 +908,7 @@ def __init__(
         constraint_value=None,
         pos_label=None,
         response_method="auto",
-        n_thresholds=100,
+        thresholds=100,
         cv=None,
         refit=True,
         n_jobs=None,
@@ -920,7 +920,7 @@ def __init__(
         )
         self.objective_metric = objective_metric
         self.constraint_value = constraint_value
-        self.n_thresholds = n_thresholds
+        self.thresholds = thresholds
         self.cv = cv
         self.refit = refit
         self.n_jobs = n_jobs
@@ -1042,35 +1042,35 @@ def _fit(self, X, y, **params):
         max_threshold = max(
             split_thresholds.max() for split_thresholds in cv_thresholds
         )
-        if isinstance(self.n_thresholds, Integral):
-            decision_thresholds = np.linspace(
-                min_threshold, max_threshold, num=self.n_thresholds
+        if isinstance(self.thresholds, Integral):
+            decisiothresholds = np.linspace(
+                min_threshold, max_threshold, num=self.thresholds
             )
         else:
-            decision_thresholds = np.asarray(self.n_thresholds)
+            decisiothresholds = np.asarray(self.thresholds)
 
         if not constrained_metric:  # find best score that is the highest value
             objective_scores = _mean_interpolated_score(
-                decision_thresholds, cv_thresholds, cv_scores
+                decisiothresholds, cv_thresholds, cv_scores
             )
             best_idx = objective_scores.argmax()
             self.best_score_ = objective_scores[best_idx]
-            self.best_threshold_ = decision_thresholds[best_idx]
+            self.best_threshold_ = decisiothresholds[best_idx]
             self.constrained_score_ = None
             if self.store_cv_results:
                 self.cv_results_ = {
-                    "thresholds": decision_thresholds,
+                    "thresholds": decisiothresholds,
                     "scores": objective_scores,
                 }
         else:
             if "tpr" in self.objective_metric:  # tpr/tnr
                 mean_tnr, mean_tpr = [
-                    _mean_interpolated_score(decision_thresholds, cv_thresholds, sc)
+                    _mean_interpolated_score(decisiothresholds, cv_thresholds, sc)
                     for sc in zip(*cv_scores)
                 ]
             else:  # precision/recall
                 mean_precision, mean_recall = [
-                    _mean_interpolated_score(decision_thresholds, cv_thresholds, sc)
+                    _mean_interpolated_score(decisiothresholds, cv_thresholds, sc)
                     for sc in zip(*cv_scores)
                 ]
 
@@ -1092,10 +1092,10 @@ def _get_best_idx(constrained_score, maximized_score):
             best_idx = _get_best_idx(constrained_scores, maximized_scores)
             self.best_score_ = maximized_scores[best_idx]
             self.constrained_score_ = constrained_scores[best_idx]
-            self.best_threshold_ = decision_thresholds[best_idx]
+            self.best_threshold_ = decisiothresholds[best_idx]
             if self.store_cv_results:
                 self.cv_results_ = {
-                    "thresholds": decision_thresholds,
+                    "thresholds": decisiothresholds,
                     "constrained_scores": constrained_scores,
                     "maximized_scores": maximized_scores,
                 }
@@ -1184,6 +1184,6 @@ def _get_curve_scorer(self):
         else:
             scoring = check_scoring(self.estimator, scoring=self.objective_metric)
             curve_scorer = _CurveScorer.from_scorer(
-                scoring, self._response_method, self.n_thresholds, self.pos_label
+                scoring, self._response_method, self.thresholds, self.pos_label
             )
         return curve_scorer
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 9f202919c8d96..a5a11f466c25b 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -51,7 +51,7 @@ def test_curve_scorer():
         balanced_accuracy_score,
         sign=1,
         response_method="predict_proba",
-        n_thresholds=10,
+        thresholds=10,
         kwargs={},
     )
     scores, thresholds = curve_scorer(estimator, X, y)
@@ -70,7 +70,7 @@ def test_curve_scorer():
         balanced_accuracy_score,
         sign=1,
         response_method="predict_proba",
-        n_thresholds=10,
+        thresholds=10,
         kwargs={"adjusted": True},
     )
     scores, thresholds = curve_scorer(estimator, X, y)
@@ -83,7 +83,7 @@ def test_curve_scorer():
         balanced_accuracy_score,
         sign=-1,
         response_method="predict_proba",
-        n_thresholds=10,
+        thresholds=10,
         kwargs={"adjusted": True},
     )
     scores, thresholds = curve_scorer(estimator, X, y)
@@ -103,7 +103,7 @@ def test_curve_scorer_pos_label(global_random_seed):
         recall_score,
         sign=1,
         response_method="predict_proba",
-        n_thresholds=10,
+        thresholds=10,
         kwargs={"pos_label": 1},
     )
     scores_pos_label_1, thresholds_pos_label_1 = curve_scorer(estimator, X, y)
@@ -112,7 +112,7 @@ def test_curve_scorer_pos_label(global_random_seed):
         recall_score,
         sign=1,
         response_method="predict_proba",
-        n_thresholds=10,
+        thresholds=10,
         kwargs={"pos_label": 0},
     )
     scores_pos_label_0, thresholds_pos_label_0 = curve_scorer(estimator, X, y)
@@ -142,7 +142,7 @@ def test_curve_scorer_pos_label(global_random_seed):
                 score_func=balanced_accuracy_score,
                 sign=1,
                 response_method="predict_proba",
-                n_thresholds=10,
+                thresholds=10,
                 kwargs={},
             ),
             "balanced_accuracy",
@@ -202,7 +202,7 @@ def test_fit_and_score_over_thresholds_curve_scorers(curve_scorer, score_method)
                 score_func=balanced_accuracy_score,
                 sign=1,
                 response_method="predict_proba",
-                n_thresholds=2,
+                thresholds=2,
                 kwargs={},
             ),
             [0.5, 1.0],
@@ -258,7 +258,7 @@ def test_fit_and_score_over_thresholds_prefit(curve_scorer, expected_score):
             score_func=balanced_accuracy_score,
             sign=1,
             response_method="predict_proba",
-            n_thresholds=10,
+            thresholds=10,
             kwargs={},
         ),
         make_scorer(roc_curve, response_method="predict_proba"),
@@ -316,7 +316,7 @@ def test_fit_and_score_over_thresholds_sample_weight(curve_scorer):
             score_func=balanced_accuracy_score,
             sign=1,
             response_method="predict_proba",
-            n_thresholds=10,
+            thresholds=10,
             kwargs={},
         ),
         make_scorer(roc_curve, response_method="predict_proba"),
@@ -443,19 +443,19 @@ def test_tuned_threshold_classifier_without_constraint_value(response_method):
     y = np.hstack([y[indices_neg], y[indices_pos]])
 
     lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
-    n_thresholds = 100
+    thresholds = 100
     model = TunedThresholdClassifierCV(
         estimator=lr,
         objective_metric="balanced_accuracy",
         response_method=response_method,
-        n_thresholds=n_thresholds,
+        thresholds=thresholds,
         store_cv_results=True,
     )
     score_optimized = balanced_accuracy_score(y, model.fit(X, y).predict(X))
     score_baseline = balanced_accuracy_score(y, lr.predict(X))
     assert score_optimized > score_baseline
-    assert model.cv_results_["thresholds"].shape == (n_thresholds,)
-    assert model.cv_results_["scores"].shape == (n_thresholds,)
+    assert model.cv_results_["thresholds"].shape == (thresholds,)
+    assert model.cv_results_["scores"].shape == (thresholds,)
 
 
 def test_tuned_threshold_classifier_limit_metric_tradeoff():
@@ -528,7 +528,7 @@ def test_tuned_threshold_classifier_with_string_targets(response_method, metric)
         constraint_value=0.9,
         pos_label="cancer",
         response_method=response_method,
-        n_thresholds=100,
+        thresholds=100,
     ).fit(X, y)
     assert_array_equal(model.classes_, np.sort(classes))
     y_pred = model.predict(X)
@@ -636,19 +636,19 @@ def test_tuned_threshold_classifier_response_method_curve_scorer_with_constraint
     X, y = make_classification(n_samples=100, random_state=global_random_seed)
     classifier = LogisticRegression()
 
-    n_thresholds = 100
+    thresholds = 100
     model = TunedThresholdClassifierCV(
         classifier,
         objective_metric=objective_metric,
         constraint_value=constraint_value,
         response_method=response_method,
-        n_thresholds=n_thresholds,
+        thresholds=thresholds,
         store_cv_results=True,
     )
     model.fit(X, y)
-    assert model.cv_results_["thresholds"].shape == (n_thresholds,)
-    assert model.cv_results_["constrained_scores"].shape == (n_thresholds,)
-    assert model.cv_results_["maximized_scores"].shape == (n_thresholds,)
+    assert model.cv_results_["thresholds"].shape == (thresholds,)
+    assert model.cv_results_["constrained_scores"].shape == (thresholds,)
+    assert model.cv_results_["maximized_scores"].shape == (thresholds,)
 
     if response_method in ("auto", "predict_proba"):
         # "auto" will fall back in priority on `predict_proba` if `estimator`
@@ -809,26 +809,26 @@ def test_tuned_threshold_classifier_pos_label_single_metric(pos_label, metric_ty
         cv="prefit",
         refit=False,
         pos_label=pos_label,
-        n_thresholds=500,
+        thresholds=500,
     ).fit(X, y)
 
     precision = precision_score(y, model.predict(X), pos_label=pos_label)
     assert precision == pytest.approx(model.best_score_, abs=1e-3)
 
 
-def test_tuned_threshold_classifier_n_thresholds_array():
-    """Check that we can pass an array to `n_thresholds` and it is used as candidate
+def test_tuned_threshold_classifier_thresholds_array():
+    """Check that we can pass an array to `thresholds` and it is used as candidate
     threshold internally."""
     X, y = make_classification(random_state=0)
     estimator = LogisticRegression()
-    n_thresholds = np.linspace(0, 1, 11)
+    thresholds = np.linspace(0, 1, 11)
     tuned_model = TunedThresholdClassifierCV(
         estimator,
-        n_thresholds=n_thresholds,
+        thresholds=thresholds,
         response_method="predict_proba",
         store_cv_results=True,
     ).fit(X, y)
-    assert_allclose(tuned_model.cv_results_["thresholds"], n_thresholds)
+    assert_allclose(tuned_model.cv_results_["thresholds"], thresholds)
 
 
 @pytest.mark.parametrize(

From f4193714dd2a4d892720c3a2e19de995328e7f85 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Tue, 30 Apr 2024 20:10:00 +0200
Subject: [PATCH 180/194] cover constant predictor error

---
 .../tests/test_classification_threshold.py          | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index a5a11f466c25b..792a37d8f50f9 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -8,6 +8,7 @@
     make_classification,
     make_multilabel_classification,
 )
+from sklearn.dummy import DummyClassifier
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.exceptions import NotFittedError
 from sklearn.linear_model import LogisticRegression
@@ -912,6 +913,18 @@ def test_tuned_threshold_classifier_error_missing_constraint(objective_metric):
         tuned_model.fit(X, y)
 
 
+def test_tuned_threshold_classifier_error_constant_predictor():
+    """Check that we raise a ValueError if the underlying classifier returns constant
+    probabilities such that we cannot find any threshold.
+    """
+    X, y = make_classification(random_state=0)
+    estimator = DummyClassifier(strategy="constant", constant=1)
+    tuned_model = TunedThresholdClassifierCV(estimator, response_method="predict_proba")
+    err_msg = "The provided estimator makes constant predictions"
+    with pytest.raises(ValueError, match=err_msg):
+        tuned_model.fit(X, y)
+
+
 @pytest.mark.parametrize(
     "response_method", ["auto", "predict_proba", "decision_function"]
 )

From 42eafe51cb3ad22a87d05920f8f019d3dddb909a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Tue, 30 Apr 2024 20:15:05 +0200
Subject: [PATCH 181/194] TST some tests for get_response_values_binary

---
 sklearn/utils/tests/test_response.py | 60 +++++++++++++++++++---------
 1 file changed, 42 insertions(+), 18 deletions(-)

diff --git a/sklearn/utils/tests/test_response.py b/sklearn/utils/tests/test_response.py
index c84bf6030336a..858c16cca4df1 100644
--- a/sklearn/utils/tests/test_response.py
+++ b/sklearn/utils/tests/test_response.py
@@ -240,36 +240,60 @@ def test_get_response_error(estimator, X, y, err_msg, params):
         _get_response_values_binary(estimator, X, **params)
 
 
-def test_get_response_predict_proba():
+@pytest.mark.parametrize("return_response_method_used", [True, False])
+def test_get_response_predict_proba(return_response_method_used):
     """Check the behaviour of `_get_response_values_binary` using `predict_proba`."""
     classifier = DecisionTreeClassifier().fit(X_binary, y_binary)
-    y_proba, pos_label = _get_response_values_binary(
-        classifier, X_binary, response_method="predict_proba"
+    results = _get_response_values_binary(
+        classifier,
+        X_binary,
+        response_method="predict_proba",
+        return_response_method_used=return_response_method_used,
     )
-    assert_allclose(y_proba, classifier.predict_proba(X_binary)[:, 1])
-    assert pos_label == 1
+    assert_allclose(results[0], classifier.predict_proba(X_binary)[:, 1])
+    assert results[1] == 1
+    if return_response_method_used:
+        assert results[2] == "predict_proba"
 
-    y_proba, pos_label = _get_response_values_binary(
-        classifier, X_binary, response_method="predict_proba", pos_label=0
+    results = _get_response_values_binary(
+        classifier,
+        X_binary,
+        response_method="predict_proba",
+        pos_label=0,
+        return_response_method_used=return_response_method_used,
     )
-    assert_allclose(y_proba, classifier.predict_proba(X_binary)[:, 0])
-    assert pos_label == 0
+    assert_allclose(results[0], classifier.predict_proba(X_binary)[:, 0])
+    assert results[1] == 0
+    if return_response_method_used:
+        assert results[2] == "predict_proba"
 
 
-def test_get_response_decision_function():
+@pytest.mark.parametrize("return_response_method_used", [True, False])
+def test_get_response_decision_function(return_response_method_used):
     """Check the behaviour of `_get_response_values_binary` using decision_function."""
     classifier = LogisticRegression().fit(X_binary, y_binary)
-    y_score, pos_label = _get_response_values_binary(
-        classifier, X_binary, response_method="decision_function"
+    results = _get_response_values_binary(
+        classifier,
+        X_binary,
+        response_method="decision_function",
+        return_response_method_used=return_response_method_used,
     )
-    assert_allclose(y_score, classifier.decision_function(X_binary))
-    assert pos_label == 1
+    assert_allclose(results[0], classifier.decision_function(X_binary))
+    assert results[1] == 1
+    if return_response_method_used:
+        assert results[2] == "decision_function"
 
-    y_score, pos_label = _get_response_values_binary(
-        classifier, X_binary, response_method="decision_function", pos_label=0
+    results = _get_response_values_binary(
+        classifier,
+        X_binary,
+        response_method="decision_function",
+        pos_label=0,
+        return_response_method_used=return_response_method_used,
     )
-    assert_allclose(y_score, classifier.decision_function(X_binary) * -1)
-    assert pos_label == 0
+    assert_allclose(results[0], classifier.decision_function(X_binary) * -1)
+    assert results[1] == 0
+    if return_response_method_used:
+        assert results[2] == "decision_function"
 
 
 @pytest.mark.parametrize(

From 581133fb1c5a3a76074e29bde1aeb054a693f982 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Thu, 2 May 2024 15:32:33 +0200
Subject: [PATCH 182/194] use conditional p(y|X) instead of posterior

---
 doc/modules/classification_threshold.rst | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index 1ef78b7a761db..db65873bd17f3 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -16,20 +16,20 @@ related to answering "what is the chance that it will rain tomorrow?" while the
 point is related to answering "should I take an umbrella tomorrow?".
 
 When it comes to the scikit-learn API, the first point is addressed providing scores
-using :term:`predict_proba` or :term:`decision_function`. The former returns posterior
-probability estimates for each class, while the latter returns a decision score for each
-class.
+using :term:`predict_proba` or :term:`decision_function`. The former returns conditional
+probability estimates :math:`P(y|X)` for each class, while the latter returns a decision
+score for each class.
 
 The decision corresponding to the labels are obtained with :term:`predict`. In binary
 classification, a decision rule or action is then defined by thresholding the scores,
 leading to the prediction of a single class label for each sample. For binary
 classification in scikit-learn, class labels predictions are obtained by hard-coded
-cut-off rules: a positive class is predicted when the posterior probability is greater
-than 0.5 (obtained with :term:`predict_proba`) or if the decision score is greater than
-0 (obtained with :term:`decision_function`).
+cut-off rules: a positive class is predicted when the conditional probability
+:math:`P(y|X)` is greater than 0.5 (obtained with :term:`predict_proba`) or if the
+decision score is greater than 0 (obtained with :term:`decision_function`).
 
-Here, we show an example that illustrates the relation between posterior
-probability estimates and class labels::
+Here, we show an example that illustrates the relation between conditional
+probability estimates :math:`P(y|X)` and class labels::
 
     >>> from sklearn.datasets import make_classification
     >>> from sklearn.tree import DecisionTreeClassifier
@@ -55,7 +55,7 @@ the trade-off of potentially more false-positive predictions, reducing the preci
 the model. That is a risk physicians are willing to take because the cost of a missed
 cancer is much higher than the cost of further diagnostic tests. Consequently, when it
 comes to deciding whether to classify a patient as having cancer or not, it may be more
-beneficial to classify them as positive for cancer when the posterior probability
+beneficial to classify them as positive for cancer when the conditional probability
 estimate is much lower than 0.5.
 
 Post-tuning the decision threshold
@@ -72,9 +72,9 @@ boosting classifier. While the vanilla and tuned classifiers provide the same
 :term:`predict_proba` outputs and thus the same Receiver Operating Characteristic (ROC)
 and Precision-Recall curves, the class label predictions differ because of the tuned
 decision threshold. The vanilla classifier predicts the class of interest for a
-posterior probability greater than 0.5 while the tuned classifier predicts the class of
-interest for a very low probability (around 0.02). This decision threshold optimizes a
-utility metric defined by the business (in this case an insurance company).
+conditional probability greater than 0.5 while the tuned classifier predicts the class
+of interest for a very low probability (around 0.02). This decision threshold optimizes
+a utility metric defined by the business (in this case an insurance company).
 
 .. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cost_sensitive_learning_002.png
    :target: ../auto_examples/model_selection/plot_cost_sensitive_learning.html

From 0f803d9a4e27580b6856a573df684fad8b76c4c2 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Thu, 2 May 2024 15:35:08 +0200
Subject: [PATCH 183/194] be more explicit that strings need to be provided to
 objective_metric

---
 doc/modules/classification_threshold.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index db65873bd17f3..2ffaee91a3566 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -118,8 +118,9 @@ a meaningful metric for their use case.
         0.86...
 
 A second strategy aims to maximize one metric while imposing constraints on another
-metric. There are four pre-defined options, two use the Receiver Operating
-Characteristic (ROC) statistics and two use the Precision-Recall statistics.
+metric. There are four pre-defined options that can be provided to `objective_metric`
+parameter, two use the Receiver Operating Characteristic (ROC) statistics and two use
+the Precision-Recall statistics.
 
 - `"max_tpr_at_tnr_constraint"`: maximizes the True Positive Rate (TPR) such that the
   True Negative Rate (TNR) is the closest to a given value.

From eb0defcda9c13dea4a8407ca64e516a45828a323 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Thu, 2 May 2024 15:46:19 +0200
Subject: [PATCH 184/194] factorize plotting into a function

---
 .../plot_cost_sensitive_learning.py           | 292 +++++-------------
 1 file changed, 80 insertions(+), 212 deletions(-)

diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index 5da7569c83e38..06a56ea768f04 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -281,79 +281,87 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
 
 # %%
 # We plot the ROC and Precision-Recall curves for the vanilla model and the tuned model.
-# Also we plot the cut-off points that would be used by each model.
-fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(21, 6))
-
-linestyles = ("dashed", "dotted")
-markerstyles = ("o", ">")
-colors = ("tab:blue", "tab:orange")
-names = ("Vanilla GBDT", "Tuned GBDT")
-for idx, (est, linestyle, marker, color, name) in enumerate(
-    zip((model, tuned_model), linestyles, markerstyles, colors, names)
-):
-    decision_threshold = getattr(est, "best_threshold_", 0.5)
-    PrecisionRecallDisplay.from_estimator(
-        est,
-        X_test,
-        y_test,
-        pos_label=pos_label,
-        linestyle=linestyle,
-        color=color,
-        ax=axs[0],
-        name=name,
+# Also we plot the cut-off points that would be used by each model. Because, we are
+# reusing the same code later, we define a function that generates the plots.
+
+
+def plot_roc_pr_curves(vanilla_model, tuned_model, *, title):
+    fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(21, 6))
+
+    linestyles = ("dashed", "dotted")
+    markerstyles = ("o", ">")
+    colors = ("tab:blue", "tab:orange")
+    names = ("Vanilla GBDT", "Tuned GBDT")
+    for idx, (est, linestyle, marker, color, name) in enumerate(
+        zip((vanilla_model, tuned_model), linestyles, markerstyles, colors, names)
+    ):
+        decision_threshold = getattr(est, "best_threshold_", 0.5)
+        PrecisionRecallDisplay.from_estimator(
+            est,
+            X_test,
+            y_test,
+            pos_label=pos_label,
+            linestyle=linestyle,
+            color=color,
+            ax=axs[0],
+            name=name,
+        )
+        axs[0].plot(
+            scoring["recall"](est, X_test, y_test),
+            scoring["precision"](est, X_test, y_test),
+            marker,
+            markersize=10,
+            color=color,
+            label=f"Cut-off point at probability of {decision_threshold:.2f}",
+        )
+        RocCurveDisplay.from_estimator(
+            est,
+            X_test,
+            y_test,
+            pos_label=pos_label,
+            linestyle=linestyle,
+            color=color,
+            ax=axs[1],
+            name=name,
+            plot_chance_level=idx == 1,
+        )
+        axs[1].plot(
+            scoring["fpr"](est, X_test, y_test),
+            scoring["tpr"](est, X_test, y_test),
+            marker,
+            markersize=10,
+            color=color,
+            label=f"Cut-off point at probability of {decision_threshold:.2f}",
+        )
+
+    axs[0].set_title("Precision-Recall curve")
+    axs[0].legend()
+    axs[1].set_title("ROC curve")
+    axs[1].legend()
+
+    axs[2].plot(
+        tuned_model.cv_results_["thresholds"],
+        tuned_model.cv_results_["scores"],
+        color="tab:orange",
     )
-    axs[0].plot(
-        scoring["recall"](est, X_test, y_test),
-        scoring["precision"](est, X_test, y_test),
-        marker,
+    axs[2].plot(
+        tuned_model.best_threshold_,
+        tuned_model.best_score_,
+        "o",
         markersize=10,
-        color=color,
-        label=f"Cut-off point at probability of {decision_threshold:.2f}",
-    )
-    RocCurveDisplay.from_estimator(
-        est,
-        X_test,
-        y_test,
-        pos_label=pos_label,
-        linestyle=linestyle,
-        color=color,
-        ax=axs[1],
-        name=name,
-        plot_chance_level=idx == 1,
-    )
-    axs[1].plot(
-        scoring["fpr"](est, X_test, y_test),
-        scoring["tpr"](est, X_test, y_test),
-        marker,
-        markersize=10,
-        color=color,
-        label=f"Cut-off point at probability of {decision_threshold:.2f}",
+        color="tab:orange",
+        label="Optimal cut-off point for the business metric",
     )
+    axs[2].legend()
+    axs[2].set_xlabel("Decision threshold (probability)")
+    axs[2].set_ylabel("Objective score (using cost-matrix)")
+    axs[2].set_title("Objective score as a function of the decision threshold")
+    fig.suptitle(title)
 
-axs[0].set_title("Precision-Recall curve")
-axs[0].legend()
-axs[1].set_title("ROC curve")
-axs[1].legend()
-
-axs[2].plot(
-    tuned_model.cv_results_["thresholds"],
-    tuned_model.cv_results_["scores"],
-    color="tab:orange",
-)
-axs[2].plot(
-    tuned_model.best_threshold_,
-    tuned_model.best_score_,
-    "o",
-    markersize=10,
-    color="tab:orange",
-    label="Optimal cut-off point for the business metric",
-)
-axs[2].legend()
-axs[2].set_xlabel("Decision threshold (probability)")
-axs[2].set_ylabel("Objective score (using cost-matrix)")
-axs[2].set_title("Objective score as a function of the decision threshold")
 
-_ = fig.suptitle("Comparison of the cut-off point for the vanilla and tuned GBDT model")
+# %%
+title = "Comparison of the cut-off point for the vanilla and tuned GBDT model"
+plot_roc_pr_curves(model, tuned_model, title=title)
 
 # %%
 # The first remark is that both classifiers have exactly the same ROC and
@@ -400,78 +408,8 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
 
 # %%
 # Then, we evaluate our model with the same approach as before:
-fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(21, 6))
-
-linestyles = ("dashed", "dotted")
-markerstyles = ("o", ">")
-colors = ("tab:blue", "tab:orange")
-names = ("Vanilla GBDT", "Tuned GBDT")
-for idx, (est, linestyle, marker, color, name) in enumerate(
-    zip((model, tuned_model), linestyles, markerstyles, colors, names)
-):
-    decision_threshold = getattr(est, "best_threshold_", 0.5)
-    PrecisionRecallDisplay.from_estimator(
-        est,
-        X_test,
-        y_test,
-        pos_label=pos_label,
-        linestyle=linestyle,
-        color=color,
-        ax=axs[0],
-        name=name,
-    )
-    axs[0].plot(
-        scoring["recall"](est, X_test, y_test),
-        scoring["precision"](est, X_test, y_test),
-        marker,
-        markersize=10,
-        color=color,
-        label=f"Cut-off point at probability of {decision_threshold:.2f}",
-    )
-    RocCurveDisplay.from_estimator(
-        est,
-        X_test,
-        y_test,
-        pos_label=pos_label,
-        linestyle=linestyle,
-        color=color,
-        ax=axs[1],
-        name=name,
-        plot_chance_level=idx == 1,
-    )
-    axs[1].plot(
-        scoring["fpr"](est, X_test, y_test),
-        scoring["tpr"](est, X_test, y_test),
-        marker,
-        markersize=10,
-        color=color,
-        label=f"Cut-off point at probability of {decision_threshold:.2f}",
-    )
-
-axs[0].set_title("Precision-Recall curve")
-axs[0].legend()
-axs[1].set_title("ROC curve")
-axs[1].legend()
-
-axs[2].plot(
-    tuned_model.cv_results_["thresholds"],
-    tuned_model.cv_results_["scores"],
-    color="tab:orange",
-)
-axs[2].plot(
-    tuned_model.best_threshold_,
-    tuned_model.best_score_,
-    "o",
-    markersize=10,
-    color="tab:orange",
-    label="Optimal cut-off point for the business metric",
-)
-axs[2].legend()
-axs[2].set_xlabel("Decision threshold (probability)")
-axs[2].set_ylabel("Objective score (using cost-matrix)")
-axs[2].set_title("Objective score as a function of the decision threshold")
-
-_ = fig.suptitle("Tuned GBDT model without refitting and using the entire dataset")
+title = "Tuned GBDT model without refitting and using the entire dataset"
+plot_roc_pr_curves(model, tuned_model, title=title)
 
 # %%
 # We observe the that the optimum cut-off point is different from the one found
@@ -496,78 +434,8 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
 tuned_model.set_params(cv=0.75).fit(X_train, y_train)
 
 # %%
-fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(21, 6))
-
-linestyles = ("dashed", "dotted")
-markerstyles = ("o", ">")
-colors = ("tab:blue", "tab:orange")
-names = ("Vanilla GBDT", "Tuned GBDT")
-for idx, (est, linestyle, marker, color, name) in enumerate(
-    zip((model, tuned_model), linestyles, markerstyles, colors, names)
-):
-    decision_threshold = getattr(est, "best_threshold_", 0.5)
-    PrecisionRecallDisplay.from_estimator(
-        est,
-        X_test,
-        y_test,
-        pos_label=pos_label,
-        linestyle=linestyle,
-        color=color,
-        ax=axs[0],
-        name=name,
-    )
-    axs[0].plot(
-        scoring["recall"](est, X_test, y_test),
-        scoring["precision"](est, X_test, y_test),
-        marker,
-        markersize=10,
-        color=color,
-        label=f"Cut-off point at probability of {decision_threshold:.2f}",
-    )
-    RocCurveDisplay.from_estimator(
-        est,
-        X_test,
-        y_test,
-        pos_label=pos_label,
-        linestyle=linestyle,
-        color=color,
-        ax=axs[1],
-        name=name,
-        plot_chance_level=idx == 1,
-    )
-    axs[1].plot(
-        scoring["fpr"](est, X_test, y_test),
-        scoring["tpr"](est, X_test, y_test),
-        marker,
-        markersize=10,
-        color=color,
-        label=f"Cut-off point at probability of {decision_threshold:.2f}",
-    )
-
-axs[0].set_title("Precision-Recall curve")
-axs[0].legend()
-axs[1].set_title("ROC curve")
-axs[1].legend()
-
-axs[2].plot(
-    tuned_model.cv_results_["thresholds"],
-    tuned_model.cv_results_["scores"],
-    color="tab:orange",
-)
-axs[2].plot(
-    tuned_model.best_threshold_,
-    tuned_model.best_score_,
-    "o",
-    markersize=10,
-    color="tab:orange",
-    label="Optimal cut-off point for the business metric",
-)
-axs[2].legend()
-axs[2].set_xlabel("Decision threshold (probability)")
-axs[2].set_ylabel("Objective score (using cost-matrix)")
-axs[2].set_title("Objective score as a function of the decision threshold")
-
-_ = fig.suptitle("Tuned GBDT model without refitting and using the entire dataset")
+title = "Tuned GBDT model without refitting and using the entire dataset"
+plot_roc_pr_curves(model, tuned_model, title=title)
 
 # %%
 # Regarding the cut-off point, we observe that the optimum is similar to the multiple

From ffd5669e61db17e22f82a92b299dc73e29b9f25a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Thu, 2 May 2024 15:55:26 +0200
Subject: [PATCH 185/194] fix typo in code

---
 .../_classification_threshold.py               | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 81ba3afa5dc31..9228bf162dac0 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -1043,34 +1043,34 @@ def _fit(self, X, y, **params):
             split_thresholds.max() for split_thresholds in cv_thresholds
         )
         if isinstance(self.thresholds, Integral):
-            decisiothresholds = np.linspace(
+            decision_thresholds = np.linspace(
                 min_threshold, max_threshold, num=self.thresholds
             )
         else:
-            decisiothresholds = np.asarray(self.thresholds)
+            decision_thresholds = np.asarray(self.thresholds)
 
         if not constrained_metric:  # find best score that is the highest value
             objective_scores = _mean_interpolated_score(
-                decisiothresholds, cv_thresholds, cv_scores
+                decision_thresholds, cv_thresholds, cv_scores
             )
             best_idx = objective_scores.argmax()
             self.best_score_ = objective_scores[best_idx]
-            self.best_threshold_ = decisiothresholds[best_idx]
+            self.best_threshold_ = decision_thresholds[best_idx]
             self.constrained_score_ = None
             if self.store_cv_results:
                 self.cv_results_ = {
-                    "thresholds": decisiothresholds,
+                    "thresholds": decision_thresholds,
                     "scores": objective_scores,
                 }
         else:
             if "tpr" in self.objective_metric:  # tpr/tnr
                 mean_tnr, mean_tpr = [
-                    _mean_interpolated_score(decisiothresholds, cv_thresholds, sc)
+                    _mean_interpolated_score(decision_thresholds, cv_thresholds, sc)
                     for sc in zip(*cv_scores)
                 ]
             else:  # precision/recall
                 mean_precision, mean_recall = [
-                    _mean_interpolated_score(decisiothresholds, cv_thresholds, sc)
+                    _mean_interpolated_score(decision_thresholds, cv_thresholds, sc)
                     for sc in zip(*cv_scores)
                 ]
 
@@ -1092,10 +1092,10 @@ def _get_best_idx(constrained_score, maximized_score):
             best_idx = _get_best_idx(constrained_scores, maximized_scores)
             self.best_score_ = maximized_scores[best_idx]
             self.constrained_score_ = constrained_scores[best_idx]
-            self.best_threshold_ = decisiothresholds[best_idx]
+            self.best_threshold_ = decision_thresholds[best_idx]
             if self.store_cv_results:
                 self.cv_results_ = {
-                    "thresholds": decisiothresholds,
+                    "thresholds": decision_thresholds,
                     "constrained_scores": constrained_scores,
                     "maximized_scores": maximized_scores,
                 }

From 18abafe6e43087f44fcfc7455407317ada35f723 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Thu, 2 May 2024 17:55:30 +0200
Subject: [PATCH 186/194] use proper scoring rule and robust estimator to scale

---
 examples/model_selection/plot_cost_sensitive_learning.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index 06a56ea768f04..0322502baf301 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -596,11 +596,13 @@ def business_metric(y_true, y_pred, amount):
 # decision threshold.
 from sklearn.linear_model import LogisticRegressionCV
 from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import RobustScaler
 
 model = make_pipeline(
-    StandardScaler(),
-    LogisticRegressionCV(random_state=42, Cs=np.logspace(-6, 6, 13), scoring="roc_auc"),
+    RobustScaler(quantile_range=(5, 95)),
+    LogisticRegressionCV(
+        random_state=42, Cs=np.logspace(-6, 6, 13), scoring="neg_log_loss"
+    ),
 ).fit(data_train, target_train)
 
 print(

From ce9464c7006e0865e040edc3fe1542bdbc5f8929 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Thu, 2 May 2024 18:11:20 +0200
Subject: [PATCH 187/194] improve narrative

---
 .../plot_cost_sensitive_learning.py           | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index 0322502baf301..ba646e6a6f053 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -594,6 +594,16 @@ def business_metric(y_true, y_pred, amount):
 #
 # Let's now create a predictive model using a logistic regression without tuning the
 # decision threshold.
+#
+# .. note::
+#    By using :class:`~sklearn.linear_model.LogisticRegressionCV`, the hyperparameter
+#    search introduced a data split within the logistic regression itself. Therefore,
+#    a data leak is introduced since we scaled the entire training set before to pass
+#    it to the logistic regression model.
+#
+#    To alleviate the effect of the data leak that is a potential distribution shift
+#    due to the presence of outliers, we use a scaler based on robust statistics by
+#    using :class:`~sklearn.preprocessing.RobustScaler`.
 from sklearn.linear_model import LogisticRegressionCV
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import RobustScaler
@@ -626,7 +636,14 @@ def business_metric(y_true, y_pred, amount):
 # Now the question is: is our model optimum for the type of decision that we want to do?
 # Up to now, we did not optimize the decision threshold. We use the
 # :class:`~sklearn.model_selection.TunedThresholdClassifierCV` to optimize the decision
-# given our business scorer.
+# given our business scorer. To avoid a nested cross-validation, we will use the `C`
+# parameter found in the previous logistic regression model while tuning.
+from sklearn.linear_model import LogisticRegression
+
+# Reuse the best parameter C found in the previous grid-search
+model = make_pipeline(
+    RobustScaler(quantile_range=(5, 95)), LogisticRegression(C=model[-1].C_[0])
+)
 tuned_model = TunedThresholdClassifierCV(
     estimator=model,
     objective_metric=business_scorer,

From 89d67cfeb5a8f8db17b9f1c822bd20c8a8bbdabd Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Thu, 2 May 2024 19:03:00 +0200
Subject: [PATCH 188/194] use grid-search

---
 .../plot_cost_sensitive_learning.py           | 38 ++++++-------------
 1 file changed, 11 insertions(+), 27 deletions(-)

diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index ba646e6a6f053..cc691931cbfa9 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -594,26 +594,16 @@ def business_metric(y_true, y_pred, amount):
 #
 # Let's now create a predictive model using a logistic regression without tuning the
 # decision threshold.
-#
-# .. note::
-#    By using :class:`~sklearn.linear_model.LogisticRegressionCV`, the hyperparameter
-#    search introduced a data split within the logistic regression itself. Therefore,
-#    a data leak is introduced since we scaled the entire training set before to pass
-#    it to the logistic regression model.
-#
-#    To alleviate the effect of the data leak that is a potential distribution shift
-#    due to the presence of outliers, we use a scaler based on robust statistics by
-#    using :class:`~sklearn.preprocessing.RobustScaler`.
-from sklearn.linear_model import LogisticRegressionCV
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import GridSearchCV
 from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import RobustScaler
+from sklearn.preprocessing import StandardScaler
 
-model = make_pipeline(
-    RobustScaler(quantile_range=(5, 95)),
-    LogisticRegressionCV(
-        random_state=42, Cs=np.logspace(-6, 6, 13), scoring="neg_log_loss"
-    ),
-).fit(data_train, target_train)
+logistic_regression = make_pipeline(StandardScaler(), LogisticRegression())
+param_grid = {"logisticregression__C": np.logspace(-6, 6, 13)}
+model = GridSearchCV(logistic_regression, param_grid, scoring="neg_log_loss").fit(
+    data_train, target_train
+)
 
 print(
     "Benefit/cost of our logistic regression: "
@@ -636,16 +626,10 @@ def business_metric(y_true, y_pred, amount):
 # Now the question is: is our model optimum for the type of decision that we want to do?
 # Up to now, we did not optimize the decision threshold. We use the
 # :class:`~sklearn.model_selection.TunedThresholdClassifierCV` to optimize the decision
-# given our business scorer. To avoid a nested cross-validation, we will use the `C`
-# parameter found in the previous logistic regression model while tuning.
-from sklearn.linear_model import LogisticRegression
-
-# Reuse the best parameter C found in the previous grid-search
-model = make_pipeline(
-    RobustScaler(quantile_range=(5, 95)), LogisticRegression(C=model[-1].C_[0])
-)
+# given our business scorer. To avoid a nested cross-validation, we will use the
+# best estimator found during the previous grid-search.
 tuned_model = TunedThresholdClassifierCV(
-    estimator=model,
+    estimator=model.best_estimator_,
     objective_metric=business_scorer,
     thresholds=100,
     n_jobs=2,

From db3360bc5dab97705a122964aae46cfbb4037bb5 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 2 May 2024 23:20:34 +0200
Subject: [PATCH 189/194] Apply suggestions from code review

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 examples/model_selection/plot_tuned_decision_threshold.py | 8 ++++----
 sklearn/model_selection/_classification_threshold.py      | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/model_selection/plot_tuned_decision_threshold.py b/examples/model_selection/plot_tuned_decision_threshold.py
index aa323b1f33fac..056a7d19557de 100644
--- a/examples/model_selection/plot_tuned_decision_threshold.py
+++ b/examples/model_selection/plot_tuned_decision_threshold.py
@@ -31,10 +31,10 @@
 target.value_counts()
 
 # %%
-# We can see that we are dealing with a binary classification problem. Since the labels
-# are not encoded as 0 and 1, we will store which label we consider to be the negative
-# class and which one we consider to be the positive class: "tested_negative" will be
-# considered the negative class and "tested_positive" the positive class.
+# We can see that we are dealing with a binary classification problem. Since the
+# labels are not encoded as 0 and 1, we make it explicit that we consider the class
+# labeled "tested_negative" as the negative class (which is also the most frequent)
+# and the class labeled "tested_positive" the positive as the positive class:
 neg_label, pos_label = target.value_counts().index
 
 # %%
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 9228bf162dac0..d6c6ecb8fff65 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -71,7 +71,7 @@ def _threshold_scores_to_class_labels(y_score, threshold, classes, pos_label):
 
 
 class BaseThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
-    """Base class for classifiers that set a non-default decision threshold.
+    """Base class for binary classifiers that set a non-default decision threshold.
 
     In this base class, we define the following interface:
 
@@ -83,7 +83,7 @@ class BaseThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator
     Parameters
     ----------
     estimator : estimator instance
-        The classifier, fitted or not, for which we want to optimize
+        The binary classifier, fitted or not, for which we want to optimize
         the decision threshold used during `predict`.
 
     pos_label : int, float, bool or str, default=None
@@ -239,7 +239,7 @@ def _more_tags(self):
 
 
 class FixedThresholdClassifier(BaseThresholdClassifier):
-    """Classifier that manually sets the decision threshold.
+    """Binary classifier that manually sets the decision threshold.
 
     This classifier allows to change the default decision threshold used for
     converting posterior probability estimates (i.e. output of `predict_proba`) or
@@ -254,7 +254,7 @@ class FixedThresholdClassifier(BaseThresholdClassifier):
     Parameters
     ----------
     estimator : estimator instance
-        The classifier, fitted or not, for which we want to optimize
+        The binary classifier, fitted or not, for which we want to optimize
         the decision threshold used during `predict`.
 
     threshold : {"auto"} or float, default="auto"

From 1789cc0e7037fb3989847978d95909800e94371e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Fri, 3 May 2024 15:24:59 +0200
Subject: [PATCH 190/194] remove constrainted metrics option

---
 doc/modules/classification_threshold.rst      |  17 -
 .../plot_tuned_decision_threshold.py          | 200 +---------
 .../_classification_threshold.py              | 216 ++---------
 .../tests/test_classification_threshold.py    | 363 ++----------------
 4 files changed, 74 insertions(+), 722 deletions(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index 2ffaee91a3566..7a774d80c5128 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -117,23 +117,6 @@ a meaningful metric for their use case.
         >>> model.best_score_
         0.86...
 
-A second strategy aims to maximize one metric while imposing constraints on another
-metric. There are four pre-defined options that can be provided to `objective_metric`
-parameter, two use the Receiver Operating Characteristic (ROC) statistics and two use
-the Precision-Recall statistics.
-
-- `"max_tpr_at_tnr_constraint"`: maximizes the True Positive Rate (TPR) such that the
-  True Negative Rate (TNR) is the closest to a given value.
-- `"max_tnr_at_tpr_constraint"`: maximizes the TNR such that the TPR is the closest to
-  a given value.
-- `"max_precision_at_recall_constraint"`: maximizes the precision such that the recall
-  is the closest to a given value.
-- `"max_recall_at_precision_constraint"`: maximizes the recall such that the precision
-  is the closest to a given value.
-
-For these options, the `constraint_value` parameter needs to be defined. In addition,
-you can use the `pos_label` parameter to indicate the label of the class of interest.
-
 Important notes regarding the internal cross-validation
 -------------------------------------------------------
 
diff --git a/examples/model_selection/plot_tuned_decision_threshold.py b/examples/model_selection/plot_tuned_decision_threshold.py
index 056a7d19557de..ab6b8c27b2f07 100644
--- a/examples/model_selection/plot_tuned_decision_threshold.py
+++ b/examples/model_selection/plot_tuned_decision_threshold.py
@@ -11,7 +11,7 @@
 
 This example shows how to use the
 :class:`~sklearn.model_selection.TunedThresholdClassifierCV` to tune the decision
-threshold, depending on a metric of interest as well as under a specific constraints.
+threshold, depending on a metric of interest.
 """
 
 # %%
@@ -184,201 +184,3 @@
 # example entitled,
 # :ref:`sphx_glr_auto_examples_model_selection_plot_cost_sensitive_learning.py`,
 # for more details.
-#
-# Tuning the decision threshold under constraint
-# ----------------------------------------------
-#
-# In some cases, we do not want to only maximize a given metric but instead to maximize
-# a metric while satisfying a constraint on another metric. In the current example, we
-# could imagine that the decision of our predictive model will be reviewed by a medical
-# doctor. In this case, this doctor will only accept a ratio of false positive lower
-# than a given value. Therefore, we are interested in maximizing the true positive rate
-# while having a false positive rate lower than this value.
-#
-# The :class:`~sklearn.model_selection.TunedThresholdClassifierCV` allows to tune the
-# decision threshold with such specification. We illustrate this strategy together with
-# a single train-test split split to display the Receiver Operating Characteristic (ROC)
-# curves to get better intuitions.
-#
-# First, we split the data into a training and testing set.
-
-# %%
-from sklearn.model_selection import train_test_split
-
-data_train, data_test, target_train, target_test = train_test_split(
-    data, target, random_state=42
-)
-
-# %%
-# Now, we will train both the vanilla and tuned model on the training set. We recall
-# that the tuned model is internally maximizing the balanced accuracy for the moment.
-model.fit(data_train, target_train)
-tuned_model.fit(data_train, target_train)
-
-# %%
-# To show the benefit on optimizing a metric under constraint, we will evaluate the
-# models using the ROC curve statistics: the true positive rate (TPR) and the false
-# positive rate (FPR).
-#
-# The FPR is not defined in scikit-learn and we define it below:
-from sklearn.metrics import confusion_matrix, make_scorer, recall_score
-
-
-def fpr_score(y, y_pred, neg_label, pos_label):
-    cm = confusion_matrix(y, y_pred, labels=[neg_label, pos_label])
-    tn, fp, _, _ = cm.ravel()
-    tnr = tn / (tn + fp)
-    return 1 - tnr
-
-
-tpr_score = recall_score  # TPR and recall are the same metric
-scoring = {
-    "fpr": make_scorer(fpr_score, neg_label=neg_label, pos_label=pos_label),
-    "tpr": make_scorer(tpr_score, pos_label=pos_label),
-}
-
-# %%
-# Now, we plot the ROC curve of both models and the FPR and TPR statistics for the
-# decision thresholds of both models.
-from sklearn.metrics import RocCurveDisplay
-
-disp = RocCurveDisplay.from_estimator(
-    model, data_test, target_test, name="Vanilla model", linestyle="--", alpha=0.5
-)
-RocCurveDisplay.from_estimator(
-    tuned_model,
-    data_test,
-    target_test,
-    name="Tuned model",
-    linestyle="-.",
-    alpha=0.5,
-    ax=disp.ax_,
-)
-disp.ax_.plot(
-    scoring["fpr"](model, data_test, target_test),
-    scoring["tpr"](model, data_test, target_test),
-    marker="o",
-    markersize=10,
-    color="tab:blue",
-    label="Default cut-off point at a probability of 0.5",
-)
-disp.ax_.plot(
-    scoring["fpr"](tuned_model, data_test, target_test),
-    scoring["tpr"](tuned_model, data_test, target_test),
-    marker=">",
-    markersize=10,
-    color="tab:orange",
-    label=f"Cut-off point at probability of {tuned_model.best_threshold_:.2f}",
-)
-disp.ax_.legend()
-_ = disp.ax_.set_title("ROC curves")
-
-# %%
-# As expected, both models have the same ROC curves since the tuned
-# model is only a post-processing step of the vanilla model. The tuning step is only
-# changing the decision threshold, as displayed by the blue and orange markers.
-# To optimize the balanced accuracy, the tuned model moved the decision threshold
-# from 0.5 to 0.22. By shifting this point, we increase the FPR while increasing
-# the TPR: in short we make more false positive but also more true positive. This is
-# exactly what we concluded in the previous section when looking at the balanced
-# accuracy score.
-#
-# However, this decision threshold might not be acceptable for our medical doctor. He
-# might be interested to have a low FPR instead, let say lower than 5%. For this level
-# of FPR, he would like our predictive model to maximize the TPR.
-#
-# The :class:`~sklearn.model_selection.TunedThresholdClassifierCV` allows to specify
-# such constraint by providing the name of the metric and the constraint value. Here, we
-# use `max_tpr_at_tnr_constraint` which is exactly what we want. Since the true negative
-# rate (TNR) is equal to 1 - FPR, we can rewrite the constraint value as `1 - 0.05 =
-# 0.95`.
-
-# %%
-constraint_value = 0.95
-tuned_model.set_params(
-    objective_metric="max_tpr_at_tnr_constraint",
-    constraint_value=constraint_value,
-    pos_label=pos_label,
-    store_cv_results=True,
-)
-tuned_model.fit(data_train, target_train)
-
-# %%
-# Now, we can plot the ROC curves and analyse the results.
-import matplotlib.pyplot as plt
-
-_, axs = plt.subplots(ncols=2, figsize=(12, 5))
-
-disp = RocCurveDisplay(
-    fpr=1 - tuned_model.cv_results_["constrained_scores"],
-    tpr=tuned_model.cv_results_["maximized_scores"],
-    estimator_name="ROC of the tuned model",
-    pos_label=pos_label,
-)
-axs[0].plot(
-    1 - tuned_model.constrained_score_,
-    tuned_model.best_score_,
-    marker="o",
-    markersize=10,
-    color="tab:blue",
-    label=f"Cut-off point at probability of {tuned_model.best_threshold_:.2f}",
-)
-axs[0].axvline(
-    1 - constraint_value, 0, 1, color="tab:blue", linestyle="--", label="FPR constraint"
-)
-axs[0].set_title("Average ROC curve for the tuned model\nacross CV folds")
-RocCurveDisplay.from_estimator(
-    model,
-    data_test,
-    target_test,
-    name="Vanilla model",
-    linestyle="--",
-    alpha=0.5,
-    ax=axs[1],
-)
-RocCurveDisplay.from_estimator(
-    tuned_model,
-    data_test,
-    target_test,
-    name="Tuned model",
-    linestyle="-.",
-    alpha=0.5,
-    ax=axs[1],
-)
-axs[1].plot(
-    scoring["fpr"](model, data_test, target_test),
-    scoring["tpr"](model, data_test, target_test),
-    marker="o",
-    markersize=10,
-    color="tab:blue",
-    label="Default cut-off point at a probability of 0.5",
-)
-axs[1].plot(
-    1 - tuned_model.constrained_score_,
-    tuned_model.best_score_,
-    marker="^",
-    markersize=10,
-    color="tab:orange",
-    label=f"Cut-off point at probability of {tuned_model.best_threshold_:.2f}",
-)
-axs[1].legend()
-axs[1].set_title("ROC curves")
-_ = disp.plot(ax=axs[0])
-
-# %%
-# We start with the right-hand side plot. It depicts the ROC curves as in the previous
-# section. We observe that the control point of the tuned model moved to a low FPR
-# that was defined by our constraint. To achieve this low FPR, the decision threshold
-# was moved to a probability of 0.72.
-#
-# The left-hand side plot shows the averaged ROC curve on the internal validation set
-# across the different cross-validation folds. This curve is used to define the decision
-# threshold. The vertical dashed line represents the FPR constraint that we defined.
-# The decision threshold corresponds to the maximum TPR on the left of this dashed line
-# and is represented by a blue marker.
-#
-# An important point to note is that the decision threshold is defined on averaged
-# statistics on an internal validation set. It means that the constraint is respected
-# on the train/validation dataset but not necessarily on the test set, in case the
-# statistical performance of the model differ from the train/validation set to the test
-# set (i.e. overfitting).
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index d6c6ecb8fff65..365a87f289222 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -15,9 +15,6 @@
 from ..metrics import (
     check_scoring,
     get_scorer_names,
-    make_scorer,
-    precision_recall_curve,
-    roc_curve,
 )
 from ..metrics._scorer import _BaseScorer
 from ..utils import _safe_indexing
@@ -611,31 +608,9 @@ def _fit_and_score_over_thresholds(
     else:  # prefit estimator, only a validation set is provided
         X_val, y_val, score_params_val = X, y, score_params
 
-    if curve_scorer is roc_curve or (
-        isinstance(curve_scorer, _BaseScorer) and curve_scorer._score_func is roc_curve
-    ):
-        fpr, tpr, potential_thresholds = curve_scorer(
-            classifier, X_val, y_val, **score_params_val
-        )
-        # For fpr=0/tpr=0, the threshold is set to `np.inf`. We need to remove it.
-        fpr, tpr, potential_thresholds = fpr[1:], tpr[1:], potential_thresholds[1:]
-        # thresholds are in decreasing order
-        return potential_thresholds[::-1], ((1 - fpr)[::-1], tpr[::-1])
-    elif curve_scorer is precision_recall_curve or (
-        isinstance(curve_scorer, _BaseScorer)
-        and curve_scorer._score_func is precision_recall_curve
-    ):
-        precision, recall, potential_thresholds = curve_scorer(
-            classifier, X_val, y_val, **score_params_val
-        )
-        # thresholds are in increasing order
-        # the last element of the precision and recall is not associated with any
-        # threshold and should be discarded
-        return potential_thresholds, (precision[:-1], recall[:-1])
-    else:
-        scores, potential_thresholds = curve_scorer(
-            classifier, X_val, y_val, **score_params_val
-        )
+    scores, potential_thresholds = curve_scorer(
+        classifier, X_val, y_val, **score_params_val
+    )
     return potential_thresholds, scores
 
 
@@ -686,37 +661,17 @@ class TunedThresholdClassifierCV(BaseThresholdClassifier):
         The classifier, fitted or not, for which we want to optimize
         the decision threshold used during `predict`.
 
-    objective_metric : {"max_tpr_at_tnr_constraint", "max_tnr_at_tpr_constraint", \
-            "max_precision_at_recall_constraint, "max_recall_at_precision_constraint"} \
-            , str, dict or callable, default="balanced_accuracy"
+    objective_metric : str, dict or callable, default="balanced_accuracy"
         The objective metric to be optimized. Can be one of:
 
         * a string associated to a scoring function for binary classification
           (see model evaluation documentation);
         * a scorer callable object created with :func:`~sklearn.metrics.make_scorer`;
-        * `"max_tnr_at_tpr_constraint"`: find the decision threshold for a true
-          positive ratio (TPR) of `constraint_value`;
-        * `"max_tpr_at_tnr_constraint"`: find the decision threshold for a true
-          negative ratio (TNR) of `constraint_value`.
-        * `"max_precision_at_recall_constraint"`: find the decision threshold for a
-          recall of `constraint_value`;
-        * `"max_recall_at_precision_constraint"`: find the decision threshold for a
-          precision of `constraint_value`.
-
-    constraint_value : float, default=None
-        The value associated with the `objective_metric` metric for which we
-        want to find the decision threshold when `objective_metric` is either
-        `"max_tnr_at_tpr_constraint"`, `"max_tpr_at_tnr_constraint"`,
-        `"max_precision_at_recall_constraint"`, or
-        `"max_recall_at_precision_constraint"`.
 
     pos_label : int, float, bool or str, default=None
-        The label of the positive class. Used when `objective_metric` is
-        `"max_tnr_at_tpr_constraint"`"`, `"max_tpr_at_tnr_constraint"`, or a dictionary.
-        When `pos_label=None`, if `y_true` is in `{-1, 1}` or `{0, 1}`,
-        `pos_label` is set to 1, otherwise an error will be raised. When using a
-        scorer, `pos_label` can be passed as a keyword argument to
-        :func:`~sklearn.metrics.make_scorer`.
+        The label of the positive class. Used to process the output of the
+        `response_method` method. When `pos_label=None`, if `y_true` is in `{-1, 1}` or
+        `{0, 1}`, `pos_label` is set to 1, otherwise an error will be raised.
 
     response_method : {"auto", "decision_function", "predict_proba"}, default="auto"
         Methods by the classifier `estimator` corresponding to the
@@ -790,24 +745,10 @@ class TunedThresholdClassifierCV(BaseThresholdClassifier):
     best_score_ : float or None
         The optimal score of the objective metric, evaluated at `best_threshold_`.
 
-    constrained_score_ : float or None
-        When `objective_metric` is one of `"max_tpr_at_tnr_constraint"`,
-        `"max_tnr_at_tpr_constraint"`, `"max_precision_at_recall_constraint"`,
-        `"max_recall_at_precision_constraint"`, it will corresponds to the score of the
-        metric which is constrained. It should be close to `constraint_value`. If
-        `objective_metric` is not one of the above, `constrained_score_` is None.
-
     cv_results_ : dict or None
         A dictionary containing the scores and thresholds computed during the
-        cross-validation process. Only exist if `store_cv_results=True`.
-        The keys are different depending on the `objective_metric` used:
-
-        * when `objective_metric` is one of `"max_tpr_at_tnr_constraint"`,
-          `"max_tnr_at_tpr_constraint"`, `"max_precision_at_recall_constraint"`,
-          `"max_recall_at_precision_constraint"`, the keys are `"thresholds"`,
-          `"constrained_scores"`, and `"maximized_scores"`;
-        * otherwise, for score computing a single values, the keys are `"thresholds"`
-          and `"scores"`.
+        cross-validation process. Only exist if `store_cv_results=True`. The
+        keys are `"thresholds"` and `"scores"`.
 
     classes_ : ndarray of shape (n_classes,)
         The class labels.
@@ -851,43 +792,31 @@ class TunedThresholdClassifierCV(BaseThresholdClassifier):
     weighted avg       0.93      0.93      0.92       250
     <BLANKLINE>
     >>> classifier_tuned = TunedThresholdClassifierCV(
-    ...     classifier, objective_metric="max_precision_at_recall_constraint",
-    ...     constraint_value=0.7,
+    ...     classifier, objective_metric="balanced_accuracy"
     ... ).fit(X_train, y_train)
     >>> print(
-    ...     f"Cut-off point found at {classifier_tuned.best_threshold_:.3f} for a "
-    ...     f"recall of {classifier_tuned.constrained_score_:.3f} and a precision of "
-    ...     f"{classifier_tuned.best_score_:.3f}."
+    ...     f"Cut-off point found at {classifier_tuned.best_threshold_:.3f}"
     ... )
-    Cut-off point found at 0.3... for a recall of 0.7... and a precision of 0.7...
+    Cut-off point found at 0.342
     >>> print(classification_report(y_test, classifier_tuned.predict(X_test)))
                   precision    recall  f1-score   support
     <BLANKLINE>
-               0       0.96      0.96      0.96       224
-               1       0.68      0.65      0.67        26
+               0       0.96      0.95      0.96       224
+               1       0.61      0.65      0.63        26
     <BLANKLINE>
-        accuracy                           0.93       250
-       macro avg       0.82      0.81      0.81       250
-    weighted avg       0.93      0.93      0.93       250
+        accuracy                           0.92       250
+       macro avg       0.78      0.80      0.79       250
+    weighted avg       0.92      0.92      0.92       250
     <BLANKLINE>
     """
 
     _parameter_constraints: dict = {
         **BaseThresholdClassifier._parameter_constraints,
         "objective_metric": [
-            StrOptions(
-                set(get_scorer_names())
-                | {
-                    "max_tnr_at_tpr_constraint",
-                    "max_tpr_at_tnr_constraint",
-                    "max_precision_at_recall_constraint",
-                    "max_recall_at_precision_constraint",
-                }
-            ),
+            StrOptions(set(get_scorer_names())),
             callable,
             MutableMapping,
         ],
-        "constraint_value": [Real, None],
         "thresholds": [Interval(Integral, 1, None, closed="left"), "array-like"],
         "cv": [
             "cv_object",
@@ -905,7 +834,6 @@ def __init__(
         estimator,
         *,
         objective_metric="balanced_accuracy",
-        constraint_value=None,
         pos_label=None,
         response_method="auto",
         thresholds=100,
@@ -919,7 +847,6 @@ def __init__(
             estimator=estimator, response_method=response_method, pos_label=pos_label
         )
         self.objective_metric = objective_metric
-        self.constraint_value = constraint_value
         self.thresholds = thresholds
         self.cv = cv
         self.refit = refit
@@ -966,23 +893,6 @@ def _fit(self, X, y, **params):
             if self.refit is False and cv.get_n_splits() > 1:
                 raise ValueError("When cv has several folds, refit cannot be False.")
 
-        if isinstance(self.objective_metric, str) and self.objective_metric in {
-            "max_tpr_at_tnr_constraint",
-            "max_tnr_at_tpr_constraint",
-            "max_precision_at_recall_constraint",
-            "max_recall_at_precision_constraint",
-        }:
-            if self.constraint_value is None:
-                raise ValueError(
-                    "When `objective_metric` is 'max_tpr_at_tnr_constraint', "
-                    "'max_tnr_at_tpr_constraint', 'max_precision_at_recall_constraint',"
-                    " or 'max_recall_at_precision_constraint', `constraint_value` must "
-                    "be provided. Got None instead."
-                )
-            constrained_metric = True
-        else:
-            constrained_metric = False
-
         routed_params = process_routing(self, "fit", **params)
         self._curve_scorer = self._get_curve_scorer()
 
@@ -1049,56 +959,17 @@ def _fit(self, X, y, **params):
         else:
             decision_thresholds = np.asarray(self.thresholds)
 
-        if not constrained_metric:  # find best score that is the highest value
-            objective_scores = _mean_interpolated_score(
-                decision_thresholds, cv_thresholds, cv_scores
-            )
-            best_idx = objective_scores.argmax()
-            self.best_score_ = objective_scores[best_idx]
-            self.best_threshold_ = decision_thresholds[best_idx]
-            self.constrained_score_ = None
-            if self.store_cv_results:
-                self.cv_results_ = {
-                    "thresholds": decision_thresholds,
-                    "scores": objective_scores,
-                }
-        else:
-            if "tpr" in self.objective_metric:  # tpr/tnr
-                mean_tnr, mean_tpr = [
-                    _mean_interpolated_score(decision_thresholds, cv_thresholds, sc)
-                    for sc in zip(*cv_scores)
-                ]
-            else:  # precision/recall
-                mean_precision, mean_recall = [
-                    _mean_interpolated_score(decision_thresholds, cv_thresholds, sc)
-                    for sc in zip(*cv_scores)
-                ]
-
-            def _get_best_idx(constrained_score, maximized_score):
-                """Find the index of the best score constrained by another score."""
-                mask = constrained_score >= self.constraint_value
-                mask_idx = maximized_score[mask].argmax()
-                return np.flatnonzero(mask)[mask_idx]
-
-            if self.objective_metric == "max_tpr_at_tnr_constraint":
-                constrained_scores, maximized_scores = mean_tnr, mean_tpr
-            elif self.objective_metric == "max_tnr_at_tpr_constraint":
-                constrained_scores, maximized_scores = mean_tpr, mean_tnr
-            elif self.objective_metric == "max_precision_at_recall_constraint":
-                constrained_scores, maximized_scores = mean_recall, mean_precision
-            else:  # max_recall_at_precision_constraint
-                constrained_scores, maximized_scores = mean_precision, mean_recall
-
-            best_idx = _get_best_idx(constrained_scores, maximized_scores)
-            self.best_score_ = maximized_scores[best_idx]
-            self.constrained_score_ = constrained_scores[best_idx]
-            self.best_threshold_ = decision_thresholds[best_idx]
-            if self.store_cv_results:
-                self.cv_results_ = {
-                    "thresholds": decision_thresholds,
-                    "constrained_scores": constrained_scores,
-                    "maximized_scores": maximized_scores,
-                }
+        objective_scores = _mean_interpolated_score(
+            decision_thresholds, cv_thresholds, cv_scores
+        )
+        best_idx = objective_scores.argmax()
+        self.best_score_ = objective_scores[best_idx]
+        self.best_threshold_ = decision_thresholds[best_idx]
+        if self.store_cv_results:
+            self.cv_results_ = {
+                "thresholds": decision_thresholds,
+                "scores": objective_scores,
+            }
 
         return self
 
@@ -1161,29 +1032,10 @@ def _get_curve_scorer(self):
         """Get the curve scorer based on the objective metric used.
 
         Here, we reuse the conventional "scorer API" via `make_scorer` or
-        `_CurveScorer`. Note that the use here is unconventional because `make_scorer`
-        or the "scorer API" is expected to return a single score value when calling
-        `scorer(estimator, X, y)`. Here the score function used are both returning
-        scores and thresholds representing a curve.
+        `_CurveScorer`.
         """
-        if self.objective_metric in {
-            "max_tnr_at_tpr_constraint",
-            "max_tpr_at_tnr_constraint",
-            "max_precision_at_recall_constraint",
-            "max_recall_at_precision_constraint",
-        }:
-            if "tpr" in self.objective_metric:  # tpr/tnr
-                score_curve_func = roc_curve
-            else:  # precision/recall
-                score_curve_func = precision_recall_curve
-            curve_scorer = make_scorer(
-                score_curve_func,
-                response_method=self._response_method,
-                pos_label=self.pos_label,
-            )
-        else:
-            scoring = check_scoring(self.estimator, scoring=self.objective_metric)
-            curve_scorer = _CurveScorer.from_scorer(
-                scoring, self._response_method, self.thresholds, self.pos_label
-            )
+        scoring = check_scoring(self.estimator, scoring=self.objective_metric)
+        curve_scorer = _CurveScorer.from_scorer(
+            scoring, self._response_method, self.thresholds, self.pos_label
+        )
         return curve_scorer
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 792a37d8f50f9..038efef4c6762 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -14,14 +14,11 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import (
     balanced_accuracy_score,
-    confusion_matrix,
     f1_score,
     fbeta_score,
     make_scorer,
-    precision_recall_curve,
     precision_score,
     recall_score,
-    roc_curve,
 )
 from sklearn.model_selection import (
     FixedThresholdClassifier,
@@ -135,44 +132,20 @@ def test_curve_scorer_pos_label(global_random_seed):
     assert scores_pos_label_1.max() == pytest.approx(1.0)
 
 
-@pytest.mark.parametrize(
-    "curve_scorer, score_method",
-    [
-        (
-            _CurveScorer(
-                score_func=balanced_accuracy_score,
-                sign=1,
-                response_method="predict_proba",
-                thresholds=10,
-                kwargs={},
-            ),
-            "balanced_accuracy",
-        ),
-        (
-            make_scorer(roc_curve, response_method="predict_proba"),
-            "max_tnr_at_tpr_constraint",
-        ),
-        (
-            make_scorer(roc_curve, response_method="predict_proba"),
-            "max_tpr_at_tnr_constraint",
-        ),
-        (
-            make_scorer(precision_recall_curve, response_method="predict_proba"),
-            "max_precision_at_recall_constraint",
-        ),
-        (
-            make_scorer(precision_recall_curve, response_method="predict_proba"),
-            "max_recall_at_precision_constraint",
-        ),
-    ],
-)
-def test_fit_and_score_over_thresholds_curve_scorers(curve_scorer, score_method):
+def test_fit_and_score_over_thresholds_curve_scorers():
     """Check that `_fit_and_score_over_thresholds` returns thresholds in ascending order
     for the different accepted curve scorers."""
     X, y = make_classification(n_samples=100, random_state=0)
     train_idx, val_idx = np.arange(50), np.arange(50, 100)
     classifier = LogisticRegression()
 
+    curve_scorer = _CurveScorer(
+        score_func=balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={},
+    )
     thresholds, scores = _fit_and_score_over_thresholds(
         classifier,
         X,
@@ -185,48 +158,11 @@ def test_fit_and_score_over_thresholds_curve_scorers(curve_scorer, score_method)
     )
 
     assert np.all(thresholds[:-1] <= thresholds[1:])
-
-    if score_method.startswith("max_"):
-        assert isinstance(scores, tuple) and len(scores) == 2
-        for sc in scores:
-            assert np.logical_and(sc >= 0, sc <= 1).all()
-    else:
-        assert isinstance(scores, np.ndarray)
-        assert np.logical_and(scores >= 0, scores <= 1).all()
+    assert isinstance(scores, np.ndarray)
+    assert np.logical_and(scores >= 0, scores <= 1).all()
 
 
-@pytest.mark.parametrize(
-    "curve_scorer, expected_score",
-    [
-        (
-            _CurveScorer(
-                score_func=balanced_accuracy_score,
-                sign=1,
-                response_method="predict_proba",
-                thresholds=2,
-                kwargs={},
-            ),
-            [0.5, 1.0],
-        ),
-        (
-            make_scorer(roc_curve, response_method="predict_proba"),
-            [[0.0, 1.0], [1.0, 1.0]],
-        ),
-        (
-            make_scorer(roc_curve, response_method="predict_proba"),
-            [[0.0, 1.0], [1.0, 1.0]],
-        ),
-        (
-            make_scorer(precision_recall_curve, response_method="predict_proba"),
-            [[0.5, 1.0], [1.0, 1.0]],
-        ),
-        (
-            make_scorer(precision_recall_curve, response_method="predict_proba"),
-            [[0.5, 1.0], [1.0, 1.0]],
-        ),
-    ],
-)
-def test_fit_and_score_over_thresholds_prefit(curve_scorer, expected_score):
+def test_fit_and_score_over_thresholds_prefit():
     """Check the behaviour with a prefit classifier."""
     X, y = make_classification(n_samples=100, random_state=0)
 
@@ -237,6 +173,13 @@ def test_fit_and_score_over_thresholds_prefit(curve_scorer, expected_score):
     # we get perfect predictions and thus match the expected score
     assert classifier.score(X[val_idx], y[val_idx]) == pytest.approx(1.0)
 
+    curve_scorer = _CurveScorer(
+        score_func=balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=2,
+        kwargs={},
+    )
     thresholds, scores = _fit_and_score_over_thresholds(
         classifier,
         X,
@@ -248,27 +191,11 @@ def test_fit_and_score_over_thresholds_prefit(curve_scorer, expected_score):
         score_params={},
     )
     assert np.all(thresholds[:-1] <= thresholds[1:])
-    assert_allclose(scores, expected_score)
+    assert_allclose(scores, [0.5, 1.0])
 
 
 @pytest.mark.usefixtures("enable_slep006")
-@pytest.mark.parametrize(
-    "curve_scorer",
-    [
-        _CurveScorer(
-            score_func=balanced_accuracy_score,
-            sign=1,
-            response_method="predict_proba",
-            thresholds=10,
-            kwargs={},
-        ),
-        make_scorer(roc_curve, response_method="predict_proba"),
-        make_scorer(roc_curve, response_method="predict_proba"),
-        make_scorer(precision_recall_curve, response_method="predict_proba"),
-        make_scorer(precision_recall_curve, response_method="predict_proba"),
-    ],
-)
-def test_fit_and_score_over_thresholds_sample_weight(curve_scorer):
+def test_fit_and_score_over_thresholds_sample_weight():
     """Check that we dispatch the sample-weight to fit and score the classifier."""
     X, y = load_iris(return_X_y=True)
     X, y = X[:100], y[:100]  # only 2 classes
@@ -282,6 +209,13 @@ def test_fit_and_score_over_thresholds_sample_weight(curve_scorer):
     classifier = LogisticRegression()
     train_repeated_idx = np.arange(X_repeated.shape[0])
     val_repeated_idx = np.arange(X_repeated.shape[0])
+    curve_scorer = _CurveScorer(
+        score_func=balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={},
+    )
     thresholds_repeated, scores_repeated = _fit_and_score_over_thresholds(
         classifier,
         X_repeated,
@@ -310,24 +244,8 @@ def test_fit_and_score_over_thresholds_sample_weight(curve_scorer):
 
 
 @pytest.mark.usefixtures("enable_slep006")
-@pytest.mark.parametrize(
-    "curve_scorer",
-    [
-        _CurveScorer(
-            score_func=balanced_accuracy_score,
-            sign=1,
-            response_method="predict_proba",
-            thresholds=10,
-            kwargs={},
-        ),
-        make_scorer(roc_curve, response_method="predict_proba"),
-        make_scorer(roc_curve, response_method="predict_proba"),
-        make_scorer(precision_recall_curve, response_method="predict_proba"),
-        make_scorer(precision_recall_curve, response_method="predict_proba"),
-    ],
-)
 @pytest.mark.parametrize("fit_params_type", ["list", "array"])
-def test_fit_and_score_over_thresholds_fit_params(curve_scorer, fit_params_type):
+def test_fit_and_score_over_thresholds_fit_params(fit_params_type):
     """Check that we pass `fit_params` to the classifier when calling `fit`."""
     X, y = make_classification(n_samples=100, random_state=0)
     fit_params = {
@@ -339,6 +257,13 @@ def test_fit_and_score_over_thresholds_fit_params(curve_scorer, fit_params_type)
     classifier.set_fit_request(a=True, b=True)
     train_idx, val_idx = np.arange(50), np.arange(50, 100)
 
+    curve_scorer = _CurveScorer(
+        score_func=balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={},
+    )
     _fit_and_score_over_thresholds(
         classifier,
         X,
@@ -459,24 +384,6 @@ def test_tuned_threshold_classifier_without_constraint_value(response_method):
     assert model.cv_results_["scores"].shape == (thresholds,)
 
 
-def test_tuned_threshold_classifier_limit_metric_tradeoff():
-    """Check that max TPR lead to opposite prediction of max TNR when constraint is
-    set to 0.0.
-    """
-    X, y = load_breast_cancer(return_X_y=True)
-    estimator = make_pipeline(StandardScaler(), LogisticRegression())
-    model = TunedThresholdClassifierCV(
-        estimator=estimator,
-        objective_metric="max_tpr_at_tnr_constraint",
-        constraint_value=0,
-    )
-    y_pred_1 = model.fit(X, y).predict(X)
-    model.set_params(objective_metric="max_tnr_at_tpr_constraint")
-    y_pred_2 = (~model.fit(X, y).predict(X).astype(bool)).astype(int)
-    # check that we have opposite predictions with a slight tolerance
-    assert np.mean(y_pred_1 == y_pred_2) > 0.99
-
-
 def test_tuned_threshold_classifier_metric_with_parameter():
     """Check that we can pass a metric with a parameter in addition check that
     `f_beta` with `beta=1` is equivalent to `f1` and different from `f_beta` with
@@ -504,10 +411,6 @@ def test_tuned_threshold_classifier_metric_with_parameter():
 @pytest.mark.parametrize(
     "metric",
     [
-        "max_tnr_at_tpr_constraint",
-        "max_tpr_at_tnr_constraint",
-        "max_precision_at_recall_constraint",
-        "max_recall_at_precision_constraint",
         make_scorer(balanced_accuracy_score),
         make_scorer(f1_score, pos_label="cancer"),
     ],
@@ -526,7 +429,6 @@ def test_tuned_threshold_classifier_with_string_targets(response_method, metric)
     model = TunedThresholdClassifierCV(
         estimator=make_pipeline(StandardScaler(), LogisticRegression()),
         objective_metric=metric,
-        constraint_value=0.9,
         pos_label="cancer",
         response_method=response_method,
         thresholds=100,
@@ -589,18 +491,8 @@ def test_tuned_threshold_classifier_refit(with_sample_weight, global_random_seed
 
 
 @pytest.mark.usefixtures("enable_slep006")
-@pytest.mark.parametrize(
-    "objective_metric",
-    [
-        "max_tnr_at_tpr_constraint",
-        "max_tpr_at_tnr_constraint",
-        "max_precision_at_recall_constraint",
-        "max_recall_at_precision_constraint",
-        "balanced_accuracy",
-    ],
-)
 @pytest.mark.parametrize("fit_params_type", ["list", "array"])
-def test_tuned_threshold_classifier_fit_params(objective_metric, fit_params_type):
+def test_tuned_threshold_classifier_fit_params(fit_params_type):
     """Check that we pass `fit_params` to the classifier when calling `fit`."""
     X, y = make_classification(n_samples=100, random_state=0)
     fit_params = {
@@ -610,70 +502,10 @@ def test_tuned_threshold_classifier_fit_params(objective_metric, fit_params_type
 
     classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0)
     classifier.set_fit_request(a=True, b=True)
-    model = TunedThresholdClassifierCV(
-        classifier, objective_metric=objective_metric, constraint_value=0.5
-    )
+    model = TunedThresholdClassifierCV(classifier)
     model.fit(X, y, **fit_params)
 
 
-@pytest.mark.parametrize(
-    "objective_metric, constraint_value",
-    [
-        ("max_tnr_at_tpr_constraint", 0.5),
-        ("max_tpr_at_tnr_constraint", 0.5),
-        ("max_precision_at_recall_constraint", 0.5),
-        ("max_recall_at_precision_constraint", 0.5),
-    ],
-)
-@pytest.mark.parametrize(
-    "response_method", ["auto", "decision_function", "predict_proba"]
-)
-def test_tuned_threshold_classifier_response_method_curve_scorer_with_constraint_metric(
-    objective_metric, constraint_value, response_method, global_random_seed
-):
-    """Check that we use the proper curve scorer and forwarding the requested
-    response method for TNR/TPR and precision/recall metrics.
-    """
-    X, y = make_classification(n_samples=100, random_state=global_random_seed)
-    classifier = LogisticRegression()
-
-    thresholds = 100
-    model = TunedThresholdClassifierCV(
-        classifier,
-        objective_metric=objective_metric,
-        constraint_value=constraint_value,
-        response_method=response_method,
-        thresholds=thresholds,
-        store_cv_results=True,
-    )
-    model.fit(X, y)
-    assert model.cv_results_["thresholds"].shape == (thresholds,)
-    assert model.cv_results_["constrained_scores"].shape == (thresholds,)
-    assert model.cv_results_["maximized_scores"].shape == (thresholds,)
-
-    if response_method in ("auto", "predict_proba"):
-        # "auto" will fall back in priority on `predict_proba` if `estimator`
-        # supports it. We expect the decision threshold to be in [0, 1]
-        if objective_metric in (
-            "max_tnr_at_tpr_constraint",
-            "max_precision_at_recall_constraint",
-        ):
-            assert 0.5 <= model.best_threshold_ <= 1
-        else:  # "max_tpr_at_tnr_constraint" or "max_recall_at_precision_constraint"
-            assert 0 <= model.best_threshold_ <= 0.5
-    else:  # "decision_function"
-        # We expect the decision function to be centered in 0.0 and to be larger than
-        # -1 and 1. We therefore check that the threshold is positive in one case and
-        # negative in the other.
-        if objective_metric in (
-            "max_tnr_at_tpr_constraint",
-            "max_precision_at_recall_constraint",
-        ):
-            assert 0 < model.best_threshold_ < 20
-        else:  # "max_tpr_at_tnr_constraint" or "max_recall_at_precision_constraint"
-            assert -20 < model.best_threshold_ < 0
-
-
 @pytest.mark.usefixtures("enable_slep006")
 def test_tuned_threshold_classifier_cv_zeros_sample_weights_equivalence():
     """Check that passing removing some sample from the dataset `X` is
@@ -704,84 +536,6 @@ def test_tuned_threshold_classifier_cv_zeros_sample_weights_equivalence():
     assert_allclose(y_pred_with_weights, y_pred_without_weights)
 
 
-@pytest.mark.parametrize(
-    "objective_metric",
-    ["max_precision_at_recall_constraint", "max_recall_at_precision_constraint"],
-)
-@pytest.mark.parametrize("pos_label", [0, 1])
-def test_tuned_threshold_classifier_pos_label_precision_recall(
-    objective_metric, pos_label
-):
-    """Check that `pos_label` is dispatched correctly by checking the precision and
-    recall score found during the optimization and the one found at `predict` time."""
-    X, y = make_classification(n_samples=5_000, weights=[0.6, 0.4], random_state=42)
-
-    # prefit the estimator to avoid variability due to the cross-validation
-    estimator = LogisticRegression().fit(X, y)
-
-    constraint_value = 0.7
-    model = TunedThresholdClassifierCV(
-        estimator,
-        objective_metric=objective_metric,
-        constraint_value=constraint_value,
-        cv="prefit",
-        refit=False,
-        pos_label=pos_label,
-    ).fit(X, y)
-
-    precision = precision_score(y, model.predict(X), pos_label=pos_label)
-    recall = recall_score(y, model.predict(X), pos_label=pos_label)
-
-    # due to internal interpolation, the scores will vary slightly
-    if objective_metric == "max_precision_at_recall_constraint":
-        assert precision == pytest.approx(model.best_score_, abs=1e-3)
-        assert recall == pytest.approx(model.constrained_score_, abs=1e-3)
-    else:
-        assert recall == pytest.approx(model.best_score_, abs=1e-3)
-        assert precision == pytest.approx(model.constrained_score_, abs=1e-3)
-
-
-@pytest.mark.parametrize(
-    "objective_metric", ["max_tnr_at_tpr_constraint", "max_tpr_at_tnr_constraint"]
-)
-@pytest.mark.parametrize("pos_label", [0, 1])
-def test_tuned_threshold_classifier_pos_label_tnr_tpr(objective_metric, pos_label):
-    """Check that `pos_label` is dispatched correctly by checking the TNR and TPR
-    score found during the optimization and the one found at `predict` time."""
-    X, y = make_classification(n_samples=5_000, weights=[0.6, 0.4], random_state=42)
-
-    # prefit the estimator to avoid variability due to the cross-validation
-    estimator = LogisticRegression().fit(X, y)
-
-    constraint_value = 0.7
-    model = TunedThresholdClassifierCV(
-        estimator,
-        objective_metric=objective_metric,
-        constraint_value=constraint_value,
-        cv="prefit",
-        refit=False,
-        pos_label=pos_label,
-    ).fit(X, y)
-
-    def tnr_tpr_score(y_true, y_pred, pos_label=pos_label):
-        cm = confusion_matrix(y_true, y_pred)
-        if pos_label == 0:
-            cm = cm[::-1, ::-1]
-        tn, fp, fn, tp = cm.ravel()
-        tnr = tn / (tn + fp)
-        tpr = tp / (tp + fn)
-        return tnr, tpr
-
-    tnr, tpr = tnr_tpr_score(y, model.predict(X), pos_label=pos_label)
-    # due to internal interpolation, the scores will vary slightly
-    if objective_metric == "max_tnr_at_tpr_constraint":
-        assert tnr == pytest.approx(model.best_score_, abs=0.05)
-        assert tpr == pytest.approx(model.constrained_score_, abs=0.05)
-    else:
-        assert tpr == pytest.approx(model.best_score_, abs=0.05)
-        assert tnr == pytest.approx(model.constrained_score_, abs=0.05)
-
-
 @pytest.mark.parametrize(
     "metric_type",
     ["string", "scorer_without_pos_label", "scorer_with_pos_label"],
@@ -832,31 +586,13 @@ def test_tuned_threshold_classifier_thresholds_array():
     assert_allclose(tuned_model.cv_results_["thresholds"], thresholds)
 
 
-@pytest.mark.parametrize(
-    "params",
-    [
-        {"objective_metric": "balanced_accuracy"},
-        {"objective_metric": "max_tpr_at_tnr_constraint", "constraint_value": 0.5},
-        {"objective_metric": "max_tnr_at_tpr_constraint", "constraint_value": 0.5},
-        {
-            "objective_metric": "max_precision_at_recall_constraint",
-            "constraint_value": 0.5,
-        },
-        {
-            "objective_metric": "max_recall_at_precision_constraint",
-            "constraint_value": 0.5,
-        },
-    ],
-)
 @pytest.mark.parametrize("store_cv_results", [True, False])
-def test_tuned_threshold_classifier_store_cv_results(params, store_cv_results):
+def test_tuned_threshold_classifier_store_cv_results(store_cv_results):
     """Check that if `cv_results_` exists depending on `store_cv_results`."""
     X, y = make_classification(random_state=0)
     estimator = LogisticRegression()
     tuned_model = TunedThresholdClassifierCV(
-        estimator,
-        store_cv_results=store_cv_results,
-        **params,
+        estimator, store_cv_results=store_cv_results
     ).fit(X, y)
     if store_cv_results:
         assert hasattr(tuned_model, "cv_results_")
@@ -892,27 +628,6 @@ def test_tuned_threshold_classifier_cv_float():
     assert_allclose(tuned_model.estimator_.coef_, cloned_estimator.coef_)
 
 
-@pytest.mark.parametrize(
-    "objective_metric",
-    [
-        "max_tpr_at_tnr_constraint",
-        "max_tnr_at_tpr_constraint",
-        "max_precision_at_recall_constraint",
-        "max_recall_at_precision_constraint",
-    ],
-)
-def test_tuned_threshold_classifier_error_missing_constraint(objective_metric):
-    """Check that we raise an informative error when using a objective metric requesting
-    a constraint but no `constraint_value` is provided."""
-    X, y = make_classification(random_state=0)
-    estimator = LogisticRegression()
-    tuned_model = TunedThresholdClassifierCV(
-        estimator, objective_metric=objective_metric
-    )
-    with pytest.raises(ValueError, match="`constraint_value` must be provided"):
-        tuned_model.fit(X, y)
-
-
 def test_tuned_threshold_classifier_error_constant_predictor():
     """Check that we raise a ValueError if the underlying classifier returns constant
     probabilities such that we cannot find any threshold.

From e7c31b94a6c5a631f5855c3ab146db5e6032026b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Fri, 3 May 2024 15:41:53 +0200
Subject: [PATCH 191/194] partial review

---
 .../_classification_threshold.py                | 17 +++++++----------
 .../tests/test_classification_threshold.py      |  8 ++++----
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 365a87f289222..b0b186342f68d 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -590,13 +590,13 @@ def _fit_and_score_over_thresholds(
 
     Returns
     -------
-    potential_thresholds : ndarray of shape (thresholds,)
-        The decision thresholds used to compute the scores. They are returned in
-        ascending order.
-
     scores : ndarray of shape (thresholds,) or tuple of such arrays
         The scores computed for each decision threshold. When TPR/TNR or precision/
         recall are computed, `scores` is a tuple of two arrays.
+
+    potential_thresholds : ndarray of shape (thresholds,)
+        The decision thresholds used to compute the scores. They are returned in
+        ascending order.
     """
 
     if train_idx is not None:
@@ -608,10 +608,7 @@ def _fit_and_score_over_thresholds(
     else:  # prefit estimator, only a validation set is provided
         X_val, y_val, score_params_val = X, y, score_params
 
-    scores, potential_thresholds = curve_scorer(
-        classifier, X_val, y_val, **score_params_val
-    )
-    return potential_thresholds, scores
+    return curve_scorer(classifier, X_val, y_val, **score_params_val)
 
 
 def _mean_interpolated_score(target_thresholds, cv_thresholds, cv_scores):
@@ -661,7 +658,7 @@ class TunedThresholdClassifierCV(BaseThresholdClassifier):
         The classifier, fitted or not, for which we want to optimize
         the decision threshold used during `predict`.
 
-    objective_metric : str, dict or callable, default="balanced_accuracy"
+    objective_metric : str or callable, default="balanced_accuracy"
         The objective metric to be optimized. Can be one of:
 
         * a string associated to a scoring function for binary classification
@@ -923,7 +920,7 @@ def _fit(self, X, y, **params):
 
             self.estimator_.fit(X_train, y_train, **fit_params_train)
 
-        cv_thresholds, cv_scores = zip(
+        cv_scores, cv_thresholds = zip(
             *Parallel(n_jobs=self.n_jobs)(
                 delayed(_fit_and_score_over_thresholds)(
                     clone(classifier) if cv != "prefit" else classifier,
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 038efef4c6762..55c772ebc95eb 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -146,7 +146,7 @@ def test_fit_and_score_over_thresholds_curve_scorers():
         thresholds=10,
         kwargs={},
     )
-    thresholds, scores = _fit_and_score_over_thresholds(
+    scores, thresholds = _fit_and_score_over_thresholds(
         classifier,
         X,
         y,
@@ -180,7 +180,7 @@ def test_fit_and_score_over_thresholds_prefit():
         thresholds=2,
         kwargs={},
     )
-    thresholds, scores = _fit_and_score_over_thresholds(
+    scores, thresholds = _fit_and_score_over_thresholds(
         classifier,
         X,
         y,
@@ -216,7 +216,7 @@ def test_fit_and_score_over_thresholds_sample_weight():
         thresholds=10,
         kwargs={},
     )
-    thresholds_repeated, scores_repeated = _fit_and_score_over_thresholds(
+    scores_repeated, thresholds_repeated = _fit_and_score_over_thresholds(
         classifier,
         X_repeated,
         y_repeated,
@@ -228,7 +228,7 @@ def test_fit_and_score_over_thresholds_sample_weight():
     )
 
     train_idx, val_idx = np.arange(X.shape[0]), np.arange(X.shape[0])
-    thresholds, scores = _fit_and_score_over_thresholds(
+    scores, thresholds = _fit_and_score_over_thresholds(
         classifier.set_fit_request(sample_weight=True),
         X,
         y,

From 0fd667cac09ac9b092e287745f85e41c953bfd96 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Fri, 3 May 2024 16:14:02 +0200
Subject: [PATCH 192/194] rename objective_metric to scoring

---
 doc/modules/classification_threshold.rst       |  4 ++--
 .../plot_cost_sensitive_learning.py            |  6 +++---
 .../plot_tuned_decision_threshold.py           |  4 +---
 .../_classification_threshold.py               | 18 +++++++++---------
 .../tests/test_classification_threshold.py     | 18 +++++++++---------
 5 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index 7a774d80c5128..fb7ca7cfe93c6 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -84,7 +84,7 @@ Options to tune the decision threshold
 --------------------------------------
 
 The decision threshold can be tuned through different strategies controlled by the
-parameter `objective_metric`.
+parameter `scoring`.
 
 One way to tune the threshold is by maximizing a pre-defined scikit-learn metric. These
 metrics can be found by calling the function :func:`~sklearn.metrics.get_scorer_names`.
@@ -110,7 +110,7 @@ a meaningful metric for their use case.
         >>> pos_label = 0
         >>> scorer = make_scorer(f1_score, pos_label=pos_label)
         >>> base_model = LogisticRegression()
-        >>> model = TunedThresholdClassifierCV(base_model, objective_metric=scorer)
+        >>> model = TunedThresholdClassifierCV(base_model, scoring=scorer)
         >>> scorer(model.fit(X, y), X, y)
         0.88...
         >>> # compare it with the internal score found by cross-validation
diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index cc691931cbfa9..6b14f84f656bc 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -256,7 +256,7 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
 # implement by hand, but the
 # :class:`~sklearn.model_selection.TunedThresholdClassifierCV` class is here to help us.
 # It automatically computes the cost-gain for all possible cut-off points and optimizes
-# for the `objective_metric`.
+# for the `scoring`.
 #
 # .. _cost_sensitive_learning_example:
 #
@@ -273,7 +273,7 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
 tuned_model = TunedThresholdClassifierCV(
     estimator=model,
     pos_label=pos_label,
-    objective_metric=scoring["cost_gain"],
+    scoring=scoring["cost_gain"],
     store_cv_results=True,  # necessary to inspect all results
 )
 tuned_model.fit(X_train, y_train)
@@ -630,7 +630,7 @@ def business_metric(y_true, y_pred, amount):
 # best estimator found during the previous grid-search.
 tuned_model = TunedThresholdClassifierCV(
     estimator=model.best_estimator_,
-    objective_metric=business_scorer,
+    scoring=business_scorer,
     thresholds=100,
     n_jobs=2,
 )
diff --git a/examples/model_selection/plot_tuned_decision_threshold.py b/examples/model_selection/plot_tuned_decision_threshold.py
index ab6b8c27b2f07..7e997ee255e4d 100644
--- a/examples/model_selection/plot_tuned_decision_threshold.py
+++ b/examples/model_selection/plot_tuned_decision_threshold.py
@@ -112,9 +112,7 @@
 # cross-validation strategy as previously.
 from sklearn.model_selection import TunedThresholdClassifierCV
 
-tuned_model = TunedThresholdClassifierCV(
-    estimator=model, objective_metric="balanced_accuracy"
-)
+tuned_model = TunedThresholdClassifierCV(estimator=model, scoring="balanced_accuracy")
 cv_results_tuned_model = pd.DataFrame(
     cross_validate(
         tuned_model,
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index b0b186342f68d..b46b9c7d6f465 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -131,7 +131,7 @@ def fit(self, X, y, **params):
 
         **params : dict
             Parameters to pass to the `fit` method of the underlying
-            classifier and to the `objective_metric` scorer.
+            classifier.
 
         Returns
         -------
@@ -354,7 +354,7 @@ def _fit(self, X, y, **params):
 
         **params : dict
             Parameters to pass to the `fit` method of the underlying
-            classifier and to the `objective_metric` scorer.
+            classifier.
 
         Returns
         -------
@@ -658,7 +658,7 @@ class TunedThresholdClassifierCV(BaseThresholdClassifier):
         The classifier, fitted or not, for which we want to optimize
         the decision threshold used during `predict`.
 
-    objective_metric : str or callable, default="balanced_accuracy"
+    scoring : str or callable, default="balanced_accuracy"
         The objective metric to be optimized. Can be one of:
 
         * a string associated to a scoring function for binary classification
@@ -789,7 +789,7 @@ class TunedThresholdClassifierCV(BaseThresholdClassifier):
     weighted avg       0.93      0.93      0.92       250
     <BLANKLINE>
     >>> classifier_tuned = TunedThresholdClassifierCV(
-    ...     classifier, objective_metric="balanced_accuracy"
+    ...     classifier, scoring="balanced_accuracy"
     ... ).fit(X_train, y_train)
     >>> print(
     ...     f"Cut-off point found at {classifier_tuned.best_threshold_:.3f}"
@@ -809,7 +809,7 @@ class TunedThresholdClassifierCV(BaseThresholdClassifier):
 
     _parameter_constraints: dict = {
         **BaseThresholdClassifier._parameter_constraints,
-        "objective_metric": [
+        "scoring": [
             StrOptions(set(get_scorer_names())),
             callable,
             MutableMapping,
@@ -830,7 +830,7 @@ def __init__(
         self,
         estimator,
         *,
-        objective_metric="balanced_accuracy",
+        scoring="balanced_accuracy",
         pos_label=None,
         response_method="auto",
         thresholds=100,
@@ -843,7 +843,7 @@ def __init__(
         super().__init__(
             estimator=estimator, response_method=response_method, pos_label=pos_label
         )
-        self.objective_metric = objective_metric
+        self.scoring = scoring
         self.thresholds = thresholds
         self.cv = cv
         self.refit = refit
@@ -864,7 +864,7 @@ def _fit(self, X, y, **params):
 
         **params : dict
             Parameters to pass to the `fit` method of the underlying
-            classifier and to the `objective_metric` scorer.
+            classifier and to the `scoring` scorer.
 
         Returns
         -------
@@ -1031,7 +1031,7 @@ def _get_curve_scorer(self):
         Here, we reuse the conventional "scorer API" via `make_scorer` or
         `_CurveScorer`.
         """
-        scoring = check_scoring(self.estimator, scoring=self.objective_metric)
+        scoring = check_scoring(self.estimator, scoring=self.scoring)
         curve_scorer = _CurveScorer.from_scorer(
             scoring, self._response_method, self.thresholds, self.pos_label
         )
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 55c772ebc95eb..d7dbfa043a6c5 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -372,7 +372,7 @@ def test_tuned_threshold_classifier_without_constraint_value(response_method):
     thresholds = 100
     model = TunedThresholdClassifierCV(
         estimator=lr,
-        objective_metric="balanced_accuracy",
+        scoring="balanced_accuracy",
         response_method=response_method,
         thresholds=thresholds,
         store_cv_results=True,
@@ -392,13 +392,13 @@ def test_tuned_threshold_classifier_metric_with_parameter():
     X, y = load_breast_cancer(return_X_y=True)
     lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
     model_fbeta_1 = TunedThresholdClassifierCV(
-        estimator=lr, objective_metric=make_scorer(fbeta_score, beta=1)
+        estimator=lr, scoring=make_scorer(fbeta_score, beta=1)
     ).fit(X, y)
     model_fbeta_2 = TunedThresholdClassifierCV(
-        estimator=lr, objective_metric=make_scorer(fbeta_score, beta=2)
+        estimator=lr, scoring=make_scorer(fbeta_score, beta=2)
     ).fit(X, y)
     model_f1 = TunedThresholdClassifierCV(
-        estimator=lr, objective_metric=make_scorer(f1_score)
+        estimator=lr, scoring=make_scorer(f1_score)
     ).fit(X, y)
 
     assert model_fbeta_1.best_threshold_ == pytest.approx(model_f1.best_threshold_)
@@ -428,7 +428,7 @@ def test_tuned_threshold_classifier_with_string_targets(response_method, metric)
     y = classes[y]
     model = TunedThresholdClassifierCV(
         estimator=make_pipeline(StandardScaler(), LogisticRegression()),
-        objective_metric=metric,
+        scoring=metric,
         pos_label="cancer",
         response_method=response_method,
         thresholds=100,
@@ -552,15 +552,15 @@ def test_tuned_threshold_classifier_pos_label_single_metric(pos_label, metric_ty
     estimator = LogisticRegression().fit(X, y)
 
     if metric_type == "string":
-        objective_metric = "precision"
+        scoring = "precision"
     elif metric_type == "scorer_without_pos_label":
-        objective_metric = make_scorer(precision_score)
+        scoring = make_scorer(precision_score)
     else:  # metric_type == "scorer_with_pos_label"
-        objective_metric = make_scorer(precision_score, pos_label=pos_label)
+        scoring = make_scorer(precision_score, pos_label=pos_label)
 
     model = TunedThresholdClassifierCV(
         estimator,
-        objective_metric=objective_metric,
+        scoring=scoring,
         cv="prefit",
         refit=False,
         pos_label=pos_label,

From 07e43875c282bb8f102afc6ecde03896fbed720e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Fri, 3 May 2024 16:16:21 +0200
Subject: [PATCH 193/194] fix typo

---
 doc/modules/classification_threshold.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
index fb7ca7cfe93c6..712a094a43246 100644
--- a/doc/modules/classification_threshold.rst
+++ b/doc/modules/classification_threshold.rst
@@ -94,7 +94,7 @@ a meaningful metric for their use case.
 .. note::
 
     It is important to notice that these metrics come with default parameters, notably
-    the label of the class of interested (i.e. `pos_label`). Thus, if this label is not
+    the label of the class of interest (i.e. `pos_label`). Thus, if this label is not
     the right one for your application, you need to define a scorer and pass the right
     `pos_label` (and additional parameters) using the
     :func:`~sklearn.metrics.make_scorer`. Refer to :ref:`scoring` to get

From 9bd68e6dec16aa265245f7ebb2765cd6c84cf095 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Fri, 3 May 2024 17:13:31 +0200
Subject: [PATCH 194/194] remove pos_label and delegate to make_scorer

---
 .../plot_cost_sensitive_learning.py           |  2 +-
 .../_classification_threshold.py              | 56 +++----------------
 .../tests/test_classification_threshold.py    | 37 ------------
 3 files changed, 10 insertions(+), 85 deletions(-)

diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
index 6b14f84f656bc..7b64af48139f2 100644
--- a/examples/model_selection/plot_cost_sensitive_learning.py
+++ b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -272,7 +272,6 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
 
 tuned_model = TunedThresholdClassifierCV(
     estimator=model,
-    pos_label=pos_label,
     scoring=scoring["cost_gain"],
     store_cv_results=True,  # necessary to inspect all results
 )
@@ -406,6 +405,7 @@ def plot_roc_pr_curves(vanilla_model, tuned_model, *, title):
 tuned_model.set_params(cv="prefit", refit=False).fit(X_train, y_train)
 print(f"{tuned_model.best_threshold_=:0.2f}")
 
+
 # %%
 # Then, we evaluate our model with the same approach as before:
 title = "Tuned GBDT model without refitting and using the entire dataset"
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index b46b9c7d6f465..d5a864da10653 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -1,5 +1,4 @@
 from collections.abc import MutableMapping
-from inspect import signature
 from numbers import Integral, Real
 
 import numpy as np
@@ -83,11 +82,6 @@ class BaseThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator
         The binary classifier, fitted or not, for which we want to optimize
         the decision threshold used during `predict`.
 
-    pos_label : int, float, bool or str, default=None
-        The label of the positive class. Used to process the output of the
-        `response_method` method. When `pos_label=None`, if `y_true` is in `{-1, 1}` or
-        `{0, 1}`, `pos_label` is set to 1, otherwise an error will be raised.
-
     response_method : {"auto", "decision_function", "predict_proba"}, default="auto"
         Methods by the classifier `estimator` corresponding to the
         decision function for which we want to find a threshold. It can be:
@@ -105,13 +99,11 @@ class BaseThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator
             HasMethods(["fit", "predict_proba"]),
             HasMethods(["fit", "decision_function"]),
         ],
-        "pos_label": [Real, str, "boolean", None],
         "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})],
     }
 
-    def __init__(self, estimator, *, pos_label=None, response_method="auto"):
+    def __init__(self, estimator, *, response_method="auto"):
         self.estimator = estimator
-        self.pos_label = pos_label
         self.response_method = response_method
 
     @_fit_context(
@@ -326,6 +318,7 @@ class FixedThresholdClassifier(BaseThresholdClassifier):
     _parameter_constraints: dict = {
         **BaseThresholdClassifier._parameter_constraints,
         "threshold": [StrOptions({"auto"}), Real],
+        "pos_label": [Real, str, "boolean", None],
     }
 
     def __init__(
@@ -336,9 +329,8 @@ def __init__(
         pos_label=None,
         response_method="auto",
     ):
-        super().__init__(
-            estimator=estimator, pos_label=pos_label, response_method=response_method
-        )
+        super().__init__(estimator=estimator, response_method=response_method)
+        self.pos_label = pos_label
         self.threshold = threshold
 
     def _fit(self, X, y, **params):
@@ -451,32 +443,14 @@ def __init__(self, score_func, sign, kwargs, thresholds, response_method):
         self._thresholds = thresholds
 
     @classmethod
-    def from_scorer(cls, scorer, response_method, thresholds, pos_label):
+    def from_scorer(cls, scorer, response_method, thresholds):
         """Create a continuous scorer from a normal scorer."""
-        # add `pos_label` if requested by the scorer function
-        scorer_kwargs = {**scorer._kwargs}
-        signature_scoring_func = signature(scorer._score_func)
-        if (
-            "pos_label" in signature_scoring_func.parameters
-            and "pos_label" not in scorer_kwargs
-        ):
-            if pos_label is None:
-                # Since the provided `pos_label` is the default, we need to
-                # use the default value of the scoring function that can be either
-                # `None` or `1`.
-                scorer_kwargs["pos_label"] = signature_scoring_func.parameters[
-                    "pos_label"
-                ].default
-            else:
-                scorer_kwargs["pos_label"] = pos_label
-        # transform a binary metric into a curve metric for all possible decision
-        # thresholds
         instance = cls(
             score_func=scorer._score_func,
             sign=scorer._sign,
             response_method=response_method,
             thresholds=thresholds,
-            kwargs=scorer_kwargs,
+            kwargs=scorer._kwargs,
         )
         # transfer the metadata request
         instance._metadata_request = scorer._get_metadata_request()
@@ -665,11 +639,6 @@ class TunedThresholdClassifierCV(BaseThresholdClassifier):
           (see model evaluation documentation);
         * a scorer callable object created with :func:`~sklearn.metrics.make_scorer`;
 
-    pos_label : int, float, bool or str, default=None
-        The label of the positive class. Used to process the output of the
-        `response_method` method. When `pos_label=None`, if `y_true` is in `{-1, 1}` or
-        `{0, 1}`, `pos_label` is set to 1, otherwise an error will be raised.
-
     response_method : {"auto", "decision_function", "predict_proba"}, default="auto"
         Methods by the classifier `estimator` corresponding to the
         decision function for which we want to find a threshold. It can be:
@@ -831,7 +800,6 @@ def __init__(
         estimator,
         *,
         scoring="balanced_accuracy",
-        pos_label=None,
         response_method="auto",
         thresholds=100,
         cv=None,
@@ -840,9 +808,7 @@ def __init__(
         random_state=None,
         store_cv_results=False,
     ):
-        super().__init__(
-            estimator=estimator, response_method=response_method, pos_label=pos_label
-        )
+        super().__init__(estimator=estimator, response_method=response_method)
         self.scoring = scoring
         self.thresholds = thresholds
         self.cv = cv
@@ -1026,13 +992,9 @@ def get_metadata_routing(self):
         return router
 
     def _get_curve_scorer(self):
-        """Get the curve scorer based on the objective metric used.
-
-        Here, we reuse the conventional "scorer API" via `make_scorer` or
-        `_CurveScorer`.
-        """
+        """Get the curve scorer based on the objective metric used."""
         scoring = check_scoring(self.estimator, scoring=self.scoring)
         curve_scorer = _CurveScorer.from_scorer(
-            scoring, self._response_method, self.thresholds, self.pos_label
+            scoring, self._response_method, self.thresholds
         )
         return curve_scorer
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index d7dbfa043a6c5..f64edb2563c76 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -17,7 +17,6 @@
     f1_score,
     fbeta_score,
     make_scorer,
-    precision_score,
     recall_score,
 )
 from sklearn.model_selection import (
@@ -429,7 +428,6 @@ def test_tuned_threshold_classifier_with_string_targets(response_method, metric)
     model = TunedThresholdClassifierCV(
         estimator=make_pipeline(StandardScaler(), LogisticRegression()),
         scoring=metric,
-        pos_label="cancer",
         response_method=response_method,
         thresholds=100,
     ).fit(X, y)
@@ -536,41 +534,6 @@ def test_tuned_threshold_classifier_cv_zeros_sample_weights_equivalence():
     assert_allclose(y_pred_with_weights, y_pred_without_weights)
 
 
-@pytest.mark.parametrize(
-    "metric_type",
-    ["string", "scorer_without_pos_label", "scorer_with_pos_label"],
-)
-@pytest.mark.parametrize("pos_label", [0, 1])
-def test_tuned_threshold_classifier_pos_label_single_metric(pos_label, metric_type):
-    """Check that `pos_label` is dispatched correctly when getting a scorer linked to
-    a known metric. By default, the scorer in scikit-learn only have a default value
-    for `pos_label` which is 1.
-    """
-    X, y = make_classification(n_samples=100, weights=[0.6, 0.4], random_state=42)
-
-    # prefit the estimator to avoid variability due to the cross-validation
-    estimator = LogisticRegression().fit(X, y)
-
-    if metric_type == "string":
-        scoring = "precision"
-    elif metric_type == "scorer_without_pos_label":
-        scoring = make_scorer(precision_score)
-    else:  # metric_type == "scorer_with_pos_label"
-        scoring = make_scorer(precision_score, pos_label=pos_label)
-
-    model = TunedThresholdClassifierCV(
-        estimator,
-        scoring=scoring,
-        cv="prefit",
-        refit=False,
-        pos_label=pos_label,
-        thresholds=500,
-    ).fit(X, y)
-
-    precision = precision_score(y, model.predict(X), pos_label=pos_label)
-    assert precision == pytest.approx(model.best_score_, abs=1e-3)
-
-
 def test_tuned_threshold_classifier_thresholds_array():
     """Check that we can pass an array to `thresholds` and it is used as candidate
     threshold internally."""