Merge ccf8ec2 into 57aa05a

rasbt · Jul 1, 2020 · ced8072 · ced8072
2 parents 57aa05a + ccf8ec2
commit ced8072
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 8 deletions.
diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md
@@ -19,7 +19,7 @@ The CHANGELOG for the current development version is available at
 
 ##### New Features
 
-- -
+- Add `predict_proba` kwarg to bootstrap methods, to allow bootstrapping of scoring functions that take in probability values. ([#700](https://github.com/rasbt/mlxtend/pull/700) via [Adam Li](https://github.com/adam2392))
 
 ##### Changes
 

diff --git a/mlxtend/evaluate/bootstrap_point632.py b/mlxtend/evaluate/bootstrap_point632.py
@@ -44,6 +44,7 @@ def mse(targets, predictions):
 
 def bootstrap_point632_score(estimator, X, y, n_splits=200,
                              method='.632', scoring_func=None,
+                             predict_proba=False,
                              random_seed=None,
                              clone_estimator=True):
     """
@@ -93,6 +94,17 @@ def bootstrap_point632_score(estimator, X, y, n_splits=200,
         estimator is a classifier and mean squared error
         if the estimator is a regressor.
 
+    predict_proba : bool
+        Whether to use the `predict_proba` function for the
+        `estimator` argument. This is to be used in conjunction
+        with `scoring_func` which takes in probability values
+        instead of actual predictions.
+        For example, if the scoring_func is
+        :meth:`sklearn.metrics.roc_auc_score`, then use
+        `predict_proba=True`.
+        Note that this requires `estimator` to have
+        `predict_proba` method implemented.
+
     random_seed : int (default=None)
         If int, random_seed is the seed used by
         the random number generator.
@@ -153,21 +165,42 @@ def bootstrap_point632_score(estimator, X, y, n_splits=200,
             raise AttributeError('Estimator type undefined.'
                                  'Please provide a scoring_func argument.')
 
+    # determine which prediction function to use
+    # either label, or probability prediction
+    if not predict_proba:
+        predict_func = cloned_est.predict
+    else:
+        if not getattr(cloned_est, 'predict_proba', None):
+            raise RuntimeError(f'The estimator {cloned_est} does not '
+                               f'support predicting probabilities via '
+                               f'`predict_proba` function.')
+        predict_func = cloned_est.predict_proba
+
     oob = BootstrapOutOfBag(n_splits=n_splits, random_seed=random_seed)
     scores = np.empty(dtype=np.float, shape=(n_splits,))
     cnt = 0
     for train, test in oob.split(X):
         cloned_est.fit(X[train], y[train])
 
-        test_acc = scoring_func(y[test], cloned_est.predict(X[test]))
+        # get the prediction probability
+        # for binary class uses the last column
+        predicted_test_val = predict_func(X[test])
+        predicted_train_val = predict_func(X[train])
+        if predict_proba:
+            len_uniq = np.unique(y)
+
+            if len(len_uniq) == 2:
+                predicted_train_val = predicted_train_val[:, 1]
+                predicted_test_val = predicted_test_val[:, 1]
+
+        test_acc = scoring_func(y[test], predicted_test_val)
 
         if method == 'oob':
             acc = test_acc
 
         else:
             test_err = 1 - test_acc
-            train_err = 1 - scoring_func(y[train],
-                                         cloned_est.predict(X[train]))
+            train_err = 1 - scoring_func(y[train], predicted_train_val)
             if method == '.632+':
                 gamma = 1 - (no_information_rate(
                     y,

diff --git a/mlxtend/evaluate/tests/test_bootstrap_point632.py b/mlxtend/evaluate/tests/test_bootstrap_point632.py
@@ -5,15 +5,23 @@
 # License: BSD 3 clause
 
 import numpy as np
-from mlxtend.evaluate import bootstrap_point632_score
-from mlxtend.utils import assert_raises
-from mlxtend.data import iris_data
+import pytest
+from sklearn.base import BaseEstimator
 from sklearn.linear_model import LogisticRegression
 from sklearn.tree import DecisionTreeClassifier
 
+from mlxtend.data import iris_data
+from mlxtend.evaluate import bootstrap_point632_score
+from mlxtend.utils import assert_raises
+
 X, y = iris_data()
 
 
+class FakeClassifier(BaseEstimator):
+    def __init__(self):
+        pass
+
+
 def test_defaults():
     lr = LogisticRegression(solver='liblinear', multi_class='ovr')
     scores = bootstrap_point632_score(lr, X, y, random_seed=123)
@@ -63,10 +71,10 @@ def test_632plus():
 
 
 def test_custom_accuracy():
-
     def accuracy2(targets, predictions):
         return sum([i == j for i, j in
                     zip(targets, predictions)]) / len(targets)
+
     lr = LogisticRegression(solver='liblinear', multi_class='ovr')
     scores = bootstrap_point632_score(lr, X, y,
                                       random_seed=123,
@@ -121,3 +129,24 @@ def test_scoring():
     f1 = np.mean(scores)
     assert len(scores == 200)
     assert np.round(f1, 2) == 1.0, f1
+
+
+def test_scoring_proba():
+    from sklearn.metrics import roc_auc_score
+    lr = LogisticRegression(solver='liblinear', multi_class='ovr')
+
+    # test predict_proba
+    scores = bootstrap_point632_score(lr, X[:100], y[:100],
+                                      scoring_func=roc_auc_score,
+                                      predict_proba=True,
+                                      random_seed=123)
+    roc_auc = np.mean(scores)
+    assert len(scores == 200)
+    assert np.round(roc_auc, 2) == 1.0, roc_auc
+
+    with pytest.raises(RuntimeError):
+        clf = FakeClassifier()
+        scores = bootstrap_point632_score(clf, X[:100], y[:100],
+                                          scoring_func=roc_auc_score,
+                                          predict_proba=True,
+                                          random_seed=123)