Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG+2] Update BaggingRegressor to relax checking X for finite values #9707

Merged
merged 9 commits into from May 31, 2018
@@ -8,23 +8,22 @@
import itertools
import numbers
import numpy as np
from warnings import warn
from abc import ABCMeta, abstractmethod
from warnings import warn

from .base import BaseEnsemble, _partition_estimators
from ..base import ClassifierMixin, RegressorMixin
from ..externals.joblib import Parallel, delayed
from ..externals.six import with_metaclass
from ..externals.six.moves import zip
from ..metrics import r2_score, accuracy_score
from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
from ..utils import check_random_state, check_X_y, check_array, column_or_1d
from ..utils.random import sample_without_replacement
from ..utils.validation import has_fit_parameter, check_is_fitted
from ..utils import indices_to_mask, check_consistent_length
from ..utils.metaestimators import if_delegate_has_method
from ..utils.multiclass import check_classification_targets

from .base import BaseEnsemble, _partition_estimators
from ..utils.random import sample_without_replacement
from ..utils.validation import has_fit_parameter, check_is_fitted


__all__ = ["BaggingClassifier",
@@ -279,8 +278,11 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):
"""
random_state = check_random_state(self.random_state)

# Convert data
X, y = check_X_y(X, y, ['csr', 'csc'])
# Convert data (X is required to be 2d and indexable)
X, y = check_X_y(

This comment has been minimized.

Copy link
@jnothman

jnothman Sep 11, 2017

Member

Perhaps add a comment that we require X to be 2d and indexable on both axes.

This comment has been minimized.

Copy link
@jimmywan

jimmywan Sep 12, 2017

Author Contributor

Added

X, y, ['csr', 'csc'], dtype=None, force_all_finite=False,
multi_output=True
)
if sample_weight is not None:
sample_weight = check_array(sample_weight, ensure_2d=False)
check_consistent_length(y, sample_weight)
@@ -390,8 +392,10 @@ def _set_oob_score(self, X, y):
"""Calculate out of bag predictions and score."""

def _validate_y(self, y):
# Default implementation
return column_or_1d(y, warn=True)
if len(y.shape) == 1 or y.shape[1] == 1:
return column_or_1d(y, warn=True)
else:
return y

def _get_estimators_indices(self):
# Get drawn indices along both sample and feature axes
@@ -667,7 +671,10 @@ def predict_proba(self, X):
"""
check_is_fitted(self, "classes_")
# Check data
X = check_array(X, accept_sparse=['csr', 'csc'])
X = check_array(
X, accept_sparse=['csr', 'csc'], dtype=None,
force_all_finite=False
)

if self.n_features_ != X.shape[1]:
raise ValueError("Number of features of the model must "
@@ -714,7 +721,10 @@ def predict_log_proba(self, X):
check_is_fitted(self, "classes_")
if hasattr(self.base_estimator_, "predict_log_proba"):
# Check data
X = check_array(X, accept_sparse=['csr', 'csc'])
X = check_array(
X, accept_sparse=['csr', 'csc'], dtype=None,
force_all_finite=False
)

if self.n_features_ != X.shape[1]:
raise ValueError("Number of features of the model must "
@@ -769,7 +779,10 @@ def decision_function(self, X):
check_is_fitted(self, "classes_")

# Check data
X = check_array(X, accept_sparse=['csr', 'csc'])
X = check_array(
X, accept_sparse=['csr', 'csc'], dtype=None,
force_all_finite=False
)

if self.n_features_ != X.shape[1]:
raise ValueError("Number of features of the model must "
@@ -945,7 +958,10 @@ def predict(self, X):
"""
check_is_fitted(self, "estimators_features_")
# Check data
X = check_array(X, accept_sparse=['csr', 'csc'])
X = check_array(
X, accept_sparse=['csr', 'csc'], dtype=None,
force_all_finite=False
)

# Parallel loop
n_jobs, n_estimators, starts = _partition_estimators(self.n_estimators,
@@ -33,6 +33,7 @@
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston, load_iris, make_hastie_10_2
from sklearn.utils import check_random_state
from sklearn.preprocessing import Imputer

from scipy.sparse import csc_matrix, csr_matrix

@@ -748,3 +749,99 @@ def test_set_oob_score_label_encoding():
x3 = BaggingClassifier(oob_score=True,
random_state=random_state).fit(X, Y3).oob_score_
assert_equal([x1, x2], [x3, x3])


def test_bagging_regressor_with_missing_inputs_and_1d_output():
# Check that BaggingRegressor can accept X with missing/infinite data
X = [
[1, 3, 5],
[2, None, 6],

This comment has been minimized.

Copy link
@jnothman

jnothman Sep 11, 2017

Member

I think you want np.nan rather than None?

This comment has been minimized.

Copy link
@jimmywan

jimmywan Sep 12, 2017

Author Contributor

TBH, I generally test against both as I'm not entirely sure what happens under the covers. I know I can initialize a Pandas DataFrame either way.

This comment has been minimized.

Copy link
@jimmywan

jimmywan Sep 12, 2017

Author Contributor

Added separate imputers/inputs for None/np.nan, np.inf, np.NINF.

[2, np.nan, 6],
[2, np.inf, 6],
[2, np.NINF, 6],
]
Y = [2, 3, 3, 3, 3]

This comment has been minimized.

Copy link
@jnothman

jnothman Sep 13, 2017

Member

lowercase y if 1d please

This comment has been minimized.

Copy link
@jimmywan

jimmywan Sep 14, 2017

Author Contributor

Done.

regressor = DecisionTreeRegressor()
pipeline = make_pipeline(
Imputer(),
Imputer(missing_values=np.inf),
Imputer(missing_values=np.NINF),
regressor
)
pipeline.fit(X, Y).predict(X)
bagging_regressor = BaggingRegressor(pipeline)
bagging_regressor.fit(X, Y).predict(X)

This comment has been minimized.

Copy link
@jnothman

jnothman Sep 13, 2017

Member

should check decision_function etc also, no?

This comment has been minimized.

Copy link
@jimmywan

jimmywan Sep 14, 2017

Author Contributor

There is no decision_function on BaggingRegressor.
I'll add it to the BaggingClassifier test.

This comment has been minimized.

Copy link
@jimmywan

jimmywan Sep 14, 2017

Author Contributor

I actually can't add it to the test for BaggingClassifier because DecisionTreeRegressor does not implement decision_function. I don't normally work with classification and this PR was meant to be a patch for the BaggingRegressor. Feel free to add additional tests if you think it's necessary.


# Verify that exceptions can be raised by wrapper regressor
regressor = DecisionTreeRegressor()
pipeline = make_pipeline(regressor)
assert_raises(ValueError, pipeline.fit, X, Y)
bagging_regressor = BaggingRegressor(pipeline)
assert_raises(ValueError, bagging_regressor.fit, X, Y)

This comment has been minimized.

Copy link
@jnothman

jnothman Sep 13, 2017

Member

Might be good to check the message here, but okay.

This comment has been minimized.

Copy link
@jimmywan

jimmywan Sep 14, 2017

Author Contributor

Since this is just a test for BaggingRegressor, I don't really want to couple the behavior of the underlying regressor to this test case. The fact that it raises an error seems sufficient to me.



def test_bagging_regressor_with_missing_inputs_and_2d_output():
# Check that BaggingRegressor can accept X with missing/infinite data
X = [
[1, 3, 5],
[2, None, 6],
[2, np.nan, 6],
[2, np.inf, 6],
[2, np.NINF, 6],
]
Y = [

This comment has been minimized.

Copy link
@jnothman

jnothman Sep 13, 2017

Member

I'd be okay with this test not dealing with missing data, and just being about 2d. I'd also be okay with the previous test just looping over 1d, 2d.

This comment has been minimized.

Copy link
@jimmywan

jimmywan Sep 14, 2017

Author Contributor

Changed, though now I have to reuse y for 1D and 2D...

[2, 1, 9],
[3, 6, 8],
[3, 6, 8],
[3, 6, 8],
[3, 6, 8],
]
regressor = DecisionTreeRegressor()
pipeline = make_pipeline(
Imputer(),
Imputer(missing_values=np.inf),
Imputer(missing_values=np.NINF),
regressor
)
pipeline.fit(X, Y).predict(X)
bagging_regressor = BaggingRegressor(pipeline)
bagging_regressor.fit(X, Y).predict(X)

This comment has been minimized.

Copy link
@jnothman

jnothman Sep 13, 2017

Member

please check that the prediction shape matches the input shape

This comment has been minimized.

Copy link
@jimmywan

jimmywan Sep 14, 2017

Author Contributor

Done.


# Verify that exceptions can be raised by wrapper regressor
regressor = DecisionTreeRegressor()
pipeline = make_pipeline(regressor)
assert_raises(ValueError, pipeline.fit, X, Y)
bagging_regressor = BaggingRegressor(pipeline)
assert_raises(ValueError, bagging_regressor.fit, X, Y)


def test_bagging_classifier_with_missing_inputs():

This comment has been minimized.

Copy link
@jnothman

jnothman Sep 13, 2017

Member

What does this test add?

This comment has been minimized.

Copy link
@jimmywan

jimmywan Sep 14, 2017

Author Contributor

This tests BaggingClassifier while the other tests BaggingRegressor.

# Check that BaggingClassifier can accept X with missing/infinite data
X = [
[1, 3, 5],
[2, None, 6],
[2, np.nan, 6],
[2, np.inf, 6],
[2, np.NINF, 6],
]
Y = [3, 6, 6, 6, 6]
classifier = DecisionTreeClassifier()
pipeline = make_pipeline(
Imputer(),
Imputer(missing_values=np.inf),
Imputer(missing_values=np.NINF),
classifier
)
pipeline.fit(X, Y).predict(X)
bagging_classifier = BaggingClassifier(pipeline)
bagging_classifier.fit(X, Y)
bagging_classifier.predict(X)
bagging_classifier.predict_log_proba(X)
bagging_classifier.predict_proba(X)

# Verify that exceptions can be raised by wrapper classifier
classifier = DecisionTreeClassifier()
pipeline = make_pipeline(classifier)
assert_raises(ValueError, pipeline.fit, X, Y)
bagging_classifier = BaggingClassifier(pipeline)
assert_raises(ValueError, bagging_classifier.fit, X, Y)
ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.