Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG+2] Update BaggingRegressor to relax checking X for finite values #9707

Merged
merged 9 commits into from May 31, 2018
@@ -8,23 +8,22 @@
import itertools
import numbers
import numpy as np
from warnings import warn
from abc import ABCMeta, abstractmethod
from warnings import warn

from .base import BaseEnsemble, _partition_estimators
from ..base import ClassifierMixin, RegressorMixin
from ..externals.joblib import Parallel, delayed
from ..externals.six import with_metaclass
from ..externals.six.moves import zip
from ..metrics import r2_score, accuracy_score
from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
from ..utils import check_random_state, check_X_y, check_array, column_or_1d
from ..utils.random import sample_without_replacement
from ..utils.validation import has_fit_parameter, check_is_fitted
from ..utils import indices_to_mask, check_consistent_length
from ..utils.metaestimators import if_delegate_has_method
from ..utils.multiclass import check_classification_targets

from .base import BaseEnsemble, _partition_estimators
from ..utils.random import sample_without_replacement
from ..utils.validation import has_fit_parameter, check_is_fitted


__all__ = ["BaggingClassifier",
@@ -280,7 +279,10 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):
random_state = check_random_state(self.random_state)

# Convert data
X, y = check_X_y(X, y, ['csr', 'csc'])
X, y = check_X_y(

This comment has been minimized.

Copy link
@jnothman

jnothman Sep 11, 2017

Member

Perhaps add a comment that we require X to be 2d and indexable on both axes.

This comment has been minimized.

Copy link
@jimmywan

jimmywan Sep 12, 2017

Author Contributor

Added

X, y, ['csr', 'csc'], dtype=None, force_all_finite=False,
multi_output=True
)
if sample_weight is not None:
sample_weight = check_array(sample_weight, ensure_2d=False)
check_consistent_length(y, sample_weight)
@@ -390,8 +392,10 @@ def _set_oob_score(self, X, y):
"""Calculate out of bag predictions and score."""

def _validate_y(self, y):
# Default implementation
return column_or_1d(y, warn=True)
if len(y.shape) == 1 or y.shape[1] == 1:
return column_or_1d(y, warn=True)
else:
return y

def _get_estimators_indices(self):
# Get drawn indices along both sample and feature axes
@@ -667,7 +671,10 @@ def predict_proba(self, X):
"""
check_is_fitted(self, "classes_")
# Check data
X = check_array(X, accept_sparse=['csr', 'csc'])
X = check_array(
X, accept_sparse=['csr', 'csc'], dtype=None,
force_all_finite=False
)

if self.n_features_ != X.shape[1]:
raise ValueError("Number of features of the model must "
@@ -714,7 +721,10 @@ def predict_log_proba(self, X):
check_is_fitted(self, "classes_")
if hasattr(self.base_estimator_, "predict_log_proba"):
# Check data
X = check_array(X, accept_sparse=['csr', 'csc'])
X = check_array(
X, accept_sparse=['csr', 'csc'], dtype=None,
force_all_finite=False
)

if self.n_features_ != X.shape[1]:
raise ValueError("Number of features of the model must "
@@ -769,7 +779,10 @@ def decision_function(self, X):
check_is_fitted(self, "classes_")

# Check data
X = check_array(X, accept_sparse=['csr', 'csc'])
X = check_array(
X, accept_sparse=['csr', 'csc'], dtype=None,
force_all_finite=False
)

if self.n_features_ != X.shape[1]:
raise ValueError("Number of features of the model must "
@@ -945,7 +958,10 @@ def predict(self, X):
"""
check_is_fitted(self, "estimators_features_")
# Check data
X = check_array(X, accept_sparse=['csr', 'csc'])
X = check_array(
X, accept_sparse=['csr', 'csc'], dtype=None,
force_all_finite=False
)

# Parallel loop
n_jobs, n_estimators, starts = _partition_estimators(self.n_estimators,
@@ -33,6 +33,7 @@
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston, load_iris, make_hastie_10_2
from sklearn.utils import check_random_state
from sklearn.preprocessing import Imputer

from scipy.sparse import csc_matrix, csr_matrix

@@ -748,3 +749,51 @@ def test_set_oob_score_label_encoding():
x3 = BaggingClassifier(oob_score=True,
random_state=random_state).fit(X, Y3).oob_score_
assert_equal([x1, x2], [x3, x3])


def test_bagging_pipeline_with_sparse_inputs():
# Check that BaggingRegressor can accept sparse pipelines inputs

This comment has been minimized.

Copy link
@jnothman

jnothman Sep 11, 2017

Member

What you're testing with is not what we call sparse. I presume sparse inputs are already tested too. Do you mean to test nans and multioutput (which probably belongs in a separate test function)?

This comment has been minimized.

Copy link
@jimmywan

jimmywan Sep 12, 2017

Author Contributor

Renamed "sparse" to "missing"
I created separate tests for single and multioutput BaggingRegressors.

X = [
[1, 3, 5],
[2, None, 6],

This comment has been minimized.

Copy link
@jnothman

jnothman Sep 11, 2017

Member

I think you want np.nan rather than None?

This comment has been minimized.

Copy link
@jimmywan

jimmywan Sep 12, 2017

Author Contributor

TBH, I generally test against both as I'm not entirely sure what happens under the covers. I know I can initialize a Pandas DataFrame either way.

This comment has been minimized.

Copy link
@jimmywan

jimmywan Sep 12, 2017

Author Contributor

Added separate imputers/inputs for None/np.nan, np.inf, np.NINF.

]
Y = [

This comment has been minimized.

Copy link
@jnothman

jnothman Sep 13, 2017

Member

I'd be okay with this test not dealing with missing data, and just being about 2d. I'd also be okay with the previous test just looping over 1d, 2d.

This comment has been minimized.

Copy link
@jimmywan

jimmywan Sep 14, 2017

Author Contributor

Changed, though now I have to reuse y for 1D and 2D...

[2, 1, 9],
[3, 6, 8],
]
regressor = DecisionTreeRegressor()
pipeline = make_pipeline(Imputer(), regressor)
pipeline.fit(X, Y).predict(X)
bagging_regressor = BaggingRegressor(pipeline)
bagging_regressor.fit(X, Y).predict(X)

This comment has been minimized.

Copy link
@jnothman

jnothman Sep 13, 2017

Member

please check that the prediction shape matches the input shape

This comment has been minimized.

Copy link
@jimmywan

jimmywan Sep 14, 2017

Author Contributor

Done.


# Also verify that it fails on bad pipelines
regressor = DecisionTreeRegressor()
pipeline = make_pipeline(regressor)
assert_raises(ValueError, pipeline.fit, X, Y)
bagging_regressor = BaggingRegressor(pipeline)
assert_raises(ValueError, bagging_regressor.fit, X, Y)


def test_bagging_classifier_with_sparse_inputs():
# Check that BaggingRegressor can accept sparse pipelines inputs
X = [
[1, 3, 5],
[2, None, 6],
]
Y = [3, 6]
classifier = DecisionTreeClassifier()
pipeline = make_pipeline(Imputer(), classifier)
pipeline.fit(X, Y).predict(X)
bagging_classifier = BaggingClassifier(pipeline)
bagging_classifier.fit(X, Y)
bagging_classifier.predict(X)
bagging_classifier.predict_log_proba(X)
bagging_classifier.predict_proba(X)

# Also verify that it fails on bad pipelines
classifier = DecisionTreeClassifier()
pipeline = make_pipeline(classifier)
assert_raises(ValueError, pipeline.fit, X, Y)
bagging_classifier = BaggingClassifier(pipeline)
assert_raises(ValueError, bagging_classifier.fit, X, Y)
ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.