diff --git a/doc/api.rst b/doc/api.rst index 7254eaeb0..15ac28d5d 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -109,6 +109,7 @@ Prototype selection :template: class.rst ensemble.BalanceCascade + ensemble.BalancedBaggingClassifier ensemble.EasyEnsemble diff --git a/doc/ensemble.rst b/doc/ensemble.rst index 01846e039..fad737cd1 100644 --- a/doc/ensemble.rst +++ b/doc/ensemble.rst @@ -6,6 +6,11 @@ Ensemble of samplers .. currentmodule:: imblearn.ensemble +.. _ensemble_samplers: + +Samplers +-------- + An imbalanced data set can be balanced by creating several balanced subsets. The module :mod:`imblearn.ensemble` allows to create such sets. @@ -54,3 +59,54 @@ parameter ``n_max_subset`` and an additional bootstraping can be activated with See :ref:`sphx_glr_auto_examples_ensemble_plot_easy_ensemble.py` and :ref:`sphx_glr_auto_examples_ensemble_plot_balance_cascade.py`. + +.. _ensemble_meta_estimators: + +Chaining ensemble of samplers and estimators +-------------------------------------------- + +In ensemble classifiers, bagging methods build several estimators on different +randomly selected subset of data. In scikit-learn, this classifier is named +``BaggingClassifier``. However, this classifier does not allow to balance each +subset of data. Therefore, when training on imbalanced data set, this +classifier will favor the majority classes:: + + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.metrics import confusion_matrix + >>> from sklearn.ensemble import BaggingClassifier + >>> from sklearn.tree import DecisionTreeClassifier + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + >>> bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(), + ... random_state=0) + >>> bc.fit(X_train, y_train) #doctest: +ELLIPSIS + BaggingClassifier(...) + >>> y_pred = bc.predict(X_test) + >>> confusion_matrix(y_test, y_pred) + array([[ 0, 0, 12], + [ 0, 0, 59], + [ 0, 0, 1179]]) + +:class:`BalancedBaggingClassifier` allows to resample each subset of data +before to train each estimator of the ensemble. In short, it combines the +output of an :class:`EasyEnsemble` sampler with an ensemble of classifiers +(i.e. ``BaggingClassifier``). Therefore, :class:`BalancedBaggingClassifier` +takes the same parameters than the scikit-learn +``BaggingClassifier``. Additionally, there is two additional parameters, +``ratio`` and ``replacement``, as in the :class:`EasyEnsemble` sampler:: + + + >>> from imblearn.ensemble import BalancedBaggingClassifier + >>> bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(), + ... ratio='auto', + ... replacement=False, + ... random_state=0) + >>> bbc.fit(X, y) # doctest: +ELLIPSIS + BalancedBaggingClassifier(...) + >>> y_pred = bbc.predict(X_test) + >>> confusion_matrix(y_test, y_pred) + array([[ 12, 0, 0], + [ 0, 55, 4], + [ 68, 53, 1058]]) + +See +:ref:`sphx_glr_auto_examples_ensemble_plot_comparison_bagging_classifier.py`. diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 3852051ed..fb6901d75 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -53,6 +53,10 @@ New features Enhancement ~~~~~~~~~~~ +- Add :class:`ensemble.BalancedBaggingClassifier` which is a meta estimator to + directly use the :class:`ensemble.EasyEnsemble` chained with a classifier. By + `Guillaume Lemaitre`_. + - All samplers accepts sparse matrices with defaulting on CSR type. By `Guillaume Lemaitre`_. diff --git a/examples/ensemble/plot_comparison_bagging_classifier.py b/examples/ensemble/plot_comparison_bagging_classifier.py new file mode 100644 index 000000000..62176c0a0 --- /dev/null +++ b/examples/ensemble/plot_comparison_bagging_classifier.py @@ -0,0 +1,104 @@ +""" +========================================================= +Comparison of balanced and imbalanced bagging classifiers +========================================================= + +This example shows the benefit of balancing the training set when using a +bagging classifier. ``BalancedBaggingClassifier`` chains a +``RandomUnderSampler`` and a given classifier while ``BaggingClassifier`` is +using directly the imbalanced data. + +Balancing the data set before training the classifier improve the +classification performance. In addition, it avoids the ensemble to focus on the +majority class which would be a known drawback of the decision tree +classifiers. + +""" + +# Authors: Guillaume Lemaitre +# License: MIT + +from collections import Counter +import itertools + +import matplotlib.pyplot as plt +import numpy as np + +from sklearn.datasets import load_iris +from sklearn.model_selection import train_test_split +from sklearn.ensemble import BaggingClassifier +from sklearn.metrics import confusion_matrix + +from imblearn.datasets import make_imbalance +from imblearn.ensemble import BalancedBaggingClassifier + +from imblearn.metrics import classification_report_imbalanced + + +def plot_confusion_matrix(cm, classes, + normalize=False, + title='Confusion matrix', + cmap=plt.cm.Blues): + """ + This function prints and plots the confusion matrix. + Normalization can be applied by setting `normalize=True`. + """ + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + print("Normalized confusion matrix") + else: + print('Confusion matrix, without normalization') + + print(cm) + + plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.title(title) + plt.colorbar() + tick_marks = np.arange(len(classes)) + plt.xticks(tick_marks, classes, rotation=45) + plt.yticks(tick_marks, classes) + + fmt = '.2f' if normalize else 'd' + thresh = cm.max() / 2. + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): + plt.text(j, i, format(cm[i, j], fmt), + horizontalalignment="center", + color="white" if cm[i, j] > thresh else "black") + + plt.tight_layout() + plt.ylabel('True label') + plt.xlabel('Predicted label') + + +iris = load_iris() +X, y = make_imbalance(iris.data, iris.target, ratio={0: 25, 1: 40, 2: 50}, + random_state=0) +X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + +bagging = BaggingClassifier(random_state=0) +balanced_bagging = BalancedBaggingClassifier(random_state=0) + +print('Class distribution of the training set: {}'.format(Counter(y_train))) + +bagging.fit(X_train, y_train) +balanced_bagging.fit(X_train, y_train) + +print('Class distribution of the test set: {}'.format(Counter(y_test))) + +print('Classification results using a bagging classifier on imbalanced data') +y_pred_bagging = bagging.predict(X_test) +print(classification_report_imbalanced(y_test, y_pred_bagging)) +cm_bagging = confusion_matrix(y_test, y_pred_bagging) +plt.figure() +plot_confusion_matrix(cm_bagging, classes=iris.target_names, + title='Confusion matrix using BaggingClassifier') + +print('Classification results using a bagging classifier on balanced data') +y_pred_balanced_bagging = balanced_bagging.predict(X_test) +print(classification_report_imbalanced(y_test, y_pred_balanced_bagging)) +cm_balanced_bagging = confusion_matrix(y_test, y_pred_balanced_bagging) +plt.figure() +plot_confusion_matrix(cm_balanced_bagging, classes=iris.target_names, + title='Confusion matrix using BalancedBaggingClassifier') + +plt.show() diff --git a/imblearn/ensemble/__init__.py b/imblearn/ensemble/__init__.py index 6c17409e5..35cbd24eb 100644 --- a/imblearn/ensemble/__init__.py +++ b/imblearn/ensemble/__init__.py @@ -6,4 +6,6 @@ from .easy_ensemble import EasyEnsemble from .balance_cascade import BalanceCascade -__all__ = ['EasyEnsemble', 'BalanceCascade'] +from .classifier import BalancedBaggingClassifier + +__all__ = ['EasyEnsemble', 'BalancedBaggingClassifier', 'BalanceCascade'] diff --git a/imblearn/ensemble/balance_cascade.py b/imblearn/ensemble/balance_cascade.py index 20276a79a..abd9be7f4 100644 --- a/imblearn/ensemble/balance_cascade.py +++ b/imblearn/ensemble/balance_cascade.py @@ -27,7 +27,7 @@ class BalanceCascade(BaseEnsembleSampler): This method iteratively select subset and make an ensemble of the different sets. The selection is performed using a specific classifier. - Read more in the :ref:`User Guide `. + Read more in the :ref:`User Guide `. Parameters ---------- @@ -99,7 +99,7 @@ class BalanceCascade(BaseEnsembleSampler): See also -------- - EasyEnsemble + BalancedBaggingClassifier, EasyEnsemble References ---------- diff --git a/imblearn/ensemble/classifier.py b/imblearn/ensemble/classifier.py new file mode 100644 index 000000000..81d528ac9 --- /dev/null +++ b/imblearn/ensemble/classifier.py @@ -0,0 +1,271 @@ +"""Ensemble predictors combining a sampler and a classifier.""" + +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + +import numbers + +import numpy as np + +import sklearn +from sklearn.base import clone +from sklearn.ensemble import BaggingClassifier +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble.bagging import _generate_bagging_indices +from sklearn.utils import indices_to_mask + +from ..pipeline import Pipeline +from ..under_sampling import RandomUnderSampler + +old_generate = _generate_bagging_indices + + +def _masked_bagging_indices(random_state, bootstrap_features, + bootstrap_samples, n_features, n_samples, + max_features, max_samples): + """Monkey-patch to always get a mask instead of indices""" + feature_indices, sample_indices = old_generate(random_state, + bootstrap_features, + bootstrap_samples, + n_features, n_samples, + max_features, max_samples) + sample_indices = indices_to_mask(sample_indices, n_samples) + + return feature_indices, sample_indices + + +sklearn.ensemble.bagging._generate_bagging_indices = _masked_bagging_indices + + +class BalancedBaggingClassifier(BaggingClassifier): + """A Bagging classifier with additional balancing. + + This implementation of Bagging is similar to the scikit-learn + implementation. It includes an additional step to balance the training set + at fit time using a ``RandomUnderSampler``. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + base_estimator : object or None, optional (default=None) + The base estimator to fit on random subsets of the dataset. + If None, then the base estimator is a decision tree. + + n_estimators : int, optional (default=10) + The number of base estimators in the ensemble. + + max_samples : int or float, optional (default=1.0) + The number of samples to draw from X to train each base estimator. + + - If int, then draw ``max_samples`` samples. + - If float, then draw ``max_samples * X.shape[0]`` samples. + + max_features : int or float, optional (default=1.0) + The number of features to draw from X to train each base estimator. + + - If int, then draw ``max_features`` features. + - If float, then draw ``max_features * X.shape[1]`` features. + + bootstrap : boolean, optional (default=True) + Whether samples are drawn with replacement. + + bootstrap_features : boolean, optional (default=False) + Whether features are drawn with replacement. + + oob_score : bool + Whether to use out-of-bag samples to estimate + the generalization error. + + warm_start : bool, optional (default=False) + When set to True, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just fit + a whole new ensemble. + + .. versionadded:: 0.17 + *warm_start* constructor parameter. + + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. + + replacement : bool, optional (default=False) + Whether or not to sample randomly with replacement or not. + + n_jobs : int, optional (default=1) + The number of jobs to run in parallel for both `fit` and `predict`. + If -1, then the number of jobs is set to the number of cores. + + random_state : int, RandomState instance or None, optional (default=None) + - If int, ``random_state`` is the seed used by the random number + generator; + - If ``RandomState`` instance, random_state is the random + number generator; + - If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. + + verbose : int, optional (default=0) + Controls the verbosity of the building process. + + Attributes + ---------- + base_estimator_ : estimator + The base estimator from which the ensemble is grown. + + estimators_ : list of estimators + The collection of fitted base estimators. + + estimators_samples_ : list of arrays + The subset of drawn samples (i.e., the in-bag samples) for each base + estimator. Each subset is defined by a boolean mask. + + estimators_features_ : list of arrays + The subset of drawn features for each base estimator. + + classes_ : array of shape = [n_classes] + The classes labels. + + n_classes_ : int or list + The number of classes. + + oob_score_ : float + Score of the training dataset obtained using an out-of-bag estimate. + + oob_decision_function_ : array of shape = [n_samples, n_classes] + Decision function computed with out-of-bag estimate on the training + set. If n_estimators is small it might be possible that a data point + was never left out during the bootstrap. In this case, + ``oob_decision_function_`` might contain NaN. + + Notes + ----- + + See + :ref:`sphx_glr_auto_examples_ensemble_plot_comparison_bagging_classifier.py`. + + See also + -------- + BalanceCascade, EasyEnsemble + + References + ---------- + .. [1] L. Breiman, "Pasting small votes for classification in large + databases and on-line", Machine Learning, 36(1), 85-103, 1999. + .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140, + 1996. + .. [3] T. Ho, "The random subspace method for constructing decision + forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844, + 1998. + .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine + Learning and Knowledge Discovery in Databases, 346-361, 2012. + + Examples + -------- + + >>> from collections import Counter + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.metrics import confusion_matrix + >>> from imblearn.ensemble import \ +BalancedBaggingClassifier # doctest: +NORMALIZE_WHITESPACE + >>> X, y = make_classification(n_classes=2, class_sep=2, + ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, + ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) + >>> print('Original dataset shape {}'.format(Counter(y))) + Original dataset shape Counter({1: 900, 0: 100}) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, + ... random_state=0) + >>> bbc = BalancedBaggingClassifier(random_state=42) + >>> bbc.fit(X_train, y_train) # doctest: +ELLIPSIS + BalancedBaggingClassifier(...) + >>> y_pred = bbc.predict(X_test) + >>> print(confusion_matrix(y_test, y_pred)) + [[ 23 0] + [ 2 225]] + + """ + def __init__(self, + base_estimator=None, + n_estimators=10, + max_samples=1.0, + max_features=1.0, + bootstrap=True, + bootstrap_features=False, + oob_score=False, + warm_start=False, + ratio='auto', + replacement=False, + n_jobs=1, + random_state=None, + verbose=0): + + super(BaggingClassifier, self).__init__( + base_estimator, + n_estimators=n_estimators, + max_samples=max_samples, + max_features=max_features, + bootstrap=bootstrap, + bootstrap_features=bootstrap_features, + oob_score=oob_score, + warm_start=warm_start, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose) + self.ratio = ratio + self.replacement = replacement + + def _validate_estimator(self, default=DecisionTreeClassifier()): + """Check the estimator and the n_estimator attribute, set the + `base_estimator_` attribute.""" + if not isinstance(self.n_estimators, (numbers.Integral, np.integer)): + raise ValueError("n_estimators must be an integer, " + "got {0}.".format(type(self.n_estimators))) + + if self.n_estimators <= 0: + raise ValueError("n_estimators must be greater than zero, " + "got {0}.".format(self.n_estimators)) + + if self.base_estimator is not None: + base_estimator = clone(self.base_estimator) + else: + base_estimator = clone(default) + + self.base_estimator_ = Pipeline( + [('sampler', RandomUnderSampler(ratio=self.ratio, + replacement=self.replacement)), + ('classifier', base_estimator)]) + + def fit(self, X, y): + """Build a Bagging ensemble of estimators from the training + set (X, y). + + Parameters + ---------- + X : array-like of shape = [n_samples, n_features] + The training input samples. + + y : array-like, shape = [n_samples] + The target values. + + Returns + ------- + self : object + Returns self. + """ + # RandomUnderSampler is not supporting sample_weight. We need to pass + # None. + return self._fit(X, y, self.max_samples, sample_weight=None) diff --git a/imblearn/ensemble/easy_ensemble.py b/imblearn/ensemble/easy_ensemble.py index 5fc018167..a10837034 100644 --- a/imblearn/ensemble/easy_ensemble.py +++ b/imblearn/ensemble/easy_ensemble.py @@ -20,7 +20,7 @@ class EasyEnsemble(BaseEnsembleSampler): This method iteratively select a random subset and make an ensemble of the different sets. - Read more in the :ref:`User Guide `. + Read more in the :ref:`User Guide `. Parameters ---------- @@ -67,7 +67,7 @@ class EasyEnsemble(BaseEnsembleSampler): See also -------- - BalanceCascade + BalanceCascade, BalancedBaggingClassifier References ---------- diff --git a/imblearn/ensemble/tests/test_classifier.py b/imblearn/ensemble/tests/test_classifier.py new file mode 100644 index 000000000..a52f09eb6 --- /dev/null +++ b/imblearn/ensemble/tests/test_classifier.py @@ -0,0 +1,451 @@ +"""Test the module ensemble classifiers.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + +import numpy as np + +from sklearn.datasets import load_iris, make_hastie_10_2 +from sklearn.model_selection import (GridSearchCV, ParameterGrid, + train_test_split) +from sklearn.dummy import DummyClassifier +from sklearn.linear_model import Perceptron, LogisticRegression +from sklearn.tree import DecisionTreeClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.feature_selection import SelectKBest +from sklearn.utils.testing import (assert_array_equal, + assert_array_almost_equal, + assert_raises, + assert_warns, + assert_warns_message) + +from imblearn.datasets import make_imbalance +from imblearn.ensemble import BalancedBaggingClassifier +from imblearn.pipeline import make_pipeline +from imblearn.under_sampling import RandomUnderSampler + +iris = load_iris() + + +def test_balanced_bagging_classifier(): + # Check classification for various parameter settings. + X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, + random_state=0) + grid = ParameterGrid({"max_samples": [0.5, 1.0], + "max_features": [1, 2, 4], + "bootstrap": [True, False], + "bootstrap_features": [True, False]}) + + for base_estimator in [None, + DummyClassifier(), + Perceptron(), + DecisionTreeClassifier(), + KNeighborsClassifier(), + SVC()]: + for params in grid: + BalancedBaggingClassifier( + base_estimator=base_estimator, + random_state=0, + **params).fit(X_train, y_train).predict(X_test) + + +def test_bootstrap_samples(): + # Test that bootstrapping samples generate non-perfect base estimators. + X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, + random_state=0) + + base_estimator = DecisionTreeClassifier().fit(X_train, y_train) + + # without bootstrap, all trees are perfect on the training set + # disable the resampling by passing an empty dictionary. + ensemble = BalancedBaggingClassifier( + base_estimator=DecisionTreeClassifier(), + max_samples=1.0, + bootstrap=False, + n_estimators=10, + ratio={}, + random_state=0).fit(X_train, y_train) + + assert (ensemble.score(X_train, y_train) == + base_estimator.score(X_train, y_train)) + + # with bootstrap, trees are no longer perfect on the training set + ensemble = BalancedBaggingClassifier( + base_estimator=DecisionTreeClassifier(), + max_samples=1.0, + bootstrap=True, + random_state=0).fit(X_train, y_train) + + assert (ensemble.score(X_train, y_train) < + base_estimator.score(X_train, y_train)) + + +def test_bootstrap_features(): + # Test that bootstrapping features may generate duplicate features. + X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, + random_state=0) + + ensemble = BalancedBaggingClassifier( + base_estimator=DecisionTreeClassifier(), + max_features=1.0, + bootstrap_features=False, + random_state=0).fit(X_train, y_train) + + for features in ensemble.estimators_features_: + assert np.unique(features).shape[0] == X.shape[1] + + ensemble = BalancedBaggingClassifier( + base_estimator=DecisionTreeClassifier(), + max_features=1.0, + bootstrap_features=True, + random_state=0).fit(X_train, y_train) + + unique_features = [np.unique(features).shape[0] + for features in ensemble.estimators_features_] + assert np.median(unique_features) < X.shape[1] + + +def test_probability(): + # Predict probabilities. + X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, + random_state=0) + + with np.errstate(divide="ignore", invalid="ignore"): + # Normal case + ensemble = BalancedBaggingClassifier( + base_estimator=DecisionTreeClassifier(), + random_state=0).fit(X_train, y_train) + + assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), + axis=1), + np.ones(len(X_test))) + + assert_array_almost_equal(ensemble.predict_proba(X_test), + np.exp(ensemble.predict_log_proba(X_test))) + + # Degenerate case, where some classes are missing + ensemble = BalancedBaggingClassifier( + base_estimator=LogisticRegression(), + random_state=0, + max_samples=5).fit(X_train, y_train) + + assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), + axis=1), + np.ones(len(X_test))) + + assert_array_almost_equal(ensemble.predict_proba(X_test), + np.exp(ensemble.predict_log_proba(X_test))) + + +def test_oob_score_classification(): + # Check that oob prediction is a good estimation of the generalization + # error. + X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, + random_state=0) + + for base_estimator in [DecisionTreeClassifier(), SVC()]: + clf = BalancedBaggingClassifier( + base_estimator=base_estimator, + n_estimators=100, + bootstrap=True, + oob_score=True, + random_state=0).fit(X_train, y_train) + + test_score = clf.score(X_test, y_test) + + assert abs(test_score - clf.oob_score_) < 0.1 + + # Test with few estimators + assert_warns(UserWarning, + BalancedBaggingClassifier( + base_estimator=base_estimator, + n_estimators=1, + bootstrap=True, + oob_score=True, + random_state=0).fit, + X_train, + y_train) + + +def test_single_estimator(): + # Check singleton ensembles. + X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, + random_state=0) + + clf1 = BalancedBaggingClassifier( + base_estimator=KNeighborsClassifier(), + n_estimators=1, + bootstrap=False, + bootstrap_features=False, + random_state=0).fit(X_train, y_train) + + clf2 = make_pipeline(RandomUnderSampler( + random_state=clf1.estimators_[0].steps[0][1].random_state), + KNeighborsClassifier()).fit(X_train, y_train) + + assert_array_equal(clf1.predict(X_test), clf2.predict(X_test)) + + +def test_error(): + # Test that it gives proper exception on deficient input. + X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}) + base = DecisionTreeClassifier() + + # Test n_estimators + assert_raises(ValueError, + BalancedBaggingClassifier(base, n_estimators=1.5).fit, X, y) + assert_raises(ValueError, + BalancedBaggingClassifier(base, n_estimators=-1).fit, X, y) + + # Test max_samples + assert_raises(ValueError, + BalancedBaggingClassifier(base, max_samples=-1).fit, X, y) + assert_raises(ValueError, + BalancedBaggingClassifier(base, max_samples=0.0).fit, X, y) + assert_raises(ValueError, + BalancedBaggingClassifier(base, max_samples=2.0).fit, X, y) + assert_raises(ValueError, + BalancedBaggingClassifier(base, max_samples=1000).fit, X, y) + assert_raises(ValueError, + BalancedBaggingClassifier(base, max_samples="foobar").fit, + X, y) + + # Test max_features + assert_raises(ValueError, + BalancedBaggingClassifier(base, max_features=-1).fit, X, y) + assert_raises(ValueError, + BalancedBaggingClassifier(base, max_features=0.0).fit, X, y) + assert_raises(ValueError, + BalancedBaggingClassifier(base, max_features=2.0).fit, X, y) + assert_raises(ValueError, + BalancedBaggingClassifier(base, max_features=5).fit, X, y) + assert_raises(ValueError, + BalancedBaggingClassifier(base, max_features="foobar").fit, + X, y) + + # Test support of decision_function + assert not (hasattr(BalancedBaggingClassifier(base).fit(X, y), + 'decision_function')) + + +def test_gridsearch(): + # Check that bagging ensembles can be grid-searched. + # Transform iris into a binary classification task + X, y = iris.data, iris.target.copy() + y[y == 2] = 1 + + # Grid search with scoring based on decision_function + parameters = {'n_estimators': (1, 2), + 'base_estimator__C': (1, 2)} + + GridSearchCV(BalancedBaggingClassifier(SVC()), + parameters, + scoring="roc_auc").fit(X, y) + + +def test_base_estimator(): + # Check base_estimator and its default values. + X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, + random_state=0) + + ensemble = BalancedBaggingClassifier(None, + n_jobs=3, + random_state=0).fit(X_train, y_train) + + assert isinstance(ensemble.base_estimator_.steps[-1][1], + DecisionTreeClassifier) + + ensemble = BalancedBaggingClassifier(DecisionTreeClassifier(), + n_jobs=3, + random_state=0).fit(X_train, y_train) + + assert isinstance(ensemble.base_estimator_.steps[-1][1], + DecisionTreeClassifier) + + ensemble = BalancedBaggingClassifier(Perceptron(), + n_jobs=3, + random_state=0).fit(X_train, y_train) + + assert isinstance(ensemble.base_estimator_.steps[-1][1], + Perceptron) + + +def test_bagging_with_pipeline(): + X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, + random_state=0) + estimator = BalancedBaggingClassifier( + make_pipeline(SelectKBest(k=1), + DecisionTreeClassifier()), + max_features=2) + estimator.fit(X, y).predict(X) + + +def test_warm_start(random_state=42): + # Test if fitting incrementally with warm start gives a forest of the + # right size and the same results as a normal fit. + X, y = make_hastie_10_2(n_samples=20, random_state=1) + + clf_ws = None + for n_estimators in [5, 10]: + if clf_ws is None: + clf_ws = BalancedBaggingClassifier(n_estimators=n_estimators, + random_state=random_state, + warm_start=True) + else: + clf_ws.set_params(n_estimators=n_estimators) + clf_ws.fit(X, y) + assert len(clf_ws) == n_estimators + + clf_no_ws = BalancedBaggingClassifier(n_estimators=10, + random_state=random_state, + warm_start=False) + clf_no_ws.fit(X, y) + + assert (set([pipe.steps[-1][1].random_state for pipe in clf_ws]) == + set([pipe.steps[-1][1].random_state for pipe in clf_no_ws])) + + +def test_warm_start_smaller_n_estimators(): + # Test if warm start'ed second fit with smaller n_estimators raises error. + X, y = make_hastie_10_2(n_samples=20, random_state=1) + clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True) + clf.fit(X, y) + clf.set_params(n_estimators=4) + assert_raises(ValueError, clf.fit, X, y) + + +def test_warm_start_equal_n_estimators(): + # Test that nothing happens when fitting without increasing n_estimators + X, y = make_hastie_10_2(n_samples=20, random_state=1) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) + + clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True, + random_state=83) + clf.fit(X_train, y_train) + + y_pred = clf.predict(X_test) + # modify X to nonsense values, this should not change anything + X_train += 1. + + assert_warns_message(UserWarning, + "Warm-start fitting without increasing n_estimators" + " does not", clf.fit, X_train, y_train) + assert_array_equal(y_pred, clf.predict(X_test)) + + +def test_warm_start_equivalence(): + # warm started classifier with 5+5 estimators should be equivalent to + # one classifier with 10 estimators + X, y = make_hastie_10_2(n_samples=20, random_state=1) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) + + clf_ws = BalancedBaggingClassifier(n_estimators=5, warm_start=True, + random_state=3141) + clf_ws.fit(X_train, y_train) + clf_ws.set_params(n_estimators=10) + clf_ws.fit(X_train, y_train) + y1 = clf_ws.predict(X_test) + + clf = BalancedBaggingClassifier(n_estimators=10, warm_start=False, + random_state=3141) + clf.fit(X_train, y_train) + y2 = clf.predict(X_test) + + assert_array_almost_equal(y1, y2) + + +def test_warm_start_with_oob_score_fails(): + # Check using oob_score and warm_start simultaneously fails + X, y = make_hastie_10_2(n_samples=20, random_state=1) + clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True, + oob_score=True) + assert_raises(ValueError, clf.fit, X, y) + + +def test_oob_score_removed_on_warm_start(): + X, y = make_hastie_10_2(n_samples=2000, random_state=1) + + clf = BalancedBaggingClassifier(n_estimators=50, oob_score=True) + clf.fit(X, y) + + clf.set_params(warm_start=True, oob_score=False, n_estimators=100) + clf.fit(X, y) + + assert_raises(AttributeError, getattr, clf, "oob_score_") + + +def test_oob_score_consistency(): + # Make sure OOB scores are identical when random_state, estimator, and + # training data are fixed and fitting is done twice + X, y = make_hastie_10_2(n_samples=200, random_state=1) + bagging = BalancedBaggingClassifier(KNeighborsClassifier(), + max_samples=0.5, + max_features=0.5, oob_score=True, + random_state=1) + assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_ + + +def test_estimators_samples(): + # Check that format of estimators_samples_ is correct and that results + # generated at fit time can be identically reproduced at a later time + # using data saved in object attributes. + X, y = make_hastie_10_2(n_samples=200, random_state=1) + + # remap the y outside of the BalancedBaggingclassifier + # _, y = np.unique(y, return_inverse=True) + bagging = BalancedBaggingClassifier(LogisticRegression(), max_samples=0.5, + max_features=0.5, random_state=1, + bootstrap=False) + bagging.fit(X, y) + + # Get relevant attributes + estimators_samples = bagging.estimators_samples_ + estimators_features = bagging.estimators_features_ + estimators = bagging.estimators_ + + # Test for correct formatting + assert len(estimators_samples) == len(estimators) + assert len(estimators_samples[0]) == len(X) + assert estimators_samples[0].dtype.kind == 'b' + + # Re-fit single estimator to test for consistent sampling + estimator_index = 0 + estimator_samples = estimators_samples[estimator_index] + estimator_features = estimators_features[estimator_index] + estimator = estimators[estimator_index] + + X_train = (X[estimator_samples])[:, estimator_features] + y_train = y[estimator_samples] + + orig_coefs = estimator.steps[-1][1].coef_ + estimator.fit(X_train, y_train) + new_coefs = estimator.steps[-1][1].coef_ + + assert_array_almost_equal(orig_coefs, new_coefs) + + +def test_max_samples_consistency(): + # Make sure validated max_samples and original max_samples are identical + # when valid integer max_samples supplied by user + max_samples = 100 + X, y = make_hastie_10_2(n_samples=2*max_samples, random_state=1) + bagging = BalancedBaggingClassifier(KNeighborsClassifier(), + max_samples=max_samples, + max_features=0.5, random_state=1) + bagging.fit(X, y) + assert bagging._max_samples == max_samples diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py index 7d665b1c0..2aaa128b4 100644 --- a/imblearn/ensemble/tests/test_easy_ensemble.py +++ b/imblearn/ensemble/tests/test_easy_ensemble.py @@ -3,13 +3,15 @@ # Christos Aridas # License: MIT -from __future__ import print_function - import numpy as np + +from sklearn.datasets import load_iris from sklearn.utils.testing import assert_array_equal from imblearn.ensemble import EasyEnsemble +iris = load_iris() + # Generate a global dataset to use RND_SEED = 0 X = np.array([[0.5220963, 0.11349303], [0.59091459, 0.40692742],