Make stacking estimators capable of replacing base estimator in searc…

…hcv (#522) * introduce cross_val_predict to StackingCVRegressor * fix test for stacking_cv_regression * remove duplicate key-values from get_params of StackingCVRegressor * make regressor replaceable in GridSearchCV for StackingCVRegressor * changelog entry * allow droping regressor in gridsearch for StackingCVRegressor * make base parameter handler for estimators composed of a list of base estimators * add a test for stackingcvregressor with gridsearch and update notebook documentation * make stacking cv estimators more compatible with gridsearch * update stacking_classifier and stacking_regressor with _basexcomposition * add verbose to stackingcvregressor and fix test errors * fix flake8 warnings * update changelog * update jupyter user guide * update docs for api modules
rasbt · Apr 28, 2019 · c338a1f · c338a1f
1 parent a3a539e
commit c338a1f
Show file tree

Hide file tree

Showing 13 changed files with 323 additions and 608 deletions.
diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md
@@ -17,13 +17,16 @@ The CHANGELOG for the current development version is available at
 
 ##### New Features
 
+- Other stacking estimators, including `StackingClassifier`, `StackingCVClassifier` and `StackingRegressor`, support grid search over the `regressors` and even a single base regressor. ([#522](https://github.com/rasbt/mlxtend/pull/522) via [Qiang Gu](https://github.com/qiaguhttps://github.com/qiagu))
+- Adds multiprocessing support to `StackingCVClassifier`. ([#522](https://github.com/rasbt/mlxtend/pull/522) via [Qiang Gu](https://github.com/qiaguhttps://github.com/qiagu))
 - Adds multiprocessing support to `StackingCVRegressor`. ([#512](https://github.com/rasbt/mlxtend/pull/512) via [Qiang Gu](https://github.com/qiaguhttps://github.com/qiagu))
 -  Now, the `StackingCVRegressor` also enables grid search over the `regressors` and even a single base regressor. When there are level-mixed parameters, `GridSearchCV` will try to replace hyperparameters in a top-down order (see the [documentation](http://rasbt.github.io/mlxtend/user_guide/regressor/StackingCVRegressor/) for examples details). ([#515](https://github.com/rasbt/mlxtend/pull/512) via [Qiang Gu](https://github.com/qiaguhttps://github.com/qiagu))
 - Adds a `verbose` parameter to `apriori` to show the current iteration number as well as the itemset size currently being sampled. ([#519](https://github.com/rasbt/mlxtend/pull/519)
 - Adds an optional `class_name` parameter to the confusion matrix function to display class names on the axis as tick marks. ([#487](https://github.com/rasbt/mlxtend/pull/487) via [sandpiturtle](https://github.com/qiaguhttps://github.com/sandpiturtle))
 
 ##### Changes
 
+- The same change mentioned below is now applied to other stacking estimators, including `StackingClassifier`, `StackingCVClassifier` and `StackingRegressor`. ([#522](https://github.com/rasbt/mlxtend/pull/522) via [Qiang Gu](https://github.com/qiaguhttps://github.com/qiagu))
 - Due to new features, restructuring, and better scikit-learn support (for `GridSearchCV`, etc.) the `StackingCVRegressor`'s meta regressor is now being accessed via `'meta_regressor__*` in the parameter grid. E.g., if a `RandomForestRegressor` as meta- egressor was previously tuned via `'randomforestregressor__n_estimators'`, this has now changed to `'meta_regressor__n_estimators'`. ([#515](https://github.com/rasbt/mlxtend/pull/512) via [Qiang Gu](https://github.com/qiaguhttps://github.com/qiagu))
 
 

diff --git a/docs/sources/user_guide/classifier/StackingCVClassifier.ipynb b/docs/sources/user_guide/classifier/StackingCVClassifier.ipynb
diff --git a/docs/sources/user_guide/classifier/StackingClassifier.ipynb b/docs/sources/user_guide/classifier/StackingClassifier.ipynb
diff --git a/docs/sources/user_guide/regressor/StackingCVRegressor.ipynb b/docs/sources/user_guide/regressor/StackingCVRegressor.ipynb
@@ -613,7 +613,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.1"
+   "version": "3.7.0"
   },
   "toc": {
    "nav_menu": {},

diff --git a/docs/sources/user_guide/regressor/StackingRegressor.ipynb b/docs/sources/user_guide/regressor/StackingRegressor.ipynb
diff --git a/mlxtend/classifier/stacking_classification.py b/mlxtend/classifier/stacking_classification.py
@@ -10,16 +10,16 @@
 
 from ..externals.estimator_checks import check_is_fitted
 from ..externals.name_estimators import _name_estimators
-from ..externals import six
+from ..utils.base_compostion import _BaseXComposition
 from scipy import sparse
-from sklearn.base import BaseEstimator
 from sklearn.base import ClassifierMixin
 from sklearn.base import TransformerMixin
 from sklearn.base import clone
 import numpy as np
 
 
-class StackingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
+class StackingClassifier(_BaseXComposition, ClassifierMixin,
+                         TransformerMixin):
 
     """A Stacking classifier for scikit-learn estimators for classification.
 
@@ -93,19 +93,17 @@ def __init__(self, classifiers, meta_classifier,
 
         self.classifiers = classifiers
         self.meta_classifier = meta_classifier
-        self.named_classifiers = {key: value for
-                                  key, value in
-                                  _name_estimators(classifiers)}
-        self.named_meta_classifier = {'meta-%s' % key: value for
-                                      key, value in
-                                      _name_estimators([meta_classifier])}
         self.use_probas = use_probas
         self.average_probas = average_probas
         self.verbose = verbose
         self.use_features_in_secondary = use_features_in_secondary
         self.store_train_meta_features = store_train_meta_features
         self.use_clones = use_clones
 
+    @property
+    def named_classifiers(self):
+        return _name_estimators(self.classifiers)
+
     def fit(self, X, y, sample_weight=None):
         """ Fit ensemble classifers and the meta-classifier.
 
@@ -128,7 +126,7 @@ def fit(self, X, y, sample_weight=None):
 
         """
         if self.use_clones:
-            self.clfs_ = [clone(clf) for clf in self.classifiers]
+            self.clfs_ = clone(self.classifiers)
             self.meta_clf_ = clone(self.meta_classifier)
         else:
             self.clfs_ = self.classifiers
@@ -176,25 +174,19 @@ def fit(self, X, y, sample_weight=None):
 
     def get_params(self, deep=True):
         """Return estimator parameter names for GridSearch support."""
+        return self._get_params('named_classifiers', deep=deep)
 
-        if not deep:
-            return super(StackingClassifier, self).get_params(deep=False)
-        else:
-            out = self.named_classifiers.copy()
-            for name, step in six.iteritems(self.named_classifiers):
-                for key, value in six.iteritems(step.get_params(deep=True)):
-                    out['%s__%s' % (name, key)] = value
-
-            out.update(self.named_meta_classifier.copy())
-            for name, step in six.iteritems(self.named_meta_classifier):
-                for key, value in six.iteritems(step.get_params(deep=True)):
-                    out['%s__%s' % (name, key)] = value
+    def set_params(self, **params):
+        """Set the parameters of this estimator.
 
-            for key, value in six.iteritems(super(StackingClassifier,
-                                            self).get_params(deep=False)):
-                out['%s' % key] = value
+        Valid parameter keys can be listed with ``get_params()``.
 
-            return out
+        Returns
+        -------
+        self
+        """
+        self._set_params('classifiers', 'named_classifiers', **params)
+        return self
 
     def predict_meta_features(self, X):
         """ Get meta-features of test-data.

diff --git a/mlxtend/classifier/stacking_cv_classification.py b/mlxtend/classifier/stacking_cv_classification.py
@@ -11,18 +11,19 @@
 
 from ..externals.name_estimators import _name_estimators
 from ..externals.estimator_checks import check_is_fitted
+from ..utils.base_compostion import _BaseXComposition
 import numpy as np
 from scipy import sparse
-from sklearn.base import BaseEstimator
 from sklearn.base import ClassifierMixin
 from sklearn.base import TransformerMixin
 from sklearn.base import clone
-from sklearn.externals import six
+from sklearn.model_selection import cross_val_predict
 from sklearn.model_selection._split import check_cv
-from sklearn.utils import safe_indexing
+from sklearn.utils import check_X_y
 
 
-class StackingCVClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
+class StackingCVClassifier(_BaseXComposition, ClassifierMixin,
+                           TransformerMixin):
 
     """A 'Stacking Cross-Validation' classifier for scikit-learn estimators.
 
@@ -98,6 +99,24 @@ class StackingCVClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
         recommended if you are working with estimators that are supporting
         the scikit-learn fit/predict API interface but are not compatible
         to scikit-learn's `clone` function.
+    n_jobs : int or None, optional (default=None)
+        The number of CPUs to use to do the computation.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+    pre_dispatch : int, or string, optional
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+            - None, in which case all the jobs are immediately
+              created and spawned. Use this for lightweight and
+              fast-running jobs, to avoid delays due to on-demand
+              spawning of the jobs
+            - An int, giving the exact number of total jobs that are
+              spawned
+            - A string, giving an expression as a function of n_jobs,
+              as in '2*n_jobs'
 
 
     Attributes
@@ -123,16 +142,11 @@ def __init__(self, classifiers, meta_classifier,
                  stratify=True,
                  shuffle=True, verbose=0,
                  store_train_meta_features=False,
-                 use_clones=True):
+                 use_clones=True, n_jobs=None,
+                 pre_dispatch='2*n_jobs'):
 
         self.classifiers = classifiers
         self.meta_classifier = meta_classifier
-        self.named_classifiers = {key: value for
-                                  key, value in
-                                  _name_estimators(classifiers)}
-        self.named_meta_classifier = {'meta-%s' % key: value for
-                                      key, value in
-                                      _name_estimators([meta_classifier])}
         self.use_probas = use_probas
         self.verbose = verbose
         self.cv = cv
@@ -141,6 +155,12 @@ def __init__(self, classifiers, meta_classifier,
         self.shuffle = shuffle
         self.store_train_meta_features = store_train_meta_features
         self.use_clones = use_clones
+        self.n_jobs = n_jobs
+        self.pre_dispatch = pre_dispatch
+
+    @property
+    def named_classifiers(self):
+        return _name_estimators(self.classifiers)
 
     def fit(self, X, y, groups=None, sample_weight=None):
         """ Fit ensemble classifers and the meta-classifier.
@@ -170,7 +190,7 @@ def fit(self, X, y, groups=None, sample_weight=None):
 
         """
         if self.use_clones:
-            self.clfs_ = [clone(clf) for clf in self.classifiers]
+            self.clfs_ = clone(self.classifiers)
             self.meta_clf_ = clone(self.meta_classifier)
         else:
             self.clfs_ = self.classifiers
@@ -184,16 +204,15 @@ def fit(self, X, y, groups=None, sample_weight=None):
             # cross-validation strategy
             final_cv.shuffle = self.shuffle
 
-        folds = list(final_cv.split(X, y, groups))
+        # Input validation.
+        X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'])
 
-        # Handle the case of X being a list of lists
-        #   by converting X into a numpy array
-        if isinstance(X, list):
-            X = np.array(X)
+        if sample_weight is None:
+            fit_params = None
+        else:
+            fit_params = dict(sample_weight=sample_weight)
 
         meta_features = None
-        n_folds = final_cv.get_n_splits()
-        n_models = len(self.clfs_)
 
         for n, model in enumerate(self.clfs_):
 
@@ -210,38 +229,19 @@ def fit(self, X, y, groups=None, sample_weight=None):
             if self.verbose > 1:
                 print(_name_estimators((model,))[0][1])
 
-            for num, (train_indices, test_indices) in enumerate(folds):
-
-                X_train = safe_indexing(X, train_indices)
-                y_train = safe_indexing(y, train_indices)
-
-                if self.verbose > 0:
-                    print("Training and fitting fold %d of %d..." %
-                          ((num + 1), n_folds))
-
-                if sample_weight is None:
-                    model.fit(X_train, y_train)
-                else:
-                    w = safe_indexing(sample_weight, train_indices)
-                    model.fit(X_train, y_train, sample_weight=w)
-
-                X_test = safe_indexing(X, test_indices)
-                if not self.use_probas:
-                    prediction = model.predict(X_test)[:, np.newaxis]
-                else:
-                    prediction = model.predict_proba(X_test)
-
-                if meta_features is None:
-                    # First run, use prediction to get the number of classes
-                    n_classes = prediction.shape[1]
-                    meta_features_shape = (X.shape[0], n_classes * n_models)
-                    meta_features = np.empty(shape=meta_features_shape)
-                    meta_features[np.array(test_indices)[:, np.newaxis],
-                                  np.arange(n_classes)] = prediction
-                else:
-                    row_idx = np.array(test_indices)[:, np.newaxis]
-                    col_idx = np.arange(n_classes) + n * n_classes
-                    meta_features[row_idx, col_idx] = prediction
+            prediction = cross_val_predict(
+                    model, X, y, groups=groups, cv=final_cv,
+                    n_jobs=self.n_jobs, fit_params=fit_params,
+                    verbose=self.verbose, pre_dispatch=self.pre_dispatch,
+                    method='predict_proba' if self.use_probas else 'predict')
+
+            if not self.use_probas:
+                prediction = prediction[:, np.newaxis]
+
+            if meta_features is None:
+                meta_features = prediction
+            else:
+                meta_features = np.column_stack((meta_features, prediction))
 
         if self.store_train_meta_features:
             self.train_meta_features_ = meta_features
@@ -270,24 +270,19 @@ def fit(self, X, y, groups=None, sample_weight=None):
 
     def get_params(self, deep=True):
         """Return estimator parameter names for GridSearch support."""
-        if not deep:
-            return super(StackingCVClassifier, self).get_params(deep=False)
-        else:
-            out = self.named_classifiers.copy()
-            for name, step in six.iteritems(self.named_classifiers):
-                for key, value in six.iteritems(step.get_params(deep=True)):
-                    out['%s__%s' % (name, key)] = value
+        return self._get_params('named_classifiers', deep=deep)
 
-            out.update(self.named_meta_classifier.copy())
-            for name, step in six.iteritems(self.named_meta_classifier):
-                for key, value in six.iteritems(step.get_params(deep=True)):
-                    out['%s__%s' % (name, key)] = value
+    def set_params(self, **params):
+        """Set the parameters of this estimator.
 
-            for key, value in six.iteritems(super(StackingCVClassifier,
-                                            self).get_params(deep=False)):
-                out['%s' % key] = value
+        Valid parameter keys can be listed with ``get_params()``.
 
-            return out
+        Returns
+        -------
+        self
+        """
+        self._set_params('classifiers', 'named_classifiers', **params)
+        return self
 
     def predict_meta_features(self, X):
         """ Get meta-features of test-data.

diff --git a/mlxtend/classifier/tests/test_stacking_classifier.py b/mlxtend/classifier/tests/test_stacking_classifier.py
@@ -216,7 +216,7 @@ def test_gridsearch():
     sclf = StackingClassifier(classifiers=[clf1, clf2],
                               meta_classifier=meta)
 
-    params = {'meta-logisticregression__C': [1.0, 100.0],
+    params = {'meta_classifier__C': [1.0, 100.0],
               'randomforestclassifier__n_estimators': [20, 200]}
 
     grid = GridSearchCV(estimator=sclf, param_grid=params, cv=5, iid=False)
@@ -238,7 +238,7 @@ def test_gridsearch_enumerate_names():
     sclf = StackingClassifier(classifiers=[clf1, clf1, clf2],
                               meta_classifier=meta)
 
-    params = {'meta-logisticregression__C': [1.0, 100.0],
+    params = {'meta_classifier__C': [1.0, 100.0],
               'randomforestclassifier-1__n_estimators': [5, 10],
               'randomforestclassifier-2__n_estimators': [5, 20],
               'use_probas': [True, False]}
@@ -407,7 +407,6 @@ def test_get_params():
               'classifiers',
               'gaussiannb',
               'kneighborsclassifier',
-              'meta-logisticregression',
               'meta_classifier',
               'randomforestclassifier',
               'store_train_meta_features',

diff --git a/mlxtend/classifier/tests/test_stacking_cv_classifier.py b/mlxtend/classifier/tests/test_stacking_cv_classifier.py
@@ -174,7 +174,7 @@ def test_gridsearch():
                                 use_probas=True,
                                 shuffle=False)
 
-    params = {'meta-logisticregression__C': [1.0, 100.0],
+    params = {'meta_classifier__C': [1.0, 100.0],
               'randomforestclassifier__n_estimators': [20, 200]}
 
     grid = GridSearchCV(estimator=sclf, param_grid=params, cv=5, iid=False)
@@ -196,7 +196,7 @@ def test_gridsearch_enumerate_names():
                                 meta_classifier=meta,
                                 shuffle=False)
 
-    params = {'meta-logisticregression__C': [1.0, 100.0],
+    params = {'meta_classifier__C': [1.0, 100.0],
               'randomforestclassifier-1__n_estimators': [5, 10],
               'randomforestclassifier-2__n_estimators': [5, 20],
               'use_probas': [True, False]}
@@ -259,7 +259,7 @@ def test_do_not_stratify():
                              cv=5,
                              scoring='accuracy')
     scores_mean = (round(scores.mean(), 2))
-    assert scores_mean == 0.94
+    assert scores_mean == 0.93, scores.mean()
 
 
 def test_cross_validation_technique():
@@ -281,7 +281,7 @@ def test_cross_validation_technique():
                              cv=5,
                              scoring='accuracy')
     scores_mean = (round(scores.mean(), 2))
-    assert scores_mean == 0.94
+    assert scores_mean == 0.93, scores.mean()
 
 
 def test_not_fitted():
@@ -338,12 +338,14 @@ def test_get_params():
                                 meta_classifier=lr)
 
     got = sorted(list({s.split('__')[0] for s in sclf.get_params().keys()}))
+
     expect = ['classifiers',
               'cv',
               'gaussiannb',
               'kneighborsclassifier',
-              'meta-logisticregression',
               'meta_classifier',
+              'n_jobs',
+              'pre_dispatch',
               'randomforestclassifier',
               'shuffle',
               'store_train_meta_features',