Add random_state parameter to stacking cv estimators (#523)

* add random_state parameter to stacking cv estimators * update changelog and jupyter docs * update changelog again * minor change * default stacking cv random_state to None
rasbt · Apr 29, 2019 · ec2658c · ec2658c
1 parent c338a1f
commit ec2658c
Show file tree

Hide file tree

Showing 7 changed files with 261 additions and 221 deletions.
diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md
@@ -17,6 +17,7 @@ The CHANGELOG for the current development version is available at
 
 ##### New Features
 
+- `StackingCVClassifier` and `StackingCVRegressor` now support `random_state` parameter, which, together with `shuffle`, controls the randomness in the cv splitting. ([#523](https://github.com/rasbt/mlxtend/pull/523) via [Qiang Gu](https://github.com/qiaguhttps://github.com/qiagu))
 - Other stacking estimators, including `StackingClassifier`, `StackingCVClassifier` and `StackingRegressor`, support grid search over the `regressors` and even a single base regressor. ([#522](https://github.com/rasbt/mlxtend/pull/522) via [Qiang Gu](https://github.com/qiaguhttps://github.com/qiagu))
 - Adds multiprocessing support to `StackingCVClassifier`. ([#522](https://github.com/rasbt/mlxtend/pull/522) via [Qiang Gu](https://github.com/qiaguhttps://github.com/qiagu))
 - Adds multiprocessing support to `StackingCVRegressor`. ([#512](https://github.com/rasbt/mlxtend/pull/512) via [Qiang Gu](https://github.com/qiaguhttps://github.com/qiagu))

diff --git a/docs/sources/user_guide/classifier/StackingCVClassifier.ipynb b/docs/sources/user_guide/classifier/StackingCVClassifier.ipynb
diff --git a/docs/sources/user_guide/regressor/StackingCVRegressor.ipynb b/docs/sources/user_guide/regressor/StackingCVRegressor.ipynb
@@ -87,7 +87,7 @@
       "R^2 Score: 0.46 (+/- 0.29) [SVM]\n",
       "R^2 Score: 0.43 (+/- 0.14) [Lasso]\n",
       "R^2 Score: 0.53 (+/- 0.28) [Random Forest]\n",
-      "R^2 Score: 0.58 (+/- 0.23) [StackingCVRegressor]\n"
+      "R^2 Score: 0.57 (+/- 0.24) [StackingCVRegressor]\n"
      ]
     }
    ],
@@ -109,13 +109,11 @@
     "rf = RandomForestRegressor(n_estimators=5, \n",
     "                           random_state=RANDOM_SEED)\n",
     "\n",
-    "# The StackingCVRegressor uses scikit-learn's check_cv\n",
-    "# internally, which doesn't support a random seed. Thus\n",
-    "# NumPy's random seed need to be specified explicitely for\n",
-    "# deterministic behavior\n",
-    "np.random.seed(RANDOM_SEED)\n",
+    "# Starting from v0.16.0, StackingCVRegressor supports\n",
+    "# `random_state` to get deterministic result.\n",
     "stack = StackingCVRegressor(regressors=(svr, lasso, rf),\n",
-    "                            meta_regressor=lasso)\n",
+    "                            meta_regressor=lasso,\n",
+    "                            random_state=RANDOM_SEED)\n",
     "\n",
     "print('5-fold cross validation scores:\\n')\n",
     "\n",
@@ -141,16 +139,11 @@
       "Neg. MSE Score: -33.34 (+/- 22.36) [SVM]\n",
       "Neg. MSE Score: -35.53 (+/- 16.99) [Lasso]\n",
       "Neg. MSE Score: -27.25 (+/- 16.76) [Random Forest]\n",
-      "Neg. MSE Score: -25.56 (+/- 18.22) [StackingCVRegressor]\n"
+      "Neg. MSE Score: -25.82 (+/- 18.10) [StackingCVRegressor]\n"
      ]
     }
    ],
    "source": [
-    "# The StackingCVRegressor uses scikit-learn's check_cv\n",
-    "# internally, which doesn't support a random seed. Thus\n",
-    "# NumPy's random seed need to be specified explicitely for\n",
-    "# deterministic behavior\n",
-    "np.random.seed(RANDOM_SEED)\n",
     "stack = StackingCVRegressor(regressors=(svr, lasso, rf),\n",
     "                            meta_regressor=lasso)\n",
     "\n",
@@ -186,18 +179,18 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/Users/guq/miniconda3/envs/python3/lib/python3.7/site-packages/sklearn/model_selection/_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.\n",
-      "  DeprecationWarning)\n"
+      "Best: 0.679576 using {'lasso__alpha': 1.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.4}\n"
      ]
     },
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Best: 0.674237 using {'lasso__alpha': 1.6, 'meta_regressor__n_estimators': 100, 'ridge__alpha': 0.2}\n"
+      "/Users/guq/miniconda3/envs/python3/lib/python3.7/site-packages/sklearn/model_selection/_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.\n",
+      "  DeprecationWarning)\n"
      ]
     }
    ],
@@ -215,13 +208,9 @@
     "lasso = Lasso(random_state=RANDOM_SEED)\n",
     "rf = RandomForestRegressor(random_state=RANDOM_SEED)\n",
     "\n",
-    "# The StackingCVRegressor uses scikit-learn's check_cv\n",
-    "# internally, which doesn't support a random seed. Thus\n",
-    "# NumPy's random seed need to be specified explicitely for\n",
-    "# deterministic behavior\n",
-    "np.random.seed(RANDOM_SEED)\n",
     "stack = StackingCVRegressor(regressors=(lasso, ridge),\n",
     "                            meta_regressor=rf, \n",
+    "                            random_state=RANDOM_SEED,\n",
     "                            use_features_in_secondary=True)\n",
     "\n",
     "params = {'lasso__alpha': [0.1, 1.0, 10.0],\n",
@@ -252,21 +241,21 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0.616 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.05}\n",
+      "0.637 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.05}\n",
       "0.656 +/- 0.08 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.1}\n",
-      "0.653 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.15}\n",
-      "0.669 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.2}\n",
-      "0.632 +/- 0.08 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.25}\n",
-      "0.664 +/- 0.08 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.3}\n",
-      "0.632 +/- 0.08 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.35}\n",
-      "0.642 +/- 0.08 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.4}\n",
-      "0.653 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.45}\n",
-      "0.657 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 100, 'ridge__alpha': 0.05}\n",
-      "0.650 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 100, 'ridge__alpha': 0.1}\n",
-      "0.648 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 100, 'ridge__alpha': 0.15}\n",
+      "0.635 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.15}\n",
+      "0.647 +/- 0.08 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.2}\n",
+      "0.630 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.25}\n",
+      "0.628 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.3}\n",
+      "0.639 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.35}\n",
+      "0.641 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.4}\n",
+      "0.653 +/- 0.08 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.45}\n",
+      "0.644 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 100, 'ridge__alpha': 0.05}\n",
+      "0.642 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 100, 'ridge__alpha': 0.1}\n",
+      "0.646 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 100, 'ridge__alpha': 0.15}\n",
       "...\n",
-      "Best parameters: {'lasso__alpha': 1.6, 'meta_regressor__n_estimators': 100, 'ridge__alpha': 0.2}\n",
-      "Accuracy: 0.67\n"
+      "Best parameters: {'lasso__alpha': 1.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.4}\n",
+      "Accuracy: 0.68\n"
      ]
     }
    ],
@@ -318,21 +307,12 @@
      "text": [
       "## StackingCVRegressor\n",
       "\n",
-      "*StackingCVRegressor(regressors, meta_regressor, cv=5, shuffle=True, use_features_in_secondary=False, store_train_meta_features=False, refit=True)*\n",
+      "*StackingCVRegressor(regressors, meta_regressor, cv=5, shuffle=True, random_state=None, verbose=0, refit=True, use_features_in_secondary=False, store_train_meta_features=False, n_jobs=None, pre_dispatch='2*n_jobs')*\n",
       "\n",
       "A 'Stacking Cross-Validation' regressor for scikit-learn estimators.\n",
       "\n",
       "New in mlxtend v0.7.0\n",
       "\n",
-      "**Notes**\n",
-      "\n",
-      "The StackingCVRegressor uses scikit-learn's check_cv\n",
-      "internally, which doesn't support a random seed. Thus\n",
-      "NumPy's random seed need to be specified explicitely for\n",
-      "deterministic behavior, for instance, by setting\n",
-      "np.random.seed(RANDOM_SEED)\n",
-      "prior to fitting the StackingCVRegressor\n",
-      "\n",
       "**Parameters**\n",
       "\n",
       "- `regressors` : array-like, shape = [n_regressors]\n",
@@ -357,28 +337,21 @@
       "    - An iterable yielding train, test splits.\n",
       "    For integer/None inputs, it will use `KFold` cross-validation\n",
       "\n",
-      "- `use_features_in_secondary` : bool (default: False)\n",
-      "\n",
-      "    If True, the meta-regressor will be trained both on\n",
-      "    the predictions of the original regressors and the\n",
-      "    original dataset.\n",
-      "    If False, the meta-regressor will be trained only on\n",
-      "    the predictions of the original regressors.\n",
-      "\n",
       "- `shuffle` : bool (default: True)\n",
       "\n",
       "    If True,  and the `cv` argument is integer, the training data will\n",
       "    be shuffled at fitting stage prior to cross-validation. If the `cv`\n",
       "    argument is a specific cross validation technique, this argument is\n",
       "    omitted.\n",
       "\n",
-      "- `store_train_meta_features` : bool (default: False)\n",
+      "- `random_state` : int, RandomState instance or None, optional (default: None)\n",
       "\n",
-      "    If True, the meta-features computed from the training data\n",
-      "    used for fitting the\n",
-      "    meta-regressor stored in the `self.train_meta_features_` array,\n",
-      "    which can be\n",
-      "    accessed after calling `fit`.\n",
+      "    Constrols the randomness of the cv splitter. Used when `cv` is\n",
+      "    integer and `shuffle=True`. New in v0.16.0.\n",
+      "\n",
+      "- `verbose` : int, optional (default=0)\n",
+      "\n",
+      "    Controls the verbosity of the building process. New in v0.16.0\n",
       "\n",
       "- `refit` : bool (default: True)\n",
       "\n",
@@ -389,6 +362,45 @@
       "    the scikit-learn fit/predict API interface but are not compatible\n",
       "    to scikit-learn's `clone` function.\n",
       "\n",
+      "- `use_features_in_secondary` : bool (default: False)\n",
+      "\n",
+      "    If True, the meta-regressor will be trained both on\n",
+      "    the predictions of the original regressors and the\n",
+      "    original dataset.\n",
+      "    If False, the meta-regressor will be trained only on\n",
+      "    the predictions of the original regressors.\n",
+      "\n",
+      "- `store_train_meta_features` : bool (default: False)\n",
+      "\n",
+      "    If True, the meta-features computed from the training data\n",
+      "    used for fitting the\n",
+      "    meta-regressor stored in the `self.train_meta_features_` array,\n",
+      "    which can be\n",
+      "    accessed after calling `fit`.\n",
+      "\n",
+      "- `n_jobs` : int or None, optional (default=None)\n",
+      "\n",
+      "    The number of CPUs to use to do the computation.\n",
+      "    ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n",
+      "    ``-1`` means using all processors. See :term:`Glossary <n_jobs>`\n",
+      "    for more details. New in v0.16.0.\n",
+      "\n",
+      "- `pre_dispatch` : int, or string, optional\n",
+      "\n",
+      "    Controls the number of jobs that get dispatched during parallel\n",
+      "    execution. Reducing this number can be useful to avoid an\n",
+      "    explosion of memory consumption when more jobs get dispatched\n",
+      "    than CPUs can process. This parameter can be:\n",
+      "    - None, in which case all the jobs are immediately\n",
+      "    created and spawned. Use this for lightweight and\n",
+      "    fast-running jobs, to avoid delays due to on-demand\n",
+      "    spawning of the jobs\n",
+      "    - An int, giving the exact number of total jobs that are\n",
+      "    spawned\n",
+      "    - A string, giving an expression as a function of n_jobs,\n",
+      "    as in '2*n_jobs'\n",
+      "    New in v0.16.0.\n",
+      "\n",
       "**Attributes**\n",
       "\n",
       "- `train_meta_features` : numpy array, shape = [n_samples, n_regressors]\n",
@@ -546,7 +558,10 @@
       "\n",
       "- `X` : array-like, shape = (n_samples, n_features)\n",
       "\n",
-      "    Test samples.\n",
+      "    Test samples. For some estimators this may be a\n",
+      "    precomputed kernel matrix instead, shape = (n_samples,\n",
+      "    n_samples_fitted], where n_samples_fitted is the number of\n",
+      "    samples used in the fitting for the estimator.\n",
       "\n",
       "\n",
       "- `y` : array-like, shape = (n_samples) or (n_samples, n_outputs)\n",
@@ -570,15 +585,22 @@
       "\n",
       "Set the parameters of this estimator.\n",
       "\n",
-      "The method works on simple estimators as well as on nested objects\n",
-      "(such as pipelines). The latter have parameters of the form\n",
-      "``<component>__<parameter>`` so that it's possible to update each\n",
-      "component of a nested object.\n",
+      "Valid parameter keys can be listed with ``get_params()``.\n",
       "\n",
       "**Returns**\n",
       "\n",
       "self\n",
       "\n",
+      "### Properties\n",
+      "\n",
+      "<hr>\n",
+      "\n",
+      "*named_regressors*\n",
+      "\n",
+      "**Returns**\n",
+      "\n",
+      "List of named estimator tuples, like [('svc', SVC(...))]\n",
+      "\n",
       "\n"
      ]
     }

diff --git a/mlxtend/classifier/stacking_cv_classification.py b/mlxtend/classifier/stacking_cv_classification.py
@@ -29,15 +29,6 @@ class StackingCVClassifier(_BaseXComposition, ClassifierMixin,
 
     New in mlxtend v0.4.3
 
-    Notes
-    -------
-    The StackingCVClassifier uses scikit-learn's check_cv
-    internally, which doesn't support a random seed. Thus
-    NumPy's random seed need to be specified explicitely for
-    deterministic behavior, for instance, by setting
-    np.random.seed(RANDOM_SEED)
-    prior to fitting the StackingCVClassifier
-
     Parameters
     ----------
     classifiers : array-like, shape = [n_classifiers]
@@ -61,20 +52,18 @@ class StackingCVClassifier(_BaseXComposition, ClassifierMixin,
         For integer/None inputs, it will use either a `KFold` or
         `StratifiedKFold` cross validation depending the value of `stratify`
         argument.
-    use_features_in_secondary : bool (default: False)
-        If True, the meta-classifier will be trained both on the predictions
-        of the original classifiers and the original dataset.
-        If False, the meta-classifier will be trained only on the predictions
-        of the original classifiers.
-    stratify : bool (default: True)
-        If True, and the `cv` argument is integer it will follow a stratified
-        K-Fold cross validation technique. If the `cv` argument is a specific
-        cross validation technique, this argument is omitted.
     shuffle : bool (default: True)
         If True,  and the `cv` argument is integer, the training data will be
         shuffled at fitting stage prior to cross-validation. If the `cv`
         argument is a specific cross validation technique, this argument is
         omitted.
+    random_state : int, RandomState instance or None, optional (default: None)
+        Constrols the randomness of the cv splitter. Used when `cv` is
+        integer and `shuffle=True`. New in v0.16.0.
+    stratify : bool (default: True)
+        If True, and the `cv` argument is integer it will follow a stratified
+        K-Fold cross validation technique. If the `cv` argument is a specific
+        cross validation technique, this argument is omitted.
     verbose : int, optional (default=0)
         Controls the verbosity of the building process.
         - `verbose=0` (default): Prints nothing
@@ -84,6 +73,11 @@ class StackingCVClassifier(_BaseXComposition, ClassifierMixin,
                        regressor being fitted
         - `verbose>2`: Changes `verbose` param of the underlying regressor to
            self.verbose - 2
+    use_features_in_secondary : bool (default: False)
+        If True, the meta-classifier will be trained both on the predictions
+        of the original classifiers and the original dataset.
+        If False, the meta-classifier will be trained only on the predictions
+        of the original classifiers.
     store_train_meta_features : bool (default: False)
         If True, the meta-features computed from the training data used
         for fitting the meta-classifier stored in the
@@ -103,7 +97,7 @@ class StackingCVClassifier(_BaseXComposition, ClassifierMixin,
         The number of CPUs to use to do the computation.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
-        for more details.
+        for more details. New in v0.16.0.
     pre_dispatch : int, or string, optional
         Controls the number of jobs that get dispatched during parallel
         execution. Reducing this number can be useful to avoid an
@@ -117,7 +111,7 @@ class StackingCVClassifier(_BaseXComposition, ClassifierMixin,
               spawned
             - A string, giving an expression as a function of n_jobs,
               as in '2*n_jobs'
-
+        New in v0.16.0.
 
     Attributes
     ----------
@@ -137,22 +131,22 @@ class StackingCVClassifier(_BaseXComposition, ClassifierMixin,
 
     """
     def __init__(self, classifiers, meta_classifier,
-                 use_probas=False, cv=2,
+                 use_probas=False, cv=2, shuffle=True,
+                 random_state=None, stratify=True, verbose=0,
                  use_features_in_secondary=False,
-                 stratify=True,
-                 shuffle=True, verbose=0,
                  store_train_meta_features=False,
                  use_clones=True, n_jobs=None,
                  pre_dispatch='2*n_jobs'):
 
         self.classifiers = classifiers
         self.meta_classifier = meta_classifier
         self.use_probas = use_probas
-        self.verbose = verbose
         self.cv = cv
-        self.use_features_in_secondary = use_features_in_secondary
-        self.stratify = stratify
         self.shuffle = shuffle
+        self.random_state = random_state
+        self.stratify = stratify
+        self.verbose = verbose
+        self.use_features_in_secondary = use_features_in_secondary
         self.store_train_meta_features = store_train_meta_features
         self.use_clones = use_clones
         self.n_jobs = n_jobs
@@ -203,6 +197,7 @@ def fit(self, X, y, groups=None, sample_weight=None):
             # Override shuffle parameter in case of self generated
             # cross-validation strategy
             final_cv.shuffle = self.shuffle
+            final_cv.random_state = self.random_state
 
         # Input validation.
         X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'])