diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index a128a787b..ac587767f 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -17,6 +17,7 @@ The CHANGELOG for the current development version is available at ##### New Features +- `StackingCVClassifier` and `StackingCVRegressor` now support `random_state` parameter, which, together with `shuffle`, controls the randomness in the cv splitting. ([#523](https://github.com/rasbt/mlxtend/pull/523) via [Qiang Gu](https://github.com/qiaguhttps://github.com/qiagu)) - Other stacking estimators, including `StackingClassifier`, `StackingCVClassifier` and `StackingRegressor`, support grid search over the `regressors` and even a single base regressor. ([#522](https://github.com/rasbt/mlxtend/pull/522) via [Qiang Gu](https://github.com/qiaguhttps://github.com/qiagu)) - Adds multiprocessing support to `StackingCVClassifier`. ([#522](https://github.com/rasbt/mlxtend/pull/522) via [Qiang Gu](https://github.com/qiaguhttps://github.com/qiagu)) - Adds multiprocessing support to `StackingCVRegressor`. ([#512](https://github.com/rasbt/mlxtend/pull/512) via [Qiang Gu](https://github.com/qiaguhttps://github.com/qiagu)) diff --git a/docs/sources/user_guide/classifier/StackingCVClassifier.ipynb b/docs/sources/user_guide/classifier/StackingCVClassifier.ipynb index 724a576a2..e8c035984 100644 --- a/docs/sources/user_guide/classifier/StackingCVClassifier.ipynb +++ b/docs/sources/user_guide/classifier/StackingCVClassifier.ipynb @@ -110,7 +110,7 @@ "Accuracy: 0.91 (+/- 0.01) [KNN]\n", "Accuracy: 0.90 (+/- 0.03) [Random Forest]\n", "Accuracy: 0.92 (+/- 0.03) [Naive Bayes]\n", - "Accuracy: 0.93 (+/- 0.02) [StackingClassifier]\n" + "Accuracy: 0.91 (+/- 0.01) [StackingClassifier]\n" ] } ], @@ -133,13 +133,11 @@ "clf3 = GaussianNB()\n", "lr = LogisticRegression()\n", "\n", - "# The StackingCVClassifier uses scikit-learn's check_cv\n", - "# internally, which doesn't support a random seed. Thus\n", - "# NumPy's random seed need to be specified explicitely for\n", - "# deterministic behavior\n", - "np.random.seed(RANDOM_SEED)\n", - "sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], \n", - " meta_classifier=lr)\n", + "# Starting from v0.16.0, StackingCVRegressor supports\n", + "# `random_state` to get deterministic result.\n", + "sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3],\n", + " meta_classifier=lr,\n", + " random_state=RANDOM_SEED)\n", "\n", "print('3-fold cross validation:\\n')\n", "\n", @@ -162,7 +160,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -242,14 +240,10 @@ "clf3 = GaussianNB()\n", "lr = LogisticRegression()\n", "\n", - "# The StackingCVClassifier uses scikit-learn's check_cv\n", - "# internally, which doesn't support a random seed. Thus\n", - "# NumPy's random seed need to be specified explicitely for\n", - "# deterministic behavior\n", - "np.random.seed(RANDOM_SEED)\n", "sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3],\n", " use_probas=True,\n", - " meta_classifier=lr)\n", + " meta_classifier=lr,\n", + " random_state=42)\n", "\n", "print('3-fold cross validation:\\n')\n", "\n", @@ -290,14 +284,14 @@ "text": [ "0.673 +/- 0.01 {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 10}\n", "0.667 +/- 0.00 {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 50}\n", - "0.927 +/- 0.03 {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 10}\n", - "0.893 +/- 0.02 {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 50}\n", - "0.667 +/- 0.00 {'kneighborsclassifier__n_neighbors': 5, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 10}\n", + "0.933 +/- 0.02 {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 10}\n", + "0.920 +/- 0.02 {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 50}\n", + "0.673 +/- 0.01 {'kneighborsclassifier__n_neighbors': 5, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 10}\n", "0.667 +/- 0.00 {'kneighborsclassifier__n_neighbors': 5, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 50}\n", - "0.947 +/- 0.02 {'kneighborsclassifier__n_neighbors': 5, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 10}\n", - "0.947 +/- 0.02 {'kneighborsclassifier__n_neighbors': 5, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 50}\n", + "0.940 +/- 0.02 {'kneighborsclassifier__n_neighbors': 5, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 10}\n", + "0.927 +/- 0.02 {'kneighborsclassifier__n_neighbors': 5, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 50}\n", "Best parameters: {'kneighborsclassifier__n_neighbors': 5, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 10}\n", - "Accuracy: 0.95\n" + "Accuracy: 0.94\n" ] } ], @@ -316,13 +310,9 @@ "clf3 = GaussianNB()\n", "lr = LogisticRegression()\n", "\n", - "# The StackingCVClassifier uses scikit-learn's check_cv\n", - "# internally, which doesn't support a random seed. Thus\n", - "# NumPy's random seed need to be specified explicitely for\n", - "# deterministic behavior\n", - "np.random.seed(RANDOM_SEED)\n", "sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], \n", - " meta_classifier=lr)\n", + " meta_classifier=lr,\n", + " random_state=42)\n", "\n", "params = {'kneighborsclassifier__n_neighbors': [1, 5],\n", " 'randomforestclassifier__n_estimators': [10, 50],\n", @@ -355,9 +345,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.673 +/- 0.01 {'kneighborsclassifier-1__n_neighbors': 1, 'kneighborsclassifier-2__n_neighbors': 1, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 10}\n", + "0.667 +/- 0.00 {'kneighborsclassifier-1__n_neighbors': 1, 'kneighborsclassifier-2__n_neighbors': 1, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 50}\n", + "0.947 +/- 0.02 {'kneighborsclassifier-1__n_neighbors': 1, 'kneighborsclassifier-2__n_neighbors': 1, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 10}\n", + "0.920 +/- 0.02 {'kneighborsclassifier-1__n_neighbors': 1, 'kneighborsclassifier-2__n_neighbors': 1, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 50}\n", + "0.673 +/- 0.01 {'kneighborsclassifier-1__n_neighbors': 1, 'kneighborsclassifier-2__n_neighbors': 5, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 10}\n", + "0.667 +/- 0.00 {'kneighborsclassifier-1__n_neighbors': 1, 'kneighborsclassifier-2__n_neighbors': 5, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 50}\n", + "0.960 +/- 0.02 {'kneighborsclassifier-1__n_neighbors': 1, 'kneighborsclassifier-2__n_neighbors': 5, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 10}\n", + "0.933 +/- 0.02 {'kneighborsclassifier-1__n_neighbors': 1, 'kneighborsclassifier-2__n_neighbors': 5, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 50}\n", + "0.673 +/- 0.01 {'kneighborsclassifier-1__n_neighbors': 5, 'kneighborsclassifier-2__n_neighbors': 1, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 10}\n", + "0.667 +/- 0.00 {'kneighborsclassifier-1__n_neighbors': 5, 'kneighborsclassifier-2__n_neighbors': 1, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 50}\n", + "0.960 +/- 0.02 {'kneighborsclassifier-1__n_neighbors': 5, 'kneighborsclassifier-2__n_neighbors': 1, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 10}\n", + "0.933 +/- 0.02 {'kneighborsclassifier-1__n_neighbors': 5, 'kneighborsclassifier-2__n_neighbors': 1, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 50}\n", + "0.673 +/- 0.01 {'kneighborsclassifier-1__n_neighbors': 5, 'kneighborsclassifier-2__n_neighbors': 5, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 10}\n", + "0.667 +/- 0.00 {'kneighborsclassifier-1__n_neighbors': 5, 'kneighborsclassifier-2__n_neighbors': 5, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 50}\n", + "0.953 +/- 0.02 {'kneighborsclassifier-1__n_neighbors': 5, 'kneighborsclassifier-2__n_neighbors': 5, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 10}\n", + "0.927 +/- 0.02 {'kneighborsclassifier-1__n_neighbors': 5, 'kneighborsclassifier-2__n_neighbors': 5, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 50}\n", + "Best parameters: {'kneighborsclassifier-1__n_neighbors': 1, 'kneighborsclassifier-2__n_neighbors': 5, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 10}\n", + "Accuracy: 0.96\n" + ] + } + ], "source": [ "from sklearn.model_selection import GridSearchCV\n", "\n", @@ -368,13 +383,9 @@ "clf3 = GaussianNB()\n", "lr = LogisticRegression()\n", "\n", - "# The StackingCVClassifier uses scikit-learn's check_cv\n", - "# internally, which doesn't support a random seed. Thus\n", - "# NumPy's random seed need to be specified explicitely for\n", - "# deterministic behavior\n", - "np.random.seed(RANDOM_SEED)\n", "sclf = StackingCVClassifier(classifiers=[clf1, clf1, clf2, clf3], \n", - " meta_classifier=lr)\n", + " meta_classifier=lr,\n", + " random_state=RANDOM_SEED)\n", "\n", "params = {'kneighborsclassifier-1__n_neighbors': [1, 5],\n", " 'kneighborsclassifier-2__n_neighbors': [1, 5],\n", @@ -429,26 +440,29 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "StackingCVClassifier(classifiers=[Pipeline(steps=[('columnselector', ColumnSelector(cols=(0, 2))), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", - " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", - " penalty='l2', random_state=None, solve...='l2', random_state=None, solver='liblinear', tol=0.0001,\n", - " verbose=0, warm_start=False))])],\n", + "StackingCVClassifier(classifiers=[Pipeline(memory=None,\n", + " steps=[('columnselector', ColumnSelector(cols=(0, 2), drop_axis=False)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, max_iter=100, multi_class='warn',\n", + " n_jobs=None,...nalty='l2', random_state=None, solver='warn',\n", + " tol=0.0001, verbose=0, warm_start=False))])],\n", " cv=2,\n", " meta_classifier=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", - " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", - " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", - " verbose=0, warm_start=False),\n", - " shuffle=True, stratify=True, use_features_in_secondary=False,\n", + " intercept_scaling=1, max_iter=100, multi_class='warn',\n", + " n_jobs=None, penalty='l2', random_state=None, solver='warn',\n", + " tol=0.0001, verbose=0, warm_start=False),\n", + " n_jobs=None, pre_dispatch='2*n_jobs', random_state=42,\n", + " shuffle=True, store_train_meta_features=False, stratify=True,\n", + " use_clones=True, use_features_in_secondary=False,\n", " use_probas=False, verbose=0)" ] }, - "execution_count": 1, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -470,7 +484,8 @@ " LogisticRegression())\n", "\n", "sclf = StackingCVClassifier(classifiers=[pipe1, pipe2], \n", - " meta_classifier=LogisticRegression())\n", + " meta_classifier=LogisticRegression(),\n", + " random_state=42)\n", "\n", "sclf.fit(X, y)" ] @@ -493,21 +508,12 @@ "text": [ "## StackingCVClassifier\n", "\n", - "*StackingCVClassifier(classifiers, meta_classifier, use_probas=False, cv=2, use_features_in_secondary=False, stratify=True, shuffle=True, verbose=0, store_train_meta_features=False, use_clones=True, n_jobs=None, pre_dispatch='2*n_jobs')*\n", + "*StackingCVClassifier(classifiers, meta_classifier, use_probas=False, cv=2, shuffle=True, random_state=None, stratify=True, verbose=0, use_features_in_secondary=False, store_train_meta_features=False, use_clones=True, n_jobs=None, pre_dispatch='2*n_jobs')*\n", "\n", "A 'Stacking Cross-Validation' classifier for scikit-learn estimators.\n", "\n", "New in mlxtend v0.4.3\n", "\n", - "**Notes**\n", - "\n", - "The StackingCVClassifier uses scikit-learn's check_cv\n", - "internally, which doesn't support a random seed. Thus\n", - "NumPy's random seed need to be specified explicitely for\n", - "deterministic behavior, for instance, by setting\n", - "np.random.seed(RANDOM_SEED)\n", - "prior to fitting the StackingCVClassifier\n", - "\n", "**Parameters**\n", "\n", "- `classifiers` : array-like, shape = [n_classifiers]\n", @@ -539,12 +545,17 @@ " `StratifiedKFold` cross validation depending the value of `stratify`\n", " argument.\n", "\n", - "- `use_features_in_secondary` : bool (default: False)\n", + "- `shuffle` : bool (default: True)\n", "\n", - " If True, the meta-classifier will be trained both on the predictions\n", - " of the original classifiers and the original dataset.\n", - " If False, the meta-classifier will be trained only on the predictions\n", - " of the original classifiers.\n", + " If True, and the `cv` argument is integer, the training data will be\n", + " shuffled at fitting stage prior to cross-validation. If the `cv`\n", + " argument is a specific cross validation technique, this argument is\n", + " omitted.\n", + "\n", + "- `random_state` : int, RandomState instance or None, optional (default: None)\n", + "\n", + " Constrols the randomness of the cv splitter. Used when `cv` is\n", + " integer and `shuffle=True`. New in v0.16.0.\n", "\n", "- `stratify` : bool (default: True)\n", "\n", @@ -552,13 +563,6 @@ " K-Fold cross validation technique. If the `cv` argument is a specific\n", " cross validation technique, this argument is omitted.\n", "\n", - "- `shuffle` : bool (default: True)\n", - "\n", - " If True, and the `cv` argument is integer, the training data will be\n", - " shuffled at fitting stage prior to cross-validation. If the `cv`\n", - " argument is a specific cross validation technique, this argument is\n", - " omitted.\n", - "\n", "- `verbose` : int, optional (default=0)\n", "\n", " Controls the verbosity of the building process.\n", @@ -570,6 +574,13 @@ " - `verbose>2`: Changes `verbose` param of the underlying regressor to\n", " self.verbose - 2\n", "\n", + "- `use_features_in_secondary` : bool (default: False)\n", + "\n", + " If True, the meta-classifier will be trained both on the predictions\n", + " of the original classifiers and the original dataset.\n", + " If False, the meta-classifier will be trained only on the predictions\n", + " of the original classifiers.\n", + "\n", "- `store_train_meta_features` : bool (default: False)\n", "\n", " If True, the meta-features computed from the training data used\n", @@ -594,7 +605,7 @@ " The number of CPUs to use to do the computation.\n", " ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n", " ``-1`` means using all processors. See :term:`Glossary `\n", - " for more details.\n", + " for more details. New in v0.16.0.\n", "\n", "- `pre_dispatch` : int, or string, optional\n", "\n", @@ -610,7 +621,7 @@ " spawned\n", " - A string, giving an expression as a function of n_jobs,\n", " as in '2*n_jobs'\n", - "\n", + " New in v0.16.0.\n", "\n", "**Attributes**\n", "\n", diff --git a/docs/sources/user_guide/regressor/StackingCVRegressor.ipynb b/docs/sources/user_guide/regressor/StackingCVRegressor.ipynb index 307196b5c..3df0cdbdb 100644 --- a/docs/sources/user_guide/regressor/StackingCVRegressor.ipynb +++ b/docs/sources/user_guide/regressor/StackingCVRegressor.ipynb @@ -87,7 +87,7 @@ "R^2 Score: 0.46 (+/- 0.29) [SVM]\n", "R^2 Score: 0.43 (+/- 0.14) [Lasso]\n", "R^2 Score: 0.53 (+/- 0.28) [Random Forest]\n", - "R^2 Score: 0.58 (+/- 0.23) [StackingCVRegressor]\n" + "R^2 Score: 0.57 (+/- 0.24) [StackingCVRegressor]\n" ] } ], @@ -109,13 +109,11 @@ "rf = RandomForestRegressor(n_estimators=5, \n", " random_state=RANDOM_SEED)\n", "\n", - "# The StackingCVRegressor uses scikit-learn's check_cv\n", - "# internally, which doesn't support a random seed. Thus\n", - "# NumPy's random seed need to be specified explicitely for\n", - "# deterministic behavior\n", - "np.random.seed(RANDOM_SEED)\n", + "# Starting from v0.16.0, StackingCVRegressor supports\n", + "# `random_state` to get deterministic result.\n", "stack = StackingCVRegressor(regressors=(svr, lasso, rf),\n", - " meta_regressor=lasso)\n", + " meta_regressor=lasso,\n", + " random_state=RANDOM_SEED)\n", "\n", "print('5-fold cross validation scores:\\n')\n", "\n", @@ -141,16 +139,11 @@ "Neg. MSE Score: -33.34 (+/- 22.36) [SVM]\n", "Neg. MSE Score: -35.53 (+/- 16.99) [Lasso]\n", "Neg. MSE Score: -27.25 (+/- 16.76) [Random Forest]\n", - "Neg. MSE Score: -25.56 (+/- 18.22) [StackingCVRegressor]\n" + "Neg. MSE Score: -25.82 (+/- 18.10) [StackingCVRegressor]\n" ] } ], "source": [ - "# The StackingCVRegressor uses scikit-learn's check_cv\n", - "# internally, which doesn't support a random seed. Thus\n", - "# NumPy's random seed need to be specified explicitely for\n", - "# deterministic behavior\n", - "np.random.seed(RANDOM_SEED)\n", "stack = StackingCVRegressor(regressors=(svr, lasso, rf),\n", " meta_regressor=lasso)\n", "\n", @@ -186,18 +179,18 @@ "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "/Users/guq/miniconda3/envs/python3/lib/python3.7/site-packages/sklearn/model_selection/_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.\n", - " DeprecationWarning)\n" + "Best: 0.679576 using {'lasso__alpha': 1.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.4}\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "Best: 0.674237 using {'lasso__alpha': 1.6, 'meta_regressor__n_estimators': 100, 'ridge__alpha': 0.2}\n" + "/Users/guq/miniconda3/envs/python3/lib/python3.7/site-packages/sklearn/model_selection/_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.\n", + " DeprecationWarning)\n" ] } ], @@ -215,13 +208,9 @@ "lasso = Lasso(random_state=RANDOM_SEED)\n", "rf = RandomForestRegressor(random_state=RANDOM_SEED)\n", "\n", - "# The StackingCVRegressor uses scikit-learn's check_cv\n", - "# internally, which doesn't support a random seed. Thus\n", - "# NumPy's random seed need to be specified explicitely for\n", - "# deterministic behavior\n", - "np.random.seed(RANDOM_SEED)\n", "stack = StackingCVRegressor(regressors=(lasso, ridge),\n", " meta_regressor=rf, \n", + " random_state=RANDOM_SEED,\n", " use_features_in_secondary=True)\n", "\n", "params = {'lasso__alpha': [0.1, 1.0, 10.0],\n", @@ -252,21 +241,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "0.616 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.05}\n", + "0.637 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.05}\n", "0.656 +/- 0.08 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.1}\n", - "0.653 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.15}\n", - "0.669 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.2}\n", - "0.632 +/- 0.08 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.25}\n", - "0.664 +/- 0.08 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.3}\n", - "0.632 +/- 0.08 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.35}\n", - "0.642 +/- 0.08 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.4}\n", - "0.653 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.45}\n", - "0.657 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 100, 'ridge__alpha': 0.05}\n", - "0.650 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 100, 'ridge__alpha': 0.1}\n", - "0.648 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 100, 'ridge__alpha': 0.15}\n", + "0.635 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.15}\n", + "0.647 +/- 0.08 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.2}\n", + "0.630 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.25}\n", + "0.628 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.3}\n", + "0.639 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.35}\n", + "0.641 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.4}\n", + "0.653 +/- 0.08 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.45}\n", + "0.644 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 100, 'ridge__alpha': 0.05}\n", + "0.642 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 100, 'ridge__alpha': 0.1}\n", + "0.646 +/- 0.09 {'lasso__alpha': 0.2, 'meta_regressor__n_estimators': 100, 'ridge__alpha': 0.15}\n", "...\n", - "Best parameters: {'lasso__alpha': 1.6, 'meta_regressor__n_estimators': 100, 'ridge__alpha': 0.2}\n", - "Accuracy: 0.67\n" + "Best parameters: {'lasso__alpha': 1.2, 'meta_regressor__n_estimators': 10, 'ridge__alpha': 0.4}\n", + "Accuracy: 0.68\n" ] } ], @@ -318,21 +307,12 @@ "text": [ "## StackingCVRegressor\n", "\n", - "*StackingCVRegressor(regressors, meta_regressor, cv=5, shuffle=True, use_features_in_secondary=False, store_train_meta_features=False, refit=True)*\n", + "*StackingCVRegressor(regressors, meta_regressor, cv=5, shuffle=True, random_state=None, verbose=0, refit=True, use_features_in_secondary=False, store_train_meta_features=False, n_jobs=None, pre_dispatch='2*n_jobs')*\n", "\n", "A 'Stacking Cross-Validation' regressor for scikit-learn estimators.\n", "\n", "New in mlxtend v0.7.0\n", "\n", - "**Notes**\n", - "\n", - "The StackingCVRegressor uses scikit-learn's check_cv\n", - "internally, which doesn't support a random seed. Thus\n", - "NumPy's random seed need to be specified explicitely for\n", - "deterministic behavior, for instance, by setting\n", - "np.random.seed(RANDOM_SEED)\n", - "prior to fitting the StackingCVRegressor\n", - "\n", "**Parameters**\n", "\n", "- `regressors` : array-like, shape = [n_regressors]\n", @@ -357,14 +337,6 @@ " - An iterable yielding train, test splits.\n", " For integer/None inputs, it will use `KFold` cross-validation\n", "\n", - "- `use_features_in_secondary` : bool (default: False)\n", - "\n", - " If True, the meta-regressor will be trained both on\n", - " the predictions of the original regressors and the\n", - " original dataset.\n", - " If False, the meta-regressor will be trained only on\n", - " the predictions of the original regressors.\n", - "\n", "- `shuffle` : bool (default: True)\n", "\n", " If True, and the `cv` argument is integer, the training data will\n", @@ -372,13 +344,14 @@ " argument is a specific cross validation technique, this argument is\n", " omitted.\n", "\n", - "- `store_train_meta_features` : bool (default: False)\n", + "- `random_state` : int, RandomState instance or None, optional (default: None)\n", "\n", - " If True, the meta-features computed from the training data\n", - " used for fitting the\n", - " meta-regressor stored in the `self.train_meta_features_` array,\n", - " which can be\n", - " accessed after calling `fit`.\n", + " Constrols the randomness of the cv splitter. Used when `cv` is\n", + " integer and `shuffle=True`. New in v0.16.0.\n", + "\n", + "- `verbose` : int, optional (default=0)\n", + "\n", + " Controls the verbosity of the building process. New in v0.16.0\n", "\n", "- `refit` : bool (default: True)\n", "\n", @@ -389,6 +362,45 @@ " the scikit-learn fit/predict API interface but are not compatible\n", " to scikit-learn's `clone` function.\n", "\n", + "- `use_features_in_secondary` : bool (default: False)\n", + "\n", + " If True, the meta-regressor will be trained both on\n", + " the predictions of the original regressors and the\n", + " original dataset.\n", + " If False, the meta-regressor will be trained only on\n", + " the predictions of the original regressors.\n", + "\n", + "- `store_train_meta_features` : bool (default: False)\n", + "\n", + " If True, the meta-features computed from the training data\n", + " used for fitting the\n", + " meta-regressor stored in the `self.train_meta_features_` array,\n", + " which can be\n", + " accessed after calling `fit`.\n", + "\n", + "- `n_jobs` : int or None, optional (default=None)\n", + "\n", + " The number of CPUs to use to do the computation.\n", + " ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n", + " ``-1`` means using all processors. See :term:`Glossary `\n", + " for more details. New in v0.16.0.\n", + "\n", + "- `pre_dispatch` : int, or string, optional\n", + "\n", + " Controls the number of jobs that get dispatched during parallel\n", + " execution. Reducing this number can be useful to avoid an\n", + " explosion of memory consumption when more jobs get dispatched\n", + " than CPUs can process. This parameter can be:\n", + " - None, in which case all the jobs are immediately\n", + " created and spawned. Use this for lightweight and\n", + " fast-running jobs, to avoid delays due to on-demand\n", + " spawning of the jobs\n", + " - An int, giving the exact number of total jobs that are\n", + " spawned\n", + " - A string, giving an expression as a function of n_jobs,\n", + " as in '2*n_jobs'\n", + " New in v0.16.0.\n", + "\n", "**Attributes**\n", "\n", "- `train_meta_features` : numpy array, shape = [n_samples, n_regressors]\n", @@ -546,7 +558,10 @@ "\n", "- `X` : array-like, shape = (n_samples, n_features)\n", "\n", - " Test samples.\n", + " Test samples. For some estimators this may be a\n", + " precomputed kernel matrix instead, shape = (n_samples,\n", + " n_samples_fitted], where n_samples_fitted is the number of\n", + " samples used in the fitting for the estimator.\n", "\n", "\n", "- `y` : array-like, shape = (n_samples) or (n_samples, n_outputs)\n", @@ -570,15 +585,22 @@ "\n", "Set the parameters of this estimator.\n", "\n", - "The method works on simple estimators as well as on nested objects\n", - "(such as pipelines). The latter have parameters of the form\n", - "``__`` so that it's possible to update each\n", - "component of a nested object.\n", + "Valid parameter keys can be listed with ``get_params()``.\n", "\n", "**Returns**\n", "\n", "self\n", "\n", + "### Properties\n", + "\n", + "
\n", + "\n", + "*named_regressors*\n", + "\n", + "**Returns**\n", + "\n", + "List of named estimator tuples, like [('svc', SVC(...))]\n", + "\n", "\n" ] } diff --git a/mlxtend/classifier/stacking_cv_classification.py b/mlxtend/classifier/stacking_cv_classification.py index 16331e577..a45b3b6a2 100644 --- a/mlxtend/classifier/stacking_cv_classification.py +++ b/mlxtend/classifier/stacking_cv_classification.py @@ -29,15 +29,6 @@ class StackingCVClassifier(_BaseXComposition, ClassifierMixin, New in mlxtend v0.4.3 - Notes - ------- - The StackingCVClassifier uses scikit-learn's check_cv - internally, which doesn't support a random seed. Thus - NumPy's random seed need to be specified explicitely for - deterministic behavior, for instance, by setting - np.random.seed(RANDOM_SEED) - prior to fitting the StackingCVClassifier - Parameters ---------- classifiers : array-like, shape = [n_classifiers] @@ -61,20 +52,18 @@ class StackingCVClassifier(_BaseXComposition, ClassifierMixin, For integer/None inputs, it will use either a `KFold` or `StratifiedKFold` cross validation depending the value of `stratify` argument. - use_features_in_secondary : bool (default: False) - If True, the meta-classifier will be trained both on the predictions - of the original classifiers and the original dataset. - If False, the meta-classifier will be trained only on the predictions - of the original classifiers. - stratify : bool (default: True) - If True, and the `cv` argument is integer it will follow a stratified - K-Fold cross validation technique. If the `cv` argument is a specific - cross validation technique, this argument is omitted. shuffle : bool (default: True) If True, and the `cv` argument is integer, the training data will be shuffled at fitting stage prior to cross-validation. If the `cv` argument is a specific cross validation technique, this argument is omitted. + random_state : int, RandomState instance or None, optional (default: None) + Constrols the randomness of the cv splitter. Used when `cv` is + integer and `shuffle=True`. New in v0.16.0. + stratify : bool (default: True) + If True, and the `cv` argument is integer it will follow a stratified + K-Fold cross validation technique. If the `cv` argument is a specific + cross validation technique, this argument is omitted. verbose : int, optional (default=0) Controls the verbosity of the building process. - `verbose=0` (default): Prints nothing @@ -84,6 +73,11 @@ class StackingCVClassifier(_BaseXComposition, ClassifierMixin, regressor being fitted - `verbose>2`: Changes `verbose` param of the underlying regressor to self.verbose - 2 + use_features_in_secondary : bool (default: False) + If True, the meta-classifier will be trained both on the predictions + of the original classifiers and the original dataset. + If False, the meta-classifier will be trained only on the predictions + of the original classifiers. store_train_meta_features : bool (default: False) If True, the meta-features computed from the training data used for fitting the meta-classifier stored in the @@ -103,7 +97,7 @@ class StackingCVClassifier(_BaseXComposition, ClassifierMixin, The number of CPUs to use to do the computation. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` - for more details. + for more details. New in v0.16.0. pre_dispatch : int, or string, optional Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an @@ -117,7 +111,7 @@ class StackingCVClassifier(_BaseXComposition, ClassifierMixin, spawned - A string, giving an expression as a function of n_jobs, as in '2*n_jobs' - + New in v0.16.0. Attributes ---------- @@ -137,10 +131,9 @@ class StackingCVClassifier(_BaseXComposition, ClassifierMixin, """ def __init__(self, classifiers, meta_classifier, - use_probas=False, cv=2, + use_probas=False, cv=2, shuffle=True, + random_state=None, stratify=True, verbose=0, use_features_in_secondary=False, - stratify=True, - shuffle=True, verbose=0, store_train_meta_features=False, use_clones=True, n_jobs=None, pre_dispatch='2*n_jobs'): @@ -148,11 +141,12 @@ def __init__(self, classifiers, meta_classifier, self.classifiers = classifiers self.meta_classifier = meta_classifier self.use_probas = use_probas - self.verbose = verbose self.cv = cv - self.use_features_in_secondary = use_features_in_secondary - self.stratify = stratify self.shuffle = shuffle + self.random_state = random_state + self.stratify = stratify + self.verbose = verbose + self.use_features_in_secondary = use_features_in_secondary self.store_train_meta_features = store_train_meta_features self.use_clones = use_clones self.n_jobs = n_jobs @@ -203,6 +197,7 @@ def fit(self, X, y, groups=None, sample_weight=None): # Override shuffle parameter in case of self generated # cross-validation strategy final_cv.shuffle = self.shuffle + final_cv.random_state = self.random_state # Input validation. X, y = check_X_y(X, y, accept_sparse=['csc', 'csr']) diff --git a/mlxtend/classifier/tests/test_stacking_cv_classifier.py b/mlxtend/classifier/tests/test_stacking_cv_classifier.py index e794bd515..63f9a1dfa 100644 --- a/mlxtend/classifier/tests/test_stacking_cv_classifier.py +++ b/mlxtend/classifier/tests/test_stacking_cv_classifier.py @@ -245,12 +245,12 @@ def test_use_features_in_secondary(): def test_do_not_stratify(): - np.random.seed(123) meta = LogisticRegression(multi_class='ovr', solver='liblinear') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], meta_classifier=meta, + random_state=42, stratify=False) scores = cross_val_score(sclf, @@ -266,14 +266,14 @@ def test_cross_validation_technique(): # This is like the `test_do_not_stratify` but instead # autogenerating the cross validation strategy it provides # a pre-created object - np.random.seed(123) cv = KFold(n_splits=2, shuffle=True) meta = LogisticRegression(multi_class='ovr', solver='liblinear') - clf1 = RandomForestClassifier(n_estimators=10) + clf1 = RandomForestClassifier(n_estimators=10, random_state=42) clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], meta_classifier=meta, - cv=cv) + cv=cv, + random_state=42) scores = cross_val_score(sclf, X_iris, @@ -281,7 +281,7 @@ def test_cross_validation_technique(): cv=5, scoring='accuracy') scores_mean = (round(scores.mean(), 2)) - assert scores_mean == 0.93, scores.mean() + assert scores_mean == 0.92, scores.mean() def test_not_fitted(): @@ -346,6 +346,7 @@ def test_get_params(): 'meta_classifier', 'n_jobs', 'pre_dispatch', + 'random_state', 'randomforestclassifier', 'shuffle', 'store_train_meta_features', @@ -358,13 +359,13 @@ def test_get_params(): def test_classifier_gridsearch(): - np.random.seed(123) clf1 = KNeighborsClassifier(n_neighbors=1) - clf2 = RandomForestClassifier(n_estimators=10) + clf2 = RandomForestClassifier(n_estimators=10, random_state=42) clf3 = GaussianNB() lr = LogisticRegression(multi_class='ovr', solver='liblinear') sclf = StackingCVClassifier(classifiers=[clf1], - meta_classifier=lr) + meta_classifier=lr, + random_state=42) params = {'classifiers': [[clf1], [clf1, clf2, clf3]]} @@ -409,20 +410,21 @@ def test_predict_meta_features(): def test_meta_feat_reordering(): - np.random.seed(123) knn = KNeighborsClassifier() lr = LogisticRegression(multi_class='ovr', solver='liblinear') gnb = GaussianNB() stclf = StackingCVClassifier(classifiers=[knn, gnb], meta_classifier=lr, shuffle=True, + random_state=42, store_train_meta_features=True) X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast, + random_state=0, test_size=0.3) stclf.fit(X_train, y_train) assert round(roc_auc_score(y_train, - stclf.train_meta_features_[:, 1]), 2) == 0.87, \ + stclf.train_meta_features_[:, 1]), 2) == 0.86, \ round(roc_auc_score(y_train, stclf.train_meta_features_[:, 1]), 2) @@ -442,7 +444,8 @@ def test_sparse_inputs(): rf = RandomForestClassifier(n_estimators=10) lr = LogisticRegression(multi_class='ovr', solver='liblinear') stclf = StackingCVClassifier(classifiers=[rf, rf], - meta_classifier=lr) + meta_classifier=lr, + random_state=42) X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast, test_size=0.3) @@ -456,23 +459,23 @@ def test_sparse_inputs(): def test_sparse_inputs_with_features_in_secondary(): - np.random.seed(123) - rf = RandomForestClassifier(n_estimators=10) + rf = RandomForestClassifier(n_estimators=10, random_state=42) lr = LogisticRegression(multi_class='ovr', solver='liblinear') stclf = StackingCVClassifier(classifiers=[rf, rf], meta_classifier=lr, + random_state=42, use_features_in_secondary=True) X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast, test_size=0.3) # dense stclf.fit(X_train, y_train) - assert round(stclf.score(X_train, y_train), 2) == 0.99, \ + assert round(stclf.score(X_train, y_train), 2) == 1.0, \ round(stclf.score(X_train, y_train), 2) # sparse stclf.fit(sparse.csr_matrix(X_train), y_train) - assert round(stclf.score(X_train, y_train), 2) == 0.99, \ + assert round(stclf.score(X_train, y_train), 2) == 1.0, \ round(stclf.score(X_train, y_train), 2) @@ -494,10 +497,11 @@ def test_works_with_df_if_fold_indexes_missing(): """ np.random.seed(123) - rf = RandomForestClassifier(n_estimators=10) + rf = RandomForestClassifier(n_estimators=10, random_state=42) lr = LogisticRegression(multi_class='ovr', solver='liblinear') stclf = StackingCVClassifier(classifiers=[rf, rf], meta_classifier=lr, + random_state=42, use_features_in_secondary=True) X_modded = pd.DataFrame(X_breast, diff --git a/mlxtend/regressor/stacking_cv_regression.py b/mlxtend/regressor/stacking_cv_regression.py index a9faeb9dc..091acdd84 100644 --- a/mlxtend/regressor/stacking_cv_regression.py +++ b/mlxtend/regressor/stacking_cv_regression.py @@ -32,15 +32,6 @@ class StackingCVRegressor(_BaseXComposition, RegressorMixin, TransformerMixin): New in mlxtend v0.7.0 - Notes - ------- - The StackingCVRegressor uses scikit-learn's check_cv - internally, which doesn't support a random seed. Thus - NumPy's random seed need to be specified explicitely for - deterministic behavior, for instance, by setting - np.random.seed(RANDOM_SEED) - prior to fitting the StackingCVRegressor - Parameters ---------- regressors : array-like, shape = [n_regressors] @@ -59,25 +50,16 @@ class StackingCVRegressor(_BaseXComposition, RegressorMixin, TransformerMixin): - An object to be used as a cross-validation generator. - An iterable yielding train, test splits. For integer/None inputs, it will use `KFold` cross-validation - use_features_in_secondary : bool (default: False) - If True, the meta-regressor will be trained both on - the predictions of the original regressors and the - original dataset. - If False, the meta-regressor will be trained only on - the predictions of the original regressors. shuffle : bool (default: True) If True, and the `cv` argument is integer, the training data will be shuffled at fitting stage prior to cross-validation. If the `cv` argument is a specific cross validation technique, this argument is omitted. + random_state : int, RandomState instance or None, optional (default: None) + Constrols the randomness of the cv splitter. Used when `cv` is + integer and `shuffle=True`. New in v0.16.0. verbose : int, optional (default=0) - Controls the verbosity of the building process. - store_train_meta_features : bool (default: False) - If True, the meta-features computed from the training data - used for fitting the - meta-regressor stored in the `self.train_meta_features_` array, - which can be - accessed after calling `fit`. + Controls the verbosity of the building process. New in v0.16.0 refit : bool (default: True) Clones the regressors for stacking regression if True (default) or else uses the original ones, which will be refitted on the dataset @@ -85,11 +67,23 @@ class StackingCVRegressor(_BaseXComposition, RegressorMixin, TransformerMixin): recommended if you are working with estimators that are supporting the scikit-learn fit/predict API interface but are not compatible to scikit-learn's `clone` function. + use_features_in_secondary : bool (default: False) + If True, the meta-regressor will be trained both on + the predictions of the original regressors and the + original dataset. + If False, the meta-regressor will be trained only on + the predictions of the original regressors. + store_train_meta_features : bool (default: False) + If True, the meta-features computed from the training data + used for fitting the + meta-regressor stored in the `self.train_meta_features_` array, + which can be + accessed after calling `fit`. n_jobs : int or None, optional (default=None) The number of CPUs to use to do the computation. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` - for more details. + for more details. New in v0.16.0. pre_dispatch : int, or string, optional Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an @@ -103,6 +97,7 @@ class StackingCVRegressor(_BaseXComposition, RegressorMixin, TransformerMixin): spawned - A string, giving an expression as a function of n_jobs, as in '2*n_jobs' + New in v0.16.0. Attributes ---------- @@ -118,20 +113,21 @@ class StackingCVRegressor(_BaseXComposition, RegressorMixin, TransformerMixin): """ def __init__(self, regressors, meta_regressor, cv=5, - shuffle=True, verbose=0, n_jobs=1, - use_features_in_secondary=False, - store_train_meta_features=False, - refit=True, pre_dispatch='2*n_jobs'): + shuffle=True, random_state=None, verbose=0, + refit=True, use_features_in_secondary=False, + store_train_meta_features=False, n_jobs=None, + pre_dispatch='2*n_jobs'): self.regressors = regressors self.meta_regressor = meta_regressor self.cv = cv self.shuffle = shuffle + self.random_state = random_state self.verbose = verbose - self.n_jobs = n_jobs + self.refit = refit self.use_features_in_secondary = use_features_in_secondary self.store_train_meta_features = store_train_meta_features - self.refit = refit + self.n_jobs = n_jobs self.pre_dispatch = pre_dispatch def fit(self, X, y, groups=None, sample_weight=None): @@ -175,6 +171,7 @@ def fit(self, X, y, groups=None, sample_weight=None): # Override shuffle parameter in case of self generated # cross-validation strategy kfold.shuffle = self.shuffle + kfold.random_state = self.random_state # # The meta_features are collection of the prediction data, # in shape of [n_samples, len(self.regressors)]. Each column diff --git a/mlxtend/regressor/tests/test_stacking_cv_regression.py b/mlxtend/regressor/tests/test_stacking_cv_regression.py index c3306cfd0..10d22b499 100644 --- a/mlxtend/regressor/tests/test_stacking_cv_regression.py +++ b/mlxtend/regressor/tests/test_stacking_cv_regression.py @@ -37,11 +37,12 @@ def test_different_models(): ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf', gamma='auto') stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge], - meta_regressor=svr_rbf) + meta_regressor=svr_rbf, + random_state=0) stack.fit(X1, y).predict(X1) - mse = 0.21 + mse = 0.20 got = np.mean((stack.predict(X1) - y) ** 2) - assert round(got, 2) == mse + assert round(got, 2) == mse, got def test_use_features_in_secondary(): @@ -52,6 +53,7 @@ def test_use_features_in_secondary(): stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf, cv=3, + random_state=0, use_features_in_secondary=True) stack.fit(X1, y).predict(X1) mse = 0.2 @@ -65,9 +67,10 @@ def test_multivariate(): ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf', gamma='auto') stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge], - meta_regressor=svr_rbf) + meta_regressor=svr_rbf, + random_state=0) stack.fit(X2, y).predict(X2) - mse = 0.19 + mse = 0.20 got = np.mean((stack.predict(X2) - y) ** 2) assert round(got, 2) == mse, '%f != %f' % (round(got, 2), mse) @@ -78,7 +81,8 @@ def test_internals(): cv = 10 stack = StackingCVRegressor(regressors=[lr, lr, lr, lr, lr], meta_regressor=lr, - cv=cv) + cv=cv, + random_state=0) stack.fit(X3, y2) assert stack.predict(X3).mean() == y2.mean() assert stack.meta_regr_.intercept_ == 0.0 @@ -93,7 +97,8 @@ def test_gridsearch_numerate_regr(): ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf', gamma='auto') stack = StackingCVRegressor(regressors=[svr_lin, ridge, ridge], - meta_regressor=svr_rbf) + meta_regressor=svr_rbf, + random_state=42) params = {'ridge-1__alpha': [0.01, 1.0], 'ridge-2__alpha': [0.01, 1.0], @@ -117,7 +122,8 @@ def test_get_params(): svr_rbf = SVR(kernel='rbf') ridge = Ridge(random_state=1) stregr = StackingCVRegressor(regressors=[ridge, lr], - meta_regressor=svr_rbf) + meta_regressor=svr_rbf, + random_state=42) got = sorted(list({s.split('__')[0] for s in stregr.get_params().keys()})) expect = ['cv', @@ -125,6 +131,7 @@ def test_get_params(): 'meta_regressor', 'n_jobs', 'pre_dispatch', + 'random_state', 'refit', 'regressors', 'ridge', @@ -140,7 +147,8 @@ def test_regressor_gridsearch(): svr_rbf = SVR(kernel='rbf', gamma='auto') ridge = Ridge(random_state=1) stregr = StackingCVRegressor(regressors=[lr], - meta_regressor=svr_rbf) + meta_regressor=svr_rbf, + random_state=1) params = {'regressors': [[ridge, lr], [lr, ridge, lr]]} @@ -218,19 +226,20 @@ def test_sparse_matrix_inputs(): ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf', gamma='auto') stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge], - meta_regressor=svr_rbf) + meta_regressor=svr_rbf, + random_state=42) # dense stack.fit(X1, y).predict(X1) - mse = 0.20 + mse = 0.21 got = np.mean((stack.predict(X1) - y) ** 2) - assert round(got, 2) == mse + assert round(got, 2) == mse, got # sparse stack.fit(sparse.csr_matrix(X1), y) mse = 0.20 got = np.mean((stack.predict(sparse.csr_matrix(X1)) - y) ** 2) - assert round(got, 2) == mse + assert round(got, 2) == mse, got def test_sparse_matrix_inputs_with_features_in_secondary(): @@ -240,19 +249,20 @@ def test_sparse_matrix_inputs_with_features_in_secondary(): svr_rbf = SVR(kernel='rbf', gamma='auto') stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf, + random_state=42, use_features_in_secondary=True) # dense stack.fit(X1, y).predict(X1) mse = 0.20 got = np.mean((stack.predict(X1) - y) ** 2) - assert round(got, 2) == mse + assert round(got, 2) == mse, got # sparse stack.fit(sparse.csr_matrix(X1), y) mse = 0.20 got = np.mean((stack.predict(sparse.csr_matrix(X1)) - y) ** 2) - assert round(got, 2) == mse + assert round(got, 2) == mse, got # Calling for np.random will break the existing tests by changing the