From c80d78f6361a216698a4470024aa8d163a7f9646 Mon Sep 17 00:00:00 2001 From: rasbt Date: Sat, 20 Jan 2018 13:25:15 -0500 Subject: [PATCH] add 5x2cv paired t test --- docs/mkdocs.yml | 1 + docs/sources/CHANGELOG.md | 2 + docs/sources/USER_GUIDE_INDEX.md | 1 + .../evaluate/paired_ttest_5x2cv.ipynb | 326 ++++++++++++++++++ .../evaluate/paired_ttest_kfold_cv.ipynb | 6 +- .../evaluate/paired_ttest_resampled.ipynb | 8 +- mlxtend/evaluate/__init__.py | 3 +- .../evaluate/tests/test_paired_ttest_5x2cv.py | 112 ++++++ .../evaluate/tests/test_paired_ttest_kfold.py | 2 - mlxtend/evaluate/ttest.py | 100 ++++++ 10 files changed, 551 insertions(+), 10 deletions(-) create mode 100644 docs/sources/user_guide/evaluate/paired_ttest_5x2cv.ipynb create mode 100644 mlxtend/evaluate/tests/test_paired_ttest_5x2cv.py diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 6bc311c7b..597bfa9b7 100755 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -61,6 +61,7 @@ pages: - user_guide/evaluate/mcnemar_table.md - user_guide/evaluate/mcnemar_tables.md - user_guide/evaluate/mcnemar.md + - user_guide/evaluate/paired_ttest_5x2cv.md - user_guide/evaluate/paired_ttest_kfold_cv.md - user_guide/evaluate/paired_ttest_resampled.md - user_guide/evaluate/permutation_test.md diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index 75d1ee214..f3276bff9 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -22,6 +22,8 @@ The CHANGELOG for the current development version is available at - New function implementing the k-fold paired t-test procedure (`paired_ttest_kfold_cv`) to compare the performance of two models (also called k-hold-out paired t-test). ([#324](https://github.com/rasbt/mlxtend/issues/324)) +- New function implementing the 5x2cv paired t-test procedure (`paired_ttest_5x2cv`) proposed by Dieterrich (1998) + to compare the performance of two models. ([#325](https://github.com/rasbt/mlxtend/issues/325)) ##### Changes diff --git a/docs/sources/USER_GUIDE_INDEX.md b/docs/sources/USER_GUIDE_INDEX.md index fc6a7f92a..b784de282 100755 --- a/docs/sources/USER_GUIDE_INDEX.md +++ b/docs/sources/USER_GUIDE_INDEX.md @@ -33,6 +33,7 @@ - [mcnemar_table](user_guide/evaluate/mcnemar_table.md) - [mcnemar_tables](user_guide/evaluate/mcnemar_tables.md) - [mcnemar](user_guide/evaluate/mcnemar.md) +- [paired_ttest_5x2cv](user_guide/evaluate/paired_ttest_5x2cv.md) - [paired_ttest_kfold_cv](user_guide/evaluate/paired_ttest_kfold_cv.md) - [paired_ttest_resampled](user_guide/evaluate/paired_ttest_resampled.md) - [permutation_test](user_guide/evaluate/permutation_test.md) diff --git a/docs/sources/user_guide/evaluate/paired_ttest_5x2cv.ipynb b/docs/sources/user_guide/evaluate/paired_ttest_5x2cv.ipynb new file mode 100644 index 000000000..e6bcf268b --- /dev/null +++ b/docs/sources/user_guide/evaluate/paired_ttest_5x2cv.ipynb @@ -0,0 +1,326 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 5x2cv paired *t* test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "5x2cv paired *t* test procedure to compare the performance of two models" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> `from mlxtend.evaluate import paired_ttest_5x2cv` " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Overview" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The 5x2cv paired *t* test is a procedure for comparing the performance of two models (classifiers or regressors)\n", + "that was proposed by Dietterich [1] to address shortcomings in other methods such as the resampled paired *t* test (see [`paired_ttest_resampled`](paired_ttest_resampled.md)) and the k-fold cross-validated paired *t* test (see [`paired_ttest_kfold_cv`](paired_ttest_kfold_cv.md)).\n", + "\n", + "To explain how this method works, let's consider to estimator (e.g., classifiers) A and B. Further, we have a labeled dataset *D*. In the common hold-out method, we typically split the dataset into 2 parts: a training and a test set. In the 5x2cv paired *t* test, we repeat the splitting (50% training and 50% test data) 5 times. \n", + "\n", + "In each of the 5 iterations, we fit A and B to the training split and evaluate their performance ($p_A$ and $p_B$) on the test split. Then, we rotate the training and test sets (the training set becomes the test set and vice versa) compute the performance again, which results in 2 performance difference measures:\n", + "\n", + "$$p^{(1)} = p^{(1)}_A - p^{(1)}_B$$\n", + "\n", + "and\n", + "\n", + "$$p^{(2)} = p^{(2)}_A - p^{(2)}_B.$$\n", + "\n", + "Then, we estimate the estimate mean and variance of the differences:\n", + "\n", + "$\\overline{p} = \\frac{p^{(1)} + p^{(2)}}{2}$\n", + "\n", + "and\n", + "\n", + "$s^2 = (p^{(1)} - \\overline{p})^2 + (p^{(2)} - \\overline{p})^2.$\n", + "\n", + "The variance of the difference is computed for the 5 iterations and then used to compute the *t* statistic as follows:\n", + "\n", + "$$t = \\frac{p_1^{(1)}}{\\sqrt{(1/5) \\sum_{i=1}^{5}s_i^2}},$$\n", + "\n", + "where $p_1^{(1)}$ is the $p_1$ from the very first iteration. The *t* statistic, assuming that it approximately follows as *t* distribution with 5 degrees of freedom, under the null hypothesis that the models A and B have equal performance. Using the *t* statistic, the p value can be computed and compared with a previously chosen significance level, e.g., $\\alpha=0.05$. If the p value is smaller than $\\alpha$, we reject the null hypothesis and accept that there is a significant difference in the two models.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### References\n", + "\n", + "- [1] Dietterich TG (1998) Approximate Statistical Tests for Comparing Supervised Classification Learning Algorithms. *Neural Comput* 10:1895–1923." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 1 - 5x2cv paired *t* test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Assume we want to compare two classification algorithms, logistic regression and a decision tree algorithm:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Logistic regression accuracy: 97.37%\n", + "Decision tree accuracy: 94.74%\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from mlxtend.data import iris_data\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "\n", + "X, y = iris_data()\n", + "clf1 = LogisticRegression(random_state=1)\n", + "clf2 = DecisionTreeClassifier(random_state=1)\n", + "\n", + "X_train, X_test, y_train, y_test = \\\n", + " train_test_split(X, y, test_size=0.25,\n", + " random_state=123)\n", + "\n", + "score1 = clf1.fit(X_train, y_train).score(X_test, y_test)\n", + "score2 = clf2.fit(X_train, y_train).score(X_test, y_test)\n", + "\n", + "print('Logistic regression accuracy: %.2f%%' % (score1*100))\n", + "print('Decision tree accuracy: %.2f%%' % (score2*100))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that these accuracy values are not used in the paired *t* test procedure as new test/train splits are generated during the resampling procedure, the values above are just serving the purpose of intuition.\n", + "\n", + "Now, let's assume a significance threshold of $\\alpha=0.05$ for rejecting the null hypothesis that both algorithms perform equally well on the dataset and conduct the 5x2cv *t* test:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "t statistic: -1.539\n", + "p value: 0.184\n" + ] + } + ], + "source": [ + "from mlxtend.evaluate import paired_ttest_5x2cv\n", + "\n", + "\n", + "t, p = paired_ttest_5x2cv(estimator1=clf1,\n", + " estimator2=clf2,\n", + " X=X, y=y,\n", + " random_seed=1)\n", + "\n", + "print('t statistic: %.3f' % t)\n", + "print('p value: %.3f' % p)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since $p > t$, we cannot reject the null hypothesis and may conclude that the performance of the two algorithms is not significantly different. \n", + "\n", + "While it is generally not recommended to apply statistical tests multiple times without correction for multiple hypothesis testing, let us take a look at an example where the decision tree algorithm is limited to producing a very simple decision boundary that would result in a relatively bad performance:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Decision tree accuracy: 63.16%\n", + "t statistic: 5.386\n", + "p value: 0.003\n" + ] + } + ], + "source": [ + "clf2 = DecisionTreeClassifier(random_state=1, max_depth=1)\n", + "\n", + "score2 = clf2.fit(X_train, y_train).score(X_test, y_test)\n", + "print('Decision tree accuracy: %.2f%%' % (score2*100))\n", + "\n", + "\n", + "t, p = paired_ttest_5x2cv(estimator1=clf1,\n", + " estimator2=clf2,\n", + " X=X, y=y,\n", + " random_seed=1)\n", + "\n", + "print('t statistic: %.3f' % t)\n", + "print('p value: %.3f' % p)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Assuming that we conducted this test also with a significance level of $\\alpha=0.05$, we can reject the null-hypothesis that both models perform equally well on this dataset, since the p-value ($p < 0.001$) is smaller than $\\alpha$." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "## paired_ttest_5x2cv\n", + "\n", + "*paired_ttest_5x2cv(estimator1, estimator2, X, y, scoring=None, random_seed=None)*\n", + "\n", + "Implements the 5x2cv paired t test proposed\n", + "by Dieterrich (1998)\n", + "to compare the performance of two models.\n", + "\n", + "**Parameters**\n", + "\n", + "- `estimator1` : scikit-learn classifier or regressor\n", + "\n", + "\n", + "\n", + "- `estimator2` : scikit-learn classifier or regressor\n", + "\n", + "\n", + "\n", + "- `X` : {array-like, sparse matrix}, shape = [n_samples, n_features]\n", + "\n", + " Training vectors, where n_samples is the number of samples and\n", + " n_features is the number of features.\n", + "\n", + "\n", + "- `y` : array-like, shape = [n_samples]\n", + "\n", + " Target values.\n", + "\n", + "\n", + "- `scoring` : str, callable, or None (default: None)\n", + "\n", + " If None (default), uses 'accuracy' for sklearn classifiers\n", + " and 'r2' for sklearn regressors.\n", + " If str, uses a sklearn scoring metric string identifier, for example\n", + " {accuracy, f1, precision, recall, roc_auc} for classifiers,\n", + " {'mean_absolute_error', 'mean_squared_error'/'neg_mean_squared_error',\n", + " 'median_absolute_error', 'r2'} for regressors.\n", + " If a callable object or function is provided, it has to be conform with\n", + " sklearn's signature ``scorer(estimator, X, y)``; see\n", + " http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html\n", + " for more information.\n", + "\n", + "\n", + "- `random_seed` : int or None (default: None)\n", + "\n", + " Random seed for creating the test/train splits.\n", + "\n", + "**Returns**\n", + "\n", + "- `t` : float\n", + "\n", + " The t-statistic\n", + "\n", + "\n", + "- `pvalue` : float\n", + "\n", + " Two-tailed p-value.\n", + " If the chosen significance level is larger\n", + " than the p-value, we reject the null hypothesis\n", + " and accept that there are significant differences\n", + " in the two compared models.\n", + "\n", + "\n" + ] + } + ], + "source": [ + "with open('../../api_modules/mlxtend.evaluate/paired_ttest_5x2cv.md', 'r') as f:\n", + " s = f.read() \n", + "print(s)" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/docs/sources/user_guide/evaluate/paired_ttest_kfold_cv.ipynb b/docs/sources/user_guide/evaluate/paired_ttest_kfold_cv.ipynb index f5b5bd4f1..3ecc7c2b6 100644 --- a/docs/sources/user_guide/evaluate/paired_ttest_kfold_cv.ipynb +++ b/docs/sources/user_guide/evaluate/paired_ttest_kfold_cv.ipynb @@ -13,7 +13,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# K-fold cross-validated paired t-test" + "# K-fold cross-validated paired *t* test" ] }, { @@ -41,7 +41,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "K-fold cross-validated paired t-test procedure is a common method for comparing the performance of two models (classifiers or regressors) and addresses some of the drawbacks of the [resampled t-test procedure](paired_ttest_resampled.md); however, this method has still the problem that the training sets overlap and is not recommended to be used in practice [1], and techniques such as the [`paired_ttest_5times2_cv`](paired_ttest_5times2_cv.md) should be used instead.\n", + "K-fold cross-validated paired t-test procedure is a common method for comparing the performance of two models (classifiers or regressors) and addresses some of the drawbacks of the [resampled t-test procedure](paired_ttest_resampled.md); however, this method has still the problem that the training sets overlap and is not recommended to be used in practice [1], and techniques such as the [`paired_ttest_5times2cv`](paired_ttest_5times2cv.md) should be used instead.\n", "\n", "To explain how this method works, let's consider to estimator (e.g., classifiers) A and B. Further, we have a labeled dataset *D*. In the common hold-out method, we typically split the dataset into 2 parts: a training and a test set. In the k-fold cross-validated paired t-test procedure, we split the test set into *k* parts of equal size, and each of these parts is then used for testing while the remaining *k-1* parts (joined together) are used for training a classifier or regressor (i.e., the standard k-fold cross-validation procedure).\n", "\n", @@ -73,7 +73,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Example 1 - Paired resampled t test" + "## Example 1 - K-fold cross-validated paired *t* test" ] }, { diff --git a/docs/sources/user_guide/evaluate/paired_ttest_resampled.ipynb b/docs/sources/user_guide/evaluate/paired_ttest_resampled.ipynb index 1ec66a838..fe4dbe010 100644 --- a/docs/sources/user_guide/evaluate/paired_ttest_resampled.ipynb +++ b/docs/sources/user_guide/evaluate/paired_ttest_resampled.ipynb @@ -41,7 +41,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Resampled paired *t* test procedure (also called k-hold-out paired *t* test) is a popular method for comparing the performance of two models (classifiers or regressors); however, this method has many drawbacks and is not recommended to be used in practice [1], and techniques such as the [`paired_ttest_5times2_cv`](paired_ttest_5times2_cv.md) should be used instead.\n", + "Resampled paired *t* test procedure (also called k-hold-out paired *t* test) is a popular method for comparing the performance of two models (classifiers or regressors); however, this method has many drawbacks and is not recommended to be used in practice [1], and techniques such as the [`paired_ttest_5x2cv`](paired_ttest_5x2cv.md) should be used instead.\n", "\n", "To explain how this method works, let's consider to estimator (e.g., classifiers) A and B. Further, we have a labeled dataset *D*. In the common hold-out method, we typically split the dataset into 2 parts: a training and a test set. In the resampled paired *t* test procedure, we repeat this splitting procedure (with typically 2/3 training data and 1/3 test data) *k* times (usually 30). In each iteration, we train A and B on the training set and evaluate it on the test set. Then, we compute the difference in performance between A and B in each iteration so that we obtain *k* difference measures. Now, by making the assumption that these *k* differences were independently drawn and follow an approximately normal distribution, we can compute the following *t* statistic with *k-1* degrees of freedom according to Student's *t* test, under the null hypothesis that the models A and B have equal performance:\n", "\n", @@ -85,7 +85,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Example 1 - Paired resampled t test" + "## Example 1 - Resampled paired *t* test" ] }, { @@ -234,9 +234,9 @@ "\n", "*paired_ttest_resampled(estimator1, estimator2, X, y, num_rounds=30, test_size=0.3, scoring=None, random_seed=None)*\n", "\n", - "Implements the resampled paired t-test procedure\n", + "Implements the resampled paired t test procedure\n", "to compare the performance of two models\n", - "(also called k-hold-out paired t-test).\n", + "(also called k-hold-out paired t test).\n", "\n", "**Parameters**\n", "\n", diff --git a/mlxtend/evaluate/__init__.py b/mlxtend/evaluate/__init__.py index fbe529d21..c2154b1e2 100644 --- a/mlxtend/evaluate/__init__.py +++ b/mlxtend/evaluate/__init__.py @@ -17,6 +17,7 @@ from .cochrans_q import cochrans_q from .ttest import paired_ttest_resampled from .ttest import paired_ttest_kfold_cv +from .ttest import paired_ttest_5x2cv __all__ = ["scoring", "confusion_matrix", @@ -25,4 +26,4 @@ "bootstrap", "permutation_test", "BootstrapOutOfBag", "bootstrap_point632_score", "cochrans_q", "paired_ttest_resampled", - "paired_ttest_kfold_cv"] + "paired_ttest_kfold_cv", "paired_ttest_5x2cv"] diff --git a/mlxtend/evaluate/tests/test_paired_ttest_5x2cv.py b/mlxtend/evaluate/tests/test_paired_ttest_5x2cv.py new file mode 100644 index 000000000..4e4953e0f --- /dev/null +++ b/mlxtend/evaluate/tests/test_paired_ttest_5x2cv.py @@ -0,0 +1,112 @@ +# Sebastian Raschka 2014-2018 +# mlxtend Machine Learning Library Extensions +# Author: Sebastian Raschka +# +# License: BSD 3 clause + +from mlxtend.evaluate import paired_ttest_5x2cv +from mlxtend.data import iris_data +from mlxtend.data import boston_housing_data +from sklearn.linear_model import LogisticRegression +from sklearn.linear_model import Lasso +from sklearn.linear_model import Ridge +from sklearn.tree import DecisionTreeClassifier +from sklearn.model_selection import train_test_split + + +def test_classifier_defaults(): + X, y = iris_data() + clf1 = LogisticRegression(random_state=1) + clf2 = DecisionTreeClassifier(random_state=1) + + X_train, X_test, y_train, y_test = \ + train_test_split(X, y, test_size=0.25, + random_state=123) + + score1 = clf1.fit(X_train, y_train).score(X_test, y_test) + score2 = clf2.fit(X_train, y_train).score(X_test, y_test) + + assert round(score1, 2) == 0.97 + assert round(score2, 2) == 0.95 + + t, p = paired_ttest_5x2cv(estimator1=clf1, + estimator2=clf2, + X=X, y=y, + random_seed=1) + + assert round(t, 3) == -1.539, t + assert round(p, 3) == 0.184, p + + # change maxdepth of decision tree classifier + + clf2 = DecisionTreeClassifier(max_depth=1, random_state=1) + + score3 = clf2.fit(X_train, y_train).score(X_test, y_test) + + assert round(score3, 2) == 0.63 + + t, p = paired_ttest_5x2cv(estimator1=clf1, + estimator2=clf2, + X=X, y=y, + random_seed=1) + + assert round(t, 3) == 5.386, t + assert round(p, 3) == 0.003, p + + +def test_scoring(): + X, y = iris_data() + clf1 = LogisticRegression(random_state=1) + clf2 = DecisionTreeClassifier(random_state=1) + + X_train, X_test, y_train, y_test = \ + train_test_split(X, y, test_size=0.25, + random_state=123) + + score1 = clf1.fit(X_train, y_train).score(X_test, y_test) + score2 = clf2.fit(X_train, y_train).score(X_test, y_test) + + assert round(score1, 2) == 0.97 + assert round(score2, 2) == 0.95 + + t, p = paired_ttest_5x2cv(estimator1=clf1, + estimator2=clf2, + X=X, y=y, + scoring='accuracy', + random_seed=1) + + assert round(t, 3) == -1.539, t + assert round(p, 3) == 0.184, p + + t, p = paired_ttest_5x2cv(estimator1=clf1, + estimator2=clf2, + X=X, y=y, + scoring='f1_macro', + random_seed=1) + + assert round(t, 3) == -1.510, t + assert round(p, 3) == 0.191, p + + +def test_regressor(): + X, y = boston_housing_data() + reg1 = Lasso(random_state=1) + reg2 = Ridge(random_state=1) + + X_train, X_test, y_train, y_test = \ + train_test_split(X, y, test_size=0.25, + random_state=123) + + score1 = reg1.fit(X_train, y_train).score(X_test, y_test) + score2 = reg2.fit(X_train, y_train).score(X_test, y_test) + + assert round(score1, 2) == 0.66, score1 + assert round(score2, 2) == 0.68, score2 + + t, p = paired_ttest_5x2cv(estimator1=reg1, + estimator2=reg2, + X=X, y=y, + random_seed=1) + + assert round(t, 3) == -0.599, t + assert round(p, 3) == 0.575, p diff --git a/mlxtend/evaluate/tests/test_paired_ttest_kfold.py b/mlxtend/evaluate/tests/test_paired_ttest_kfold.py index cf6921909..15b4ca885 100644 --- a/mlxtend/evaluate/tests/test_paired_ttest_kfold.py +++ b/mlxtend/evaluate/tests/test_paired_ttest_kfold.py @@ -4,9 +4,7 @@ # # License: BSD 3 clause -import sys from mlxtend.evaluate import paired_ttest_kfold_cv -from mlxtend.utils import assert_raises from mlxtend.data import iris_data from mlxtend.data import boston_housing_data from sklearn.linear_model import LogisticRegression diff --git a/mlxtend/evaluate/ttest.py b/mlxtend/evaluate/ttest.py index e055ad2b4..4581984ed 100644 --- a/mlxtend/evaluate/ttest.py +++ b/mlxtend/evaluate/ttest.py @@ -216,3 +216,103 @@ def paired_ttest_kfold_cv(estimator1, estimator2, X, y, pvalue = stats.t.sf(np.abs(t_stat), cv - 1)*2. return float(t_stat), float(pvalue) + + +def paired_ttest_5x2cv(estimator1, estimator2, X, y, + scoring=None, + random_seed=None): + """ + Implements the 5x2cv paired t test proposed + by Dieterrich (1998) + to compare the performance of two models. + + Parameters + ---------- + estimator1 : scikit-learn classifier or regressor + + estimator2 : scikit-learn classifier or regressor + + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + y : array-like, shape = [n_samples] + Target values. + + scoring : str, callable, or None (default: None) + If None (default), uses 'accuracy' for sklearn classifiers + and 'r2' for sklearn regressors. + If str, uses a sklearn scoring metric string identifier, for example + {accuracy, f1, precision, recall, roc_auc} for classifiers, + {'mean_absolute_error', 'mean_squared_error'/'neg_mean_squared_error', + 'median_absolute_error', 'r2'} for regressors. + If a callable object or function is provided, it has to be conform with + sklearn's signature ``scorer(estimator, X, y)``; see + http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html + for more information. + + random_seed : int or None (default: None) + Random seed for creating the test/train splits. + + Returns + ---------- + t : float + The t-statistic + + pvalue : float + Two-tailed p-value. + If the chosen significance level is larger + than the p-value, we reject the null hypothesis + and accept that there are significant differences + in the two compared models. + + """ + rng = np.random.RandomState(random_seed) + + if scoring is None: + if estimator1._estimator_type == 'classifier': + scoring = 'accuracy' + elif estimator1._estimator_type == 'regressor': + scoring = 'r2' + else: + raise AttributeError('Estimator must ' + 'be a Classifier or Regressor.') + if isinstance(scoring, str): + scorer = get_scorer(scoring) + else: + scorer = scoring + + variance_sum = 0. + first_diff = None + + def score_diff(X_1, X_2, y_1, y_2): + + estimator1.fit(X_1, y_1) + estimator2.fit(X_1, y_1) + est1_score = scorer(estimator1, X_2, y_2) + est2_score = scorer(estimator2, X_2, y_2) + score_diff = est1_score - est2_score + return score_diff + + for i in range(5): + + randint = rng.randint(low=0, high=32767) + X_1, X_2, y_1, y_2 = \ + train_test_split(X, y, test_size=0.5, + random_state=randint) + + score_diff_1 = score_diff(X_1, X_2, y_1, y_2) + score_diff_2 = score_diff(X_2, X_1, y_2, y_1) + score_mean = (score_diff_1 + score_diff_2) / 2. + score_var = ((score_diff_1 - score_mean)**2 + + (score_diff_2 - score_mean)**2) + variance_sum += score_var + if first_diff is None: + first_diff = score_diff_1 + + numerator = first_diff + denominator = np.sqrt(1/5. * variance_sum) + t_stat = numerator / denominator + + pvalue = stats.t.sf(np.abs(t_stat), 5)*2. + return float(t_stat), float(pvalue)