From 83a7a0567772c908be80099d2d5856f2e3059e84 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 19 Sep 2016 13:26:34 -0700 Subject: [PATCH] edit train/test_size default behavior --- sklearn/cross_validation.py | 41 +++++++++++----- sklearn/model_selection/_split.py | 54 ++++++++++++++------- sklearn/model_selection/tests/test_split.py | 15 +++++- sklearn/tests/test_cross_validation.py | 36 +++++++++----- 4 files changed, 101 insertions(+), 45 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 508b0460ec154..ea5a0d73f704b 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -230,8 +230,8 @@ def __repr__(self): ) def __len__(self): - return int(factorial(self.n) / factorial(self.n - self.p) - / factorial(self.p)) + return int(factorial(self.n) / factorial(self.n - self.p) / + factorial(self.p)) class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)): @@ -738,7 +738,7 @@ def __len__(self): class BaseShuffleSplit(with_metaclass(ABCMeta)): """Base class for ShuffleSplit and StratifiedShuffleSplit""" - def __init__(self, n, n_iter=10, test_size=0.1, train_size=None, + def __init__(self, n, n_iter=10, test_size=None, train_size=None, random_state=None): self.n = n self.n_iter = n_iter @@ -845,9 +845,8 @@ def __len__(self): def _validate_shuffle_split(n, test_size, train_size): if test_size is None and train_size is None: - raise ValueError( - 'test_size and train_size can not both be None') - + train_size = 0.9 + test_size = 0.1 if test_size is not None: if np.asarray(test_size).dtype.kind == 'f': if test_size >= 1.: @@ -881,21 +880,37 @@ def _validate_shuffle_split(n, test_size, train_size): else: raise ValueError("Invalid value for train_size: %r" % train_size) - if np.asarray(test_size).dtype.kind == 'f': - n_test = ceil(test_size * n) - elif np.asarray(test_size).dtype.kind == 'i': - n_test = float(test_size) + if test_size is None: + # only train_size set, so set test_size as + # n - n_train + if np.asarray(train_size).dtype.kind == 'f': + n_train = floor(train_size * n) + elif np.asarray(train_size).dtype.kind == 'i': + n_train = float(train_size) + + # set n_test to be the complement of n_train + n_test = n - n_train + + elif train_size is None: + # only test_size was set, so set train_size as + # n - n_test + if np.asarray(test_size).dtype.kind == 'f': + n_test = ceil(test_size * n) + elif np.asarray(test_size).dtype.kind == 'i': + n_test = float(test_size) - if train_size is None: n_train = n - n_test else: + # both train_size and test_size set, so subsample if np.asarray(train_size).dtype.kind == 'f': n_train = floor(train_size * n) else: n_train = float(train_size) - if test_size is None: - n_test = n - n_train + if np.asarray(test_size).dtype.kind == 'f': + n_test = ceil(test_size * n) + else: + n_test = float(test_size) if n_train + n_test > n: raise ValueError('The sum of train_size and test_size = %d, ' diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 5989edd30b109..7818dc759ed4d 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -896,7 +896,7 @@ def get_n_splits(self, X, y, labels): class BaseShuffleSplit(with_metaclass(ABCMeta)): """Base class for ShuffleSplit and StratifiedShuffleSplit""" - def __init__(self, n_splits=10, test_size=0.1, train_size=None, + def __init__(self, n_splits=10, test_size=None, train_size=None, random_state=None): _validate_shuffle_split_init(test_size, train_size) self.n_splits = n_splits @@ -1251,9 +1251,6 @@ def _validate_shuffle_split_init(test_size, train_size): NOTE This does not take into account the number of samples which is known only at split """ - if test_size is None and train_size is None: - raise ValueError('test_size and train_size can not both be None') - if test_size is not None: if np.asarray(test_size).dtype.kind == 'f': if test_size >= 1.: @@ -1285,30 +1282,51 @@ def _validate_shuffle_split(n_samples, test_size, train_size): Validation helper to check if the test/test sizes are meaningful wrt to the size of the data (n_samples) """ - if (test_size is not None and np.asarray(test_size).dtype.kind == 'i' - and test_size >= n_samples): + if test_size is None and train_size is None: + train_size = 0.9 + test_size = 0.1 + + if (test_size is not None and np.asarray(test_size).dtype.kind == 'i'and + test_size >= n_samples): raise ValueError('test_size=%d should be smaller than the number of ' 'samples %d' % (test_size, n_samples)) - if (train_size is not None and np.asarray(train_size).dtype.kind == 'i' - and train_size >= n_samples): + if (train_size is not None and np.asarray(train_size).dtype.kind == 'i' and + train_size >= n_samples): raise ValueError("train_size=%d should be smaller than the number of" " samples %d" % (train_size, n_samples)) - if np.asarray(test_size).dtype.kind == 'f': - n_test = ceil(test_size * n_samples) - elif np.asarray(test_size).dtype.kind == 'i': - n_test = float(test_size) + if test_size is None: + # only train_size set, so set test_size as + # n - n_train + if np.asarray(train_size).dtype.kind == 'f': + n_train = floor(train_size * n_samples) + elif np.asarray(train_size).dtype.kind == 'i': + n_train = float(train_size) + + # set n_test to be the complement of n_train + n_test = n_samples - n_train + + elif train_size is None: + # only test_size was set, so set train_size as + # n - n_test + if np.asarray(test_size).dtype.kind == 'f': + n_test = ceil(test_size * n_samples) + elif np.asarray(test_size).dtype.kind == 'i': + n_test = float(test_size) - if train_size is None: n_train = n_samples - n_test - elif np.asarray(train_size).dtype.kind == 'f': - n_train = floor(train_size * n_samples) else: - n_train = float(train_size) + # both train_size and test_size set, so subsample + if np.asarray(train_size).dtype.kind == 'f': + n_train = floor(train_size * n_samples) + else: + n_train = float(train_size) - if test_size is None: - n_test = n_samples - n_train + if np.asarray(test_size).dtype.kind == 'f': + n_test = ceil(test_size * n_samples) + else: + n_test = float(test_size) if n_train + n_test > n_samples: raise ValueError('The sum of train_size and test_size = %d, ' diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index d28148efe6956..172b8de1d7c27 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -156,7 +156,7 @@ def test_cross_validator_with_default_params(): skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)" lolo_repr = "LeaveOneLabelOut()" lopo_repr = "LeavePLabelOut(n_labels=2)" - ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, test_size=0.1, " + ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, test_size=None, " "train_size=None)") ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))" @@ -807,7 +807,6 @@ def train_test_split_mock_pandas(): def test_shufflesplit_errors(): # When the {test|train}_size is a float/invalid, error is raised at init - assert_raises(ValueError, ShuffleSplit, test_size=None, train_size=None) assert_raises(ValueError, ShuffleSplit, test_size=2.0) assert_raises(ValueError, ShuffleSplit, test_size=1.0) assert_raises(ValueError, ShuffleSplit, test_size=0.1, train_size=0.95) @@ -829,6 +828,18 @@ def test_shufflesplit_reproducible(): list(a for a, b in ss.split(X))) +def test_shufflesplit_train_test_size(): + # check that same sequence of train-test is given + # when setting train_size to be the complement of test_size + # and vice-versa + ss_default = ShuffleSplit(random_state=0) + ss_train = ShuffleSplit(random_state=0, train_size=.9) + ss_test = ShuffleSplit(random_state=0, test_size=.1) + assert_array_equal(list(a for a, b in ss_default.split(X)), + list(a for a, b in ss_train.split(X)), + list(a for a, b in ss_test.split(X))) + + def test_safe_split_with_precomputed_kernel(): clf = SVC() clfp = SVC(kernel="precomputed") diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 0e03cad783e53..9ac8650da75e1 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -24,10 +24,6 @@ from sklearn.utils.testing import ignore_warnings from sklearn.utils.mocking import CheckingClassifier, MockDataFrame -with warnings.catch_warnings(): - warnings.simplefilter('ignore') - from sklearn import cross_validation as cval - from sklearn.datasets import make_regression from sklearn.datasets import load_boston from sklearn.datasets import load_digits @@ -48,6 +44,10 @@ from sklearn.preprocessing import Imputer from sklearn.pipeline import Pipeline +with warnings.catch_warnings(): + warnings.simplefilter('ignore') + from sklearn import cross_validation as cval + class MockClassifier(object): """Dummy classifier to test the cross-validation""" @@ -490,10 +490,11 @@ def test_stratified_shuffle_split_iter(): for train, test in sss: assert_array_equal(np.unique(y[train]), np.unique(y[test])) # Checks if folds keep classes proportions - p_train = (np.bincount(np.unique(y[train], return_inverse=True)[1]) - / float(len(y[train]))) - p_test = (np.bincount(np.unique(y[test], return_inverse=True)[1]) - / float(len(y[test]))) + p_train = (np.bincount(np.unique(y[train], + return_inverse=True)[1]) / + float(len(y[train]))) + p_test = (np.bincount(np.unique(y[test], return_inverse=True)[1]) / + float(len(y[test]))) assert_array_almost_equal(p_train, p_test, 1) assert_equal(y[train].size + y[test].size, y.size) assert_array_equal(np.intersect1d(train, test), []) @@ -862,6 +863,7 @@ def train_test_split_pandas(): assert_true(isinstance(X_train, InputFeatureType)) assert_true(isinstance(X_test, InputFeatureType)) + def train_test_split_mock_pandas(): # X mock dataframe X_df = MockDataFrame(X) @@ -948,8 +950,8 @@ def test_permutation_score(): # test with custom scoring object def custom_score(y_true, y_pred): - return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) - / y_true.shape[0]) + return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) / + y_true.shape[0]) scorer = make_scorer(custom_score) score, _, pvalue = cval.permutation_test_score( @@ -1018,8 +1020,6 @@ def test_shufflesplit_errors(): assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3) assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j) - assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None, - train_size=None) def test_shufflesplit_reproducible(): @@ -1029,6 +1029,18 @@ def test_shufflesplit_reproducible(): assert_array_equal(list(a for a, b in ss), list(a for a, b in ss)) +def test_shufflesplit_train_test_size(): + # check that same sequence of train-test is given + # when setting train_size to be the complement of test_size + # and vice-versa + ss_default = cval.ShuffleSplit(10, random_state=0) + ss_train = cval.ShuffleSplit(10, random_state=0, train_size=.9) + ss_test = cval.ShuffleSplit(10, random_state=0, test_size=.1) + assert_array_equal(list(a for a, b in ss_default), + list(a for a, b in ss_train), + list(a for a, b in ss_test)) + + def test_safe_split_with_precomputed_kernel(): clf = SVC() clfp = SVC(kernel="precomputed")