Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG+3] edit train/test_size default behavior #7459

Merged
merged 26 commits into from
Jun 14, 2017
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
2cd9ab3
edit train/test_size default behavior
nelson-liu Sep 19, 2016
8e0e817
revert changes to cross_Validation
nelson-liu Sep 20, 2016
f0dddd9
fix improper merge resolution
nelson-liu Sep 20, 2016
2746367
edit default train/test_size behavior for other splitters
nelson-liu Sep 21, 2016
4aa0f77
add deprecation warnings to groupshufflesplit and train_test_split
nelson-liu Oct 4, 2016
2d1c51c
edit train/test_size default behavior
nelson-liu Sep 19, 2016
719662a
revert changes to cross_Validation
nelson-liu Sep 20, 2016
177c48d
correctly format docstrings and remove warnings of changed default va…
nelson-liu Dec 25, 2016
e0ca540
restored original behavior with added DeprecationWarnings
nelson-liu Dec 25, 2016
2d50779
add unit tests for deprecationwarnings
nelson-liu Dec 25, 2016
125844d
reset GroupShuffleSplit default test_size to 0.2
nelson-liu Dec 25, 2016
7abd7ad
remove extraneous test
nelson-liu Dec 25, 2016
18d5c1f
fix flake8 violations
nelson-liu Dec 25, 2016
71aabb0
fix indentation error overriding test size in groupsamplesplit
nelson-liu Dec 25, 2016
3707437
change DeprecationWarnings to FutureWarnings
nelson-liu Dec 25, 2016
da58d82
reword docstrings for test_size parameters
nelson-liu Dec 25, 2016
95ce853
fix flake8 error in line length
nelson-liu Dec 25, 2016
281bd53
remove extraneous newline
nelson-liu Dec 25, 2016
51e6397
edit indentation errors and clarify future test_size behavior
nelson-liu Dec 26, 2016
453ada5
ignore FutureWarnings in unrelated tests
nelson-liu Dec 26, 2016
7b3dd0f
fix flake8 error
nelson-liu Dec 27, 2016
beaf8d0
add more details about the defaults
nelson-liu Mar 7, 2017
21b4b7a
Merge branch 'master' into edit_train_test_split_api
nelson-liu Mar 7, 2017
5392f85
fix typo in GroupShuffleSplit stating default is 0.1
nelson-liu Mar 10, 2017
fd49cb9
Add what's new
jnothman Jun 14, 2017
a042aac
Merge branch 'master' into edit_train_test_split_api
jnothman Jun 14, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 76 additions & 34 deletions sklearn/model_selection/_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -916,7 +916,7 @@ def get_n_splits(self, X, y, groups):
class BaseShuffleSplit(with_metaclass(ABCMeta)):
"""Base class for ShuffleSplit and StratifiedShuffleSplit"""

def __init__(self, n_splits=10, test_size=0.1, train_size=None,
def __init__(self, n_splits=10, test_size="default", train_size=None,
random_state=None):
_validate_shuffle_split_init(test_size, train_size)
self.n_splits = n_splits
Expand Down Expand Up @@ -994,16 +994,19 @@ class ShuffleSplit(BaseShuffleSplit):

Parameters
----------
n_splits : int (default 10)
n_splits : int, default 10
Number of re-shuffling & splitting iterations.

test_size : float, int, or None, default 0.1
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the test split. If
int, represents the absolute number of test samples. If None,
the value is automatically set to the complement of the train size.
test_size : float, int, None, optional
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should specify 'default "default"' in the same way the other parameters specify their defaults

If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test samples. If None, the value is set to the
complement of the train size. By default, the value is set to 0.1.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should explicitly mention what "default" means here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By default, the value is set to 0.1.

Hmmm really? Quickly looking at the code below, it looks like the default value is 0.2 ...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whoops, good catch @lesteve .

The default will change in version 0.21. It will remain 0.1 only
if ``train_size`` is unspecified, otherwise it will complement
the specified ``train_size``.

train_size : float, int, or None (default is None)
train_size : float, int, or None, default None
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the train split. If
int, represents the absolute number of train samples. If None,
Expand Down Expand Up @@ -1040,7 +1043,8 @@ class ShuffleSplit(BaseShuffleSplit):

def _iter_indices(self, X, y=None, groups=None):
n_samples = _num_samples(X)
n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
n_train, n_test = _validate_shuffle_split(n_samples,
self.test_size,
self.train_size)
rng = check_random_state(self.random_state)
for i in range(self.n_splits):
Expand Down Expand Up @@ -1079,13 +1083,16 @@ class GroupShuffleSplit(ShuffleSplit):
n_splits : int (default 5)
Number of re-shuffling & splitting iterations.

test_size : float (default 0.2), int, or None
If float, should be between 0.0 and 1.0 and represent the
proportion of the groups to include in the test split. If
int, represents the absolute number of test groups. If None,
the value is automatically set to the complement of the train size.
test_size : float, int, None, optional
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test samples. If None, the value is set to the
complement of the train size. By default, the value is set to 0.1.
The default will change in version 0.21. It will remain 0.2 only
if ``train_size`` is unspecified, otherwise it will complement
the specified ``train_size``.

train_size : float, int, or None (default is None)
train_size : float, int, or None, default is None
If float, should be between 0.0 and 1.0 and represent the
proportion of the groups to include in the train split. If
int, represents the absolute number of train groups. If None,
Expand All @@ -1095,8 +1102,16 @@ class GroupShuffleSplit(ShuffleSplit):
Pseudo-random number generator state used for random sampling.
'''

def __init__(self, n_splits=5, test_size=0.2, train_size=None,
def __init__(self, n_splits=5, test_size="default", train_size=None,
random_state=None):
if test_size == "default":
if train_size is not None:
warnings.warn("From version 0.21, test_size will always "
"complement train_size unless both "
"are specified.",
FutureWarning)
test_size = 0.2

super(GroupShuffleSplit, self).__init__(
n_splits=n_splits,
test_size=test_size,
Expand Down Expand Up @@ -1203,16 +1218,19 @@ class StratifiedShuffleSplit(BaseShuffleSplit):

Parameters
----------
n_splits : int (default 10)
n_splits : int, default 10
Number of re-shuffling & splitting iterations.

test_size : float (default 0.1), int, or None
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the test split. If
int, represents the absolute number of test samples. If None,
the value is automatically set to the complement of the train size.
test_size : float, int, None, optional
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test samples. If None, the value is set to the
complement of the train size. By default, the value is set to 0.1.
The default will change in version 0.21. It will remain 0.1 only
if ``train_size`` is unspecified, otherwise it will complement
the specified ``train_size``.

train_size : float, int, or None (default is None)
train_size : float, int, or None, default is None
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the train split. If
int, represents the absolute number of train samples. If None,
Expand Down Expand Up @@ -1240,7 +1258,7 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
TRAIN: [0 2] TEST: [3 1]
"""

def __init__(self, n_splits=10, test_size=0.1, train_size=None,
def __init__(self, n_splits=10, test_size="default", train_size=None,
random_state=None):
super(StratifiedShuffleSplit, self).__init__(
n_splits, test_size, train_size, random_state)
Expand Down Expand Up @@ -1330,6 +1348,14 @@ def _validate_shuffle_split_init(test_size, train_size):
NOTE This does not take into account the number of samples which is known
only at split
"""
if test_size == "default":
if train_size is not None:
warnings.warn("From version 0.21, test_size will always "
"complement train_size unless both "
"are specified.",
FutureWarning)
test_size = 0.1

if test_size is None and train_size is None:
raise ValueError('test_size and train_size can not both be None')

Expand Down Expand Up @@ -1364,16 +1390,21 @@ def _validate_shuffle_split(n_samples, test_size, train_size):
Validation helper to check if the test/test sizes are meaningful wrt to the
size of the data (n_samples)
"""
if (test_size is not None and np.asarray(test_size).dtype.kind == 'i' and
if (test_size is not None and
np.asarray(test_size).dtype.kind == 'i' and
test_size >= n_samples):
raise ValueError('test_size=%d should be smaller than the number of '
'samples %d' % (test_size, n_samples))

if (train_size is not None and np.asarray(train_size).dtype.kind == 'i' and
if (train_size is not None and
np.asarray(train_size).dtype.kind == 'i' and
train_size >= n_samples):
raise ValueError("train_size=%d should be smaller than the number of"
" samples %d" % (train_size, n_samples))

if test_size == "default":
test_size = 0.1

if np.asarray(test_size).dtype.kind == 'f':
n_test = ceil(test_size * n_samples)
elif np.asarray(test_size).dtype.kind == 'i':
Expand Down Expand Up @@ -1611,14 +1642,16 @@ def train_test_split(*arrays, **options):
Allowed inputs are lists, numpy arrays, scipy-sparse
matrices or pandas dataframes.

test_size : float, int, or None (default is None)
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the test split. If
int, represents the absolute number of test samples. If None,
the value is automatically set to the complement of the train size.
If train size is also None, test size is set to 0.25.
test_size : float, int, None, optional
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test samples. If None, the value is set to the
complement of the train size. By default, the value is set to 0.25.
The default will change in version 0.21. It will remain 0.25 only
if ``train_size`` is unspecified, otherwise it will complement
the specified ``train_size``.

train_size : float, int, or None (default is None)
train_size : float, int, or None, default None
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the train split. If
int, represents the absolute number of train samples. If None,
Expand Down Expand Up @@ -1674,14 +1707,22 @@ def train_test_split(*arrays, **options):
n_arrays = len(arrays)
if n_arrays == 0:
raise ValueError("At least one array required as input")
test_size = options.pop('test_size', None)
test_size = options.pop('test_size', 'default')
train_size = options.pop('train_size', None)
random_state = options.pop('random_state', None)
stratify = options.pop('stratify', None)

if options:
raise TypeError("Invalid parameters passed: %s" % str(options))

if test_size == 'default':
test_size = None
if train_size is not None:
warnings.warn("From version 0.21, test_size will always "
"complement train_size unless both "
"are specified.",
FutureWarning)

if test_size is None and train_size is None:
test_size = 0.25

Expand All @@ -1703,6 +1744,7 @@ def train_test_split(*arrays, **options):

train_test_split.__test__ = False # to avoid a pb with nosetests


def _build_repr(self):
# XXX This is copied from BaseEstimator's get_params
cls = self.__class__
Expand Down
16 changes: 14 additions & 2 deletions sklearn/model_selection/tests/test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_warns_message
from sklearn.utils.testing import assert_warns
from sklearn.utils.testing import assert_raise_message
from sklearn.utils.testing import ignore_warnings
from sklearn.utils.validation import _num_samples
Expand Down Expand Up @@ -160,8 +161,8 @@ def test_cross_validator_with_default_params():
skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)"
lolo_repr = "LeaveOneGroupOut()"
lopo_repr = "LeavePGroupsOut(n_groups=2)"
ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, test_size=0.1, "
"train_size=None)")
ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, "
"test_size='default',\n train_size=None)")
ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))"

n_splits_expected = [n_samples, comb(n_samples, p), n_splits, n_splits,
Expand Down Expand Up @@ -517,6 +518,7 @@ def test_shuffle_split():
assert_array_equal(t3[1], t4[1])


@ignore_warnings
def test_stratified_shuffle_split_init():
X = np.arange(7)
y = np.asarray([0, 1, 1, 1, 2, 2, 2])
Expand Down Expand Up @@ -804,6 +806,7 @@ def test_leave_one_p_group_out_error_on_fewer_number_of_groups():
LeavePGroupsOut(n_groups=3).split(X, y, groups))


@ignore_warnings
def test_train_test_split_errors():
assert_raises(ValueError, train_test_split)
assert_raises(ValueError, train_test_split, range(3), train_size=1.1)
Expand Down Expand Up @@ -920,6 +923,7 @@ def train_test_split_list_input():
np.testing.assert_equal(y_test3, y_test2)


@ignore_warnings
def test_shufflesplit_errors():
# When the {test|train}_size is a float/invalid, error is raised at init
assert_raises(ValueError, ShuffleSplit, test_size=None, train_size=None)
Expand Down Expand Up @@ -1193,6 +1197,14 @@ def test_nested_cv():
fit_params={'groups': groups})


def test_train_test_default_warning():
assert_warns(FutureWarning, ShuffleSplit, train_size=0.75)
assert_warns(FutureWarning, GroupShuffleSplit, train_size=0.75)
assert_warns(FutureWarning, StratifiedShuffleSplit, train_size=0.75)
assert_warns(FutureWarning, train_test_split, range(3),
train_size=0.75)


def test_build_repr():
class MockSplitter:
def __init__(self, a, b=0, c=None):
Expand Down