-
-
Notifications
You must be signed in to change notification settings - Fork 25.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MRG+3] edit train/test_size default behavior #7459
Changes from 21 commits
2cd9ab3
8e0e817
f0dddd9
2746367
4aa0f77
2d1c51c
719662a
177c48d
e0ca540
2d50779
125844d
7abd7ad
18d5c1f
71aabb0
3707437
da58d82
95ce853
281bd53
51e6397
453ada5
7b3dd0f
beaf8d0
21b4b7a
5392f85
fd49cb9
a042aac
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -916,7 +916,7 @@ def get_n_splits(self, X, y, groups): | |
class BaseShuffleSplit(with_metaclass(ABCMeta)): | ||
"""Base class for ShuffleSplit and StratifiedShuffleSplit""" | ||
|
||
def __init__(self, n_splits=10, test_size=0.1, train_size=None, | ||
def __init__(self, n_splits=10, test_size="default", train_size=None, | ||
random_state=None): | ||
_validate_shuffle_split_init(test_size, train_size) | ||
self.n_splits = n_splits | ||
|
@@ -994,16 +994,19 @@ class ShuffleSplit(BaseShuffleSplit): | |
|
||
Parameters | ||
---------- | ||
n_splits : int (default 10) | ||
n_splits : int, default 10 | ||
Number of re-shuffling & splitting iterations. | ||
|
||
test_size : float, int, or None, default 0.1 | ||
If float, should be between 0.0 and 1.0 and represent the | ||
proportion of the dataset to include in the test split. If | ||
int, represents the absolute number of test samples. If None, | ||
the value is automatically set to the complement of the train size. | ||
test_size : float, int, None, optional | ||
If float, should be between 0.0 and 1.0 and represent the proportion | ||
of the dataset to include in the test split. If int, represents the | ||
absolute number of test samples. If None, the value is set to the | ||
complement of the train size. By default, the value is set to 0.1. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should explicitly mention what "default" means here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Hmmm really? Quickly looking at the code below, it looks like the default value is 0.2 ... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. whoops, good catch @lesteve . |
||
The default will change in version 0.21. It will remain 0.1 only | ||
if ``train_size`` is unspecified, otherwise it will complement | ||
the specified ``train_size``. | ||
|
||
train_size : float, int, or None (default is None) | ||
train_size : float, int, or None, default None | ||
If float, should be between 0.0 and 1.0 and represent the | ||
proportion of the dataset to include in the train split. If | ||
int, represents the absolute number of train samples. If None, | ||
|
@@ -1040,7 +1043,8 @@ class ShuffleSplit(BaseShuffleSplit): | |
|
||
def _iter_indices(self, X, y=None, groups=None): | ||
n_samples = _num_samples(X) | ||
n_train, n_test = _validate_shuffle_split(n_samples, self.test_size, | ||
n_train, n_test = _validate_shuffle_split(n_samples, | ||
self.test_size, | ||
self.train_size) | ||
rng = check_random_state(self.random_state) | ||
for i in range(self.n_splits): | ||
|
@@ -1079,13 +1083,16 @@ class GroupShuffleSplit(ShuffleSplit): | |
n_splits : int (default 5) | ||
Number of re-shuffling & splitting iterations. | ||
|
||
test_size : float (default 0.2), int, or None | ||
If float, should be between 0.0 and 1.0 and represent the | ||
proportion of the groups to include in the test split. If | ||
int, represents the absolute number of test groups. If None, | ||
the value is automatically set to the complement of the train size. | ||
test_size : float, int, None, optional | ||
If float, should be between 0.0 and 1.0 and represent the proportion | ||
of the dataset to include in the test split. If int, represents the | ||
absolute number of test samples. If None, the value is set to the | ||
complement of the train size. By default, the value is set to 0.1. | ||
The default will change in version 0.21. It will remain 0.2 only | ||
if ``train_size`` is unspecified, otherwise it will complement | ||
the specified ``train_size``. | ||
|
||
train_size : float, int, or None (default is None) | ||
train_size : float, int, or None, default is None | ||
If float, should be between 0.0 and 1.0 and represent the | ||
proportion of the groups to include in the train split. If | ||
int, represents the absolute number of train groups. If None, | ||
|
@@ -1095,8 +1102,16 @@ class GroupShuffleSplit(ShuffleSplit): | |
Pseudo-random number generator state used for random sampling. | ||
''' | ||
|
||
def __init__(self, n_splits=5, test_size=0.2, train_size=None, | ||
def __init__(self, n_splits=5, test_size="default", train_size=None, | ||
random_state=None): | ||
if test_size == "default": | ||
if train_size is not None: | ||
warnings.warn("From version 0.21, test_size will always " | ||
"complement train_size unless both " | ||
"are specified.", | ||
FutureWarning) | ||
test_size = 0.2 | ||
|
||
super(GroupShuffleSplit, self).__init__( | ||
n_splits=n_splits, | ||
test_size=test_size, | ||
|
@@ -1203,16 +1218,19 @@ class StratifiedShuffleSplit(BaseShuffleSplit): | |
|
||
Parameters | ||
---------- | ||
n_splits : int (default 10) | ||
n_splits : int, default 10 | ||
Number of re-shuffling & splitting iterations. | ||
|
||
test_size : float (default 0.1), int, or None | ||
If float, should be between 0.0 and 1.0 and represent the | ||
proportion of the dataset to include in the test split. If | ||
int, represents the absolute number of test samples. If None, | ||
the value is automatically set to the complement of the train size. | ||
test_size : float, int, None, optional | ||
If float, should be between 0.0 and 1.0 and represent the proportion | ||
of the dataset to include in the test split. If int, represents the | ||
absolute number of test samples. If None, the value is set to the | ||
complement of the train size. By default, the value is set to 0.1. | ||
The default will change in version 0.21. It will remain 0.1 only | ||
if ``train_size`` is unspecified, otherwise it will complement | ||
the specified ``train_size``. | ||
|
||
train_size : float, int, or None (default is None) | ||
train_size : float, int, or None, default is None | ||
If float, should be between 0.0 and 1.0 and represent the | ||
proportion of the dataset to include in the train split. If | ||
int, represents the absolute number of train samples. If None, | ||
|
@@ -1240,7 +1258,7 @@ class StratifiedShuffleSplit(BaseShuffleSplit): | |
TRAIN: [0 2] TEST: [3 1] | ||
""" | ||
|
||
def __init__(self, n_splits=10, test_size=0.1, train_size=None, | ||
def __init__(self, n_splits=10, test_size="default", train_size=None, | ||
random_state=None): | ||
super(StratifiedShuffleSplit, self).__init__( | ||
n_splits, test_size, train_size, random_state) | ||
|
@@ -1330,6 +1348,14 @@ def _validate_shuffle_split_init(test_size, train_size): | |
NOTE This does not take into account the number of samples which is known | ||
only at split | ||
""" | ||
if test_size == "default": | ||
if train_size is not None: | ||
warnings.warn("From version 0.21, test_size will always " | ||
"complement train_size unless both " | ||
"are specified.", | ||
FutureWarning) | ||
test_size = 0.1 | ||
|
||
if test_size is None and train_size is None: | ||
raise ValueError('test_size and train_size can not both be None') | ||
|
||
|
@@ -1364,16 +1390,21 @@ def _validate_shuffle_split(n_samples, test_size, train_size): | |
Validation helper to check if the test/test sizes are meaningful wrt to the | ||
size of the data (n_samples) | ||
""" | ||
if (test_size is not None and np.asarray(test_size).dtype.kind == 'i' and | ||
if (test_size is not None and | ||
np.asarray(test_size).dtype.kind == 'i' and | ||
test_size >= n_samples): | ||
raise ValueError('test_size=%d should be smaller than the number of ' | ||
'samples %d' % (test_size, n_samples)) | ||
|
||
if (train_size is not None and np.asarray(train_size).dtype.kind == 'i' and | ||
if (train_size is not None and | ||
np.asarray(train_size).dtype.kind == 'i' and | ||
train_size >= n_samples): | ||
raise ValueError("train_size=%d should be smaller than the number of" | ||
" samples %d" % (train_size, n_samples)) | ||
|
||
if test_size == "default": | ||
test_size = 0.1 | ||
|
||
if np.asarray(test_size).dtype.kind == 'f': | ||
n_test = ceil(test_size * n_samples) | ||
elif np.asarray(test_size).dtype.kind == 'i': | ||
|
@@ -1611,14 +1642,16 @@ def train_test_split(*arrays, **options): | |
Allowed inputs are lists, numpy arrays, scipy-sparse | ||
matrices or pandas dataframes. | ||
|
||
test_size : float, int, or None (default is None) | ||
If float, should be between 0.0 and 1.0 and represent the | ||
proportion of the dataset to include in the test split. If | ||
int, represents the absolute number of test samples. If None, | ||
the value is automatically set to the complement of the train size. | ||
If train size is also None, test size is set to 0.25. | ||
test_size : float, int, None, optional | ||
If float, should be between 0.0 and 1.0 and represent the proportion | ||
of the dataset to include in the test split. If int, represents the | ||
absolute number of test samples. If None, the value is set to the | ||
complement of the train size. By default, the value is set to 0.25. | ||
The default will change in version 0.21. It will remain 0.25 only | ||
if ``train_size`` is unspecified, otherwise it will complement | ||
the specified ``train_size``. | ||
|
||
train_size : float, int, or None (default is None) | ||
train_size : float, int, or None, default None | ||
If float, should be between 0.0 and 1.0 and represent the | ||
proportion of the dataset to include in the train split. If | ||
int, represents the absolute number of train samples. If None, | ||
|
@@ -1674,14 +1707,22 @@ def train_test_split(*arrays, **options): | |
n_arrays = len(arrays) | ||
if n_arrays == 0: | ||
raise ValueError("At least one array required as input") | ||
test_size = options.pop('test_size', None) | ||
test_size = options.pop('test_size', 'default') | ||
train_size = options.pop('train_size', None) | ||
random_state = options.pop('random_state', None) | ||
stratify = options.pop('stratify', None) | ||
|
||
if options: | ||
raise TypeError("Invalid parameters passed: %s" % str(options)) | ||
|
||
if test_size == 'default': | ||
test_size = None | ||
if train_size is not None: | ||
warnings.warn("From version 0.21, test_size will always " | ||
"complement train_size unless both " | ||
"are specified.", | ||
FutureWarning) | ||
|
||
if test_size is None and train_size is None: | ||
test_size = 0.25 | ||
|
||
|
@@ -1703,6 +1744,7 @@ def train_test_split(*arrays, **options): | |
|
||
train_test_split.__test__ = False # to avoid a pb with nosetests | ||
|
||
|
||
def _build_repr(self): | ||
# XXX This is copied from BaseEstimator's get_params | ||
cls = self.__class__ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should specify 'default "default"' in the same way the other parameters specify their defaults