Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

We’re showing branches in this repository, but you can also compare across forks.

base fork: scikit-learn/scikit-learn
...
head fork: scikit-learn/scikit-learn
  • 9 commits
  • 4 files changed
  • 0 commit comments
  • 2 contributors
8 doc/modules/cross_validation.rst
View
@@ -33,7 +33,7 @@ We can now quickly sample a training set while holding out 40% of the
data for testing (evaluating) our classifier::
>>> X_train, X_test, y_train, y_test = cross_validation.train_test_split(
- ... iris.data, iris.target, test_fraction=0.4, random_state=0)
+ ... iris.data, iris.target, test_size=0.4, random_state=0)
>>> X_train.shape, y_train.shape
((90, 4), (90,))
@@ -103,7 +103,7 @@ validation iterator instead, for instance::
>>> n_samples = iris.data.shape[0]
>>> cv = cross_validation.ShuffleSplit(n_samples, n_iterations=3,
- ... test_fraction=0.3, random_state=0)
+ ... test_size=0.3, random_state=0)
>>> cross_validation.cross_val_score(clf, iris.data, iris.target, cv=cv)
... # doctest: +ELLIPSIS
@@ -339,12 +339,12 @@ generator.
Here is a usage example::
- >>> ss = cross_validation.ShuffleSplit(5, n_iterations=3, test_fraction=0.25,
+ >>> ss = cross_validation.ShuffleSplit(5, n_iterations=3, test_size=0.25,
... random_state=0)
>>> len(ss)
3
>>> print ss # doctest: +ELLIPSIS
- ShuffleSplit(5, n_iterations=3, test_fraction=0.25, indices=True, ...)
+ ShuffleSplit(5, n_iterations=3, test_size=0.25, indices=True, ...)
>>> for train_index, test_index in ss:
... print train_index, test_index
12 doc/whats_new.rst
View
@@ -82,6 +82,9 @@ Changelog
``shrink_threshold`` parameter, which implements shrunken centroid
classification, by `Robert Layton`_.
+ - Classes in :ref:`neighbors` now support arbitrary Minkowski metric for
+ nearest neighbors searches. The metric can be specified by argument ``p``.
+
API changes summary
-------------------
@@ -159,6 +162,15 @@ API changes summary
- The SVMlight format loader now supports files with both zero-based and
one-based column indices, since both occur "in the wild".
+ - Options in class :class:`ShuffleSplit` are now consistent with
+ :class:`StratifiedShuffleSplit`. Options ``test_fraction`` and
+ ``train_fraction`` are deprecated and renamed to ``test_size`` and
+ ``train_size`` and can accept both ``float`` and ``int``.
+
+ - Argument ``p`` added to classes in :ref:`neighbors` to specify an
+ arbitrary Minkowski metric for nearest neighbors searches.
+
+
.. _changes_0_10:
0.10
197 sklearn/cross_validation.py
View
@@ -11,6 +11,7 @@
from itertools import combinations
from math import ceil, floor, factorial
import operator
+import warnings
import numpy as np
import scipy.sparse as sp
@@ -686,14 +687,16 @@ class ShuffleSplit(object):
n_iterations : int (default 10)
Number of re-shuffling & splitting iterations.
- test_fraction : float (default 0.1)
- Should be between 0.0 and 1.0 and represent the proportion of
- the dataset to include in the test split.
+ test_size : float (default 0.1) or int
+ If float, should be between 0.0 and 1.0 and represent the
+ proportion of the dataset to include in the test split. If
+ int, represents the absolute number of test samples.
- train_fraction : float or None (default is None)
- Should be between 0.0 and 1.0 and represent the proportion of
- the dataset to include in the train split. If None, the value is
- automatically set to the complement of the test fraction.
+ train_size : float, int, or None (default is None)
+ If float, should be between 0.0 and 1.0 and represent the
+ proportion of the dataset to include in the train split. If
+ int, represents the absolute number of train samples. If None,
+ the value is automatically set to the complement of the test fraction.
indices : boolean, optional (default True)
Return train/test split as arrays of indices, rather than a boolean
@@ -707,12 +710,12 @@ class ShuffleSplit(object):
--------
>>> from sklearn import cross_validation
>>> rs = cross_validation.ShuffleSplit(4, n_iterations=3,
- ... test_fraction=.25, random_state=0)
+ ... test_size=.25, random_state=0)
>>> len(rs)
3
>>> print rs
... # doctest: +ELLIPSIS
- ShuffleSplit(4, n_iterations=3, test_fraction=0.25, indices=True, ...)
+ ShuffleSplit(4, n_iterations=3, test_size=0.25, indices=True, ...)
>>> for train_index, test_index in rs:
... print "TRAIN:", train_index, "TEST:", test_index
...
@@ -721,7 +724,7 @@ class ShuffleSplit(object):
TRAIN: [0 2 1] TEST: [3]
>>> rs = cross_validation.ShuffleSplit(4, n_iterations=3,
- ... train_fraction=0.5, test_fraction=.25, random_state=0)
+ ... train_size=0.5, test_size=.25, random_state=0)
>>> for train_index, test_index in rs:
... print "TRAIN:", train_index, "TEST:", test_index
...
@@ -734,36 +737,39 @@ class ShuffleSplit(object):
Bootstrap: cross-validation using re-sampling with replacement.
"""
- def __init__(self, n, n_iterations=10, test_fraction=0.1,
- train_fraction=None, indices=True, random_state=None):
+ def __init__(self, n, n_iterations=10, test_size=0.1,
+ train_size=None, indices=True, random_state=None,
+ test_fraction=None, train_fraction=None):
self.n = n
self.n_iterations = n_iterations
- self.test_fraction = test_fraction
- self.train_fraction = train_fraction
+
+ if test_fraction is not None:
+ warnings.warn(
+ "test_fraction is deprecated in 0.11 and scheduled for "
+ "removal in 0.12, use test_size instead")
+ test_size = test_fraction
+ if train_fraction is not None:
+ warnings.warn(
+ "train_fraction is deprecated in 0.11 and scheduled for "
+ "removal in 0.12, use train_size instead")
+ train_size = train_fraction
+
+ self.test_size = test_size
+ self.train_size = train_size
self.random_state = random_state
self.indices = indices
- if test_fraction >= 1.0:
- raise ValueError(
- "test_fraction=%f should be smaller than 1.0" % test_fraction)
- if (train_fraction is not None
- and train_fraction + test_fraction > 1.0):
- raise ValueError(
- 'The sum of train_fraction=%f and test_fraction=%f '
- 'should be smaller or equal than 1.0' %
- (train_fraction, test_fraction))
+
+ self.n_train, self.n_test = _validate_shuffle_split(n,
+ test_size,
+ train_size)
def __iter__(self):
rng = check_random_state(self.random_state)
- n_test = ceil(self.test_fraction * self.n)
- if self.train_fraction is None:
- n_train = self.n - n_test
- else:
- n_train = floor(self.train_fraction * self.n)
for i in range(self.n_iterations):
# random partition
permutation = rng.permutation(self.n)
- ind_test = permutation[:n_test]
- ind_train = permutation[n_test:n_test + n_train]
+ ind_test = permutation[:self.n_test]
+ ind_train = permutation[self.n_test:self.n_test + self.n_train]
if self.indices:
yield ind_train, ind_test
@@ -775,12 +781,12 @@ def __iter__(self):
yield train_mask, test_mask
def __repr__(self):
- return ('%s(%d, n_iterations=%d, test_fraction=%s, indices=%s, '
+ return ('%s(%d, n_iterations=%d, test_size=%s, indices=%s, '
'random_state=%s)' % (
self.__class__.__name__,
self.n,
self.n_iterations,
- str(self.test_fraction),
+ str(self.test_size),
self.indices,
self.random_state,
))
@@ -789,57 +795,72 @@ def __len__(self):
return self.n_iterations
-def _validate_stratified_shuffle_split(y, test_size, train_size):
- y = unique(y, return_inverse=True)[1]
- if np.min(np.bincount(y)) < 2:
- raise ValueError("The least populated class in y has only 1"
- " member, which is too few. The minimum"
- " number of labels for any class cannot"
- " be less than 2.")
-
- if isinstance(test_size, float) and test_size >= 1.:
- raise ValueError(
- 'test_size=%f should be smaller '
- 'than 1.0 or be an integer' % test_size)
- elif isinstance(test_size, int) and test_size >= y.size:
- raise ValueError(
- 'test_size=%d should be smaller '
- 'than the number of samples %d' % (test_size, y.size))
+def _validate_shuffle_split(n, test_size, train_size):
+ if np.asarray(test_size).dtype.kind == 'f':
+ if test_size >= 1.:
+ raise ValueError(
+ 'test_size=%f should be smaller '
+ 'than 1.0 or be an integer' % test_size)
+ elif np.asarray(test_size).dtype.kind == 'i':
+ if test_size >= n:
+ raise ValueError(
+ 'test_size=%d should be smaller '
+ 'than the number of samples %d' % (test_size, n))
+ else:
+ raise ValueError("Invalid value for test_size: %r" % test_size)
if train_size is not None:
- if isinstance(train_size, float) and train_size >= 1.:
- raise ValueError("train_size=%f should be smaller "
- "than 1.0 or be an integer" % train_size)
- elif isinstance(train_size, int) and train_size >= y.size:
- raise ValueError("train_size=%d should be smaller "
- "than the number of samples %d" %
- (train_size, y.size))
-
- if isinstance(test_size, float):
- n_test = ceil(test_size * y.size)
+ if np.asarray(train_size).dtype.kind == 'f':
+ if train_size >= 1.:
+ raise ValueError("train_size=%f should be smaller "
+ "than 1.0 or be an integer" % train_size)
+ elif np.asarray(test_size).dtype.kind == 'f' and \
+ train_size + test_size > 1.:
+ raise ValueError('The sum of test_size and train_size = %f, '
+ 'should be smaller than 1.0. Reduce '
+ 'test_size and/or train_size.' %
+ (train_size + test_size))
+ elif np.asarray(train_size).dtype.kind == 'i':
+ if train_size >= n:
+ raise ValueError("train_size=%d should be smaller "
+ "than the number of samples %d" %
+ (train_size, n))
+ else:
+ raise ValueError("Invalid value for train_size: %r" % train_size)
+
+ if np.asarray(test_size).dtype.kind == 'f':
+ n_test = ceil(test_size * n)
else:
n_test = float(test_size)
if train_size is None:
- if isinstance(test_size, float):
- n_train = y.size - n_test
- else:
- n_train = float(y.size - test_size)
+ n_train = n - n_test
else:
- if isinstance(train_size, float):
- n_train = floor(train_size * y.size)
+ if np.asarray(train_size).dtype.kind == 'f':
+ n_train = floor(train_size * n)
else:
n_train = float(train_size)
- if n_train + n_test > y.size:
- raise ValueError('The sum of n_train and n_test = %d, should '
- 'be smaller than the number of samples %d. '
- 'Reduce test_size and/or train_size.' %
- (n_train + n_test, y.size))
+ if n_train + n_test > n:
+ raise ValueError('The sum of train_size and test_size = %d, '
+ 'should be smaller than the number of '
+ 'samples %d. Reduce test_size and/or '
+ 'train_size.' % (n_train + n_test, n))
return n_train, n_test
+def _validate_stratified_shuffle_split(y, test_size, train_size):
+ y = unique(y, return_inverse=True)[1]
+ if np.min(np.bincount(y)) < 2:
+ raise ValueError("The least populated class in y has only 1"
+ " member, which is too few. The minimum"
+ " number of labels for any class cannot"
+ " be less than 2.")
+
+ return _validate_shuffle_split(y.size, test_size, train_size)
+
+
class StratifiedShuffleSplit(object):
"""Stratified ShuffleSplit cross validation iterator
@@ -1184,14 +1205,16 @@ def train_test_split(*arrays, **options):
Python lists or tuples occurring in arrays are converted to 1D numpy
arrays.
- test_fraction : float (default 0.25)
- Should be between 0.0 and 1.0 and represent the proportion of
- the dataset to include in the test split.
+ test_size : float (default 0.25) or int
+ If float, should be between 0.0 and 1.0 and represent the
+ proportion of the dataset to include in the test split. If
+ int, represents the absolute number of test samples.
- train_fraction : float or None (default is None)
- Should be between 0.0 and 1.0 and represent the proportion of
- the dataset to include in the train split. If None, the value is
- automatically set to the complement of the test fraction.
+ train_size : float, int, or None (default is None)
+ If float, should be between 0.0 and 1.0 and represent the
+ proportion of the dataset to include in the train split. If
+ int, represents the absolute number of train samples. If None,
+ the value is automatically set to the complement of the test fraction.
random_state : int or RandomState
Pseudo-random number generator state used for random sampling.
@@ -1214,7 +1237,7 @@ def train_test_split(*arrays, **options):
[0, 1, 2, 3, 4]
>>> a_train, a_test, b_train, b_test = train_test_split(
- ... a, b, test_fraction=0.33, random_state=42)
+ ... a, b, test_size=0.33, random_state=42)
...
>>> a_train
array([[4, 5],
@@ -1233,15 +1256,29 @@ def train_test_split(*arrays, **options):
if n_arrays == 0:
raise ValueError("At least one array required as input")
- test_fraction = options.pop('test_fraction', 0.25)
+ test_fraction = options.pop('test_fraction', None)
+ if test_fraction is not None:
+ warnings.warn(
+ "test_fraction is deprecated in 0.11 and scheduled for "
+ "removal in 0.12, use test_size instead")
+ else:
+ test_fraction = 0.25
+
train_fraction = options.pop('train_fraction', None)
+ if train_fraction is not None:
+ warnings.warn(
+ "train_fraction is deprecated in 0.11 and scheduled for "
+ "removal in 0.12, use train_size instead")
+
+ test_size = options.pop('test_size', test_fraction)
+ train_size = options.pop('train_size', train_fraction)
random_state = options.pop('random_state', None)
options['sparse_format'] = 'csr'
arrays = check_arrays(*arrays, **options)
n_samples = arrays[0].shape[0]
- cv = ShuffleSplit(n_samples, test_fraction=test_fraction,
- train_fraction=train_fraction,
+ cv = ShuffleSplit(n_samples, test_size=test_size,
+ train_size=train_size,
random_state=random_state,
indices=True)
train, test = iter(cv).next()
51 sklearn/tests/test_cross_validation.py
View
@@ -1,9 +1,10 @@
"""Test the cross_validation module"""
+import warnings
import numpy as np
from scipy.sparse import coo_matrix
-from nose.tools import assert_true
+from nose.tools import assert_true, assert_equal
from nose.tools import assert_raises
from ..base import BaseEstimator
@@ -91,6 +92,20 @@ def test_shuffle_kfold():
assert_array_equal(all_folds, ind)
+def test_shuffle_split():
+ ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0)
+ ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0)
+ ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0)
+ ss4 = cval.ShuffleSplit(10, test_size=long(2), random_state=0)
+ for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4):
+ assert_array_equal(t1[0], t2[0])
+ assert_array_equal(t2[0], t3[0])
+ assert_array_equal(t3[0], t4[0])
+ assert_array_equal(t1[1], t2[1])
+ assert_array_equal(t2[1], t3[1])
+ assert_array_equal(t3[1], t4[1])
+
+
def test_stratified_shuffle_split():
y = np.asarray([0, 1, 1, 1, 2, 2, 2])
# Check that error is raised if there is a class with only one sample
@@ -133,14 +148,40 @@ def test_cross_val_score():
def test_train_test_split_errors():
assert_raises(ValueError, cval.train_test_split)
assert_raises(ValueError, cval.train_test_split, range(3),
- train_fraction=1.1)
+ train_size=1.1)
+ assert_raises(ValueError, cval.train_test_split, range(3),
+ test_size=0.6, train_size=0.6)
+ assert_raises(ValueError, cval.train_test_split, range(3),
+ test_size=np.float32(0.6), train_size=np.float32(0.6))
+ assert_raises(ValueError, cval.train_test_split, range(3),
+ test_size="wrong_type")
assert_raises(ValueError, cval.train_test_split, range(3),
- test_fraction=0.6, train_fraction=0.6)
+ test_size=2, train_size=4)
assert_raises(TypeError, cval.train_test_split, range(3),
some_argument=1.1)
assert_raises(ValueError, cval.train_test_split, range(3), range(42))
+def test_shuffle_split_warnings():
+ # change warnings.warn to catch the message
+ expected_message = ("test_fraction is deprecated in 0.11 and scheduled "
+ "for removal in 0.12, use test_size instead",
+ "train_fraction is deprecated in 0.11 and scheduled "
+ "for removal in 0.12, use train_size instead")
+
+ with warnings.catch_warnings(record=True) as warn_queue:
+ cval.ShuffleSplit(10, 3, test_fraction=0.1)
+ cval.ShuffleSplit(10, 3, train_fraction=0.1)
+ cval.train_test_split(range(3), test_fraction=0.1)
+ cval.train_test_split(range(3), train_fraction=0.1)
+
+ assert_equal(len(warn_queue), 4)
+ assert_equal(warn_queue[0].message.message, expected_message[0])
+ assert_equal(warn_queue[1].message.message, expected_message[1])
+ assert_equal(warn_queue[2].message.message, expected_message[0])
+ assert_equal(warn_queue[3].message.message, expected_message[1])
+
+
def test_train_test_split():
X = np.arange(100).reshape((10, 10))
X_s = coo_matrix(X)
@@ -287,6 +328,10 @@ def test_shufflesplit_errors():
assert_raises(ValueError, cval.ShuffleSplit, 10, test_fraction=1.0)
assert_raises(ValueError, cval.ShuffleSplit, 10, test_fraction=0.1,
train_fraction=0.95)
+ assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=11)
+ assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10)
+ assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8,
+ train_size=3)
def test_shufflesplit_reproducible():

No commit comments for this range

Something went wrong with that request. Please try again.