[MRG+1] fix sampling in stratified shuffle split (#6472)

amueller · ogrisel · commit 31a4691d7671 · 2016-09-09T19:00:49.000+02:00
Fix sampling in stratified shuffle split, break tests that test sampling.
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -397,6 +397,11 @@ Bug fixes
     - Fix :class:`linear_model.ElasticNet` sparse decision function to match
       output with dense in the multioutput case.
 
+    - Fix in :class:`sklearn.model_selection.StratifiedShuffleSplit` to
+      return splits of size ``train_size`` and ``test_size`` in all cases
+      (`#6472 <https://github.com/scikit-learn/scikit-learn/pull/6472>`).
+      By `Andreas Müller`_.
+
 API changes summary
 -------------------
 
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
@@ -27,6 +27,7 @@
 from .utils.validation import (_is_arraylike, _num_samples,
                                column_or_1d)
 from .utils.multiclass import type_of_target
+from .utils.random import choice
 from .externals.joblib import Parallel, delayed, logger
 from .externals.six import with_metaclass
 from .externals.six.moves import zip
@@ -414,9 +415,9 @@ def __init__(self, labels, n_folds=3):
 
         if n_folds > n_labels:
             raise ValueError(
-                    ("Cannot have number of folds n_folds={0} greater"
-                     " than the number of labels: {1}.").format(n_folds,
-                                                                n_labels))
+                ("Cannot have number of folds n_folds={0} greater"
+                 " than the number of labels: {1}.").format(n_folds,
+                                                            n_labels))
 
         # Weight labels by their number of occurrences
         n_samples_per_label = np.bincount(labels)
@@ -906,6 +907,59 @@ def _validate_shuffle_split(n, test_size, train_size):
     return int(n_train), int(n_test)
 
 
+def _approximate_mode(class_counts, n_draws, rng):
+    """Computes approximate mode of multivariate hypergeometric.
+
+    This is an approximation to the mode of the multivariate
+    hypergeometric given by class_counts and n_draws.
+    It shouldn't be off by more than one.
+
+    It is the mostly likely outcome of drawing n_draws many
+    samples from the population given by class_counts.
+
+    Parameters
+    ----------
+    class_counts : ndarray of int
+        Population per class.
+    n_draws : int
+        Number of draws (samples to draw) from the overall population.
+    rng : random state
+        Used to break ties.
+
+    Returns
+    -------
+    sampled_classes : ndarray of int
+        Number of samples drawn from each class.
+        np.sum(sampled_classes) == n_draws
+    """
+    # this computes a bad approximation to the mode of the
+    # multivariate hypergeometric given by class_counts and n_draws
+    continuous = n_draws * class_counts / class_counts.sum()
+    # floored means we don't overshoot n_samples, but probably undershoot
+    floored = np.floor(continuous)
+    # we add samples according to how much "left over" probability
+    # they had, until we arrive at n_samples
+    need_to_add = int(n_draws - floored.sum())
+    if need_to_add > 0:
+        remainder = continuous - floored
+        values = np.sort(np.unique(remainder))[::-1]
+        # add according to remainder, but break ties
+        # randomly to avoid biases
+        for value in values:
+            inds, = np.where(remainder == value)
+            # if we need_to_add less than what's in inds
+            # we draw randomly from them.
+            # if we need to add more, we add them all and
+            # go to the next value
+            add_now = min(len(inds), need_to_add)
+            inds = choice(inds, size=add_now, replace=False, random_state=rng)
+            floored[inds] += 1
+            need_to_add -= add_now
+            if need_to_add == 0:
+                    break
+    return floored.astype(np.int)
+
+
 class StratifiedShuffleSplit(BaseShuffleSplit):
     """Stratified ShuffleSplit cross validation iterator
 
@@ -991,39 +1045,24 @@ def __init__(self, y, n_iter=10, test_size=0.1, train_size=None,
     def _iter_indices(self):
         rng = check_random_state(self.random_state)
         cls_count = bincount(self.y_indices)
-        p_i = cls_count / float(self.n)
-        n_i = np.round(self.n_train * p_i).astype(int)
-        t_i = np.minimum(cls_count - n_i,
-                         np.round(self.n_test * p_i).astype(int))
 
         for n in range(self.n_iter):
+            # if there are ties in the class-counts, we want
+            # to make sure to break them anew in each iteration
+            n_i = _approximate_mode(cls_count, self.n_train, rng)
+            class_counts_remaining = cls_count - n_i
+            t_i = _approximate_mode(class_counts_remaining, self.n_test, rng)
+
             train = []
             test = []
 
-            for i, cls in enumerate(self.classes):
+            for i, _ in enumerate(self.classes):
                 permutation = rng.permutation(cls_count[i])
-                cls_i = np.where((self.y == cls))[0][permutation]
-
-                train.extend(cls_i[:n_i[i]])
-                test.extend(cls_i[n_i[i]:n_i[i] + t_i[i]])
-
-            # Because of rounding issues (as n_train and n_test are not
-            # dividers of the number of elements per class), we may end
-            # up here with less samples in train and test than asked for.
-            if len(train) + len(test) < self.n_train + self.n_test:
-                # We complete by affecting randomly the missing indexes
-                missing_idx = np.where(bincount(train + test,
-                                                minlength=len(self.y)) == 0,
-                                       )[0]
-                missing_idx = rng.permutation(missing_idx)
-                n_missing_train = self.n_train - len(train)
-                n_missing_test = self.n_test - len(test)
-
-                if n_missing_train > 0:
-                    train.extend(missing_idx[:n_missing_train])
-                if n_missing_test > 0:
-                    test.extend(missing_idx[-n_missing_test:])
+                perm_indices_class_i = np.where(
+                    (i == self.y_indices))[0][permutation]
 
+                train.extend(perm_indices_class_i[:n_i[i]])
+                test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]])
             train = rng.permutation(train)
             test = rng.permutation(test)
 
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
@@ -30,6 +30,7 @@
 from ..externals.six.moves import zip
 from ..utils.fixes import bincount
 from ..utils.fixes import signature
+from ..utils.random import choice
 from ..base import _pprint
 from ..gaussian_process.kernels import Kernel as GPKernel
 
@@ -1098,6 +1099,73 @@ def _iter_indices(self, X, y, labels):
             yield train, test
 
 
+def _approximate_mode(class_counts, n_draws, rng):
+    """Computes approximate mode of multivariate hypergeometric.
+
+    This is an approximation to the mode of the multivariate
+    hypergeometric given by class_counts and n_draws.
+    It shouldn't be off by more than one.
+
+    It is the mostly likely outcome of drawing n_draws many
+    samples from the population given by class_counts.
+
+    Parameters
+    ----------
+    class_counts : ndarray of int
+        Population per class.
+    n_draws : int
+        Number of draws (samples to draw) from the overall population.
+    rng : random state
+        Used to break ties.
+
+    Returns
+    -------
+    sampled_classes : ndarray of int
+        Number of samples drawn from each class.
+        np.sum(sampled_classes) == n_draws
+
+    Examples
+    --------
+    >>> from sklearn.model_selection._split import _approximate_mode
+    >>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0)
+    array([2, 1])
+    >>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0)
+    array([3, 1])
+    >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
+    ...                   n_draws=2, rng=0)
+    array([0, 1, 1, 0])
+    >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
+    ...                   n_draws=2, rng=42)
+    array([1, 1, 0, 0])
+    """
+    # this computes a bad approximation to the mode of the
+    # multivariate hypergeometric given by class_counts and n_draws
+    continuous = n_draws * class_counts / class_counts.sum()
+    # floored means we don't overshoot n_samples, but probably undershoot
+    floored = np.floor(continuous)
+    # we add samples according to how much "left over" probability
+    # they had, until we arrive at n_samples
+    need_to_add = int(n_draws - floored.sum())
+    if need_to_add > 0:
+        remainder = continuous - floored
+        values = np.sort(np.unique(remainder))[::-1]
+        # add according to remainder, but break ties
+        # randomly to avoid biases
+        for value in values:
+            inds, = np.where(remainder == value)
+            # if we need_to_add less than what's in inds
+            # we draw randomly from them.
+            # if we need to add more, we add them all and
+            # go to the next value
+            add_now = min(len(inds), need_to_add)
+            inds = choice(inds, size=add_now, replace=False, random_state=rng)
+            floored[inds] += 1
+            need_to_add -= add_now
+            if need_to_add == 0:
+                break
+    return floored.astype(np.int)
+
+
 class StratifiedShuffleSplit(BaseShuffleSplit):
     """Stratified ShuffleSplit cross-validator
 
@@ -1181,12 +1249,14 @@ def _iter_indices(self, X, y, labels=None):
                              (n_test, n_classes))
 
         rng = check_random_state(self.random_state)
-        p_i = class_counts / float(n_samples)
-        n_i = np.round(n_train * p_i).astype(int)
-        t_i = np.minimum(class_counts - n_i,
-                         np.round(n_test * p_i).astype(int))
 
         for _ in range(self.n_splits):
+            # if there are ties in the class-counts, we want
+            # to make sure to break them anew in each iteration
+            n_i = _approximate_mode(class_counts, n_train, rng)
+            class_counts_remaining = class_counts - n_i
+            t_i = _approximate_mode(class_counts_remaining, n_test, rng)
+
             train = []
             test = []
 
@@ -1196,23 +1266,6 @@ def _iter_indices(self, X, y, labels=None):
 
                 train.extend(perm_indices_class_i[:n_i[i]])
                 test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]])
-
-            # Because of rounding issues (as n_train and n_test are not
-            # dividers of the number of elements per class), we may end
-            # up here with less samples in train and test than asked for.
-            if len(train) + len(test) < n_train + n_test:
-                # We complete by affecting randomly the missing indexes
-                missing_indices = np.where(bincount(train + test,
-                                                    minlength=len(y)) == 0)[0]
-                missing_indices = rng.permutation(missing_indices)
-                n_missing_train = n_train - len(train)
-                n_missing_test = n_test - len(test)
-
-                if n_missing_train > 0:
-                    train.extend(missing_indices[:n_missing_train])
-                if n_missing_test > 0:
-                    test.extend(missing_indices[-n_missing_test:])
-
             train = rng.permutation(train)
             test = rng.permutation(test)
 
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
@@ -535,17 +535,33 @@ def test_stratified_shuffle_split_init():
                   StratifiedShuffleSplit(test_size=2).split(X, y))
 
 
+def test_stratified_shuffle_split_respects_test_size():
+    y = np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2])
+    test_size = 5
+    train_size = 10
+    sss = StratifiedShuffleSplit(6, test_size=test_size, train_size=train_size,
+                                 random_state=0).split(np.ones(len(y)), y)
+    for train, test in sss:
+        assert_equal(len(train), train_size)
+        assert_equal(len(test), test_size)
+
+
 def test_stratified_shuffle_split_iter():
     ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
           np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
-          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
+          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
           np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
-          np.array([-1] * 800 + [1] * 50)
+          np.array([-1] * 800 + [1] * 50),
+          np.concatenate([[i] * (100 + i) for i in range(11)])
           ]
 
     for y in ys:
         sss = StratifiedShuffleSplit(6, test_size=0.33,
                                      random_state=0).split(np.ones(len(y)), y)
+        # this is how test-size is computed internally
+        # in _validate_shuffle_split
+        test_size = np.ceil(0.33 * len(y))
+        train_size = len(y) - test_size
         for train, test in sss:
             assert_array_equal(np.unique(y[train]), np.unique(y[test]))
             # Checks if folds keep classes proportions
@@ -556,7 +572,9 @@ def test_stratified_shuffle_split_iter():
                                   return_inverse=True)[1]) /
                       float(len(y[test])))
             assert_array_almost_equal(p_train, p_test, 1)
-            assert_equal(y[train].size + y[test].size, y.size)
+            assert_equal(len(train) + len(test), y.size)
+            assert_equal(len(train), train_size)
+            assert_equal(len(test), test_size)
             assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
 
 
@@ -572,8 +590,8 @@ def assert_counts_are_ok(idx_counts, p):
         threshold = 0.05 / n_splits
         bf = stats.binom(n_splits, p)
         for count in idx_counts:
-            p = bf.pmf(count)
-            assert_true(p > threshold,
+            prob = bf.pmf(count)
+            assert_true(prob > threshold,
                         "An index is not drawn with chance corresponding "
                         "to even draws")
 
@@ -593,9 +611,8 @@ def assert_counts_are_ok(idx_counts, p):
                     counter[id] += 1
         assert_equal(n_splits_actual, n_splits)
 
-        n_train, n_test = _validate_shuffle_split(n_samples,
-                                                  test_size=1./n_folds,
-                                                  train_size=1.-(1./n_folds))
+        n_train, n_test = _validate_shuffle_split(
+            n_samples, test_size=1. / n_folds, train_size=1. - (1. / n_folds))
 
         assert_equal(len(train), n_train)
         assert_equal(len(test), n_test)
@@ -656,7 +673,7 @@ def test_label_shuffle_split():
     for l in labels:
         X = y = np.ones(len(l))
         n_splits = 6
-        test_size = 1./3
+        test_size = 1. / 3
         slo = LabelShuffleSplit(n_splits, test_size=test_size, random_state=0)
 
         # Make sure the repr works
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
@@ -479,24 +479,30 @@ def test_stratified_shuffle_split_init():
 def test_stratified_shuffle_split_iter():
     ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
           np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
-          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
+          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
           np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
           np.array([-1] * 800 + [1] * 50)
           ]
 
     for y in ys:
         sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33,
                                           random_state=0)
+        test_size = np.ceil(0.33 * len(y))
+        train_size = len(y) - test_size
         for train, test in sss:
             assert_array_equal(np.unique(y[train]), np.unique(y[test]))
             # Checks if folds keep classes proportions
-            p_train = (np.bincount(np.unique(y[train], return_inverse=True)[1])
-                       / float(len(y[train])))
-            p_test = (np.bincount(np.unique(y[test], return_inverse=True)[1])
-                      / float(len(y[test])))
+            p_train = (np.bincount(np.unique(y[train],
+                                   return_inverse=True)[1]) /
+                       float(len(y[train])))
+            p_test = (np.bincount(np.unique(y[test],
+                                  return_inverse=True)[1]) /
+                      float(len(y[test])))
             assert_array_almost_equal(p_train, p_test, 1)
-            assert_equal(y[train].size + y[test].size, y.size)
-            assert_array_equal(np.intersect1d(train, test), [])
+            assert_equal(len(train) + len(test), y.size)
+            assert_equal(len(train), train_size)
+            assert_equal(len(test), test_size)
+            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
 
 
 def test_stratified_shuffle_split_even():
diff --git a/sklearn/utils/random.py b/sklearn/utils/random.py