bug in groupby when key space exceeds int64 bounds

pandas-dev · Jan 31, 2015 · c7f363b · c7f363b
1 parent 391f46a
commit c7f363b
Show file tree

Hide file tree

Showing 9 changed files with 174 additions and 73 deletions.
diff --git a/bench/bench_groupby.py b/bench/bench_groupby.py
@@ -47,7 +47,8 @@ def g():
 from pandas.core.groupby import get_group_index
 
 
-group_index = get_group_index(label_list, shape).astype('i4')
+group_index = get_group_index(label_list, shape,
+                              sort=True, xnull=True).astype('i4')
 
 ngroups = np.prod(shape)
 

diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
@@ -156,6 +156,7 @@ Bug Fixes
 - Bug in ``pivot`` and `unstack`` where ``nan`` values would break index alignment (:issue:`4862`, :issue:`7401`, :issue:`7403`, :issue:`7405`, :issue:`7466`)
 - Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`).
 - Bug in ``MultiIndex`` where inserting new keys would fail (:issue:`9250`).
+- Bug in ``groupby`` when key space exceeds ``int64`` bounds (:issue:`9096`).
 
 
 - Fixed character encoding bug in ``read_stata`` and ``StataReader`` when loading data from a URL (:issue:`9231`).

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1367,30 +1367,16 @@ def group_info(self):
 
     def _get_compressed_labels(self):
         all_labels = [ping.labels for ping in self.groupings]
-        if self._overflow_possible:
-            tups = lib.fast_zip(all_labels)
-            labs, uniques = algos.factorize(tups)
+        if len(all_labels) > 1:
+            group_index = get_group_index(all_labels, self.shape,
+                                          sort=True, xnull=True)
+            return _compress_group_index(group_index)
 
-            if self.sort:
-                uniques, labs = _reorder_by_uniques(uniques, labs)
+        ping = self.groupings[0]
+        self.compressed = False
+        self._filter_empty_groups = False
 
-            return labs, uniques
-        else:
-            if len(all_labels) > 1:
-                group_index = get_group_index(all_labels, self.shape)
-                comp_ids, obs_group_ids = _compress_group_index(group_index)
-            else:
-                ping = self.groupings[0]
-                comp_ids = ping.labels
-                obs_group_ids = np.arange(len(ping.group_index))
-                self.compressed = False
-                self._filter_empty_groups = False
-
-            return comp_ids, obs_group_ids
-
-    @cache_readonly
-    def _overflow_possible(self):
-        return _int64_overflow_possible(self.shape)
+        return ping.labels, np.arange(len(ping.group_index))
 
     @cache_readonly
     def ngroups(self):
@@ -1402,15 +1388,13 @@ def result_index(self):
         return MultiIndex.from_arrays(recons, names=self.names)
 
     def get_group_levels(self):
-        obs_ids = self.group_info[1]
+        comp_ids, obs_ids, _ = self.group_info
 
         if not self.compressed and len(self.groupings) == 1:
             return [self.groupings[0].group_index]
 
-        if self._overflow_possible:
-            recons_labels = [np.array(x) for x in zip(*obs_ids)]
-        else:
-            recons_labels = decons_group_index(obs_ids, self.shape)
+        recons_labels = decons_obs_group_ids(comp_ids, obs_ids,
+                self.shape, (ping.labels for ping in self.groupings))
 
         name_list = []
         for ping, labels in zip(self.groupings, recons_labels):
@@ -3490,42 +3474,28 @@ def get_splitter(data, *args, **kwargs):
 # Misc utilities
 
 
-def get_group_index(label_list, shape):
+def get_group_index(labels, shape, sort, xnull):
     """
     For the particular label_list, gets the offsets into the hypothetical list
     representing the totally ordered cartesian product of all possible label
-    combinations.
-    """
-    if len(label_list) == 1:
-        return label_list[0]
-
-    n = len(label_list[0])
-    group_index = np.zeros(n, dtype=np.int64)
-    mask = np.zeros(n, dtype=bool)
-    for i in range(len(shape)):
-        stride = np.prod([x for x in shape[i + 1:]], dtype=np.int64)
-        group_index += com._ensure_int64(label_list[i]) * stride
-        mask |= label_list[i] < 0
-
-    np.putmask(group_index, mask, -1)
-    return group_index
-
-
-def get_flat_ids(labels, shape, retain_lex_rank):
-    """
-    Given a list of labels at each level, returns a flat array of int64 ids
-    corresponding to unique tuples across the labels. If `retain_lex_rank`,
-    rank of returned ids preserve lexical ranks of labels.
+    combinations, *as long as* this space fits within int64 bounds;
+    otherwise, though group indices identify unique combinations of
+    labels, they cannot be deconstructed.
+    - If `sort`, rank of returned ids preserve lexical ranks of labels.
+      i.e. returned id's can be used to do lexical sort on labels;
+    - If `xnull` nulls (-1 labels) are passed through.
 
     Parameters
     ----------
     labels: sequence of arrays
         Integers identifying levels at each location
     shape: sequence of ints same length as labels
         Number of unique levels at each location
-    retain_lex_rank: boolean
+    sort: boolean
         If the ranks of returned ids should match lexical ranks of labels
-
+    xnull: boolean
+        If true nulls are eXcluded. i.e. -1 values in the labels are
+        passed through
     Returns
     -------
     An array of type int64 where two elements are equal if their corresponding
@@ -3544,12 +3514,18 @@ def loop(labels, shape):
             stride //= shape[i]
             out += labels[i] * stride
 
+        if xnull: # exclude nulls
+            mask = labels[0] == -1
+            for lab in labels[1:nlev]:
+                mask |= lab == -1
+            out[mask] = -1
+
         if nlev == len(shape):  # all levels done!
             return out
 
         # compress what has been done so far in order to avoid overflow
         # to retain lexical ranks, obs_ids should be sorted
-        comp_ids, obs_ids = _compress_group_index(out, sort=retain_lex_rank)
+        comp_ids, obs_ids = _compress_group_index(out, sort=sort)
 
         labels = [comp_ids] + labels[nlev:]
         shape = [len(obs_ids)] + shape[nlev:]
@@ -3560,9 +3536,10 @@ def maybe_lift(lab, size):  # pormote nan values
         return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
 
     labels = map(com._ensure_int64, labels)
-    labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
+    if not xnull:
+        labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
 
-    return loop(labels, shape)
+    return loop(list(labels), list(shape))
 
 
 _INT64_MAX = np.iinfo(np.int64).max
@@ -3578,6 +3555,11 @@ def _int64_overflow_possible(shape):
 
 def decons_group_index(comp_labels, shape):
     # reconstruct labels
+    if _int64_overflow_possible(shape):
+        # at some point group indices are factorized,
+        # and may not be deconstructed here! wrong path!
+        raise ValueError('cannot deconstruct factorized group indices!')
+
     label_list = []
     factor = 1
     y = 0
@@ -3591,12 +3573,25 @@ def decons_group_index(comp_labels, shape):
     return label_list[::-1]
 
 
+def decons_obs_group_ids(comp_ids, obs_ids, shape, labels):
+    """reconstruct labels from observed ids"""
+    from pandas.hashtable import unique_label_indices
+
+    if not _int64_overflow_possible(shape):
+        # obs ids are deconstructable! take the fast route!
+        return decons_group_index(obs_ids, shape)
+
+    i = unique_label_indices(comp_ids)
+    i8copy = lambda a: a.astype('i8', subok=False, copy=True)
+    return [i8copy(lab[i]) for lab in labels]
+
+
 def _indexer_from_factorized(labels, shape, compress=True):
     if _int64_overflow_possible(shape):
         indexer = np.lexsort(np.array(labels[::-1]))
         return indexer
 
-    group_index = get_group_index(labels, shape)
+    group_index = get_group_index(labels, shape, sort=True, xnull=True)
 
     if compress:
         comp_ids, obs_ids = _compress_group_index(group_index)
@@ -3712,9 +3707,12 @@ def get_key(self, comp_id):
 
 def _get_indices_dict(label_list, keys):
     shape = list(map(len, keys))
-    ngroups = np.prod(shape)
 
-    group_index = get_group_index(label_list, shape)
+    group_index = get_group_index(label_list, shape, sort=True, xnull=True)
+    ngroups = ((group_index.size and group_index.max()) + 1) \
+              if _int64_overflow_possible(shape) \
+              else np.prod(shape, dtype='i8')
+
     sorter = _get_group_index_sorter(group_index, ngroups)
 
     sorted_labels = [lab.take(sorter) for lab in label_list]

diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -3229,11 +3229,11 @@ def is_unique(self):
 
     @Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
     def duplicated(self, take_last=False):
-        from pandas.core.groupby import get_flat_ids
+        from pandas.core.groupby import get_group_index
         from pandas.hashtable import duplicated_int64
 
         shape = map(len, self.levels)
-        ids = get_flat_ids(self.labels, shape, False)
+        ids = get_group_index(self.labels, shape, sort=False, xnull=False)
 
         return duplicated_int64(ids, take_last)
 

diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
@@ -12,8 +12,8 @@
 from pandas.core.categorical import Categorical
 from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote,
                                 isnull)
-from pandas.core.groupby import (get_group_index, _compress_group_index,
-                                 decons_group_index)
+from pandas.core.groupby import get_group_index, _compress_group_index
+
 import pandas.core.common as com
 import pandas.algos as algos
 
@@ -103,10 +103,6 @@ def _make_sorted_values_labels(self):
         sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]
 
         comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
-
-        # group_index = get_group_index(to_sort, sizes)
-        # comp_index, obs_ids = _compress_group_index(group_index)
-
         ngroups = len(obs_ids)
 
         indexer = algos.groupsort_indexer(comp_index, ngroups)[0]
@@ -252,6 +248,8 @@ def _make_new_index(lev, lab):
 
 
 def _unstack_multiple(data, clocs):
+    from pandas.core.groupby import decons_obs_group_ids
+
     if len(clocs) == 0:
         return data
 
@@ -271,10 +269,10 @@ def _unstack_multiple(data, clocs):
     rnames = [index.names[i] for i in rlocs]
 
     shape = [len(x) for x in clevels]
-    group_index = get_group_index(clabels, shape)
+    group_index = get_group_index(clabels, shape, sort=False, xnull=False)
 
     comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
-    recons_labels = decons_group_index(obs_ids, shape)
+    recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels)
 
     dummy_index = MultiIndex(levels=rlevels + [obs_ids],
                              labels=rlabels + [comp_ids],
@@ -449,9 +447,9 @@ def _unstack_frame(obj, level):
 
 
 def get_compressed_ids(labels, sizes):
-    from pandas.core.groupby import get_flat_ids
+    from pandas.core.groupby import get_group_index
 
-    ids = get_flat_ids(labels, sizes, True)
+    ids = get_group_index(labels, sizes, sort=True, xnull=False)
     return _compress_group_index(ids, sort=True)
 
 

diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -17,6 +17,7 @@ cnp.import_array()
 cnp.import_ufunc()
 
 cdef int64_t iNaT = util.get_nat()
+_SIZE_HINT_LIMIT = (1 << 20) + 7
 
 cdef extern from "datetime.h":
     bint PyDateTime_Check(object o)
@@ -1073,7 +1074,7 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):
         kh_int64_t * table = kh_init_int64()
         ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
 
-    kh_resize_int64(table, min(1 << 20, n))
+    kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT))
 
     if take_last:
         for i from n > i >=0:
@@ -1086,3 +1087,33 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):
 
     kh_destroy_int64(table)
     return out
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def unique_label_indices(ndarray[int64_t, ndim=1] labels):
+    """
+    indices of the first occurrences of the unique labels
+    *excluding* -1. equivelent to:
+        np.unique(labels, return_index=True)[1]
+    """
+    cdef:
+        int ret = 0
+        Py_ssize_t i, n = len(labels)
+        kh_int64_t * table = kh_init_int64()
+        Int64Vector idx = Int64Vector()
+        ndarray[int64_t, ndim=1] arr
+
+    kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT))
+
+    for i in range(n):
+        kh_put_int64(table, labels[i], &ret)
+        if ret != 0:
+            idx.append(i)
+
+    kh_destroy_int64(table)
+
+    arr = idx.to_array()
+    arr = arr[labels[arr].argsort()]
+
+    return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -261,6 +261,21 @@ def test_quantile():
     expected = algos.quantile(s.values, [0, .25, .5, .75, 1.])
     tm.assert_almost_equal(result, expected)
 
+def test_unique_label_indices():
+    from pandas.hashtable import unique_label_indices
+
+    a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8')
+
+    left = unique_label_indices(a)
+    right = np.unique(a, return_index=True)[1]
+
+    tm.assert_array_equal(left, right)
+
+    a[np.random.choice(len(a), 10)] = -1
+    left= unique_label_indices(a)
+    right = np.unique(a, return_index=True)[1][1:]
+    tm.assert_array_equal(left, right)
+
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],