Skip to content

Commit

Permalink
bug in groupby when key space exceeds int64 bounds
Browse files Browse the repository at this point in the history
  • Loading branch information
behzadnouri committed Jan 31, 2015
1 parent 391f46a commit c7f363b
Show file tree
Hide file tree
Showing 9 changed files with 174 additions and 73 deletions.
3 changes: 2 additions & 1 deletion bench/bench_groupby.py
Expand Up @@ -47,7 +47,8 @@ def g():
from pandas.core.groupby import get_group_index
group_index = get_group_index(label_list, shape).astype('i4')
group_index = get_group_index(label_list, shape,
sort=True, xnull=True).astype('i4')
ngroups = np.prod(shape)
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.0.txt
Expand Up @@ -156,6 +156,7 @@ Bug Fixes
- Bug in ``pivot`` and `unstack`` where ``nan`` values would break index alignment (:issue:`4862`, :issue:`7401`, :issue:`7403`, :issue:`7405`, :issue:`7466`)
- Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`).
- Bug in ``MultiIndex`` where inserting new keys would fail (:issue:`9250`).
- Bug in ``groupby`` when key space exceeds ``int64`` bounds (:issue:`9096`).


- Fixed character encoding bug in ``read_stata`` and ``StataReader`` when loading data from a URL (:issue:`9231`).
Expand Down
114 changes: 56 additions & 58 deletions pandas/core/groupby.py
Expand Up @@ -1367,30 +1367,16 @@ def group_info(self):

def _get_compressed_labels(self):
all_labels = [ping.labels for ping in self.groupings]
if self._overflow_possible:
tups = lib.fast_zip(all_labels)
labs, uniques = algos.factorize(tups)
if len(all_labels) > 1:
group_index = get_group_index(all_labels, self.shape,
sort=True, xnull=True)
return _compress_group_index(group_index)

if self.sort:
uniques, labs = _reorder_by_uniques(uniques, labs)
ping = self.groupings[0]
self.compressed = False
self._filter_empty_groups = False

return labs, uniques
else:
if len(all_labels) > 1:
group_index = get_group_index(all_labels, self.shape)
comp_ids, obs_group_ids = _compress_group_index(group_index)
else:
ping = self.groupings[0]
comp_ids = ping.labels
obs_group_ids = np.arange(len(ping.group_index))
self.compressed = False
self._filter_empty_groups = False

return comp_ids, obs_group_ids

@cache_readonly
def _overflow_possible(self):
return _int64_overflow_possible(self.shape)
return ping.labels, np.arange(len(ping.group_index))

@cache_readonly
def ngroups(self):
Expand All @@ -1402,15 +1388,13 @@ def result_index(self):
return MultiIndex.from_arrays(recons, names=self.names)

def get_group_levels(self):
obs_ids = self.group_info[1]
comp_ids, obs_ids, _ = self.group_info

if not self.compressed and len(self.groupings) == 1:
return [self.groupings[0].group_index]

if self._overflow_possible:
recons_labels = [np.array(x) for x in zip(*obs_ids)]
else:
recons_labels = decons_group_index(obs_ids, self.shape)
recons_labels = decons_obs_group_ids(comp_ids, obs_ids,
self.shape, (ping.labels for ping in self.groupings))

name_list = []
for ping, labels in zip(self.groupings, recons_labels):
Expand Down Expand Up @@ -3490,42 +3474,28 @@ def get_splitter(data, *args, **kwargs):
# Misc utilities


def get_group_index(label_list, shape):
def get_group_index(labels, shape, sort, xnull):
"""
For the particular label_list, gets the offsets into the hypothetical list
representing the totally ordered cartesian product of all possible label
combinations.
"""
if len(label_list) == 1:
return label_list[0]

n = len(label_list[0])
group_index = np.zeros(n, dtype=np.int64)
mask = np.zeros(n, dtype=bool)
for i in range(len(shape)):
stride = np.prod([x for x in shape[i + 1:]], dtype=np.int64)
group_index += com._ensure_int64(label_list[i]) * stride
mask |= label_list[i] < 0

np.putmask(group_index, mask, -1)
return group_index


def get_flat_ids(labels, shape, retain_lex_rank):
"""
Given a list of labels at each level, returns a flat array of int64 ids
corresponding to unique tuples across the labels. If `retain_lex_rank`,
rank of returned ids preserve lexical ranks of labels.
combinations, *as long as* this space fits within int64 bounds;
otherwise, though group indices identify unique combinations of
labels, they cannot be deconstructed.
- If `sort`, rank of returned ids preserve lexical ranks of labels.
i.e. returned id's can be used to do lexical sort on labels;
- If `xnull` nulls (-1 labels) are passed through.
Parameters
----------
labels: sequence of arrays
Integers identifying levels at each location
shape: sequence of ints same length as labels
Number of unique levels at each location
retain_lex_rank: boolean
sort: boolean
If the ranks of returned ids should match lexical ranks of labels
xnull: boolean
If true nulls are eXcluded. i.e. -1 values in the labels are
passed through
Returns
-------
An array of type int64 where two elements are equal if their corresponding
Expand All @@ -3544,12 +3514,18 @@ def loop(labels, shape):
stride //= shape[i]
out += labels[i] * stride

if xnull: # exclude nulls
mask = labels[0] == -1
for lab in labels[1:nlev]:
mask |= lab == -1
out[mask] = -1

if nlev == len(shape): # all levels done!
return out

# compress what has been done so far in order to avoid overflow
# to retain lexical ranks, obs_ids should be sorted
comp_ids, obs_ids = _compress_group_index(out, sort=retain_lex_rank)
comp_ids, obs_ids = _compress_group_index(out, sort=sort)

labels = [comp_ids] + labels[nlev:]
shape = [len(obs_ids)] + shape[nlev:]
Expand All @@ -3560,9 +3536,10 @@ def maybe_lift(lab, size): # pormote nan values
return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)

labels = map(com._ensure_int64, labels)
labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
if not xnull:
labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))

return loop(labels, shape)
return loop(list(labels), list(shape))


_INT64_MAX = np.iinfo(np.int64).max
Expand All @@ -3578,6 +3555,11 @@ def _int64_overflow_possible(shape):

def decons_group_index(comp_labels, shape):
# reconstruct labels
if _int64_overflow_possible(shape):
# at some point group indices are factorized,
# and may not be deconstructed here! wrong path!
raise ValueError('cannot deconstruct factorized group indices!')

label_list = []
factor = 1
y = 0
Expand All @@ -3591,12 +3573,25 @@ def decons_group_index(comp_labels, shape):
return label_list[::-1]


def decons_obs_group_ids(comp_ids, obs_ids, shape, labels):
"""reconstruct labels from observed ids"""
from pandas.hashtable import unique_label_indices

if not _int64_overflow_possible(shape):
# obs ids are deconstructable! take the fast route!
return decons_group_index(obs_ids, shape)

i = unique_label_indices(comp_ids)
i8copy = lambda a: a.astype('i8', subok=False, copy=True)
return [i8copy(lab[i]) for lab in labels]


def _indexer_from_factorized(labels, shape, compress=True):
if _int64_overflow_possible(shape):
indexer = np.lexsort(np.array(labels[::-1]))
return indexer

group_index = get_group_index(labels, shape)
group_index = get_group_index(labels, shape, sort=True, xnull=True)

if compress:
comp_ids, obs_ids = _compress_group_index(group_index)
Expand Down Expand Up @@ -3712,9 +3707,12 @@ def get_key(self, comp_id):

def _get_indices_dict(label_list, keys):
shape = list(map(len, keys))
ngroups = np.prod(shape)

group_index = get_group_index(label_list, shape)
group_index = get_group_index(label_list, shape, sort=True, xnull=True)
ngroups = ((group_index.size and group_index.max()) + 1) \
if _int64_overflow_possible(shape) \
else np.prod(shape, dtype='i8')

sorter = _get_group_index_sorter(group_index, ngroups)

sorted_labels = [lab.take(sorter) for lab in label_list]
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/index.py
Expand Up @@ -3229,11 +3229,11 @@ def is_unique(self):

@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, take_last=False):
from pandas.core.groupby import get_flat_ids
from pandas.core.groupby import get_group_index
from pandas.hashtable import duplicated_int64

shape = map(len, self.levels)
ids = get_flat_ids(self.labels, shape, False)
ids = get_group_index(self.labels, shape, sort=False, xnull=False)

return duplicated_int64(ids, take_last)

Expand Down
18 changes: 8 additions & 10 deletions pandas/core/reshape.py
Expand Up @@ -12,8 +12,8 @@
from pandas.core.categorical import Categorical
from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote,
isnull)
from pandas.core.groupby import (get_group_index, _compress_group_index,
decons_group_index)
from pandas.core.groupby import get_group_index, _compress_group_index

import pandas.core.common as com
import pandas.algos as algos

Expand Down Expand Up @@ -103,10 +103,6 @@ def _make_sorted_values_labels(self):
sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]

comp_index, obs_ids = get_compressed_ids(to_sort, sizes)

# group_index = get_group_index(to_sort, sizes)
# comp_index, obs_ids = _compress_group_index(group_index)

ngroups = len(obs_ids)

indexer = algos.groupsort_indexer(comp_index, ngroups)[0]
Expand Down Expand Up @@ -252,6 +248,8 @@ def _make_new_index(lev, lab):


def _unstack_multiple(data, clocs):
from pandas.core.groupby import decons_obs_group_ids

if len(clocs) == 0:
return data

Expand All @@ -271,10 +269,10 @@ def _unstack_multiple(data, clocs):
rnames = [index.names[i] for i in rlocs]

shape = [len(x) for x in clevels]
group_index = get_group_index(clabels, shape)
group_index = get_group_index(clabels, shape, sort=False, xnull=False)

comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
recons_labels = decons_group_index(obs_ids, shape)
recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels)

dummy_index = MultiIndex(levels=rlevels + [obs_ids],
labels=rlabels + [comp_ids],
Expand Down Expand Up @@ -449,9 +447,9 @@ def _unstack_frame(obj, level):


def get_compressed_ids(labels, sizes):
from pandas.core.groupby import get_flat_ids
from pandas.core.groupby import get_group_index

ids = get_flat_ids(labels, sizes, True)
ids = get_group_index(labels, sizes, sort=True, xnull=False)
return _compress_group_index(ids, sort=True)


Expand Down
33 changes: 32 additions & 1 deletion pandas/hashtable.pyx
Expand Up @@ -17,6 +17,7 @@ cnp.import_array()
cnp.import_ufunc()

cdef int64_t iNaT = util.get_nat()
_SIZE_HINT_LIMIT = (1 << 20) + 7

cdef extern from "datetime.h":
bint PyDateTime_Check(object o)
Expand Down Expand Up @@ -1073,7 +1074,7 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):
kh_int64_t * table = kh_init_int64()
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')

kh_resize_int64(table, min(1 << 20, n))
kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT))

if take_last:
for i from n > i >=0:
Expand All @@ -1086,3 +1087,33 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):

kh_destroy_int64(table)
return out


@cython.wraparound(False)
@cython.boundscheck(False)
def unique_label_indices(ndarray[int64_t, ndim=1] labels):
"""
indices of the first occurrences of the unique labels
*excluding* -1. equivelent to:
np.unique(labels, return_index=True)[1]
"""
cdef:
int ret = 0
Py_ssize_t i, n = len(labels)
kh_int64_t * table = kh_init_int64()
Int64Vector idx = Int64Vector()
ndarray[int64_t, ndim=1] arr

kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT))

for i in range(n):
kh_put_int64(table, labels[i], &ret)
if ret != 0:
idx.append(i)

kh_destroy_int64(table)

arr = idx.to_array()
arr = arr[labels[arr].argsort()]

return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
15 changes: 15 additions & 0 deletions pandas/tests/test_algos.py
Expand Up @@ -261,6 +261,21 @@ def test_quantile():
expected = algos.quantile(s.values, [0, .25, .5, .75, 1.])
tm.assert_almost_equal(result, expected)

def test_unique_label_indices():
from pandas.hashtable import unique_label_indices

a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8')

left = unique_label_indices(a)
right = np.unique(a, return_index=True)[1]

tm.assert_array_equal(left, right)

a[np.random.choice(len(a), 10)] = -1
left= unique_label_indices(a)
right = np.unique(a, return_index=True)[1][1:]
tm.assert_array_equal(left, right)

if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down

0 comments on commit c7f363b

Please sign in to comment.