Skip to content

Commit

Permalink
ENH: reimplment groupby_indices using better algorithmic tricks, asso…
Browse files Browse the repository at this point in the history
…ciated vbenchmark. close #609
  • Loading branch information
wesm committed May 13, 2012
1 parent 88e6bcf commit e7af2b9
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 185 deletions.
3 changes: 2 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def _unique_generic(values, table_type, type_caster):
uniques = table.unique(values)
return uniques


def factorize(values, sort=False, order=None, na_sentinel=-1):
"""
Encode input values as an enumerated type or categorical variable
Expand All @@ -118,7 +119,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
uniques = com._asarray_tuplesafe(uniques)
if sort and len(counts) > 0:
sorter = uniques.argsort()
reverse_indexer = np.empty(len(sorter), dtype=np.int32)
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
reverse_indexer.put(sorter, np.arange(len(sorter)))

mask = labels < 0
Expand Down
21 changes: 17 additions & 4 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,10 +536,9 @@ def indices(self):
if len(self.groupings) == 1:
return self.groupings[0].indices
else:
# TODO: this is massively inefficient
to_groupby = zip(*(ping.grouper for ping in self.groupings))
to_groupby = Index(to_groupby)
return lib.groupby_indices(to_groupby)
label_list = [ping.labels for ping in self.groupings]
keys = [ping.group_index for ping in self.groupings]
return _get_indices_dict(label_list, keys)

@property
def labels(self):
Expand Down Expand Up @@ -1972,6 +1971,20 @@ def get_key(self, comp_id):
return tuple(level[table.get_item(comp_id)]
for table, level in zip(self.tables, self.levels))


def _get_indices_dict(label_list, keys):
shape = [len(x) for x in keys]
group_index = get_group_index(label_list, shape)

sorter, _ = lib.groupsort_indexer(com._ensure_int64(group_index),
np.prod(shape))

sorted_labels = [lab.take(sorter) for lab in label_list]
group_index = group_index.take(sorter)
index = np.arange(len(group_index)).take(sorter)

return lib.indices_fast(index, group_index, keys, sorted_labels)

#----------------------------------------------------------------------
# sorting levels...cleverly?

Expand Down
113 changes: 50 additions & 63 deletions pandas/src/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -746,7 +746,6 @@ def group_var(ndarray[float64_t, ndim=2] out,

@cython.boundscheck(False)
@cython.wraparound(False)

def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
object closed='left'):
"""
Expand Down Expand Up @@ -1107,8 +1106,8 @@ def group_ohlc(ndarray[float64_t, ndim=2] out,
out[b, 3] = vclose


# @cython.boundscheck(False)
# @cython.wraparound(False)
@cython.boundscheck(False)
@cython.wraparound(False)
def group_mean_bin(ndarray[float64_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[float64_t, ndim=2] values,
Expand Down Expand Up @@ -1268,62 +1267,6 @@ def lookup_values(ndarray[object] values, dict mapping):
result[i] = mapping[values[i]]
return maybe_convert_objects(result)

def reduce_mean(ndarray[object] indices,
ndarray[object] buckets,
ndarray[float64_t] values,
inclusive=False):
cdef:
Py_ssize_t i, j, nbuckets, nvalues
ndarray[float64_t] output
float64_t the_sum, val, nobs



nbuckets = len(buckets)
nvalues = len(indices)

assert(len(values) == len(indices))

output = np.empty(nbuckets, dtype=float)
output.fill(np.NaN)

j = 0
for i from 0 <= i < nbuckets:
next_bound = buckets[i]
the_sum = 0
nobs = 0
if inclusive:
while j < nvalues and indices[j] <= next_bound:
val = values[j]
# not NaN
if val == val:
the_sum += val
nobs += 1
j += 1
else:
while j < nvalues and indices[j] < next_bound:
val = values[j]
# not NaN
if val == val:
the_sum += val
nobs += 1
j += 1

if nobs > 0:
output[i] = the_sum / nobs

if j >= nvalues:
break

return output

def _bucket_locs(index, buckets, inclusive=False):
if inclusive:
locs = index.searchsorted(buckets, side='left')
else:
locs = index.searchsorted(buckets, side='right')

return locs

def count_level_1d(ndarray[uint8_t, cast=True] mask,
ndarray[int64_t] labels, Py_ssize_t max_bin):
Expand All @@ -1341,6 +1284,7 @@ def count_level_1d(ndarray[uint8_t, cast=True] mask,

return counts


def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
ndarray[int64_t] labels, Py_ssize_t max_bin):
cdef:
Expand All @@ -1357,6 +1301,7 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,

return counts


def duplicated(list values, take_last=False):
cdef:
Py_ssize_t i, n
Expand Down Expand Up @@ -1411,18 +1356,19 @@ def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
return starts, ends


def groupby_arrays(ndarray index, ndarray[int64_t] labels):
def groupby_arrays(ndarray index, ndarray[int64_t] labels, sort=True):
cdef:
Py_ssize_t i, lab, cur, start, n = len(index)
dict result = {}

index = np.asarray(index)

# this is N log N. If this is a bottleneck may we worth fixing someday
indexer = labels.argsort(kind='mergesort')
if sort:
indexer = labels.argsort(kind='mergesort')

labels = labels.take(indexer)
index = index.take(indexer)
labels = labels.take(indexer)
index = index.take(indexer)

if n == 0:
return result
Expand All @@ -1438,4 +1384,45 @@ def groupby_arrays(ndarray index, ndarray[int64_t] labels):
start = i
cur = lab

result[cur] = index[start:]
return result

def indices_fast(object index, ndarray[int64_t] labels, list keys,
list sorted_labels):
cdef:
Py_ssize_t i, j, k, lab, cur, start, n = len(labels)
dict result = {}
object tup

k = len(keys)

if n == 0:
return result

start = 0
cur = labels[0]
for i in range(1, n):
lab = labels[i]

if lab != cur:
if lab != -1:
tup = PyTuple_New(k)
for j in range(k):
val = util.get_value_at(keys[j],
sorted_labels[j][i-1])
PyTuple_SET_ITEM(tup, j, val)
Py_INCREF(val)

result[tup] = index[start:i]
start = i
cur = lab

tup = PyTuple_New(k)
for j in range(k):
val = util.get_value_at(keys[j],
sorted_labels[j][n - 1])
PyTuple_SET_ITEM(tup, j, val)
Py_INCREF(val)
result[tup] = index[start:]

return result
151 changes: 34 additions & 117 deletions pandas/src/sandbox.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -421,117 +421,6 @@ def int64_unique(ndarray[int64_t] arr):

return np.sort(uniques[:j])

def group_add_bin(ndarray[float64_t, ndim=2] out,
ndarray[int32_t] counts,
ndarray[float64_t, ndim=2] values,
ndarray[int32_t] bins):
'''
Only aggregates on axis=0
'''
cdef:
Py_ssize_t i, j, N, K, ngroups, b
float64_t val, count
ndarray[float64_t, ndim=2] sumx, nobs

nobs = np.zeros_like(out)
sumx = np.zeros_like(out)

ngroups = len(bins) + 1
N, K = (<object> values).shape

b = 0
if K > 1:
for i in range(N):
while b < ngroups - 1 and i >= bins[b]:
b += 1

counts[b] += 1
for j in range(K):
val = values[i, j]

# not nan
if val == val:
nobs[b, j] += 1
sumx[b, j] += val
else:
for i in range(N):
while b < ngroups - 1 and i >= bins[b]:
b += 1

counts[b] += 1
val = values[i, 0]

# not nan
if val == val:
nobs[b, 0] += 1
sumx[b, 0] += val
print i, b, counts, nobs.squeeze()

for i in range(ngroups):
print 'writing %d' % i
for j in range(K):
if nobs[i] == 0:
out[i, j] = nan
else:
out[i, j] = sumx[i, j]

@cython.boundscheck(False)
@cython.wraparound(False)
def group_add(ndarray[float64_t, ndim=2] out,
ndarray[int32_t] counts,
ndarray[float64_t, ndim=2] values,
ndarray[int32_t] labels):
'''
Only aggregates on axis=0
'''
cdef:
Py_ssize_t i, j, N, K, lab
float64_t val, count
ndarray[float64_t, ndim=2] sumx, nobs

nobs = np.zeros_like(out)
sumx = np.zeros_like(out)

N, K = (<object> values).shape

if K > 1:
for i in range(N):
lab = labels[i]
if lab < 0:
continue

counts[lab] += 1
for j in range(K):
val = values[i, j]

# not nan
if val == val:
nobs[lab, j] += 1
sumx[lab, j] += val
else:
for i in range(N):
lab = labels[i]
if lab < 0:
continue

counts[lab] += 1
val = values[i, 0]

# not nan
if val == val:
nobs[lab, 0] += 1
sumx[lab, 0] += val

for i in range(len(counts)):
for j in range(K):
if nobs[i, j] == 0:
out[i, j] = nan
else:
out[i, j] = sumx[i, j]


from datetime cimport getAbsTime


# cdef extern from "kvec.h":

Expand All @@ -546,12 +435,6 @@ def test_foo(ndarray[int64_t] values):
val = values[0]
print val

def get_abs_time(freq, dailyDate, originalDate):
return getAbsTime(freq, dailyDate, originalDate)

have_pytz = 1
import pytz

# cdef extern from "foo.h":
# double add_things(double *a, double *b, double *c, int n)

Expand Down Expand Up @@ -581,3 +464,37 @@ def inner(ndarray[float64_t] x, ndarray[float64_t] y):
for i in range(n):
result += x[i] * y[i]
return result

def indices_fast(ndarray[int64_t] labels, list keys,
list sorted_labels):
cdef:
Py_ssize_t i, j, k, lab, cur, start, n = len(labels)
dict result = {}
object tup

index = np.arange(n)

k = len(keys)

if n == 0:
return result

start = 0
cur = labels[0]
for i in range(1, n):
lab = labels[i]

if lab != cur:
if lab != -1:
tup = PyTuple_New(k)
for j in range(k):
val = util.get_value_at(keys[j],
sorted_labels[j][cur])
PyTuple_SET_ITEM(tup, j, val)
Py_INCREF(val)

result[tup] = index[start:i]
start = i
cur = lab

return result
Loading

0 comments on commit e7af2b9

Please sign in to comment.