Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

ENH: reimplment groupby_indices using better algorithmic tricks, asso…

…ciated vbenchmark. close #609
  • Loading branch information...
commit e7af2b99634f5514554d10731f8b99dc070139cb 1 parent 88e6bcf
@wesm wesm authored
View
3  pandas/core/algorithms.py
@@ -94,6 +94,7 @@ def _unique_generic(values, table_type, type_caster):
uniques = table.unique(values)
return uniques
+
def factorize(values, sort=False, order=None, na_sentinel=-1):
"""
Encode input values as an enumerated type or categorical variable
@@ -118,7 +119,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
uniques = com._asarray_tuplesafe(uniques)
if sort and len(counts) > 0:
sorter = uniques.argsort()
- reverse_indexer = np.empty(len(sorter), dtype=np.int32)
+ reverse_indexer = np.empty(len(sorter), dtype=np.int_)
reverse_indexer.put(sorter, np.arange(len(sorter)))
mask = labels < 0
View
21 pandas/core/groupby.py
@@ -536,10 +536,9 @@ def indices(self):
if len(self.groupings) == 1:
return self.groupings[0].indices
else:
- # TODO: this is massively inefficient
- to_groupby = zip(*(ping.grouper for ping in self.groupings))
- to_groupby = Index(to_groupby)
- return lib.groupby_indices(to_groupby)
+ label_list = [ping.labels for ping in self.groupings]
+ keys = [ping.group_index for ping in self.groupings]
+ return _get_indices_dict(label_list, keys)
@property
def labels(self):
@@ -1972,6 +1971,20 @@ def get_key(self, comp_id):
return tuple(level[table.get_item(comp_id)]
for table, level in zip(self.tables, self.levels))
+
+def _get_indices_dict(label_list, keys):
+ shape = [len(x) for x in keys]
+ group_index = get_group_index(label_list, shape)
+
+ sorter, _ = lib.groupsort_indexer(com._ensure_int64(group_index),
+ np.prod(shape))
+
+ sorted_labels = [lab.take(sorter) for lab in label_list]
+ group_index = group_index.take(sorter)
+ index = np.arange(len(group_index)).take(sorter)
+
+ return lib.indices_fast(index, group_index, keys, sorted_labels)
+
#----------------------------------------------------------------------
# sorting levels...cleverly?
View
113 pandas/src/groupby.pyx
@@ -746,7 +746,6 @@ def group_var(ndarray[float64_t, ndim=2] out,
@cython.boundscheck(False)
@cython.wraparound(False)
-
def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
object closed='left'):
"""
@@ -1107,8 +1106,8 @@ def group_ohlc(ndarray[float64_t, ndim=2] out,
out[b, 3] = vclose
-# @cython.boundscheck(False)
-# @cython.wraparound(False)
+@cython.boundscheck(False)
+@cython.wraparound(False)
def group_mean_bin(ndarray[float64_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[float64_t, ndim=2] values,
@@ -1268,62 +1267,6 @@ def lookup_values(ndarray[object] values, dict mapping):
result[i] = mapping[values[i]]
return maybe_convert_objects(result)
-def reduce_mean(ndarray[object] indices,
- ndarray[object] buckets,
- ndarray[float64_t] values,
- inclusive=False):
- cdef:
- Py_ssize_t i, j, nbuckets, nvalues
- ndarray[float64_t] output
- float64_t the_sum, val, nobs
-
-
-
- nbuckets = len(buckets)
- nvalues = len(indices)
-
- assert(len(values) == len(indices))
-
- output = np.empty(nbuckets, dtype=float)
- output.fill(np.NaN)
-
- j = 0
- for i from 0 <= i < nbuckets:
- next_bound = buckets[i]
- the_sum = 0
- nobs = 0
- if inclusive:
- while j < nvalues and indices[j] <= next_bound:
- val = values[j]
- # not NaN
- if val == val:
- the_sum += val
- nobs += 1
- j += 1
- else:
- while j < nvalues and indices[j] < next_bound:
- val = values[j]
- # not NaN
- if val == val:
- the_sum += val
- nobs += 1
- j += 1
-
- if nobs > 0:
- output[i] = the_sum / nobs
-
- if j >= nvalues:
- break
-
- return output
-
-def _bucket_locs(index, buckets, inclusive=False):
- if inclusive:
- locs = index.searchsorted(buckets, side='left')
- else:
- locs = index.searchsorted(buckets, side='right')
-
- return locs
def count_level_1d(ndarray[uint8_t, cast=True] mask,
ndarray[int64_t] labels, Py_ssize_t max_bin):
@@ -1341,6 +1284,7 @@ def count_level_1d(ndarray[uint8_t, cast=True] mask,
return counts
+
def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
ndarray[int64_t] labels, Py_ssize_t max_bin):
cdef:
@@ -1357,6 +1301,7 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
return counts
+
def duplicated(list values, take_last=False):
cdef:
Py_ssize_t i, n
@@ -1411,7 +1356,7 @@ def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
return starts, ends
-def groupby_arrays(ndarray index, ndarray[int64_t] labels):
+def groupby_arrays(ndarray index, ndarray[int64_t] labels, sort=True):
cdef:
Py_ssize_t i, lab, cur, start, n = len(index)
dict result = {}
@@ -1419,10 +1364,11 @@ def groupby_arrays(ndarray index, ndarray[int64_t] labels):
index = np.asarray(index)
# this is N log N. If this is a bottleneck may we worth fixing someday
- indexer = labels.argsort(kind='mergesort')
+ if sort:
+ indexer = labels.argsort(kind='mergesort')
- labels = labels.take(indexer)
- index = index.take(indexer)
+ labels = labels.take(indexer)
+ index = index.take(indexer)
if n == 0:
return result
@@ -1438,4 +1384,45 @@ def groupby_arrays(ndarray index, ndarray[int64_t] labels):
start = i
cur = lab
+ result[cur] = index[start:]
+ return result
+
+def indices_fast(object index, ndarray[int64_t] labels, list keys,
+ list sorted_labels):
+ cdef:
+ Py_ssize_t i, j, k, lab, cur, start, n = len(labels)
+ dict result = {}
+ object tup
+
+ k = len(keys)
+
+ if n == 0:
+ return result
+
+ start = 0
+ cur = labels[0]
+ for i in range(1, n):
+ lab = labels[i]
+
+ if lab != cur:
+ if lab != -1:
+ tup = PyTuple_New(k)
+ for j in range(k):
+ val = util.get_value_at(keys[j],
+ sorted_labels[j][i-1])
+ PyTuple_SET_ITEM(tup, j, val)
+ Py_INCREF(val)
+
+ result[tup] = index[start:i]
+ start = i
+ cur = lab
+
+ tup = PyTuple_New(k)
+ for j in range(k):
+ val = util.get_value_at(keys[j],
+ sorted_labels[j][n - 1])
+ PyTuple_SET_ITEM(tup, j, val)
+ Py_INCREF(val)
+ result[tup] = index[start:]
+
return result
View
151 pandas/src/sandbox.pyx
@@ -421,117 +421,6 @@ def int64_unique(ndarray[int64_t] arr):
return np.sort(uniques[:j])
-def group_add_bin(ndarray[float64_t, ndim=2] out,
- ndarray[int32_t] counts,
- ndarray[float64_t, ndim=2] values,
- ndarray[int32_t] bins):
- '''
- Only aggregates on axis=0
- '''
- cdef:
- Py_ssize_t i, j, N, K, ngroups, b
- float64_t val, count
- ndarray[float64_t, ndim=2] sumx, nobs
-
- nobs = np.zeros_like(out)
- sumx = np.zeros_like(out)
-
- ngroups = len(bins) + 1
- N, K = (<object> values).shape
-
- b = 0
- if K > 1:
- for i in range(N):
- while b < ngroups - 1 and i >= bins[b]:
- b += 1
-
- counts[b] += 1
- for j in range(K):
- val = values[i, j]
-
- # not nan
- if val == val:
- nobs[b, j] += 1
- sumx[b, j] += val
- else:
- for i in range(N):
- while b < ngroups - 1 and i >= bins[b]:
- b += 1
-
- counts[b] += 1
- val = values[i, 0]
-
- # not nan
- if val == val:
- nobs[b, 0] += 1
- sumx[b, 0] += val
- print i, b, counts, nobs.squeeze()
-
- for i in range(ngroups):
- print 'writing %d' % i
- for j in range(K):
- if nobs[i] == 0:
- out[i, j] = nan
- else:
- out[i, j] = sumx[i, j]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_add(ndarray[float64_t, ndim=2] out,
- ndarray[int32_t] counts,
- ndarray[float64_t, ndim=2] values,
- ndarray[int32_t] labels):
- '''
- Only aggregates on axis=0
- '''
- cdef:
- Py_ssize_t i, j, N, K, lab
- float64_t val, count
- ndarray[float64_t, ndim=2] sumx, nobs
-
- nobs = np.zeros_like(out)
- sumx = np.zeros_like(out)
-
- N, K = (<object> values).shape
-
- if K > 1:
- for i in range(N):
- lab = labels[i]
- if lab < 0:
- continue
-
- counts[lab] += 1
- for j in range(K):
- val = values[i, j]
-
- # not nan
- if val == val:
- nobs[lab, j] += 1
- sumx[lab, j] += val
- else:
- for i in range(N):
- lab = labels[i]
- if lab < 0:
- continue
-
- counts[lab] += 1
- val = values[i, 0]
-
- # not nan
- if val == val:
- nobs[lab, 0] += 1
- sumx[lab, 0] += val
-
- for i in range(len(counts)):
- for j in range(K):
- if nobs[i, j] == 0:
- out[i, j] = nan
- else:
- out[i, j] = sumx[i, j]
-
-
-from datetime cimport getAbsTime
-
# cdef extern from "kvec.h":
@@ -546,12 +435,6 @@ def test_foo(ndarray[int64_t] values):
val = values[0]
print val
-def get_abs_time(freq, dailyDate, originalDate):
- return getAbsTime(freq, dailyDate, originalDate)
-
-have_pytz = 1
-import pytz
-
# cdef extern from "foo.h":
# double add_things(double *a, double *b, double *c, int n)
@@ -581,3 +464,37 @@ def inner(ndarray[float64_t] x, ndarray[float64_t] y):
for i in range(n):
result += x[i] * y[i]
return result
+
+def indices_fast(ndarray[int64_t] labels, list keys,
+ list sorted_labels):
+ cdef:
+ Py_ssize_t i, j, k, lab, cur, start, n = len(labels)
+ dict result = {}
+ object tup
+
+ index = np.arange(n)
+
+ k = len(keys)
+
+ if n == 0:
+ return result
+
+ start = 0
+ cur = labels[0]
+ for i in range(1, n):
+ lab = labels[i]
+
+ if lab != cur:
+ if lab != -1:
+ tup = PyTuple_New(k)
+ for j in range(k):
+ val = util.get_value_at(keys[j],
+ sorted_labels[j][cur])
+ PyTuple_SET_ITEM(tup, j, val)
+ Py_INCREF(val)
+
+ result[tup] = index[start:i]
+ start = i
+ cur = lab
+
+ return result
View
20 vb_suite/groupby.py
@@ -172,3 +172,23 @@ def f():
groupby_last = Benchmark('data.groupby(labels).last()', setup,
start_date=datetime(2012, 5, 1))
+
+
+#----------------------------------------------------------------------
+# groupby_indices replacement, chop up Series
+
+setup = common_setup + """
+try:
+ rng = date_range('1/1/2000', '12/31/2005', freq='H')
+ year, month, day = rng.year, rng.month, rng.day
+except:
+ rng = date_range('1/1/2000', '12/31/2000', offset=datetools.Hour())
+ year = rng.map(lambda x: x.year)
+ month = rng.map(lambda x: x.month)
+ day = rng.map(lambda x: x.day)
+
+ts = Series(np.random.randn(len(rng)), index=rng)
+"""
+
+groupby_indices = Benchmark('len(ts.groupby([year, month, day]))',
+ setup, start_date=datetime(2012, 1, 1))
Please sign in to comment.
Something went wrong with that request. Please try again.