ENH: reimplment groupby_indices using better algorithmic tricks, asso…

`…ciated vbenchmark. close #609`
commit e7af2b99634f5514554d10731f8b99dc070139cb 1 parent 88e6bcf
authored May 13, 2012
3  pandas/core/algorithms.py
 `@@ -94,6 +94,7 @@ def _unique_generic(values, table_type, type_caster):` 94 94 ` uniques = table.unique(values)` 95 95 ` return uniques` 96 96 ` ` 97 `+` 97 98 ` def factorize(values, sort=False, order=None, na_sentinel=-1):` 98 99 ` """` 99 100 ` Encode input values as an enumerated type or categorical variable` `@@ -118,7 +119,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):` 118 119 ` uniques = com._asarray_tuplesafe(uniques)` 119 120 ` if sort and len(counts) > 0:` 120 121 ` sorter = uniques.argsort()` 121 `- reverse_indexer = np.empty(len(sorter), dtype=np.int32)` 122 `+ reverse_indexer = np.empty(len(sorter), dtype=np.int_)` 122 123 ` reverse_indexer.put(sorter, np.arange(len(sorter)))` 123 124 ` ` 124 125 ` mask = labels < 0`
21  pandas/core/groupby.py
 `@@ -536,10 +536,9 @@ def indices(self):` 536 536 ` if len(self.groupings) == 1:` 537 537 ` return self.groupings[0].indices` 538 538 ` else:` 539 `- # TODO: this is massively inefficient` 540 `- to_groupby = zip(*(ping.grouper for ping in self.groupings))` 541 `- to_groupby = Index(to_groupby)` 542 `- return lib.groupby_indices(to_groupby)` 539 `+ label_list = [ping.labels for ping in self.groupings]` 540 `+ keys = [ping.group_index for ping in self.groupings]` 541 `+ return _get_indices_dict(label_list, keys)` 543 542 ` ` 544 543 ` @property` 545 544 ` def labels(self):` `@@ -1972,6 +1971,20 @@ def get_key(self, comp_id):` 1972 1971 ` return tuple(level[table.get_item(comp_id)]` 1973 1972 ` for table, level in zip(self.tables, self.levels))` 1974 1973 ` ` 1974 `+` 1975 `+def _get_indices_dict(label_list, keys):` 1976 `+ shape = [len(x) for x in keys]` 1977 `+ group_index = get_group_index(label_list, shape)` 1978 `+` 1979 `+ sorter, _ = lib.groupsort_indexer(com._ensure_int64(group_index),` 1980 `+ np.prod(shape))` 1981 `+` 1982 `+ sorted_labels = [lab.take(sorter) for lab in label_list]` 1983 `+ group_index = group_index.take(sorter)` 1984 `+ index = np.arange(len(group_index)).take(sorter)` 1985 `+` 1986 `+ return lib.indices_fast(index, group_index, keys, sorted_labels)` 1987 `+` 1975 1988 ` #----------------------------------------------------------------------` 1976 1989 ` # sorting levels...cleverly?` 1977 1990 ` `
113  pandas/src/groupby.pyx
 `@@ -746,7 +746,6 @@ def group_var(ndarray[float64_t, ndim=2] out,` 746 746 ` ` 747 747 ` @cython.boundscheck(False)` 748 748 ` @cython.wraparound(False)` 749 `-` 750 749 ` def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,` 751 750 ` object closed='left'):` 752 751 ` """` `@@ -1107,8 +1106,8 @@ def group_ohlc(ndarray[float64_t, ndim=2] out,` 1107 1106 ` out[b, 3] = vclose` 1108 1107 ` ` 1109 1108 ` ` 1110 `-# @cython.boundscheck(False)` 1111 `-# @cython.wraparound(False)` 1109 `+@cython.boundscheck(False)` 1110 `+@cython.wraparound(False)` 1112 1111 ` def group_mean_bin(ndarray[float64_t, ndim=2] out,` 1113 1112 ` ndarray[int64_t] counts,` 1114 1113 ` ndarray[float64_t, ndim=2] values,` `@@ -1268,62 +1267,6 @@ def lookup_values(ndarray[object] values, dict mapping):` 1268 1267 ` result[i] = mapping[values[i]]` 1269 1268 ` return maybe_convert_objects(result)` 1270 1269 ` ` 1271 `-def reduce_mean(ndarray[object] indices,` 1272 `- ndarray[object] buckets,` 1273 `- ndarray[float64_t] values,` 1274 `- inclusive=False):` 1275 `- cdef:` 1276 `- Py_ssize_t i, j, nbuckets, nvalues` 1277 `- ndarray[float64_t] output` 1278 `- float64_t the_sum, val, nobs` 1279 `-` 1280 `-` 1281 `-` 1282 `- nbuckets = len(buckets)` 1283 `- nvalues = len(indices)` 1284 `-` 1285 `- assert(len(values) == len(indices))` 1286 `-` 1287 `- output = np.empty(nbuckets, dtype=float)` 1288 `- output.fill(np.NaN)` 1289 `-` 1290 `- j = 0` 1291 `- for i from 0 <= i < nbuckets:` 1292 `- next_bound = buckets[i]` 1293 `- the_sum = 0` 1294 `- nobs = 0` 1295 `- if inclusive:` 1296 `- while j < nvalues and indices[j] <= next_bound:` 1297 `- val = values[j]` 1298 `- # not NaN` 1299 `- if val == val:` 1300 `- the_sum += val` 1301 `- nobs += 1` 1302 `- j += 1` 1303 `- else:` 1304 `- while j < nvalues and indices[j] < next_bound:` 1305 `- val = values[j]` 1306 `- # not NaN` 1307 `- if val == val:` 1308 `- the_sum += val` 1309 `- nobs += 1` 1310 `- j += 1` 1311 `-` 1312 `- if nobs > 0:` 1313 `- output[i] = the_sum / nobs` 1314 `-` 1315 `- if j >= nvalues:` 1316 `- break` 1317 `-` 1318 `- return output` 1319 `-` 1320 `-def _bucket_locs(index, buckets, inclusive=False):` 1321 `- if inclusive:` 1322 `- locs = index.searchsorted(buckets, side='left')` 1323 `- else:` 1324 `- locs = index.searchsorted(buckets, side='right')` 1325 `-` 1326 `- return locs` 1327 1270 ` ` 1328 1271 ` def count_level_1d(ndarray[uint8_t, cast=True] mask,` 1329 1272 ` ndarray[int64_t] labels, Py_ssize_t max_bin):` `@@ -1341,6 +1284,7 @@ def count_level_1d(ndarray[uint8_t, cast=True] mask,` 1341 1284 ` ` 1342 1285 ` return counts` 1343 1286 ` ` 1287 `+` 1344 1288 ` def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,` 1345 1289 ` ndarray[int64_t] labels, Py_ssize_t max_bin):` 1346 1290 ` cdef:` `@@ -1357,6 +1301,7 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,` 1357 1301 ` ` 1358 1302 ` return counts` 1359 1303 ` ` 1304 `+` 1360 1305 ` def duplicated(list values, take_last=False):` 1361 1306 ` cdef:` 1362 1307 ` Py_ssize_t i, n` `@@ -1411,7 +1356,7 @@ def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):` 1411 1356 ` return starts, ends` 1412 1357 ` ` 1413 1358 ` ` 1414 `-def groupby_arrays(ndarray index, ndarray[int64_t] labels):` 1359 `+def groupby_arrays(ndarray index, ndarray[int64_t] labels, sort=True):` 1415 1360 ` cdef:` 1416 1361 ` Py_ssize_t i, lab, cur, start, n = len(index)` 1417 1362 ` dict result = {}` `@@ -1419,10 +1364,11 @@ def groupby_arrays(ndarray index, ndarray[int64_t] labels):` 1419 1364 ` index = np.asarray(index)` 1420 1365 ` ` 1421 1366 ` # this is N log N. If this is a bottleneck may we worth fixing someday` 1422 `- indexer = labels.argsort(kind='mergesort')` 1367 `+ if sort:` 1368 `+ indexer = labels.argsort(kind='mergesort')` 1423 1369 ` ` 1424 `- labels = labels.take(indexer)` 1425 `- index = index.take(indexer)` 1370 `+ labels = labels.take(indexer)` 1371 `+ index = index.take(indexer)` 1426 1372 ` ` 1427 1373 ` if n == 0:` 1428 1374 ` return result` `@@ -1438,4 +1384,45 @@ def groupby_arrays(ndarray index, ndarray[int64_t] labels):` 1438 1384 ` start = i` 1439 1385 ` cur = lab` 1440 1386 ` ` 1387 `+ result[cur] = index[start:]` 1388 `+ return result` 1389 `+` 1390 `+def indices_fast(object index, ndarray[int64_t] labels, list keys,` 1391 `+ list sorted_labels):` 1392 `+ cdef:` 1393 `+ Py_ssize_t i, j, k, lab, cur, start, n = len(labels)` 1394 `+ dict result = {}` 1395 `+ object tup` 1396 `+` 1397 `+ k = len(keys)` 1398 `+` 1399 `+ if n == 0:` 1400 `+ return result` 1401 `+` 1402 `+ start = 0` 1403 `+ cur = labels[0]` 1404 `+ for i in range(1, n):` 1405 `+ lab = labels[i]` 1406 `+` 1407 `+ if lab != cur:` 1408 `+ if lab != -1:` 1409 `+ tup = PyTuple_New(k)` 1410 `+ for j in range(k):` 1411 `+ val = util.get_value_at(keys[j],` 1412 `+ sorted_labels[j][i-1])` 1413 `+ PyTuple_SET_ITEM(tup, j, val)` 1414 `+ Py_INCREF(val)` 1415 `+` 1416 `+ result[tup] = index[start:i]` 1417 `+ start = i` 1418 `+ cur = lab` 1419 `+` 1420 `+ tup = PyTuple_New(k)` 1421 `+ for j in range(k):` 1422 `+ val = util.get_value_at(keys[j],` 1423 `+ sorted_labels[j][n - 1])` 1424 `+ PyTuple_SET_ITEM(tup, j, val)` 1425 `+ Py_INCREF(val)` 1426 `+ result[tup] = index[start:]` 1427 `+` 1441 1428 ` return result`
151  pandas/src/sandbox.pyx
 `@@ -421,117 +421,6 @@ def int64_unique(ndarray[int64_t] arr):` 421 421 ` ` 422 422 ` return np.sort(uniques[:j])` 423 423 ` ` 424 `-def group_add_bin(ndarray[float64_t, ndim=2] out,` 425 `- ndarray[int32_t] counts,` 426 `- ndarray[float64_t, ndim=2] values,` 427 `- ndarray[int32_t] bins):` 428 `- '''` 429 `- Only aggregates on axis=0` 430 `- '''` 431 `- cdef:` 432 `- Py_ssize_t i, j, N, K, ngroups, b` 433 `- float64_t val, count` 434 `- ndarray[float64_t, ndim=2] sumx, nobs` 435 `-` 436 `- nobs = np.zeros_like(out)` 437 `- sumx = np.zeros_like(out)` 438 `-` 439 `- ngroups = len(bins) + 1` 440 `- N, K = (
20  vb_suite/groupby.py
 `@@ -172,3 +172,23 @@ def f():` 172 172 ` ` 173 173 ` groupby_last = Benchmark('data.groupby(labels).last()', setup,` 174 174 ` start_date=datetime(2012, 5, 1))` 175 `+` 176 `+` 177 `+#----------------------------------------------------------------------` 178 `+# groupby_indices replacement, chop up Series` 179 `+` 180 `+setup = common_setup + """` 181 `+try:` 182 `+ rng = date_range('1/1/2000', '12/31/2005', freq='H')` 183 `+ year, month, day = rng.year, rng.month, rng.day` 184 `+except:` 185 `+ rng = date_range('1/1/2000', '12/31/2000', offset=datetools.Hour())` 186 `+ year = rng.map(lambda x: x.year)` 187 `+ month = rng.map(lambda x: x.month)` 188 `+ day = rng.map(lambda x: x.day)` 189 `+` 190 `+ts = Series(np.random.randn(len(rng)), index=rng)` 191 `+"""` 192 `+` 193 `+groupby_indices = Benchmark('len(ts.groupby([year, month, day]))',` 194 `+ setup, start_date=datetime(2012, 1, 1))`