Skip to content
This repository
Browse code

ENH: reimplment groupby_indices using better algorithmic tricks, asso…

…ciated vbenchmark. close #609
  • Loading branch information...
commit e7af2b99634f5514554d10731f8b99dc070139cb 1 parent 88e6bcf
Wes McKinney authored May 13, 2012
3  pandas/core/algorithms.py
@@ -94,6 +94,7 @@ def _unique_generic(values, table_type, type_caster):
94 94
     uniques = table.unique(values)
95 95
     return uniques
96 96
 
  97
+
97 98
 def factorize(values, sort=False, order=None, na_sentinel=-1):
98 99
     """
99 100
     Encode input values as an enumerated type or categorical variable
@@ -118,7 +119,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
118 119
     uniques = com._asarray_tuplesafe(uniques)
119 120
     if sort and len(counts) > 0:
120 121
         sorter = uniques.argsort()
121  
-        reverse_indexer = np.empty(len(sorter), dtype=np.int32)
  122
+        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
122 123
         reverse_indexer.put(sorter, np.arange(len(sorter)))
123 124
 
124 125
         mask = labels < 0
21  pandas/core/groupby.py
@@ -536,10 +536,9 @@ def indices(self):
536 536
         if len(self.groupings) == 1:
537 537
             return self.groupings[0].indices
538 538
         else:
539  
-            # TODO: this is massively inefficient
540  
-            to_groupby = zip(*(ping.grouper for ping in self.groupings))
541  
-            to_groupby = Index(to_groupby)
542  
-            return lib.groupby_indices(to_groupby)
  539
+            label_list = [ping.labels for ping in self.groupings]
  540
+            keys = [ping.group_index for ping in self.groupings]
  541
+            return _get_indices_dict(label_list, keys)
543 542
 
544 543
     @property
545 544
     def labels(self):
@@ -1972,6 +1971,20 @@ def get_key(self, comp_id):
1972 1971
         return tuple(level[table.get_item(comp_id)]
1973 1972
                      for table, level in zip(self.tables, self.levels))
1974 1973
 
  1974
+
  1975
+def _get_indices_dict(label_list, keys):
  1976
+    shape = [len(x) for x in keys]
  1977
+    group_index = get_group_index(label_list, shape)
  1978
+
  1979
+    sorter, _ = lib.groupsort_indexer(com._ensure_int64(group_index),
  1980
+                                      np.prod(shape))
  1981
+
  1982
+    sorted_labels = [lab.take(sorter) for lab in label_list]
  1983
+    group_index = group_index.take(sorter)
  1984
+    index = np.arange(len(group_index)).take(sorter)
  1985
+
  1986
+    return lib.indices_fast(index, group_index, keys, sorted_labels)
  1987
+
1975 1988
 #----------------------------------------------------------------------
1976 1989
 # sorting levels...cleverly?
1977 1990
 
113  pandas/src/groupby.pyx
@@ -746,7 +746,6 @@ def group_var(ndarray[float64_t, ndim=2] out,
746 746
 
747 747
 @cython.boundscheck(False)
748 748
 @cython.wraparound(False)
749  
-
750 749
 def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
751 750
                        object closed='left'):
752 751
     """
@@ -1107,8 +1106,8 @@ def group_ohlc(ndarray[float64_t, ndim=2] out,
1107 1106
             out[b, 3] = vclose
1108 1107
 
1109 1108
 
1110  
-# @cython.boundscheck(False)
1111  
-# @cython.wraparound(False)
  1109
+@cython.boundscheck(False)
  1110
+@cython.wraparound(False)
1112 1111
 def group_mean_bin(ndarray[float64_t, ndim=2] out,
1113 1112
                    ndarray[int64_t] counts,
1114 1113
                    ndarray[float64_t, ndim=2] values,
@@ -1268,62 +1267,6 @@ def lookup_values(ndarray[object] values, dict mapping):
1268 1267
         result[i] = mapping[values[i]]
1269 1268
     return maybe_convert_objects(result)
1270 1269
 
1271  
-def reduce_mean(ndarray[object] indices,
1272  
-                ndarray[object] buckets,
1273  
-                ndarray[float64_t] values,
1274  
-                inclusive=False):
1275  
-    cdef:
1276  
-        Py_ssize_t i, j, nbuckets, nvalues
1277  
-        ndarray[float64_t] output
1278  
-        float64_t the_sum, val, nobs
1279  
-
1280  
-
1281  
-
1282  
-    nbuckets = len(buckets)
1283  
-    nvalues = len(indices)
1284  
-
1285  
-    assert(len(values) == len(indices))
1286  
-
1287  
-    output = np.empty(nbuckets, dtype=float)
1288  
-    output.fill(np.NaN)
1289  
-
1290  
-    j = 0
1291  
-    for i from 0 <= i < nbuckets:
1292  
-        next_bound = buckets[i]
1293  
-        the_sum = 0
1294  
-        nobs = 0
1295  
-        if inclusive:
1296  
-            while j < nvalues and indices[j] <= next_bound:
1297  
-                val = values[j]
1298  
-                # not NaN
1299  
-                if val == val:
1300  
-                    the_sum += val
1301  
-                    nobs += 1
1302  
-                j += 1
1303  
-        else:
1304  
-            while j < nvalues and indices[j] < next_bound:
1305  
-                val = values[j]
1306  
-                # not NaN
1307  
-                if val == val:
1308  
-                    the_sum += val
1309  
-                    nobs += 1
1310  
-                j += 1
1311  
-
1312  
-        if nobs > 0:
1313  
-            output[i] = the_sum / nobs
1314  
-
1315  
-        if j >= nvalues:
1316  
-            break
1317  
-
1318  
-    return output
1319  
-
1320  
-def _bucket_locs(index, buckets, inclusive=False):
1321  
-    if inclusive:
1322  
-        locs = index.searchsorted(buckets, side='left')
1323  
-    else:
1324  
-        locs = index.searchsorted(buckets, side='right')
1325  
-
1326  
-    return locs
1327 1270
 
1328 1271
 def count_level_1d(ndarray[uint8_t, cast=True] mask,
1329 1272
                    ndarray[int64_t] labels, Py_ssize_t max_bin):
@@ -1341,6 +1284,7 @@ def count_level_1d(ndarray[uint8_t, cast=True] mask,
1341 1284
 
1342 1285
     return counts
1343 1286
 
  1287
+
1344 1288
 def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
1345 1289
                    ndarray[int64_t] labels, Py_ssize_t max_bin):
1346 1290
     cdef:
@@ -1357,6 +1301,7 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
1357 1301
 
1358 1302
     return counts
1359 1303
 
  1304
+
1360 1305
 def duplicated(list values, take_last=False):
1361 1306
     cdef:
1362 1307
         Py_ssize_t i, n
@@ -1411,7 +1356,7 @@ def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
1411 1356
     return starts, ends
1412 1357
 
1413 1358
 
1414  
-def groupby_arrays(ndarray index, ndarray[int64_t] labels):
  1359
+def groupby_arrays(ndarray index, ndarray[int64_t] labels, sort=True):
1415 1360
     cdef:
1416 1361
         Py_ssize_t i, lab, cur, start, n = len(index)
1417 1362
         dict result = {}
@@ -1419,10 +1364,11 @@ def groupby_arrays(ndarray index, ndarray[int64_t] labels):
1419 1364
     index = np.asarray(index)
1420 1365
 
1421 1366
     # this is N log N. If this is a bottleneck may we worth fixing someday
1422  
-    indexer = labels.argsort(kind='mergesort')
  1367
+    if sort:
  1368
+        indexer = labels.argsort(kind='mergesort')
1423 1369
 
1424  
-    labels = labels.take(indexer)
1425  
-    index = index.take(indexer)
  1370
+        labels = labels.take(indexer)
  1371
+        index = index.take(indexer)
1426 1372
 
1427 1373
     if n == 0:
1428 1374
         return result
@@ -1438,4 +1384,45 @@ def groupby_arrays(ndarray index, ndarray[int64_t] labels):
1438 1384
             start = i
1439 1385
         cur = lab
1440 1386
 
  1387
+    result[cur] = index[start:]
  1388
+    return result
  1389
+
  1390
+def indices_fast(object index, ndarray[int64_t] labels, list keys,
  1391
+                 list sorted_labels):
  1392
+    cdef:
  1393
+        Py_ssize_t i, j, k, lab, cur, start, n = len(labels)
  1394
+        dict result = {}
  1395
+        object tup
  1396
+
  1397
+    k = len(keys)
  1398
+
  1399
+    if n == 0:
  1400
+        return result
  1401
+
  1402
+    start = 0
  1403
+    cur = labels[0]
  1404
+    for i in range(1, n):
  1405
+        lab = labels[i]
  1406
+
  1407
+        if lab != cur:
  1408
+            if lab != -1:
  1409
+                tup = PyTuple_New(k)
  1410
+                for j in range(k):
  1411
+                    val = util.get_value_at(keys[j],
  1412
+                                            sorted_labels[j][i-1])
  1413
+                    PyTuple_SET_ITEM(tup, j, val)
  1414
+                    Py_INCREF(val)
  1415
+
  1416
+                result[tup] = index[start:i]
  1417
+            start = i
  1418
+        cur = lab
  1419
+
  1420
+    tup = PyTuple_New(k)
  1421
+    for j in range(k):
  1422
+        val = util.get_value_at(keys[j],
  1423
+                                sorted_labels[j][n - 1])
  1424
+        PyTuple_SET_ITEM(tup, j, val)
  1425
+        Py_INCREF(val)
  1426
+    result[tup] = index[start:]
  1427
+
1441 1428
     return result
151  pandas/src/sandbox.pyx
@@ -421,117 +421,6 @@ def int64_unique(ndarray[int64_t] arr):
421 421
 
422 422
     return np.sort(uniques[:j])
423 423
 
424  
-def group_add_bin(ndarray[float64_t, ndim=2] out,
425  
-                  ndarray[int32_t] counts,
426  
-                  ndarray[float64_t, ndim=2] values,
427  
-                  ndarray[int32_t] bins):
428  
-    '''
429  
-    Only aggregates on axis=0
430  
-    '''
431  
-    cdef:
432  
-        Py_ssize_t i, j, N, K, ngroups, b
433  
-        float64_t val, count
434  
-        ndarray[float64_t, ndim=2] sumx, nobs
435  
-
436  
-    nobs = np.zeros_like(out)
437  
-    sumx = np.zeros_like(out)
438  
-
439  
-    ngroups = len(bins) + 1
440  
-    N, K = (<object> values).shape
441  
-
442  
-    b = 0
443  
-    if K > 1:
444  
-        for i in range(N):
445  
-            while b < ngroups - 1 and i >= bins[b]:
446  
-                b += 1
447  
-
448  
-            counts[b] += 1
449  
-            for j in range(K):
450  
-                val = values[i, j]
451  
-
452  
-                # not nan
453  
-                if val == val:
454  
-                    nobs[b, j] += 1
455  
-                    sumx[b, j] += val
456  
-    else:
457  
-        for i in range(N):
458  
-            while b < ngroups - 1 and i >= bins[b]:
459  
-                b += 1
460  
-
461  
-            counts[b] += 1
462  
-            val = values[i, 0]
463  
-
464  
-            # not nan
465  
-            if val == val:
466  
-                nobs[b, 0] += 1
467  
-                sumx[b, 0] += val
468  
-            print i, b, counts, nobs.squeeze()
469  
-
470  
-    for i in range(ngroups):
471  
-        print 'writing %d' % i
472  
-        for j in range(K):
473  
-            if nobs[i] == 0:
474  
-                out[i, j] = nan
475  
-            else:
476  
-                out[i, j] = sumx[i, j]
477  
-
478  
-@cython.boundscheck(False)
479  
-@cython.wraparound(False)
480  
-def group_add(ndarray[float64_t, ndim=2] out,
481  
-              ndarray[int32_t] counts,
482  
-              ndarray[float64_t, ndim=2] values,
483  
-              ndarray[int32_t] labels):
484  
-    '''
485  
-    Only aggregates on axis=0
486  
-    '''
487  
-    cdef:
488  
-        Py_ssize_t i, j, N, K, lab
489  
-        float64_t val, count
490  
-        ndarray[float64_t, ndim=2] sumx, nobs
491  
-
492  
-    nobs = np.zeros_like(out)
493  
-    sumx = np.zeros_like(out)
494  
-
495  
-    N, K = (<object> values).shape
496  
-
497  
-    if K > 1:
498  
-        for i in range(N):
499  
-            lab = labels[i]
500  
-            if lab < 0:
501  
-                continue
502  
-
503  
-            counts[lab] += 1
504  
-            for j in range(K):
505  
-                val = values[i, j]
506  
-
507  
-                # not nan
508  
-                if val == val:
509  
-                    nobs[lab, j] += 1
510  
-                    sumx[lab, j] += val
511  
-    else:
512  
-        for i in range(N):
513  
-            lab = labels[i]
514  
-            if lab < 0:
515  
-                continue
516  
-
517  
-            counts[lab] += 1
518  
-            val = values[i, 0]
519  
-
520  
-            # not nan
521  
-            if val == val:
522  
-                nobs[lab, 0] += 1
523  
-                sumx[lab, 0] += val
524  
-
525  
-    for i in range(len(counts)):
526  
-        for j in range(K):
527  
-            if nobs[i, j] == 0:
528  
-                out[i, j] = nan
529  
-            else:
530  
-                out[i, j] = sumx[i, j]
531  
-
532  
-
533  
-from datetime cimport getAbsTime
534  
-
535 424
 
536 425
 # cdef extern from "kvec.h":
537 426
 
@@ -546,12 +435,6 @@ def test_foo(ndarray[int64_t] values):
546 435
     val = values[0]
547 436
     print val
548 437
 
549  
-def get_abs_time(freq, dailyDate, originalDate):
550  
-    return getAbsTime(freq, dailyDate, originalDate)
551  
-
552  
-have_pytz = 1
553  
-import pytz
554  
-
555 438
 # cdef extern from "foo.h":
556 439
 #     double add_things(double *a, double *b, double *c, int n)
557 440
 
@@ -581,3 +464,37 @@ def inner(ndarray[float64_t] x, ndarray[float64_t] y):
581 464
     for i in range(n):
582 465
         result += x[i] * y[i]
583 466
     return result
  467
+
  468
+def indices_fast(ndarray[int64_t] labels, list keys,
  469
+                 list sorted_labels):
  470
+    cdef:
  471
+        Py_ssize_t i, j, k, lab, cur, start, n = len(labels)
  472
+        dict result = {}
  473
+        object tup
  474
+
  475
+    index = np.arange(n)
  476
+
  477
+    k = len(keys)
  478
+
  479
+    if n == 0:
  480
+        return result
  481
+
  482
+    start = 0
  483
+    cur = labels[0]
  484
+    for i in range(1, n):
  485
+        lab = labels[i]
  486
+
  487
+        if lab != cur:
  488
+            if lab != -1:
  489
+                tup = PyTuple_New(k)
  490
+                for j in range(k):
  491
+                    val = util.get_value_at(keys[j],
  492
+                                            sorted_labels[j][cur])
  493
+                    PyTuple_SET_ITEM(tup, j, val)
  494
+                    Py_INCREF(val)
  495
+
  496
+                result[tup] = index[start:i]
  497
+            start = i
  498
+        cur = lab
  499
+
  500
+    return result
20  vb_suite/groupby.py
@@ -172,3 +172,23 @@ def f():
172 172
 
173 173
 groupby_last = Benchmark('data.groupby(labels).last()', setup,
174 174
                           start_date=datetime(2012, 5, 1))
  175
+
  176
+
  177
+#----------------------------------------------------------------------
  178
+# groupby_indices replacement, chop up Series
  179
+
  180
+setup = common_setup + """
  181
+try:
  182
+    rng = date_range('1/1/2000', '12/31/2005', freq='H')
  183
+    year, month, day = rng.year, rng.month, rng.day
  184
+except:
  185
+    rng = date_range('1/1/2000', '12/31/2000', offset=datetools.Hour())
  186
+    year = rng.map(lambda x: x.year)
  187
+    month = rng.map(lambda x: x.month)
  188
+    day = rng.map(lambda x: x.day)
  189
+
  190
+ts = Series(np.random.randn(len(rng)), index=rng)
  191
+"""
  192
+
  193
+groupby_indices = Benchmark('len(ts.groupby([year, month, day]))',
  194
+                            setup, start_date=datetime(2012, 1, 1))

0 notes on commit e7af2b9

Please sign in to comment.
Something went wrong with that request. Please try again.