From 8cd419ee231ecbcdee1d81580c16cb53c6774eb5 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sat, 30 Nov 2019 21:21:48 -0500 Subject: [PATCH 01/17] heapq.merge special case for few iterables --- Lib/heapq.py | 1317 +++++++++++++++++++++++++++----------------------- 1 file changed, 716 insertions(+), 601 deletions(-) diff --git a/Lib/heapq.py b/Lib/heapq.py index fabefd87f8bf8c..7fd2d865dd34cd 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -1,601 +1,716 @@ -"""Heap queue algorithm (a.k.a. priority queue). - -Heaps are arrays for which a[k] <= a[2*k+1] and a[k] <= a[2*k+2] for -all k, counting elements from 0. For the sake of comparison, -non-existing elements are considered to be infinite. The interesting -property of a heap is that a[0] is always its smallest element. - -Usage: - -heap = [] # creates an empty heap -heappush(heap, item) # pushes a new item on the heap -item = heappop(heap) # pops the smallest item from the heap -item = heap[0] # smallest item on the heap without popping it -heapify(x) # transforms list into a heap, in-place, in linear time -item = heapreplace(heap, item) # pops and returns smallest item, and adds - # new item; the heap size is unchanged - -Our API differs from textbook heap algorithms as follows: - -- We use 0-based indexing. This makes the relationship between the - index for a node and the indexes for its children slightly less - obvious, but is more suitable since Python uses 0-based indexing. - -- Our heappop() method returns the smallest item, not the largest. - -These two make it possible to view the heap as a regular Python list -without surprises: heap[0] is the smallest item, and heap.sort() -maintains the heap invariant! -""" - -# Original code by Kevin O'Connor, augmented by Tim Peters and Raymond Hettinger - -__about__ = """Heap queues - -[explanation by François Pinard] - -Heaps are arrays for which a[k] <= a[2*k+1] and a[k] <= a[2*k+2] for -all k, counting elements from 0. For the sake of comparison, -non-existing elements are considered to be infinite. The interesting -property of a heap is that a[0] is always its smallest element. - -The strange invariant above is meant to be an efficient memory -representation for a tournament. The numbers below are `k', not a[k]: - - 0 - - 1 2 - - 3 4 5 6 - - 7 8 9 10 11 12 13 14 - - 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 - - -In the tree above, each cell `k' is topping `2*k+1' and `2*k+2'. In -a usual binary tournament we see in sports, each cell is the winner -over the two cells it tops, and we can trace the winner down the tree -to see all opponents s/he had. However, in many computer applications -of such tournaments, we do not need to trace the history of a winner. -To be more memory efficient, when a winner is promoted, we try to -replace it by something else at a lower level, and the rule becomes -that a cell and the two cells it tops contain three different items, -but the top cell "wins" over the two topped cells. - -If this heap invariant is protected at all time, index 0 is clearly -the overall winner. The simplest algorithmic way to remove it and -find the "next" winner is to move some loser (let's say cell 30 in the -diagram above) into the 0 position, and then percolate this new 0 down -the tree, exchanging values, until the invariant is re-established. -This is clearly logarithmic on the total number of items in the tree. -By iterating over all items, you get an O(n ln n) sort. - -A nice feature of this sort is that you can efficiently insert new -items while the sort is going on, provided that the inserted items are -not "better" than the last 0'th element you extracted. This is -especially useful in simulation contexts, where the tree holds all -incoming events, and the "win" condition means the smallest scheduled -time. When an event schedule other events for execution, they are -scheduled into the future, so they can easily go into the heap. So, a -heap is a good structure for implementing schedulers (this is what I -used for my MIDI sequencer :-). - -Various structures for implementing schedulers have been extensively -studied, and heaps are good for this, as they are reasonably speedy, -the speed is almost constant, and the worst case is not much different -than the average case. However, there are other representations which -are more efficient overall, yet the worst cases might be terrible. - -Heaps are also very useful in big disk sorts. You most probably all -know that a big sort implies producing "runs" (which are pre-sorted -sequences, which size is usually related to the amount of CPU memory), -followed by a merging passes for these runs, which merging is often -very cleverly organised[1]. It is very important that the initial -sort produces the longest runs possible. Tournaments are a good way -to that. If, using all the memory available to hold a tournament, you -replace and percolate items that happen to fit the current run, you'll -produce runs which are twice the size of the memory for random input, -and much better for input fuzzily ordered. - -Moreover, if you output the 0'th item on disk and get an input which -may not fit in the current tournament (because the value "wins" over -the last output value), it cannot fit in the heap, so the size of the -heap decreases. The freed memory could be cleverly reused immediately -for progressively building a second heap, which grows at exactly the -same rate the first heap is melting. When the first heap completely -vanishes, you switch heaps and start a new run. Clever and quite -effective! - -In a word, heaps are useful memory structures to know. I use them in -a few applications, and I think it is good to keep a `heap' module -around. :-) - --------------------- -[1] The disk balancing algorithms which are current, nowadays, are -more annoying than clever, and this is a consequence of the seeking -capabilities of the disks. On devices which cannot seek, like big -tape drives, the story was quite different, and one had to be very -clever to ensure (far in advance) that each tape movement will be the -most effective possible (that is, will best participate at -"progressing" the merge). Some tapes were even able to read -backwards, and this was also used to avoid the rewinding time. -Believe me, real good tape sorts were quite spectacular to watch! -From all times, sorting has always been a Great Art! :-) -""" - -__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'merge', - 'nlargest', 'nsmallest', 'heappushpop'] - -def heappush(heap, item): - """Push item onto heap, maintaining the heap invariant.""" - heap.append(item) - _siftdown(heap, 0, len(heap)-1) - -def heappop(heap): - """Pop the smallest item off the heap, maintaining the heap invariant.""" - lastelt = heap.pop() # raises appropriate IndexError if heap is empty - if heap: - returnitem = heap[0] - heap[0] = lastelt - _siftup(heap, 0) - return returnitem - return lastelt - -def heapreplace(heap, item): - """Pop and return the current smallest value, and add the new item. - - This is more efficient than heappop() followed by heappush(), and can be - more appropriate when using a fixed-size heap. Note that the value - returned may be larger than item! That constrains reasonable uses of - this routine unless written as part of a conditional replacement: - - if item > heap[0]: - item = heapreplace(heap, item) - """ - returnitem = heap[0] # raises appropriate IndexError if heap is empty - heap[0] = item - _siftup(heap, 0) - return returnitem - -def heappushpop(heap, item): - """Fast version of a heappush followed by a heappop.""" - if heap and heap[0] < item: - item, heap[0] = heap[0], item - _siftup(heap, 0) - return item - -def heapify(x): - """Transform list into a heap, in-place, in O(len(x)) time.""" - n = len(x) - # Transform bottom-up. The largest index there's any point to looking at - # is the largest with a child index in-range, so must have 2*i + 1 < n, - # or i < (n-1)/2. If n is even = 2*j, this is (2*j-1)/2 = j-1/2 so - # j-1 is the largest, which is n//2 - 1. If n is odd = 2*j+1, this is - # (2*j+1-1)/2 = j so j-1 is the largest, and that's again n//2-1. - for i in reversed(range(n//2)): - _siftup(x, i) - -def _heappop_max(heap): - """Maxheap version of a heappop.""" - lastelt = heap.pop() # raises appropriate IndexError if heap is empty - if heap: - returnitem = heap[0] - heap[0] = lastelt - _siftup_max(heap, 0) - return returnitem - return lastelt - -def _heapreplace_max(heap, item): - """Maxheap version of a heappop followed by a heappush.""" - returnitem = heap[0] # raises appropriate IndexError if heap is empty - heap[0] = item - _siftup_max(heap, 0) - return returnitem - -def _heapify_max(x): - """Transform list into a maxheap, in-place, in O(len(x)) time.""" - n = len(x) - for i in reversed(range(n//2)): - _siftup_max(x, i) - -# 'heap' is a heap at all indices >= startpos, except possibly for pos. pos -# is the index of a leaf with a possibly out-of-order value. Restore the -# heap invariant. -def _siftdown(heap, startpos, pos): - newitem = heap[pos] - # Follow the path to the root, moving parents down until finding a place - # newitem fits. - while pos > startpos: - parentpos = (pos - 1) >> 1 - parent = heap[parentpos] - if newitem < parent: - heap[pos] = parent - pos = parentpos - continue - break - heap[pos] = newitem - -# The child indices of heap index pos are already heaps, and we want to make -# a heap at index pos too. We do this by bubbling the smaller child of -# pos up (and so on with that child's children, etc) until hitting a leaf, -# then using _siftdown to move the oddball originally at index pos into place. -# -# We *could* break out of the loop as soon as we find a pos where newitem <= -# both its children, but turns out that's not a good idea, and despite that -# many books write the algorithm that way. During a heap pop, the last array -# element is sifted in, and that tends to be large, so that comparing it -# against values starting from the root usually doesn't pay (= usually doesn't -# get us out of the loop early). See Knuth, Volume 3, where this is -# explained and quantified in an exercise. -# -# Cutting the # of comparisons is important, since these routines have no -# way to extract "the priority" from an array element, so that intelligence -# is likely to be hiding in custom comparison methods, or in array elements -# storing (priority, record) tuples. Comparisons are thus potentially -# expensive. -# -# On random arrays of length 1000, making this change cut the number of -# comparisons made by heapify() a little, and those made by exhaustive -# heappop() a lot, in accord with theory. Here are typical results from 3 -# runs (3 just to demonstrate how small the variance is): -# -# Compares needed by heapify Compares needed by 1000 heappops -# -------------------------- -------------------------------- -# 1837 cut to 1663 14996 cut to 8680 -# 1855 cut to 1659 14966 cut to 8678 -# 1847 cut to 1660 15024 cut to 8703 -# -# Building the heap by using heappush() 1000 times instead required -# 2198, 2148, and 2219 compares: heapify() is more efficient, when -# you can use it. -# -# The total compares needed by list.sort() on the same lists were 8627, -# 8627, and 8632 (this should be compared to the sum of heapify() and -# heappop() compares): list.sort() is (unsurprisingly!) more efficient -# for sorting. - -def _siftup(heap, pos): - endpos = len(heap) - startpos = pos - newitem = heap[pos] - # Bubble up the smaller child until hitting a leaf. - childpos = 2*pos + 1 # leftmost child position - while childpos < endpos: - # Set childpos to index of smaller child. - rightpos = childpos + 1 - if rightpos < endpos and not heap[childpos] < heap[rightpos]: - childpos = rightpos - # Move the smaller child up. - heap[pos] = heap[childpos] - pos = childpos - childpos = 2*pos + 1 - # The leaf at pos is empty now. Put newitem there, and bubble it up - # to its final resting place (by sifting its parents down). - heap[pos] = newitem - _siftdown(heap, startpos, pos) - -def _siftdown_max(heap, startpos, pos): - 'Maxheap variant of _siftdown' - newitem = heap[pos] - # Follow the path to the root, moving parents down until finding a place - # newitem fits. - while pos > startpos: - parentpos = (pos - 1) >> 1 - parent = heap[parentpos] - if parent < newitem: - heap[pos] = parent - pos = parentpos - continue - break - heap[pos] = newitem - -def _siftup_max(heap, pos): - 'Maxheap variant of _siftup' - endpos = len(heap) - startpos = pos - newitem = heap[pos] - # Bubble up the larger child until hitting a leaf. - childpos = 2*pos + 1 # leftmost child position - while childpos < endpos: - # Set childpos to index of larger child. - rightpos = childpos + 1 - if rightpos < endpos and not heap[rightpos] < heap[childpos]: - childpos = rightpos - # Move the larger child up. - heap[pos] = heap[childpos] - pos = childpos - childpos = 2*pos + 1 - # The leaf at pos is empty now. Put newitem there, and bubble it up - # to its final resting place (by sifting its parents down). - heap[pos] = newitem - _siftdown_max(heap, startpos, pos) - -def merge(*iterables, key=None, reverse=False): - '''Merge multiple sorted inputs into a single sorted output. - - Similar to sorted(itertools.chain(*iterables)) but returns a generator, - does not pull the data into memory all at once, and assumes that each of - the input streams is already sorted (smallest to largest). - - >>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25])) - [0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25] - - If *key* is not None, applies a key function to each element to determine - its sort order. - - >>> list(merge(['dog', 'horse'], ['cat', 'fish', 'kangaroo'], key=len)) - ['dog', 'cat', 'fish', 'horse', 'kangaroo'] - - ''' - - h = [] - h_append = h.append - - if reverse: - _heapify = _heapify_max - _heappop = _heappop_max - _heapreplace = _heapreplace_max - direction = -1 - else: - _heapify = heapify - _heappop = heappop - _heapreplace = heapreplace - direction = 1 - - if key is None: - for order, it in enumerate(map(iter, iterables)): - try: - next = it.__next__ - h_append([next(), order * direction, next]) - except StopIteration: - pass - _heapify(h) - while len(h) > 1: - try: - while True: - value, order, next = s = h[0] - yield value - s[0] = next() # raises StopIteration when exhausted - _heapreplace(h, s) # restore heap condition - except StopIteration: - _heappop(h) # remove empty iterator - if h: - # fast case when only a single iterator remains - value, order, next = h[0] - yield value - yield from next.__self__ - return - - for order, it in enumerate(map(iter, iterables)): - try: - next = it.__next__ - value = next() - h_append([key(value), order * direction, value, next]) - except StopIteration: - pass - _heapify(h) - while len(h) > 1: - try: - while True: - key_value, order, value, next = s = h[0] - yield value - value = next() - s[0] = key(value) - s[2] = value - _heapreplace(h, s) - except StopIteration: - _heappop(h) - if h: - key_value, order, value, next = h[0] - yield value - yield from next.__self__ - - -# Algorithm notes for nlargest() and nsmallest() -# ============================================== -# -# Make a single pass over the data while keeping the k most extreme values -# in a heap. Memory consumption is limited to keeping k values in a list. -# -# Measured performance for random inputs: -# -# number of comparisons -# n inputs k-extreme values (average of 5 trials) % more than min() -# ------------- ---------------- --------------------- ----------------- -# 1,000 100 3,317 231.7% -# 10,000 100 14,046 40.5% -# 100,000 100 105,749 5.7% -# 1,000,000 100 1,007,751 0.8% -# 10,000,000 100 10,009,401 0.1% -# -# Theoretical number of comparisons for k smallest of n random inputs: -# -# Step Comparisons Action -# ---- -------------------------- --------------------------- -# 1 1.66 * k heapify the first k-inputs -# 2 n - k compare remaining elements to top of heap -# 3 k * (1 + lg2(k)) * ln(n/k) replace the topmost value on the heap -# 4 k * lg2(k) - (k/2) final sort of the k most extreme values -# -# Combining and simplifying for a rough estimate gives: -# -# comparisons = n + k * (log(k, 2) * log(n/k) + log(k, 2) + log(n/k)) -# -# Computing the number of comparisons for step 3: -# ----------------------------------------------- -# * For the i-th new value from the iterable, the probability of being in the -# k most extreme values is k/i. For example, the probability of the 101st -# value seen being in the 100 most extreme values is 100/101. -# * If the value is a new extreme value, the cost of inserting it into the -# heap is 1 + log(k, 2). -# * The probability times the cost gives: -# (k/i) * (1 + log(k, 2)) -# * Summing across the remaining n-k elements gives: -# sum((k/i) * (1 + log(k, 2)) for i in range(k+1, n+1)) -# * This reduces to: -# (H(n) - H(k)) * k * (1 + log(k, 2)) -# * Where H(n) is the n-th harmonic number estimated by: -# gamma = 0.5772156649 -# H(n) = log(n, e) + gamma + 1 / (2 * n) -# http://en.wikipedia.org/wiki/Harmonic_series_(mathematics)#Rate_of_divergence -# * Substituting the H(n) formula: -# comparisons = k * (1 + log(k, 2)) * (log(n/k, e) + (1/n - 1/k) / 2) -# -# Worst-case for step 3: -# ---------------------- -# In the worst case, the input data is reversed sorted so that every new element -# must be inserted in the heap: -# -# comparisons = 1.66 * k + log(k, 2) * (n - k) -# -# Alternative Algorithms -# ---------------------- -# Other algorithms were not used because they: -# 1) Took much more auxiliary memory, -# 2) Made multiple passes over the data. -# 3) Made more comparisons in common cases (small k, large n, semi-random input). -# See the more detailed comparison of approach at: -# http://code.activestate.com/recipes/577573-compare-algorithms-for-heapqsmallest - -def nsmallest(n, iterable, key=None): - """Find the n smallest elements in a dataset. - - Equivalent to: sorted(iterable, key=key)[:n] - """ - - # Short-cut for n==1 is to use min() - if n == 1: - it = iter(iterable) - sentinel = object() - result = min(it, default=sentinel, key=key) - return [] if result is sentinel else [result] - - # When n>=size, it's faster to use sorted() - try: - size = len(iterable) - except (TypeError, AttributeError): - pass - else: - if n >= size: - return sorted(iterable, key=key)[:n] - - # When key is none, use simpler decoration - if key is None: - it = iter(iterable) - # put the range(n) first so that zip() doesn't - # consume one too many elements from the iterator - result = [(elem, i) for i, elem in zip(range(n), it)] - if not result: - return result - _heapify_max(result) - top = result[0][0] - order = n - _heapreplace = _heapreplace_max - for elem in it: - if elem < top: - _heapreplace(result, (elem, order)) - top, _order = result[0] - order += 1 - result.sort() - return [elem for (elem, order) in result] - - # General case, slowest method - it = iter(iterable) - result = [(key(elem), i, elem) for i, elem in zip(range(n), it)] - if not result: - return result - _heapify_max(result) - top = result[0][0] - order = n - _heapreplace = _heapreplace_max - for elem in it: - k = key(elem) - if k < top: - _heapreplace(result, (k, order, elem)) - top, _order, _elem = result[0] - order += 1 - result.sort() - return [elem for (k, order, elem) in result] - -def nlargest(n, iterable, key=None): - """Find the n largest elements in a dataset. - - Equivalent to: sorted(iterable, key=key, reverse=True)[:n] - """ - - # Short-cut for n==1 is to use max() - if n == 1: - it = iter(iterable) - sentinel = object() - result = max(it, default=sentinel, key=key) - return [] if result is sentinel else [result] - - # When n>=size, it's faster to use sorted() - try: - size = len(iterable) - except (TypeError, AttributeError): - pass - else: - if n >= size: - return sorted(iterable, key=key, reverse=True)[:n] - - # When key is none, use simpler decoration - if key is None: - it = iter(iterable) - result = [(elem, i) for i, elem in zip(range(0, -n, -1), it)] - if not result: - return result - heapify(result) - top = result[0][0] - order = -n - _heapreplace = heapreplace - for elem in it: - if top < elem: - _heapreplace(result, (elem, order)) - top, _order = result[0] - order -= 1 - result.sort(reverse=True) - return [elem for (elem, order) in result] - - # General case, slowest method - it = iter(iterable) - result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)] - if not result: - return result - heapify(result) - top = result[0][0] - order = -n - _heapreplace = heapreplace - for elem in it: - k = key(elem) - if top < k: - _heapreplace(result, (k, order, elem)) - top, _order, _elem = result[0] - order -= 1 - result.sort(reverse=True) - return [elem for (k, order, elem) in result] - -# If available, use C implementation -try: - from _heapq import * -except ImportError: - pass -try: - from _heapq import _heapreplace_max -except ImportError: - pass -try: - from _heapq import _heapify_max -except ImportError: - pass -try: - from _heapq import _heappop_max -except ImportError: - pass - - -if __name__ == "__main__": - - import doctest # pragma: no cover - print(doctest.testmod()) # pragma: no cover +"""Heap queue algorithm (a.k.a. priority queue). + +Heaps are arrays for which a[k] <= a[2*k+1] and a[k] <= a[2*k+2] for +all k, counting elements from 0. For the sake of comparison, +non-existing elements are considered to be infinite. The interesting +property of a heap is that a[0] is always its smallest element. + +Usage: + +heap = [] # creates an empty heap +heappush(heap, item) # pushes a new item on the heap +item = heappop(heap) # pops the smallest item from the heap +item = heap[0] # smallest item on the heap without popping it +heapify(x) # transforms list into a heap, in-place, in linear time +item = heapreplace(heap, item) # pops and returns smallest item, and adds + # new item; the heap size is unchanged + +Our API differs from textbook heap algorithms as follows: + +- We use 0-based indexing. This makes the relationship between the + index for a node and the indexes for its children slightly less + obvious, but is more suitable since Python uses 0-based indexing. + +- Our heappop() method returns the smallest item, not the largest. + +These two make it possible to view the heap as a regular Python list +without surprises: heap[0] is the smallest item, and heap.sort() +maintains the heap invariant! +""" + +# Original code by Kevin O'Connor, augmented by Tim Peters and Raymond Hettinger + +__about__ = """Heap queues + +[explanation by François Pinard] + +Heaps are arrays for which a[k] <= a[2*k+1] and a[k] <= a[2*k+2] for +all k, counting elements from 0. For the sake of comparison, +non-existing elements are considered to be infinite. The interesting +property of a heap is that a[0] is always its smallest element. + +The strange invariant above is meant to be an efficient memory +representation for a tournament. The numbers below are `k', not a[k]: + + 0 + + 1 2 + + 3 4 5 6 + + 7 8 9 10 11 12 13 14 + + 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 + + +In the tree above, each cell `k' is topping `2*k+1' and `2*k+2'. In +a usual binary tournament we see in sports, each cell is the winner +over the two cells it tops, and we can trace the winner down the tree +to see all opponents s/he had. However, in many computer applications +of such tournaments, we do not need to trace the history of a winner. +To be more memory efficient, when a winner is promoted, we try to +replace it by something else at a lower level, and the rule becomes +that a cell and the two cells it tops contain three different items, +but the top cell "wins" over the two topped cells. + +If this heap invariant is protected at all time, index 0 is clearly +the overall winner. The simplest algorithmic way to remove it and +find the "next" winner is to move some loser (let's say cell 30 in the +diagram above) into the 0 position, and then percolate this new 0 down +the tree, exchanging values, until the invariant is re-established. +This is clearly logarithmic on the total number of items in the tree. +By iterating over all items, you get an O(n ln n) sort. + +A nice feature of this sort is that you can efficiently insert new +items while the sort is going on, provided that the inserted items are +not "better" than the last 0'th element you extracted. This is +especially useful in simulation contexts, where the tree holds all +incoming events, and the "win" condition means the smallest scheduled +time. When an event schedule other events for execution, they are +scheduled into the future, so they can easily go into the heap. So, a +heap is a good structure for implementing schedulers (this is what I +used for my MIDI sequencer :-). + +Various structures for implementing schedulers have been extensively +studied, and heaps are good for this, as they are reasonably speedy, +the speed is almost constant, and the worst case is not much different +than the average case. However, there are other representations which +are more efficient overall, yet the worst cases might be terrible. + +Heaps are also very useful in big disk sorts. You most probably all +know that a big sort implies producing "runs" (which are pre-sorted +sequences, which size is usually related to the amount of CPU memory), +followed by a merging passes for these runs, which merging is often +very cleverly organised[1]. It is very important that the initial +sort produces the longest runs possible. Tournaments are a good way +to that. If, using all the memory available to hold a tournament, you +replace and percolate items that happen to fit the current run, you'll +produce runs which are twice the size of the memory for random input, +and much better for input fuzzily ordered. + +Moreover, if you output the 0'th item on disk and get an input which +may not fit in the current tournament (because the value "wins" over +the last output value), it cannot fit in the heap, so the size of the +heap decreases. The freed memory could be cleverly reused immediately +for progressively building a second heap, which grows at exactly the +same rate the first heap is melting. When the first heap completely +vanishes, you switch heaps and start a new run. Clever and quite +effective! + +In a word, heaps are useful memory structures to know. I use them in +a few applications, and I think it is good to keep a `heap' module +around. :-) + +-------------------- +[1] The disk balancing algorithms which are current, nowadays, are +more annoying than clever, and this is a consequence of the seeking +capabilities of the disks. On devices which cannot seek, like big +tape drives, the story was quite different, and one had to be very +clever to ensure (far in advance) that each tape movement will be the +most effective possible (that is, will best participate at +"progressing" the merge). Some tapes were even able to read +backwards, and this was also used to avoid the rewinding time. +Believe me, real good tape sorts were quite spectacular to watch! +From all times, sorting has always been a Great Art! :-) +""" + +__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'merge', + 'nlargest', 'nsmallest', 'heappushpop'] + +def heappush(heap, item): + """Push item onto heap, maintaining the heap invariant.""" + heap.append(item) + _siftdown(heap, 0, len(heap)-1) + +def heappop(heap): + """Pop the smallest item off the heap, maintaining the heap invariant.""" + lastelt = heap.pop() # raises appropriate IndexError if heap is empty + if heap: + returnitem = heap[0] + heap[0] = lastelt + _siftup(heap, 0) + return returnitem + return lastelt + +def heapreplace(heap, item): + """Pop and return the current smallest value, and add the new item. + + This is more efficient than heappop() followed by heappush(), and can be + more appropriate when using a fixed-size heap. Note that the value + returned may be larger than item! That constrains reasonable uses of + this routine unless written as part of a conditional replacement: + + if item > heap[0]: + item = heapreplace(heap, item) + """ + returnitem = heap[0] # raises appropriate IndexError if heap is empty + heap[0] = item + _siftup(heap, 0) + return returnitem + +def heappushpop(heap, item): + """Fast version of a heappush followed by a heappop.""" + if heap and heap[0] < item: + item, heap[0] = heap[0], item + _siftup(heap, 0) + return item + +def heapify(x): + """Transform list into a heap, in-place, in O(len(x)) time.""" + n = len(x) + # Transform bottom-up. The largest index there's any point to looking at + # is the largest with a child index in-range, so must have 2*i + 1 < n, + # or i < (n-1)/2. If n is even = 2*j, this is (2*j-1)/2 = j-1/2 so + # j-1 is the largest, which is n//2 - 1. If n is odd = 2*j+1, this is + # (2*j+1-1)/2 = j so j-1 is the largest, and that's again n//2-1. + for i in reversed(range(n//2)): + _siftup(x, i) + +def _heappop_max(heap): + """Maxheap version of a heappop.""" + lastelt = heap.pop() # raises appropriate IndexError if heap is empty + if heap: + returnitem = heap[0] + heap[0] = lastelt + _siftup_max(heap, 0) + return returnitem + return lastelt + +def _heapreplace_max(heap, item): + """Maxheap version of a heappop followed by a heappush.""" + returnitem = heap[0] # raises appropriate IndexError if heap is empty + heap[0] = item + _siftup_max(heap, 0) + return returnitem + +def _heapify_max(x): + """Transform list into a maxheap, in-place, in O(len(x)) time.""" + n = len(x) + for i in reversed(range(n//2)): + _siftup_max(x, i) + +# 'heap' is a heap at all indices >= startpos, except possibly for pos. pos +# is the index of a leaf with a possibly out-of-order value. Restore the +# heap invariant. +def _siftdown(heap, startpos, pos): + newitem = heap[pos] + # Follow the path to the root, moving parents down until finding a place + # newitem fits. + while pos > startpos: + parentpos = (pos - 1) >> 1 + parent = heap[parentpos] + if newitem < parent: + heap[pos] = parent + pos = parentpos + continue + break + heap[pos] = newitem + +# The child indices of heap index pos are already heaps, and we want to make +# a heap at index pos too. We do this by bubbling the smaller child of +# pos up (and so on with that child's children, etc) until hitting a leaf, +# then using _siftdown to move the oddball originally at index pos into place. +# +# We *could* break out of the loop as soon as we find a pos where newitem <= +# both its children, but turns out that's not a good idea, and despite that +# many books write the algorithm that way. During a heap pop, the last array +# element is sifted in, and that tends to be large, so that comparing it +# against values starting from the root usually doesn't pay (= usually doesn't +# get us out of the loop early). See Knuth, Volume 3, where this is +# explained and quantified in an exercise. +# +# Cutting the # of comparisons is important, since these routines have no +# way to extract "the priority" from an array element, so that intelligence +# is likely to be hiding in custom comparison methods, or in array elements +# storing (priority, record) tuples. Comparisons are thus potentially +# expensive. +# +# On random arrays of length 1000, making this change cut the number of +# comparisons made by heapify() a little, and those made by exhaustive +# heappop() a lot, in accord with theory. Here are typical results from 3 +# runs (3 just to demonstrate how small the variance is): +# +# Compares needed by heapify Compares needed by 1000 heappops +# -------------------------- -------------------------------- +# 1837 cut to 1663 14996 cut to 8680 +# 1855 cut to 1659 14966 cut to 8678 +# 1847 cut to 1660 15024 cut to 8703 +# +# Building the heap by using heappush() 1000 times instead required +# 2198, 2148, and 2219 compares: heapify() is more efficient, when +# you can use it. +# +# The total compares needed by list.sort() on the same lists were 8627, +# 8627, and 8632 (this should be compared to the sum of heapify() and +# heappop() compares): list.sort() is (unsurprisingly!) more efficient +# for sorting. + +def _siftup(heap, pos): + endpos = len(heap) + startpos = pos + newitem = heap[pos] + # Bubble up the smaller child until hitting a leaf. + childpos = 2*pos + 1 # leftmost child position + while childpos < endpos: + # Set childpos to index of smaller child. + rightpos = childpos + 1 + if rightpos < endpos and not heap[childpos] < heap[rightpos]: + childpos = rightpos + # Move the smaller child up. + heap[pos] = heap[childpos] + pos = childpos + childpos = 2*pos + 1 + # The leaf at pos is empty now. Put newitem there, and bubble it up + # to its final resting place (by sifting its parents down). + heap[pos] = newitem + _siftdown(heap, startpos, pos) + +def _siftdown_max(heap, startpos, pos): + 'Maxheap variant of _siftdown' + newitem = heap[pos] + # Follow the path to the root, moving parents down until finding a place + # newitem fits. + while pos > startpos: + parentpos = (pos - 1) >> 1 + parent = heap[parentpos] + if parent < newitem: + heap[pos] = parent + pos = parentpos + continue + break + heap[pos] = newitem + +def _siftup_max(heap, pos): + 'Maxheap variant of _siftup' + endpos = len(heap) + startpos = pos + newitem = heap[pos] + # Bubble up the larger child until hitting a leaf. + childpos = 2*pos + 1 # leftmost child position + while childpos < endpos: + # Set childpos to index of larger child. + rightpos = childpos + 1 + if rightpos < endpos and not heap[rightpos] < heap[childpos]: + childpos = rightpos + # Move the larger child up. + heap[pos] = heap[childpos] + pos = childpos + childpos = 2*pos + 1 + # The leaf at pos is empty now. Put newitem there, and bubble it up + # to its final resting place (by sifting its parents down). + heap[pos] = newitem + _siftdown_max(heap, startpos, pos) + +def merge(*iterables, key=None, reverse=False): + '''Merge multiple sorted inputs into a single sorted output. + + Similar to sorted(itertools.chain(*iterables)) but returns a generator, + does not pull the data into memory all at once, and assumes that each of + the input streams is already sorted (smallest to largest). + + >>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25])) + [0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25] + + If *key* is not None, applies a key function to each element to determine + its sort order. + + >>> list(merge(['dog', 'horse'], ['cat', 'fish', 'kangaroo'], key=len)) + ['dog', 'cat', 'fish', 'horse', 'kangaroo'] + + ''' + n = len(iterables) + if n == 0: + return + if n == 1: + yield from iterables[0] + return + if n == 2: + # special case for two iterables + a_iter = iter(iterables[0]) + b_iter = iter(iterables[1]) + + try: + a = next(a_iter) + except StopIteration: + yield from b_iter + return + try: + b = next(b_iter) + except StopIteration: + yield a + yield from a_iter + return + + if key is None: + if not reverse: + # no key, forward + while True: + if a <= b: + yield a + try: + a = next(a_iter) + except StopIteration: + yield b + yield from b_iter + return + else: + yield b + try: + b = next(b_iter) + except StopIteration: + yield a + yield from a_iter + return + else: + # no key, reverse + while True: + if a >= b: + yield a + try: + a = next(a_iter) + except StopIteration: + yield b + yield from b_iter + return + else: + yield b + try: + b = next(b_iter) + except StopIteration: + yield a + yield from a_iter + return + else: + ka = key(a) + kb = key(b) + if not reverse: + # using a key, forward + while True: + if ka <= kb: + yield a + try: + a = next(a_iter) + except StopIteration: + yield b + yield from b_iter + return + ka = key(a) + else: + yield b + try: + b = next(b_iter) + except StopIteration: + yield a + yield from a_iter + return + kb = key(b) + else: + # using a key, reverse + while True: + if ka >= kb: + yield a + try: + a = next(a_iter) + except StopIteration: + yield b + yield from b_iter + return + ka = key(a) + else: + yield b + try: + b = next(b_iter) + except StopIteration: + yield a + yield from a_iter + return + kb = key(b) + if n <= 6: + # Use the 2-case recursively for small numbers of iterables + n_2 = n // 2 + result_1 = merge(*iterables[:n_2], key=key, reverse=reverse) + result_2 = merge(*iterables[n_2:], key=key, reverse=reverse) + yield from merge(result_1, result_2, key=key, reverse=reverse) + return + + # For larger numbers, store the front of each iterable in a heap. + h = [] + h_append = h.append + + if reverse: + _heapify = _heapify_max + _heappop = _heappop_max + _heapreplace = _heapreplace_max + direction = -1 + else: + _heapify = heapify + _heappop = heappop + _heapreplace = heapreplace + direction = 1 + + if key is None: + for order, it in enumerate(map(iter, iterables)): + try: + next_func = it.__next__ + h_append([next_func(), order * direction, next_func]) + except StopIteration: + pass + _heapify(h) + while len(h) > 1: + try: + while True: + value, order, next_func = s = h[0] + yield value + s[0] = next_func() # raises StopIteration when exhausted + _heapreplace(h, s) # restore heap condition + except StopIteration: + _heappop(h) # remove empty iterator + if h: + # fast case when only a single iterator remains + value, order, next_func = h[0] + yield value + yield from next_func.__self__ + return + + for order, it in enumerate(map(iter, iterables)): + try: + next_func = it.__next__ + value = next_func() + h_append([key(value), order * direction, value, next_func]) + except StopIteration: + pass + _heapify(h) + while len(h) > 1: + try: + while True: + key_value, order, value, next_func = s = h[0] + yield value + value = next_func() + s[0] = key(value) + s[2] = value + _heapreplace(h, s) + except StopIteration: + _heappop(h) + if h: + key_value, order, value, next_func = h[0] + yield value + yield from next_func.__self__ + + +# Algorithm notes for nlargest() and nsmallest() +# ============================================== +# +# Make a single pass over the data while keeping the k most extreme values +# in a heap. Memory consumption is limited to keeping k values in a list. +# +# Measured performance for random inputs: +# +# number of comparisons +# n inputs k-extreme values (average of 5 trials) % more than min() +# ------------- ---------------- --------------------- ----------------- +# 1,000 100 3,317 231.7% +# 10,000 100 14,046 40.5% +# 100,000 100 105,749 5.7% +# 1,000,000 100 1,007,751 0.8% +# 10,000,000 100 10,009,401 0.1% +# +# Theoretical number of comparisons for k smallest of n random inputs: +# +# Step Comparisons Action +# ---- -------------------------- --------------------------- +# 1 1.66 * k heapify the first k-inputs +# 2 n - k compare remaining elements to top of heap +# 3 k * (1 + lg2(k)) * ln(n/k) replace the topmost value on the heap +# 4 k * lg2(k) - (k/2) final sort of the k most extreme values +# +# Combining and simplifying for a rough estimate gives: +# +# comparisons = n + k * (log(k, 2) * log(n/k) + log(k, 2) + log(n/k)) +# +# Computing the number of comparisons for step 3: +# ----------------------------------------------- +# * For the i-th new value from the iterable, the probability of being in the +# k most extreme values is k/i. For example, the probability of the 101st +# value seen being in the 100 most extreme values is 100/101. +# * If the value is a new extreme value, the cost of inserting it into the +# heap is 1 + log(k, 2). +# * The probability times the cost gives: +# (k/i) * (1 + log(k, 2)) +# * Summing across the remaining n-k elements gives: +# sum((k/i) * (1 + log(k, 2)) for i in range(k+1, n+1)) +# * This reduces to: +# (H(n) - H(k)) * k * (1 + log(k, 2)) +# * Where H(n) is the n-th harmonic number estimated by: +# gamma = 0.5772156649 +# H(n) = log(n, e) + gamma + 1 / (2 * n) +# http://en.wikipedia.org/wiki/Harmonic_series_(mathematics)#Rate_of_divergence +# * Substituting the H(n) formula: +# comparisons = k * (1 + log(k, 2)) * (log(n/k, e) + (1/n - 1/k) / 2) +# +# Worst-case for step 3: +# ---------------------- +# In the worst case, the input data is reversed sorted so that every new element +# must be inserted in the heap: +# +# comparisons = 1.66 * k + log(k, 2) * (n - k) +# +# Alternative Algorithms +# ---------------------- +# Other algorithms were not used because they: +# 1) Took much more auxiliary memory, +# 2) Made multiple passes over the data. +# 3) Made more comparisons in common cases (small k, large n, semi-random input). +# See the more detailed comparison of approach at: +# http://code.activestate.com/recipes/577573-compare-algorithms-for-heapqsmallest + +def nsmallest(n, iterable, key=None): + """Find the n smallest elements in a dataset. + + Equivalent to: sorted(iterable, key=key)[:n] + """ + + # Short-cut for n==1 is to use min() + if n == 1: + it = iter(iterable) + sentinel = object() + result = min(it, default=sentinel, key=key) + return [] if result is sentinel else [result] + + # When n>=size, it's faster to use sorted() + try: + size = len(iterable) + except (TypeError, AttributeError): + pass + else: + if n >= size: + return sorted(iterable, key=key)[:n] + + # When key is none, use simpler decoration + if key is None: + it = iter(iterable) + # put the range(n) first so that zip() doesn't + # consume one too many elements from the iterator + result = [(elem, i) for i, elem in zip(range(n), it)] + if not result: + return result + _heapify_max(result) + top = result[0][0] + order = n + _heapreplace = _heapreplace_max + for elem in it: + if elem < top: + _heapreplace(result, (elem, order)) + top, _order = result[0] + order += 1 + result.sort() + return [elem for (elem, order) in result] + + # General case, slowest method + it = iter(iterable) + result = [(key(elem), i, elem) for i, elem in zip(range(n), it)] + if not result: + return result + _heapify_max(result) + top = result[0][0] + order = n + _heapreplace = _heapreplace_max + for elem in it: + k = key(elem) + if k < top: + _heapreplace(result, (k, order, elem)) + top, _order, _elem = result[0] + order += 1 + result.sort() + return [elem for (k, order, elem) in result] + +def nlargest(n, iterable, key=None): + """Find the n largest elements in a dataset. + + Equivalent to: sorted(iterable, key=key, reverse=True)[:n] + """ + + # Short-cut for n==1 is to use max() + if n == 1: + it = iter(iterable) + sentinel = object() + result = max(it, default=sentinel, key=key) + return [] if result is sentinel else [result] + + # When n>=size, it's faster to use sorted() + try: + size = len(iterable) + except (TypeError, AttributeError): + pass + else: + if n >= size: + return sorted(iterable, key=key, reverse=True)[:n] + + # When key is none, use simpler decoration + if key is None: + it = iter(iterable) + result = [(elem, i) for i, elem in zip(range(0, -n, -1), it)] + if not result: + return result + heapify(result) + top = result[0][0] + order = -n + _heapreplace = heapreplace + for elem in it: + if top < elem: + _heapreplace(result, (elem, order)) + top, _order = result[0] + order -= 1 + result.sort(reverse=True) + return [elem for (elem, order) in result] + + # General case, slowest method + it = iter(iterable) + result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)] + if not result: + return result + heapify(result) + top = result[0][0] + order = -n + _heapreplace = heapreplace + for elem in it: + k = key(elem) + if top < k: + _heapreplace(result, (k, order, elem)) + top, _order, _elem = result[0] + order -= 1 + result.sort(reverse=True) + return [elem for (k, order, elem) in result] + +# If available, use C implementation +try: + from _heapq import * +except ImportError: + pass +try: + from _heapq import _heapreplace_max +except ImportError: + pass +try: + from _heapq import _heapify_max +except ImportError: + pass +try: + from _heapq import _heappop_max +except ImportError: + pass + + +if __name__ == "__main__": + + import doctest # pragma: no cover + print(doctest.testmod()) # pragma: no cover \ No newline at end of file From d93f918c57a1eacabd2bfac405bcbc33620b35a3 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sat, 30 Nov 2019 21:51:54 -0500 Subject: [PATCH 02/17] fixed spacing --- Lib/heapq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/heapq.py b/Lib/heapq.py index 7fd2d865dd34cd..a49c92ff1b43d7 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -713,4 +713,4 @@ def nlargest(n, iterable, key=None): if __name__ == "__main__": import doctest # pragma: no cover - print(doctest.testmod()) # pragma: no cover \ No newline at end of file + print(doctest.testmod()) # pragma: no cover From 6b286a39661b3241e33ffd73deae19ac7f93cf1f Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sat, 30 Nov 2019 22:24:41 -0500 Subject: [PATCH 03/17] oops --- Lib/heapq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/heapq.py b/Lib/heapq.py index a49c92ff1b43d7..7fd2d865dd34cd 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -713,4 +713,4 @@ def nlargest(n, iterable, key=None): if __name__ == "__main__": import doctest # pragma: no cover - print(doctest.testmod()) # pragma: no cover + print(doctest.testmod()) # pragma: no cover \ No newline at end of file From 31989bfd99ffd4498c4b5d92cce5f52c5ebc5c11 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sat, 30 Nov 2019 22:26:11 -0500 Subject: [PATCH 04/17] oops --- Lib/heapq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/heapq.py b/Lib/heapq.py index 7fd2d865dd34cd..a49c92ff1b43d7 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -713,4 +713,4 @@ def nlargest(n, iterable, key=None): if __name__ == "__main__": import doctest # pragma: no cover - print(doctest.testmod()) # pragma: no cover \ No newline at end of file + print(doctest.testmod()) # pragma: no cover From abd79b46a6e5411cd5a51b576ca41b4b5e670f14 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sat, 30 Nov 2019 22:29:51 -0500 Subject: [PATCH 05/17] oops --- Lib/heapq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/heapq.py b/Lib/heapq.py index a49c92ff1b43d7..7fd2d865dd34cd 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -713,4 +713,4 @@ def nlargest(n, iterable, key=None): if __name__ == "__main__": import doctest # pragma: no cover - print(doctest.testmod()) # pragma: no cover + print(doctest.testmod()) # pragma: no cover \ No newline at end of file From 3c0a90263888609c179d44f592d7187d0f867d7d Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sat, 30 Nov 2019 22:30:11 -0500 Subject: [PATCH 06/17] Revert "heapq.merge special case for few iterables" This reverts commit 8cd419ee231ecbcdee1d81580c16cb53c6774eb5. --- Lib/heapq.py | 1317 +++++++++++++++++++++++--------------------------- 1 file changed, 601 insertions(+), 716 deletions(-) diff --git a/Lib/heapq.py b/Lib/heapq.py index 7fd2d865dd34cd..fabefd87f8bf8c 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -1,716 +1,601 @@ -"""Heap queue algorithm (a.k.a. priority queue). - -Heaps are arrays for which a[k] <= a[2*k+1] and a[k] <= a[2*k+2] for -all k, counting elements from 0. For the sake of comparison, -non-existing elements are considered to be infinite. The interesting -property of a heap is that a[0] is always its smallest element. - -Usage: - -heap = [] # creates an empty heap -heappush(heap, item) # pushes a new item on the heap -item = heappop(heap) # pops the smallest item from the heap -item = heap[0] # smallest item on the heap without popping it -heapify(x) # transforms list into a heap, in-place, in linear time -item = heapreplace(heap, item) # pops and returns smallest item, and adds - # new item; the heap size is unchanged - -Our API differs from textbook heap algorithms as follows: - -- We use 0-based indexing. This makes the relationship between the - index for a node and the indexes for its children slightly less - obvious, but is more suitable since Python uses 0-based indexing. - -- Our heappop() method returns the smallest item, not the largest. - -These two make it possible to view the heap as a regular Python list -without surprises: heap[0] is the smallest item, and heap.sort() -maintains the heap invariant! -""" - -# Original code by Kevin O'Connor, augmented by Tim Peters and Raymond Hettinger - -__about__ = """Heap queues - -[explanation by François Pinard] - -Heaps are arrays for which a[k] <= a[2*k+1] and a[k] <= a[2*k+2] for -all k, counting elements from 0. For the sake of comparison, -non-existing elements are considered to be infinite. The interesting -property of a heap is that a[0] is always its smallest element. - -The strange invariant above is meant to be an efficient memory -representation for a tournament. The numbers below are `k', not a[k]: - - 0 - - 1 2 - - 3 4 5 6 - - 7 8 9 10 11 12 13 14 - - 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 - - -In the tree above, each cell `k' is topping `2*k+1' and `2*k+2'. In -a usual binary tournament we see in sports, each cell is the winner -over the two cells it tops, and we can trace the winner down the tree -to see all opponents s/he had. However, in many computer applications -of such tournaments, we do not need to trace the history of a winner. -To be more memory efficient, when a winner is promoted, we try to -replace it by something else at a lower level, and the rule becomes -that a cell and the two cells it tops contain three different items, -but the top cell "wins" over the two topped cells. - -If this heap invariant is protected at all time, index 0 is clearly -the overall winner. The simplest algorithmic way to remove it and -find the "next" winner is to move some loser (let's say cell 30 in the -diagram above) into the 0 position, and then percolate this new 0 down -the tree, exchanging values, until the invariant is re-established. -This is clearly logarithmic on the total number of items in the tree. -By iterating over all items, you get an O(n ln n) sort. - -A nice feature of this sort is that you can efficiently insert new -items while the sort is going on, provided that the inserted items are -not "better" than the last 0'th element you extracted. This is -especially useful in simulation contexts, where the tree holds all -incoming events, and the "win" condition means the smallest scheduled -time. When an event schedule other events for execution, they are -scheduled into the future, so they can easily go into the heap. So, a -heap is a good structure for implementing schedulers (this is what I -used for my MIDI sequencer :-). - -Various structures for implementing schedulers have been extensively -studied, and heaps are good for this, as they are reasonably speedy, -the speed is almost constant, and the worst case is not much different -than the average case. However, there are other representations which -are more efficient overall, yet the worst cases might be terrible. - -Heaps are also very useful in big disk sorts. You most probably all -know that a big sort implies producing "runs" (which are pre-sorted -sequences, which size is usually related to the amount of CPU memory), -followed by a merging passes for these runs, which merging is often -very cleverly organised[1]. It is very important that the initial -sort produces the longest runs possible. Tournaments are a good way -to that. If, using all the memory available to hold a tournament, you -replace and percolate items that happen to fit the current run, you'll -produce runs which are twice the size of the memory for random input, -and much better for input fuzzily ordered. - -Moreover, if you output the 0'th item on disk and get an input which -may not fit in the current tournament (because the value "wins" over -the last output value), it cannot fit in the heap, so the size of the -heap decreases. The freed memory could be cleverly reused immediately -for progressively building a second heap, which grows at exactly the -same rate the first heap is melting. When the first heap completely -vanishes, you switch heaps and start a new run. Clever and quite -effective! - -In a word, heaps are useful memory structures to know. I use them in -a few applications, and I think it is good to keep a `heap' module -around. :-) - --------------------- -[1] The disk balancing algorithms which are current, nowadays, are -more annoying than clever, and this is a consequence of the seeking -capabilities of the disks. On devices which cannot seek, like big -tape drives, the story was quite different, and one had to be very -clever to ensure (far in advance) that each tape movement will be the -most effective possible (that is, will best participate at -"progressing" the merge). Some tapes were even able to read -backwards, and this was also used to avoid the rewinding time. -Believe me, real good tape sorts were quite spectacular to watch! -From all times, sorting has always been a Great Art! :-) -""" - -__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'merge', - 'nlargest', 'nsmallest', 'heappushpop'] - -def heappush(heap, item): - """Push item onto heap, maintaining the heap invariant.""" - heap.append(item) - _siftdown(heap, 0, len(heap)-1) - -def heappop(heap): - """Pop the smallest item off the heap, maintaining the heap invariant.""" - lastelt = heap.pop() # raises appropriate IndexError if heap is empty - if heap: - returnitem = heap[0] - heap[0] = lastelt - _siftup(heap, 0) - return returnitem - return lastelt - -def heapreplace(heap, item): - """Pop and return the current smallest value, and add the new item. - - This is more efficient than heappop() followed by heappush(), and can be - more appropriate when using a fixed-size heap. Note that the value - returned may be larger than item! That constrains reasonable uses of - this routine unless written as part of a conditional replacement: - - if item > heap[0]: - item = heapreplace(heap, item) - """ - returnitem = heap[0] # raises appropriate IndexError if heap is empty - heap[0] = item - _siftup(heap, 0) - return returnitem - -def heappushpop(heap, item): - """Fast version of a heappush followed by a heappop.""" - if heap and heap[0] < item: - item, heap[0] = heap[0], item - _siftup(heap, 0) - return item - -def heapify(x): - """Transform list into a heap, in-place, in O(len(x)) time.""" - n = len(x) - # Transform bottom-up. The largest index there's any point to looking at - # is the largest with a child index in-range, so must have 2*i + 1 < n, - # or i < (n-1)/2. If n is even = 2*j, this is (2*j-1)/2 = j-1/2 so - # j-1 is the largest, which is n//2 - 1. If n is odd = 2*j+1, this is - # (2*j+1-1)/2 = j so j-1 is the largest, and that's again n//2-1. - for i in reversed(range(n//2)): - _siftup(x, i) - -def _heappop_max(heap): - """Maxheap version of a heappop.""" - lastelt = heap.pop() # raises appropriate IndexError if heap is empty - if heap: - returnitem = heap[0] - heap[0] = lastelt - _siftup_max(heap, 0) - return returnitem - return lastelt - -def _heapreplace_max(heap, item): - """Maxheap version of a heappop followed by a heappush.""" - returnitem = heap[0] # raises appropriate IndexError if heap is empty - heap[0] = item - _siftup_max(heap, 0) - return returnitem - -def _heapify_max(x): - """Transform list into a maxheap, in-place, in O(len(x)) time.""" - n = len(x) - for i in reversed(range(n//2)): - _siftup_max(x, i) - -# 'heap' is a heap at all indices >= startpos, except possibly for pos. pos -# is the index of a leaf with a possibly out-of-order value. Restore the -# heap invariant. -def _siftdown(heap, startpos, pos): - newitem = heap[pos] - # Follow the path to the root, moving parents down until finding a place - # newitem fits. - while pos > startpos: - parentpos = (pos - 1) >> 1 - parent = heap[parentpos] - if newitem < parent: - heap[pos] = parent - pos = parentpos - continue - break - heap[pos] = newitem - -# The child indices of heap index pos are already heaps, and we want to make -# a heap at index pos too. We do this by bubbling the smaller child of -# pos up (and so on with that child's children, etc) until hitting a leaf, -# then using _siftdown to move the oddball originally at index pos into place. -# -# We *could* break out of the loop as soon as we find a pos where newitem <= -# both its children, but turns out that's not a good idea, and despite that -# many books write the algorithm that way. During a heap pop, the last array -# element is sifted in, and that tends to be large, so that comparing it -# against values starting from the root usually doesn't pay (= usually doesn't -# get us out of the loop early). See Knuth, Volume 3, where this is -# explained and quantified in an exercise. -# -# Cutting the # of comparisons is important, since these routines have no -# way to extract "the priority" from an array element, so that intelligence -# is likely to be hiding in custom comparison methods, or in array elements -# storing (priority, record) tuples. Comparisons are thus potentially -# expensive. -# -# On random arrays of length 1000, making this change cut the number of -# comparisons made by heapify() a little, and those made by exhaustive -# heappop() a lot, in accord with theory. Here are typical results from 3 -# runs (3 just to demonstrate how small the variance is): -# -# Compares needed by heapify Compares needed by 1000 heappops -# -------------------------- -------------------------------- -# 1837 cut to 1663 14996 cut to 8680 -# 1855 cut to 1659 14966 cut to 8678 -# 1847 cut to 1660 15024 cut to 8703 -# -# Building the heap by using heappush() 1000 times instead required -# 2198, 2148, and 2219 compares: heapify() is more efficient, when -# you can use it. -# -# The total compares needed by list.sort() on the same lists were 8627, -# 8627, and 8632 (this should be compared to the sum of heapify() and -# heappop() compares): list.sort() is (unsurprisingly!) more efficient -# for sorting. - -def _siftup(heap, pos): - endpos = len(heap) - startpos = pos - newitem = heap[pos] - # Bubble up the smaller child until hitting a leaf. - childpos = 2*pos + 1 # leftmost child position - while childpos < endpos: - # Set childpos to index of smaller child. - rightpos = childpos + 1 - if rightpos < endpos and not heap[childpos] < heap[rightpos]: - childpos = rightpos - # Move the smaller child up. - heap[pos] = heap[childpos] - pos = childpos - childpos = 2*pos + 1 - # The leaf at pos is empty now. Put newitem there, and bubble it up - # to its final resting place (by sifting its parents down). - heap[pos] = newitem - _siftdown(heap, startpos, pos) - -def _siftdown_max(heap, startpos, pos): - 'Maxheap variant of _siftdown' - newitem = heap[pos] - # Follow the path to the root, moving parents down until finding a place - # newitem fits. - while pos > startpos: - parentpos = (pos - 1) >> 1 - parent = heap[parentpos] - if parent < newitem: - heap[pos] = parent - pos = parentpos - continue - break - heap[pos] = newitem - -def _siftup_max(heap, pos): - 'Maxheap variant of _siftup' - endpos = len(heap) - startpos = pos - newitem = heap[pos] - # Bubble up the larger child until hitting a leaf. - childpos = 2*pos + 1 # leftmost child position - while childpos < endpos: - # Set childpos to index of larger child. - rightpos = childpos + 1 - if rightpos < endpos and not heap[rightpos] < heap[childpos]: - childpos = rightpos - # Move the larger child up. - heap[pos] = heap[childpos] - pos = childpos - childpos = 2*pos + 1 - # The leaf at pos is empty now. Put newitem there, and bubble it up - # to its final resting place (by sifting its parents down). - heap[pos] = newitem - _siftdown_max(heap, startpos, pos) - -def merge(*iterables, key=None, reverse=False): - '''Merge multiple sorted inputs into a single sorted output. - - Similar to sorted(itertools.chain(*iterables)) but returns a generator, - does not pull the data into memory all at once, and assumes that each of - the input streams is already sorted (smallest to largest). - - >>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25])) - [0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25] - - If *key* is not None, applies a key function to each element to determine - its sort order. - - >>> list(merge(['dog', 'horse'], ['cat', 'fish', 'kangaroo'], key=len)) - ['dog', 'cat', 'fish', 'horse', 'kangaroo'] - - ''' - n = len(iterables) - if n == 0: - return - if n == 1: - yield from iterables[0] - return - if n == 2: - # special case for two iterables - a_iter = iter(iterables[0]) - b_iter = iter(iterables[1]) - - try: - a = next(a_iter) - except StopIteration: - yield from b_iter - return - try: - b = next(b_iter) - except StopIteration: - yield a - yield from a_iter - return - - if key is None: - if not reverse: - # no key, forward - while True: - if a <= b: - yield a - try: - a = next(a_iter) - except StopIteration: - yield b - yield from b_iter - return - else: - yield b - try: - b = next(b_iter) - except StopIteration: - yield a - yield from a_iter - return - else: - # no key, reverse - while True: - if a >= b: - yield a - try: - a = next(a_iter) - except StopIteration: - yield b - yield from b_iter - return - else: - yield b - try: - b = next(b_iter) - except StopIteration: - yield a - yield from a_iter - return - else: - ka = key(a) - kb = key(b) - if not reverse: - # using a key, forward - while True: - if ka <= kb: - yield a - try: - a = next(a_iter) - except StopIteration: - yield b - yield from b_iter - return - ka = key(a) - else: - yield b - try: - b = next(b_iter) - except StopIteration: - yield a - yield from a_iter - return - kb = key(b) - else: - # using a key, reverse - while True: - if ka >= kb: - yield a - try: - a = next(a_iter) - except StopIteration: - yield b - yield from b_iter - return - ka = key(a) - else: - yield b - try: - b = next(b_iter) - except StopIteration: - yield a - yield from a_iter - return - kb = key(b) - if n <= 6: - # Use the 2-case recursively for small numbers of iterables - n_2 = n // 2 - result_1 = merge(*iterables[:n_2], key=key, reverse=reverse) - result_2 = merge(*iterables[n_2:], key=key, reverse=reverse) - yield from merge(result_1, result_2, key=key, reverse=reverse) - return - - # For larger numbers, store the front of each iterable in a heap. - h = [] - h_append = h.append - - if reverse: - _heapify = _heapify_max - _heappop = _heappop_max - _heapreplace = _heapreplace_max - direction = -1 - else: - _heapify = heapify - _heappop = heappop - _heapreplace = heapreplace - direction = 1 - - if key is None: - for order, it in enumerate(map(iter, iterables)): - try: - next_func = it.__next__ - h_append([next_func(), order * direction, next_func]) - except StopIteration: - pass - _heapify(h) - while len(h) > 1: - try: - while True: - value, order, next_func = s = h[0] - yield value - s[0] = next_func() # raises StopIteration when exhausted - _heapreplace(h, s) # restore heap condition - except StopIteration: - _heappop(h) # remove empty iterator - if h: - # fast case when only a single iterator remains - value, order, next_func = h[0] - yield value - yield from next_func.__self__ - return - - for order, it in enumerate(map(iter, iterables)): - try: - next_func = it.__next__ - value = next_func() - h_append([key(value), order * direction, value, next_func]) - except StopIteration: - pass - _heapify(h) - while len(h) > 1: - try: - while True: - key_value, order, value, next_func = s = h[0] - yield value - value = next_func() - s[0] = key(value) - s[2] = value - _heapreplace(h, s) - except StopIteration: - _heappop(h) - if h: - key_value, order, value, next_func = h[0] - yield value - yield from next_func.__self__ - - -# Algorithm notes for nlargest() and nsmallest() -# ============================================== -# -# Make a single pass over the data while keeping the k most extreme values -# in a heap. Memory consumption is limited to keeping k values in a list. -# -# Measured performance for random inputs: -# -# number of comparisons -# n inputs k-extreme values (average of 5 trials) % more than min() -# ------------- ---------------- --------------------- ----------------- -# 1,000 100 3,317 231.7% -# 10,000 100 14,046 40.5% -# 100,000 100 105,749 5.7% -# 1,000,000 100 1,007,751 0.8% -# 10,000,000 100 10,009,401 0.1% -# -# Theoretical number of comparisons for k smallest of n random inputs: -# -# Step Comparisons Action -# ---- -------------------------- --------------------------- -# 1 1.66 * k heapify the first k-inputs -# 2 n - k compare remaining elements to top of heap -# 3 k * (1 + lg2(k)) * ln(n/k) replace the topmost value on the heap -# 4 k * lg2(k) - (k/2) final sort of the k most extreme values -# -# Combining and simplifying for a rough estimate gives: -# -# comparisons = n + k * (log(k, 2) * log(n/k) + log(k, 2) + log(n/k)) -# -# Computing the number of comparisons for step 3: -# ----------------------------------------------- -# * For the i-th new value from the iterable, the probability of being in the -# k most extreme values is k/i. For example, the probability of the 101st -# value seen being in the 100 most extreme values is 100/101. -# * If the value is a new extreme value, the cost of inserting it into the -# heap is 1 + log(k, 2). -# * The probability times the cost gives: -# (k/i) * (1 + log(k, 2)) -# * Summing across the remaining n-k elements gives: -# sum((k/i) * (1 + log(k, 2)) for i in range(k+1, n+1)) -# * This reduces to: -# (H(n) - H(k)) * k * (1 + log(k, 2)) -# * Where H(n) is the n-th harmonic number estimated by: -# gamma = 0.5772156649 -# H(n) = log(n, e) + gamma + 1 / (2 * n) -# http://en.wikipedia.org/wiki/Harmonic_series_(mathematics)#Rate_of_divergence -# * Substituting the H(n) formula: -# comparisons = k * (1 + log(k, 2)) * (log(n/k, e) + (1/n - 1/k) / 2) -# -# Worst-case for step 3: -# ---------------------- -# In the worst case, the input data is reversed sorted so that every new element -# must be inserted in the heap: -# -# comparisons = 1.66 * k + log(k, 2) * (n - k) -# -# Alternative Algorithms -# ---------------------- -# Other algorithms were not used because they: -# 1) Took much more auxiliary memory, -# 2) Made multiple passes over the data. -# 3) Made more comparisons in common cases (small k, large n, semi-random input). -# See the more detailed comparison of approach at: -# http://code.activestate.com/recipes/577573-compare-algorithms-for-heapqsmallest - -def nsmallest(n, iterable, key=None): - """Find the n smallest elements in a dataset. - - Equivalent to: sorted(iterable, key=key)[:n] - """ - - # Short-cut for n==1 is to use min() - if n == 1: - it = iter(iterable) - sentinel = object() - result = min(it, default=sentinel, key=key) - return [] if result is sentinel else [result] - - # When n>=size, it's faster to use sorted() - try: - size = len(iterable) - except (TypeError, AttributeError): - pass - else: - if n >= size: - return sorted(iterable, key=key)[:n] - - # When key is none, use simpler decoration - if key is None: - it = iter(iterable) - # put the range(n) first so that zip() doesn't - # consume one too many elements from the iterator - result = [(elem, i) for i, elem in zip(range(n), it)] - if not result: - return result - _heapify_max(result) - top = result[0][0] - order = n - _heapreplace = _heapreplace_max - for elem in it: - if elem < top: - _heapreplace(result, (elem, order)) - top, _order = result[0] - order += 1 - result.sort() - return [elem for (elem, order) in result] - - # General case, slowest method - it = iter(iterable) - result = [(key(elem), i, elem) for i, elem in zip(range(n), it)] - if not result: - return result - _heapify_max(result) - top = result[0][0] - order = n - _heapreplace = _heapreplace_max - for elem in it: - k = key(elem) - if k < top: - _heapreplace(result, (k, order, elem)) - top, _order, _elem = result[0] - order += 1 - result.sort() - return [elem for (k, order, elem) in result] - -def nlargest(n, iterable, key=None): - """Find the n largest elements in a dataset. - - Equivalent to: sorted(iterable, key=key, reverse=True)[:n] - """ - - # Short-cut for n==1 is to use max() - if n == 1: - it = iter(iterable) - sentinel = object() - result = max(it, default=sentinel, key=key) - return [] if result is sentinel else [result] - - # When n>=size, it's faster to use sorted() - try: - size = len(iterable) - except (TypeError, AttributeError): - pass - else: - if n >= size: - return sorted(iterable, key=key, reverse=True)[:n] - - # When key is none, use simpler decoration - if key is None: - it = iter(iterable) - result = [(elem, i) for i, elem in zip(range(0, -n, -1), it)] - if not result: - return result - heapify(result) - top = result[0][0] - order = -n - _heapreplace = heapreplace - for elem in it: - if top < elem: - _heapreplace(result, (elem, order)) - top, _order = result[0] - order -= 1 - result.sort(reverse=True) - return [elem for (elem, order) in result] - - # General case, slowest method - it = iter(iterable) - result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)] - if not result: - return result - heapify(result) - top = result[0][0] - order = -n - _heapreplace = heapreplace - for elem in it: - k = key(elem) - if top < k: - _heapreplace(result, (k, order, elem)) - top, _order, _elem = result[0] - order -= 1 - result.sort(reverse=True) - return [elem for (k, order, elem) in result] - -# If available, use C implementation -try: - from _heapq import * -except ImportError: - pass -try: - from _heapq import _heapreplace_max -except ImportError: - pass -try: - from _heapq import _heapify_max -except ImportError: - pass -try: - from _heapq import _heappop_max -except ImportError: - pass - - -if __name__ == "__main__": - - import doctest # pragma: no cover - print(doctest.testmod()) # pragma: no cover \ No newline at end of file +"""Heap queue algorithm (a.k.a. priority queue). + +Heaps are arrays for which a[k] <= a[2*k+1] and a[k] <= a[2*k+2] for +all k, counting elements from 0. For the sake of comparison, +non-existing elements are considered to be infinite. The interesting +property of a heap is that a[0] is always its smallest element. + +Usage: + +heap = [] # creates an empty heap +heappush(heap, item) # pushes a new item on the heap +item = heappop(heap) # pops the smallest item from the heap +item = heap[0] # smallest item on the heap without popping it +heapify(x) # transforms list into a heap, in-place, in linear time +item = heapreplace(heap, item) # pops and returns smallest item, and adds + # new item; the heap size is unchanged + +Our API differs from textbook heap algorithms as follows: + +- We use 0-based indexing. This makes the relationship between the + index for a node and the indexes for its children slightly less + obvious, but is more suitable since Python uses 0-based indexing. + +- Our heappop() method returns the smallest item, not the largest. + +These two make it possible to view the heap as a regular Python list +without surprises: heap[0] is the smallest item, and heap.sort() +maintains the heap invariant! +""" + +# Original code by Kevin O'Connor, augmented by Tim Peters and Raymond Hettinger + +__about__ = """Heap queues + +[explanation by François Pinard] + +Heaps are arrays for which a[k] <= a[2*k+1] and a[k] <= a[2*k+2] for +all k, counting elements from 0. For the sake of comparison, +non-existing elements are considered to be infinite. The interesting +property of a heap is that a[0] is always its smallest element. + +The strange invariant above is meant to be an efficient memory +representation for a tournament. The numbers below are `k', not a[k]: + + 0 + + 1 2 + + 3 4 5 6 + + 7 8 9 10 11 12 13 14 + + 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 + + +In the tree above, each cell `k' is topping `2*k+1' and `2*k+2'. In +a usual binary tournament we see in sports, each cell is the winner +over the two cells it tops, and we can trace the winner down the tree +to see all opponents s/he had. However, in many computer applications +of such tournaments, we do not need to trace the history of a winner. +To be more memory efficient, when a winner is promoted, we try to +replace it by something else at a lower level, and the rule becomes +that a cell and the two cells it tops contain three different items, +but the top cell "wins" over the two topped cells. + +If this heap invariant is protected at all time, index 0 is clearly +the overall winner. The simplest algorithmic way to remove it and +find the "next" winner is to move some loser (let's say cell 30 in the +diagram above) into the 0 position, and then percolate this new 0 down +the tree, exchanging values, until the invariant is re-established. +This is clearly logarithmic on the total number of items in the tree. +By iterating over all items, you get an O(n ln n) sort. + +A nice feature of this sort is that you can efficiently insert new +items while the sort is going on, provided that the inserted items are +not "better" than the last 0'th element you extracted. This is +especially useful in simulation contexts, where the tree holds all +incoming events, and the "win" condition means the smallest scheduled +time. When an event schedule other events for execution, they are +scheduled into the future, so they can easily go into the heap. So, a +heap is a good structure for implementing schedulers (this is what I +used for my MIDI sequencer :-). + +Various structures for implementing schedulers have been extensively +studied, and heaps are good for this, as they are reasonably speedy, +the speed is almost constant, and the worst case is not much different +than the average case. However, there are other representations which +are more efficient overall, yet the worst cases might be terrible. + +Heaps are also very useful in big disk sorts. You most probably all +know that a big sort implies producing "runs" (which are pre-sorted +sequences, which size is usually related to the amount of CPU memory), +followed by a merging passes for these runs, which merging is often +very cleverly organised[1]. It is very important that the initial +sort produces the longest runs possible. Tournaments are a good way +to that. If, using all the memory available to hold a tournament, you +replace and percolate items that happen to fit the current run, you'll +produce runs which are twice the size of the memory for random input, +and much better for input fuzzily ordered. + +Moreover, if you output the 0'th item on disk and get an input which +may not fit in the current tournament (because the value "wins" over +the last output value), it cannot fit in the heap, so the size of the +heap decreases. The freed memory could be cleverly reused immediately +for progressively building a second heap, which grows at exactly the +same rate the first heap is melting. When the first heap completely +vanishes, you switch heaps and start a new run. Clever and quite +effective! + +In a word, heaps are useful memory structures to know. I use them in +a few applications, and I think it is good to keep a `heap' module +around. :-) + +-------------------- +[1] The disk balancing algorithms which are current, nowadays, are +more annoying than clever, and this is a consequence of the seeking +capabilities of the disks. On devices which cannot seek, like big +tape drives, the story was quite different, and one had to be very +clever to ensure (far in advance) that each tape movement will be the +most effective possible (that is, will best participate at +"progressing" the merge). Some tapes were even able to read +backwards, and this was also used to avoid the rewinding time. +Believe me, real good tape sorts were quite spectacular to watch! +From all times, sorting has always been a Great Art! :-) +""" + +__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'merge', + 'nlargest', 'nsmallest', 'heappushpop'] + +def heappush(heap, item): + """Push item onto heap, maintaining the heap invariant.""" + heap.append(item) + _siftdown(heap, 0, len(heap)-1) + +def heappop(heap): + """Pop the smallest item off the heap, maintaining the heap invariant.""" + lastelt = heap.pop() # raises appropriate IndexError if heap is empty + if heap: + returnitem = heap[0] + heap[0] = lastelt + _siftup(heap, 0) + return returnitem + return lastelt + +def heapreplace(heap, item): + """Pop and return the current smallest value, and add the new item. + + This is more efficient than heappop() followed by heappush(), and can be + more appropriate when using a fixed-size heap. Note that the value + returned may be larger than item! That constrains reasonable uses of + this routine unless written as part of a conditional replacement: + + if item > heap[0]: + item = heapreplace(heap, item) + """ + returnitem = heap[0] # raises appropriate IndexError if heap is empty + heap[0] = item + _siftup(heap, 0) + return returnitem + +def heappushpop(heap, item): + """Fast version of a heappush followed by a heappop.""" + if heap and heap[0] < item: + item, heap[0] = heap[0], item + _siftup(heap, 0) + return item + +def heapify(x): + """Transform list into a heap, in-place, in O(len(x)) time.""" + n = len(x) + # Transform bottom-up. The largest index there's any point to looking at + # is the largest with a child index in-range, so must have 2*i + 1 < n, + # or i < (n-1)/2. If n is even = 2*j, this is (2*j-1)/2 = j-1/2 so + # j-1 is the largest, which is n//2 - 1. If n is odd = 2*j+1, this is + # (2*j+1-1)/2 = j so j-1 is the largest, and that's again n//2-1. + for i in reversed(range(n//2)): + _siftup(x, i) + +def _heappop_max(heap): + """Maxheap version of a heappop.""" + lastelt = heap.pop() # raises appropriate IndexError if heap is empty + if heap: + returnitem = heap[0] + heap[0] = lastelt + _siftup_max(heap, 0) + return returnitem + return lastelt + +def _heapreplace_max(heap, item): + """Maxheap version of a heappop followed by a heappush.""" + returnitem = heap[0] # raises appropriate IndexError if heap is empty + heap[0] = item + _siftup_max(heap, 0) + return returnitem + +def _heapify_max(x): + """Transform list into a maxheap, in-place, in O(len(x)) time.""" + n = len(x) + for i in reversed(range(n//2)): + _siftup_max(x, i) + +# 'heap' is a heap at all indices >= startpos, except possibly for pos. pos +# is the index of a leaf with a possibly out-of-order value. Restore the +# heap invariant. +def _siftdown(heap, startpos, pos): + newitem = heap[pos] + # Follow the path to the root, moving parents down until finding a place + # newitem fits. + while pos > startpos: + parentpos = (pos - 1) >> 1 + parent = heap[parentpos] + if newitem < parent: + heap[pos] = parent + pos = parentpos + continue + break + heap[pos] = newitem + +# The child indices of heap index pos are already heaps, and we want to make +# a heap at index pos too. We do this by bubbling the smaller child of +# pos up (and so on with that child's children, etc) until hitting a leaf, +# then using _siftdown to move the oddball originally at index pos into place. +# +# We *could* break out of the loop as soon as we find a pos where newitem <= +# both its children, but turns out that's not a good idea, and despite that +# many books write the algorithm that way. During a heap pop, the last array +# element is sifted in, and that tends to be large, so that comparing it +# against values starting from the root usually doesn't pay (= usually doesn't +# get us out of the loop early). See Knuth, Volume 3, where this is +# explained and quantified in an exercise. +# +# Cutting the # of comparisons is important, since these routines have no +# way to extract "the priority" from an array element, so that intelligence +# is likely to be hiding in custom comparison methods, or in array elements +# storing (priority, record) tuples. Comparisons are thus potentially +# expensive. +# +# On random arrays of length 1000, making this change cut the number of +# comparisons made by heapify() a little, and those made by exhaustive +# heappop() a lot, in accord with theory. Here are typical results from 3 +# runs (3 just to demonstrate how small the variance is): +# +# Compares needed by heapify Compares needed by 1000 heappops +# -------------------------- -------------------------------- +# 1837 cut to 1663 14996 cut to 8680 +# 1855 cut to 1659 14966 cut to 8678 +# 1847 cut to 1660 15024 cut to 8703 +# +# Building the heap by using heappush() 1000 times instead required +# 2198, 2148, and 2219 compares: heapify() is more efficient, when +# you can use it. +# +# The total compares needed by list.sort() on the same lists were 8627, +# 8627, and 8632 (this should be compared to the sum of heapify() and +# heappop() compares): list.sort() is (unsurprisingly!) more efficient +# for sorting. + +def _siftup(heap, pos): + endpos = len(heap) + startpos = pos + newitem = heap[pos] + # Bubble up the smaller child until hitting a leaf. + childpos = 2*pos + 1 # leftmost child position + while childpos < endpos: + # Set childpos to index of smaller child. + rightpos = childpos + 1 + if rightpos < endpos and not heap[childpos] < heap[rightpos]: + childpos = rightpos + # Move the smaller child up. + heap[pos] = heap[childpos] + pos = childpos + childpos = 2*pos + 1 + # The leaf at pos is empty now. Put newitem there, and bubble it up + # to its final resting place (by sifting its parents down). + heap[pos] = newitem + _siftdown(heap, startpos, pos) + +def _siftdown_max(heap, startpos, pos): + 'Maxheap variant of _siftdown' + newitem = heap[pos] + # Follow the path to the root, moving parents down until finding a place + # newitem fits. + while pos > startpos: + parentpos = (pos - 1) >> 1 + parent = heap[parentpos] + if parent < newitem: + heap[pos] = parent + pos = parentpos + continue + break + heap[pos] = newitem + +def _siftup_max(heap, pos): + 'Maxheap variant of _siftup' + endpos = len(heap) + startpos = pos + newitem = heap[pos] + # Bubble up the larger child until hitting a leaf. + childpos = 2*pos + 1 # leftmost child position + while childpos < endpos: + # Set childpos to index of larger child. + rightpos = childpos + 1 + if rightpos < endpos and not heap[rightpos] < heap[childpos]: + childpos = rightpos + # Move the larger child up. + heap[pos] = heap[childpos] + pos = childpos + childpos = 2*pos + 1 + # The leaf at pos is empty now. Put newitem there, and bubble it up + # to its final resting place (by sifting its parents down). + heap[pos] = newitem + _siftdown_max(heap, startpos, pos) + +def merge(*iterables, key=None, reverse=False): + '''Merge multiple sorted inputs into a single sorted output. + + Similar to sorted(itertools.chain(*iterables)) but returns a generator, + does not pull the data into memory all at once, and assumes that each of + the input streams is already sorted (smallest to largest). + + >>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25])) + [0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25] + + If *key* is not None, applies a key function to each element to determine + its sort order. + + >>> list(merge(['dog', 'horse'], ['cat', 'fish', 'kangaroo'], key=len)) + ['dog', 'cat', 'fish', 'horse', 'kangaroo'] + + ''' + + h = [] + h_append = h.append + + if reverse: + _heapify = _heapify_max + _heappop = _heappop_max + _heapreplace = _heapreplace_max + direction = -1 + else: + _heapify = heapify + _heappop = heappop + _heapreplace = heapreplace + direction = 1 + + if key is None: + for order, it in enumerate(map(iter, iterables)): + try: + next = it.__next__ + h_append([next(), order * direction, next]) + except StopIteration: + pass + _heapify(h) + while len(h) > 1: + try: + while True: + value, order, next = s = h[0] + yield value + s[0] = next() # raises StopIteration when exhausted + _heapreplace(h, s) # restore heap condition + except StopIteration: + _heappop(h) # remove empty iterator + if h: + # fast case when only a single iterator remains + value, order, next = h[0] + yield value + yield from next.__self__ + return + + for order, it in enumerate(map(iter, iterables)): + try: + next = it.__next__ + value = next() + h_append([key(value), order * direction, value, next]) + except StopIteration: + pass + _heapify(h) + while len(h) > 1: + try: + while True: + key_value, order, value, next = s = h[0] + yield value + value = next() + s[0] = key(value) + s[2] = value + _heapreplace(h, s) + except StopIteration: + _heappop(h) + if h: + key_value, order, value, next = h[0] + yield value + yield from next.__self__ + + +# Algorithm notes for nlargest() and nsmallest() +# ============================================== +# +# Make a single pass over the data while keeping the k most extreme values +# in a heap. Memory consumption is limited to keeping k values in a list. +# +# Measured performance for random inputs: +# +# number of comparisons +# n inputs k-extreme values (average of 5 trials) % more than min() +# ------------- ---------------- --------------------- ----------------- +# 1,000 100 3,317 231.7% +# 10,000 100 14,046 40.5% +# 100,000 100 105,749 5.7% +# 1,000,000 100 1,007,751 0.8% +# 10,000,000 100 10,009,401 0.1% +# +# Theoretical number of comparisons for k smallest of n random inputs: +# +# Step Comparisons Action +# ---- -------------------------- --------------------------- +# 1 1.66 * k heapify the first k-inputs +# 2 n - k compare remaining elements to top of heap +# 3 k * (1 + lg2(k)) * ln(n/k) replace the topmost value on the heap +# 4 k * lg2(k) - (k/2) final sort of the k most extreme values +# +# Combining and simplifying for a rough estimate gives: +# +# comparisons = n + k * (log(k, 2) * log(n/k) + log(k, 2) + log(n/k)) +# +# Computing the number of comparisons for step 3: +# ----------------------------------------------- +# * For the i-th new value from the iterable, the probability of being in the +# k most extreme values is k/i. For example, the probability of the 101st +# value seen being in the 100 most extreme values is 100/101. +# * If the value is a new extreme value, the cost of inserting it into the +# heap is 1 + log(k, 2). +# * The probability times the cost gives: +# (k/i) * (1 + log(k, 2)) +# * Summing across the remaining n-k elements gives: +# sum((k/i) * (1 + log(k, 2)) for i in range(k+1, n+1)) +# * This reduces to: +# (H(n) - H(k)) * k * (1 + log(k, 2)) +# * Where H(n) is the n-th harmonic number estimated by: +# gamma = 0.5772156649 +# H(n) = log(n, e) + gamma + 1 / (2 * n) +# http://en.wikipedia.org/wiki/Harmonic_series_(mathematics)#Rate_of_divergence +# * Substituting the H(n) formula: +# comparisons = k * (1 + log(k, 2)) * (log(n/k, e) + (1/n - 1/k) / 2) +# +# Worst-case for step 3: +# ---------------------- +# In the worst case, the input data is reversed sorted so that every new element +# must be inserted in the heap: +# +# comparisons = 1.66 * k + log(k, 2) * (n - k) +# +# Alternative Algorithms +# ---------------------- +# Other algorithms were not used because they: +# 1) Took much more auxiliary memory, +# 2) Made multiple passes over the data. +# 3) Made more comparisons in common cases (small k, large n, semi-random input). +# See the more detailed comparison of approach at: +# http://code.activestate.com/recipes/577573-compare-algorithms-for-heapqsmallest + +def nsmallest(n, iterable, key=None): + """Find the n smallest elements in a dataset. + + Equivalent to: sorted(iterable, key=key)[:n] + """ + + # Short-cut for n==1 is to use min() + if n == 1: + it = iter(iterable) + sentinel = object() + result = min(it, default=sentinel, key=key) + return [] if result is sentinel else [result] + + # When n>=size, it's faster to use sorted() + try: + size = len(iterable) + except (TypeError, AttributeError): + pass + else: + if n >= size: + return sorted(iterable, key=key)[:n] + + # When key is none, use simpler decoration + if key is None: + it = iter(iterable) + # put the range(n) first so that zip() doesn't + # consume one too many elements from the iterator + result = [(elem, i) for i, elem in zip(range(n), it)] + if not result: + return result + _heapify_max(result) + top = result[0][0] + order = n + _heapreplace = _heapreplace_max + for elem in it: + if elem < top: + _heapreplace(result, (elem, order)) + top, _order = result[0] + order += 1 + result.sort() + return [elem for (elem, order) in result] + + # General case, slowest method + it = iter(iterable) + result = [(key(elem), i, elem) for i, elem in zip(range(n), it)] + if not result: + return result + _heapify_max(result) + top = result[0][0] + order = n + _heapreplace = _heapreplace_max + for elem in it: + k = key(elem) + if k < top: + _heapreplace(result, (k, order, elem)) + top, _order, _elem = result[0] + order += 1 + result.sort() + return [elem for (k, order, elem) in result] + +def nlargest(n, iterable, key=None): + """Find the n largest elements in a dataset. + + Equivalent to: sorted(iterable, key=key, reverse=True)[:n] + """ + + # Short-cut for n==1 is to use max() + if n == 1: + it = iter(iterable) + sentinel = object() + result = max(it, default=sentinel, key=key) + return [] if result is sentinel else [result] + + # When n>=size, it's faster to use sorted() + try: + size = len(iterable) + except (TypeError, AttributeError): + pass + else: + if n >= size: + return sorted(iterable, key=key, reverse=True)[:n] + + # When key is none, use simpler decoration + if key is None: + it = iter(iterable) + result = [(elem, i) for i, elem in zip(range(0, -n, -1), it)] + if not result: + return result + heapify(result) + top = result[0][0] + order = -n + _heapreplace = heapreplace + for elem in it: + if top < elem: + _heapreplace(result, (elem, order)) + top, _order = result[0] + order -= 1 + result.sort(reverse=True) + return [elem for (elem, order) in result] + + # General case, slowest method + it = iter(iterable) + result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)] + if not result: + return result + heapify(result) + top = result[0][0] + order = -n + _heapreplace = heapreplace + for elem in it: + k = key(elem) + if top < k: + _heapreplace(result, (k, order, elem)) + top, _order, _elem = result[0] + order -= 1 + result.sort(reverse=True) + return [elem for (k, order, elem) in result] + +# If available, use C implementation +try: + from _heapq import * +except ImportError: + pass +try: + from _heapq import _heapreplace_max +except ImportError: + pass +try: + from _heapq import _heapify_max +except ImportError: + pass +try: + from _heapq import _heappop_max +except ImportError: + pass + + +if __name__ == "__main__": + + import doctest # pragma: no cover + print(doctest.testmod()) # pragma: no cover From 159f24567782b93ce85931cec9b96053c34e25de Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sat, 30 Nov 2019 22:58:08 -0500 Subject: [PATCH 07/17] added special case for two iterables --- Lib/heapq.py | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/Lib/heapq.py b/Lib/heapq.py index fabefd87f8bf8c..7ec6771c5fe914 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -329,6 +329,116 @@ def merge(*iterables, key=None, reverse=False): ''' + n = len(iterables) + if n == 0: + return + if n == 1: + yield from iterables[0] + return + if n == 2: + # special case for two iterables + + a_iter = iter(iterables[0]) + b_iter = iter(iterables[1]) + next_a = a_iter.__next__ + next_b = b_iter.__next__ + + try: + a = next_a() + except StopIteration: + yield from b_iter + return + try: + b = next_b() + except StopIteration: + yield a + yield from a_iter + return + + if key is None: + if not reverse: + # no key, forward + while True: + if a <= b: + yield a + try: + a = next_a() + except StopIteration: + yield b + yield from b_iter + return + else: + yield b + try: + b = next_b() + except StopIteration: + yield a + yield from a_iter + return + else: + # no key, reverse + while True: + if a >= b: + yield a + try: + a = next_a() + except StopIteration: + yield b + yield from b_iter + return + else: + yield b + try: + b = next_b() + except StopIteration: + yield a + yield from a_iter + return + else: + ka = key(a) + kb = key(b) + if not reverse: + while True: + # using a key, forward + if ka <= kb: + yield a + try: + a = next_a() + except StopIteration: + yield b + yield from b_iter + return + ka = key(a) + else: + yield b + try: + b = next_b() + except StopIteration: + yield a + yield from a_iter + return + kb = key(b) + else: + # using a key, reverse + while True: + if ka <= kb: + yield a + try: + a = next_a() + except StopIteration: + yield b + yield from b_iter + return + ka = key(a) + else: + yield b + try: + b = next_b() + except StopIteration: + yield a + yield from a_iter + return + h = [] h_append = h.append From 40ccb5d91239e888c67e80feb361b078435daa5e Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Sun, 1 Dec 2019 04:01:33 +0000 Subject: [PATCH 08/17] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../NEWS.d/next/Library/2019-12-01-04-01-32.bpo-38938.Mj9nKR.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2019-12-01-04-01-32.bpo-38938.Mj9nKR.rst diff --git a/Misc/NEWS.d/next/Library/2019-12-01-04-01-32.bpo-38938.Mj9nKR.rst b/Misc/NEWS.d/next/Library/2019-12-01-04-01-32.bpo-38938.Mj9nKR.rst new file mode 100644 index 00000000000000..88fbeec7e16759 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-12-01-04-01-32.bpo-38938.Mj9nKR.rst @@ -0,0 +1 @@ +For performance, added a special case to heapq.merge for two iterables. \ No newline at end of file From 9807e684b25a6d6d8c7cb630afac40a170eca2a5 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sun, 1 Dec 2019 14:53:19 -0500 Subject: [PATCH 09/17] removed special cases for zero or one iterator --- Lib/heapq.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/Lib/heapq.py b/Lib/heapq.py index 7ec6771c5fe914..296102189b837f 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -329,14 +329,8 @@ def merge(*iterables, key=None, reverse=False): ''' - n = len(iterables) - if n == 0: - return - if n == 1: - yield from iterables[0] - return - if n == 2: - # special case for two iterables + if len(iterables) == 2: + # special case for performance a_iter = iter(iterables[0]) b_iter = iter(iterables[1]) From de8fb05d453cf6c92a6f0a26f0fa76fb0453fae3 Mon Sep 17 00:00:00 2001 From: Dennis Sweeney Date: Sun, 1 Dec 2019 21:56:47 -0500 Subject: [PATCH 10/17] fixed missing key calculation. Now using only < --- Lib/heapq.py | 55 ++++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/Lib/heapq.py b/Lib/heapq.py index 296102189b837f..6d881e67b10e76 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -353,7 +353,15 @@ def merge(*iterables, key=None, reverse=False): if not reverse: # no key, forward while True: - if a <= b: + if b < a: + yield b + try: + b = next_b() + except StopIteration: + yield a + yield from a_iter + return + else: yield a try: a = next_a() @@ -361,7 +369,10 @@ def merge(*iterables, key=None, reverse=False): yield b yield from b_iter return - else: + else: + # no key, reverse + while True: + if a < b: yield b try: b = next_b() @@ -369,10 +380,7 @@ def merge(*iterables, key=None, reverse=False): yield a yield from a_iter return - else: - # no key, reverse - while True: - if a >= b: + else: yield a try: a = next_a() @@ -380,7 +388,13 @@ def merge(*iterables, key=None, reverse=False): yield b yield from b_iter return - else: + else: + ka = key(a) + kb = key(b) + if not reverse: + while True: + # using a key, forward + if kb < ka: yield b try: b = next_b() @@ -388,13 +402,8 @@ def merge(*iterables, key=None, reverse=False): yield a yield from a_iter return - else: - ka = key(a) - kb = key(b) - if not reverse: - while True: - # using a key, forward - if ka <= kb: + kb = key(b) + else: yield a try: a = next_a() @@ -403,7 +412,10 @@ def merge(*iterables, key=None, reverse=False): yield from b_iter return ka = key(a) - else: + else: + # using a key, reverse + while True: + if ka < kb: yield b try: b = next_b() @@ -412,10 +424,7 @@ def merge(*iterables, key=None, reverse=False): yield from a_iter return kb = key(b) - else: - # using a key, reverse - while True: - if ka <= kb: + else: yield a try: a = next_a() @@ -424,14 +433,6 @@ def merge(*iterables, key=None, reverse=False): yield from b_iter return ka = key(a) - else: - yield b - try: - b = next_b() - except StopIteration: - yield a - yield from a_iter - return h = [] h_append = h.append From 02ede9eaac879863e8c8fbb4acecf587c95a93eb Mon Sep 17 00:00:00 2001 From: Dennis Sweeney Date: Sun, 1 Dec 2019 22:08:14 -0500 Subject: [PATCH 11/17] removed negation of reverse --- Lib/heapq.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Lib/heapq.py b/Lib/heapq.py index 6d881e67b10e76..c5fc1de4905f19 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -350,10 +350,10 @@ def merge(*iterables, key=None, reverse=False): return if key is None: - if not reverse: - # no key, forward + if reverse: + # no key, reverse while True: - if b < a: + if a < b: yield b try: b = next_b() @@ -370,9 +370,9 @@ def merge(*iterables, key=None, reverse=False): yield from b_iter return else: - # no key, reverse + # no key, forward while True: - if a < b: + if b < a: yield b try: b = next_b() @@ -391,10 +391,10 @@ def merge(*iterables, key=None, reverse=False): else: ka = key(a) kb = key(b) - if not reverse: + if reverse: while True: - # using a key, forward - if kb < ka: + # using a key, reverse + if ka < kb: yield b try: b = next_b() @@ -413,9 +413,9 @@ def merge(*iterables, key=None, reverse=False): return ka = key(a) else: - # using a key, reverse + # using a key, forward while True: - if ka < kb: + if kb < ka: yield b try: b = next_b() From 208d0d96c59d5344a194fe9f51c5bda2308bbc64 Mon Sep 17 00:00:00 2001 From: Dennis Sweeney Date: Sun, 1 Dec 2019 22:15:05 -0500 Subject: [PATCH 12/17] made comment placement consistent --- Lib/heapq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/heapq.py b/Lib/heapq.py index c5fc1de4905f19..78dd527c979e3e 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -392,8 +392,8 @@ def merge(*iterables, key=None, reverse=False): ka = key(a) kb = key(b) if reverse: + # using a key, reverse while True: - # using a key, reverse if ka < kb: yield b try: From c373a76e75c16bec890138742c71311e8309a767 Mon Sep 17 00:00:00 2001 From: Dennis Sweeney Date: Mon, 2 Dec 2019 13:32:33 -0500 Subject: [PATCH 13/17] changed test_merge case to check many different numbers of iterables --- Lib/test/test_heapq.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/Lib/test/test_heapq.py b/Lib/test/test_heapq.py index 861ba7540df2bc..bf786e37fd0b29 100644 --- a/Lib/test/test_heapq.py +++ b/Lib/test/test_heapq.py @@ -176,22 +176,23 @@ def test_heapsort(self): self.assertEqual(heap_sorted, sorted(data)) def test_merge(self): - inputs = [] - for i in range(random.randrange(25)): - row = [] - for j in range(random.randrange(100)): - tup = random.choice('ABC'), random.randrange(-500, 500) - row.append(tup) - inputs.append(row) - - for key in [None, itemgetter(0), itemgetter(1), itemgetter(1, 0)]: - for reverse in [False, True]: - seqs = [] - for seq in inputs: - seqs.append(sorted(seq, key=key, reverse=reverse)) - self.assertEqual(sorted(chain(*inputs), key=key, reverse=reverse), - list(self.module.merge(*seqs, key=key, reverse=reverse))) - self.assertEqual(list(self.module.merge()), []) + for n in range(26): + inputs = [] + for i in range(n): + row = [] + for j in range(random.randrange(100)): + tup = random.choice('ABC'), random.randrange(-500, 500) + row.append(tup) + inputs.append(row) + + for key in [None, itemgetter(0), itemgetter(1), itemgetter(1, 0)]: + for reverse in [False, True]: + seqs = [] + for seq in inputs: + seqs.append(sorted(seq, key=key, reverse=reverse)) + self.assertEqual(sorted(chain(*inputs), key=key, reverse=reverse), + list(self.module.merge(*seqs, key=key, reverse=reverse))) + self.assertEqual(list(self.module.merge()), []) def test_empty_merges(self): # Merging two empty lists (with or without a key) should produce From 937d3138f806c8a41b1b13356422a9d83742e863 Mon Sep 17 00:00:00 2001 From: Dennis Sweeney Date: Wed, 4 Dec 2019 04:35:47 -0500 Subject: [PATCH 14/17] committed to the recursive binary tree approach --- Lib/heapq.py | 94 ++++++++++++++++++---------------------------------- 1 file changed, 32 insertions(+), 62 deletions(-) diff --git a/Lib/heapq.py b/Lib/heapq.py index 78dd527c979e3e..445588df817f7d 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -311,6 +311,12 @@ def _siftup_max(heap, pos): heap[pos] = newitem _siftdown_max(heap, startpos, pos) +def _key_from_key_val(key_val): + return key_val[0] + +def _val_from_key_val(key_val): + return key_val[1] + def merge(*iterables, key=None, reverse=False): '''Merge multiple sorted inputs into a single sorted output. @@ -328,8 +334,13 @@ def merge(*iterables, key=None, reverse=False): ['dog', 'cat', 'fish', 'horse', 'kangaroo'] ''' - - if len(iterables) == 2: + n = len(iterables) + if n == 0: + return + if n == 1: + yield from iterables[0] + return + if n == 2: # special case for performance a_iter = iter(iterables[0]) @@ -434,67 +445,26 @@ def merge(*iterables, key=None, reverse=False): return ka = key(a) - h = [] - h_append = h.append - - if reverse: - _heapify = _heapify_max - _heappop = _heappop_max - _heapreplace = _heapreplace_max - direction = -1 + # For more iterables, reduce recursively using the 2-iterable case. + n2 = n // 2 + if key is None or key is _key_from_key_val: + # If there is no key, make the recursive calls not have keys. + # OTOH, If the key is _key_from_key_val, then this is a recursive call, + # so the keys are already computed and so we should continue to use + # _key_from_key_val as the key for the deeper recursive calls. + left_half = merge(*iterables[:n2], key=key, reverse=reverse) + right_half = merge(*iterables[n2:], key=key, reverse=reverse) + yield from merge(left_half, right_half, key=key, reverse=reverse) else: - _heapify = heapify - _heappop = heappop - _heapreplace = heapreplace - direction = 1 - - if key is None: - for order, it in enumerate(map(iter, iterables)): - try: - next = it.__next__ - h_append([next(), order * direction, next]) - except StopIteration: - pass - _heapify(h) - while len(h) > 1: - try: - while True: - value, order, next = s = h[0] - yield value - s[0] = next() # raises StopIteration when exhausted - _heapreplace(h, s) # restore heap condition - except StopIteration: - _heappop(h) # remove empty iterator - if h: - # fast case when only a single iterator remains - value, order, next = h[0] - yield value - yield from next.__self__ - return - - for order, it in enumerate(map(iter, iterables)): - try: - next = it.__next__ - value = next() - h_append([key(value), order * direction, value, next]) - except StopIteration: - pass - _heapify(h) - while len(h) > 1: - try: - while True: - key_value, order, value, next = s = h[0] - yield value - value = next() - s[0] = key(value) - s[2] = value - _heapreplace(h, s) - except StopIteration: - _heappop(h) - if h: - key_value, order, value, next = h[0] - yield value - yield from next.__self__ + # We are using a key and this is a top-level call. + # To prevent re-computation of keys, we can feed into our algorithm precomputed + # (key, value) pairs, and make the recursive calls keyed by only the + # first entry of these pairs. + key_iterables = [zip(map(key, it), it) for it in iterables] + left_half = merge(*key_iterables[:n2], key=_key_from_key_val, reverse=reverse) + right_half = merge(*key_iterables[n2:], key=_key_from_key_val, reverse=reverse) + result = merge(left_half, right_half, key=_key_from_key_val, reverse=reverse) + yield from map(_val_from_key_val, result) # Algorithm notes for nlargest() and nsmallest() From 9f43541b1ad43d472963bb45b673bea57937d965 Mon Sep 17 00:00:00 2001 From: Dennis Sweeney Date: Wed, 4 Dec 2019 21:17:36 -0500 Subject: [PATCH 15/17] Revert "committed to the recursive binary tree approach" This reverts commit 937d3138f806c8a41b1b13356422a9d83742e863. --- Lib/heapq.py | 94 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 62 insertions(+), 32 deletions(-) diff --git a/Lib/heapq.py b/Lib/heapq.py index 445588df817f7d..78dd527c979e3e 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -311,12 +311,6 @@ def _siftup_max(heap, pos): heap[pos] = newitem _siftdown_max(heap, startpos, pos) -def _key_from_key_val(key_val): - return key_val[0] - -def _val_from_key_val(key_val): - return key_val[1] - def merge(*iterables, key=None, reverse=False): '''Merge multiple sorted inputs into a single sorted output. @@ -334,13 +328,8 @@ def merge(*iterables, key=None, reverse=False): ['dog', 'cat', 'fish', 'horse', 'kangaroo'] ''' - n = len(iterables) - if n == 0: - return - if n == 1: - yield from iterables[0] - return - if n == 2: + + if len(iterables) == 2: # special case for performance a_iter = iter(iterables[0]) @@ -445,26 +434,67 @@ def merge(*iterables, key=None, reverse=False): return ka = key(a) - # For more iterables, reduce recursively using the 2-iterable case. - n2 = n // 2 - if key is None or key is _key_from_key_val: - # If there is no key, make the recursive calls not have keys. - # OTOH, If the key is _key_from_key_val, then this is a recursive call, - # so the keys are already computed and so we should continue to use - # _key_from_key_val as the key for the deeper recursive calls. - left_half = merge(*iterables[:n2], key=key, reverse=reverse) - right_half = merge(*iterables[n2:], key=key, reverse=reverse) - yield from merge(left_half, right_half, key=key, reverse=reverse) + h = [] + h_append = h.append + + if reverse: + _heapify = _heapify_max + _heappop = _heappop_max + _heapreplace = _heapreplace_max + direction = -1 else: - # We are using a key and this is a top-level call. - # To prevent re-computation of keys, we can feed into our algorithm precomputed - # (key, value) pairs, and make the recursive calls keyed by only the - # first entry of these pairs. - key_iterables = [zip(map(key, it), it) for it in iterables] - left_half = merge(*key_iterables[:n2], key=_key_from_key_val, reverse=reverse) - right_half = merge(*key_iterables[n2:], key=_key_from_key_val, reverse=reverse) - result = merge(left_half, right_half, key=_key_from_key_val, reverse=reverse) - yield from map(_val_from_key_val, result) + _heapify = heapify + _heappop = heappop + _heapreplace = heapreplace + direction = 1 + + if key is None: + for order, it in enumerate(map(iter, iterables)): + try: + next = it.__next__ + h_append([next(), order * direction, next]) + except StopIteration: + pass + _heapify(h) + while len(h) > 1: + try: + while True: + value, order, next = s = h[0] + yield value + s[0] = next() # raises StopIteration when exhausted + _heapreplace(h, s) # restore heap condition + except StopIteration: + _heappop(h) # remove empty iterator + if h: + # fast case when only a single iterator remains + value, order, next = h[0] + yield value + yield from next.__self__ + return + + for order, it in enumerate(map(iter, iterables)): + try: + next = it.__next__ + value = next() + h_append([key(value), order * direction, value, next]) + except StopIteration: + pass + _heapify(h) + while len(h) > 1: + try: + while True: + key_value, order, value, next = s = h[0] + yield value + value = next() + s[0] = key(value) + s[2] = value + _heapreplace(h, s) + except StopIteration: + _heappop(h) + if h: + key_value, order, value, next = h[0] + yield value + yield from next.__self__ # Algorithm notes for nlargest() and nsmallest() From c149212a94b8bdce74f904c3c9f9a87ce1292440 Mon Sep 17 00:00:00 2001 From: Dennis Sweeney Date: Fri, 6 Dec 2019 09:53:01 -0500 Subject: [PATCH 16/17] factored out try/except from loop for more performance gains --- Lib/heapq.py | 122 ++++++++++++++++++++++++--------------------------- 1 file changed, 58 insertions(+), 64 deletions(-) diff --git a/Lib/heapq.py b/Lib/heapq.py index 78dd527c979e3e..c7b49f7078cc34 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -351,88 +351,82 @@ def merge(*iterables, key=None, reverse=False): if key is None: if reverse: - # no key, reverse - while True: - if a < b: - yield b - try: + # no key, biggest first + try: + while True: + if a < b: + yield b b = next_b() - except StopIteration: + else: yield a - yield from a_iter - return - else: - yield a - try: a = next_a() - except StopIteration: - yield b - yield from b_iter - return - else: - # no key, forward - while True: - if b < a: + except StopIteration: + if a < b: + yield a + yield from a_iter + else: yield b - try: + yield from b_iter + else: + # no key, smallest first + try: + while True: + if b < a: + yield b b = next_b() - except StopIteration: + else: yield a - yield from a_iter - return - else: - yield a - try: a = next_a() - except StopIteration: - yield b - yield from b_iter - return + except StopIteration: + if b < a: + yield a + yield from a_iter + else: + yield b + yield from b_iter else: ka = key(a) kb = key(b) if reverse: - # using a key, reverse - while True: - if ka < kb: - yield b - try: + # using a key, biggest first + try: + while True: + if ka < kb: + yield b b = next_b() - except StopIteration: + kb = key(b) + else: yield a - yield from a_iter - return - kb = key(b) - else: - yield a - try: a = next_a() - except StopIteration: - yield b - yield from b_iter - return - ka = key(a) - else: - # using a key, forward - while True: - if kb < ka: + ka = key(a) + except StopIteration: + if ka < kb: + yield a + yield from a_iter + else: yield b - try: + yield from b_iter + else: + # using a key, smallest first + try: + while True: + if kb < ka: + yield b b = next_b() - except StopIteration: + kb = key(b) + else: yield a - yield from a_iter - return - kb = key(b) - else: - yield a - try: a = next_a() - except StopIteration: - yield b - yield from b_iter - return - ka = key(a) + ka = key(a) + except StopIteration: + if kb < ka: + yield a + yield from a_iter + else: + yield b + yield from b_iter + # here ends the 2-iterable case + return h = [] h_append = h.append From b900aa1649b279e37184cfe362914fa0ddc8df2f Mon Sep 17 00:00:00 2001 From: Dennis Sweeney Date: Sun, 8 Dec 2019 19:55:23 -0500 Subject: [PATCH 17/17] Revert "factored out try/except from loop for more performance gains" to avoid masking StopIteration from key computation or comparison. This reverts commit c149212a94b8bdce74f904c3c9f9a87ce1292440. --- Lib/heapq.py | 122 +++++++++++++++++++++++++++------------------------ 1 file changed, 64 insertions(+), 58 deletions(-) diff --git a/Lib/heapq.py b/Lib/heapq.py index c7b49f7078cc34..78dd527c979e3e 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -351,82 +351,88 @@ def merge(*iterables, key=None, reverse=False): if key is None: if reverse: - # no key, biggest first - try: - while True: - if a < b: - yield b - b = next_b() - else: - yield a - a = next_a() - except StopIteration: + # no key, reverse + while True: if a < b: - yield a - yield from a_iter - else: yield b - yield from b_iter - else: - # no key, smallest first - try: - while True: - if b < a: - yield b + try: b = next_b() - else: + except StopIteration: yield a + yield from a_iter + return + else: + yield a + try: a = next_a() - except StopIteration: + except StopIteration: + yield b + yield from b_iter + return + else: + # no key, forward + while True: if b < a: - yield a - yield from a_iter - else: yield b - yield from b_iter + try: + b = next_b() + except StopIteration: + yield a + yield from a_iter + return + else: + yield a + try: + a = next_a() + except StopIteration: + yield b + yield from b_iter + return else: ka = key(a) kb = key(b) if reverse: - # using a key, biggest first - try: - while True: - if ka < kb: - yield b - b = next_b() - kb = key(b) - else: - yield a - a = next_a() - ka = key(a) - except StopIteration: + # using a key, reverse + while True: if ka < kb: - yield a - yield from a_iter - else: yield b - yield from b_iter - else: - # using a key, smallest first - try: - while True: - if kb < ka: - yield b + try: b = next_b() - kb = key(b) - else: + except StopIteration: yield a + yield from a_iter + return + kb = key(b) + else: + yield a + try: a = next_a() - ka = key(a) - except StopIteration: + except StopIteration: + yield b + yield from b_iter + return + ka = key(a) + else: + # using a key, forward + while True: if kb < ka: - yield a - yield from a_iter - else: yield b - yield from b_iter - # here ends the 2-iterable case - return + try: + b = next_b() + except StopIteration: + yield a + yield from a_iter + return + kb = key(b) + else: + yield a + try: + a = next_a() + except StopIteration: + yield b + yield from b_iter + return + ka = key(a) h = [] h_append = h.append