From e311b34dd52e22cfd8c1b6f40fe5975da736add6 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Tue, 31 Aug 2021 22:22:35 -0500
Subject: [PATCH 1/9] bpo-34561: Switch to Munro & Wild "powersort" merge
 strategy.

---
 Objects/listobject.c | 75 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 57 insertions(+), 18 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index 898cbc20c5f81c..5a463e9eaeffc1 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1159,11 +1159,15 @@ sortslice_advance(sortslice *slice, Py_ssize_t n)
  */
 struct s_slice {
     sortslice base;
-    Py_ssize_t len;
+    Py_ssize_t len;   /* length of run */
+    Py_ssize_t start; /* starting index of run */
+    int power; /* node "level" for powersort merge strategy */
 };
 
 typedef struct s_MergeState MergeState;
 struct s_MergeState {
+    Py_ssize_t listlen; /* len(input_list) */
+
     /* This controls when we get *into* galloping mode.  It's initialized
      * to MIN_GALLOP.  merge_lo and merge_hi tend to nudge it higher for
      * random data, and lower for highly structured data.
@@ -1538,6 +1542,7 @@ merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc)
     ms->a.keys = ms->temparray;
     ms->n = 0;
     ms->min_gallop = MIN_GALLOP;
+    ms->listlen = list_size;
 }
 
 /* Free all the temp memory owned by the MergeState.  This must be called
@@ -1920,6 +1925,36 @@ merge_at(MergeState *ms, Py_ssize_t i)
         return merge_hi(ms, ssa, na, ssb, nb);
 }
 
+static int
+powerloop(MergeState *ms, Py_ssize_t s1, Py_ssize_t n1, Py_ssize_t n2)
+{
+    int result = 0;
+    assert(s1 >= 0);
+    assert(n1 > 0 && n2 > 0);
+    assert(s1 + n1 + n2 <= ms->listlen);
+    /* midpoints a and b:
+     * a = s1 + n1/2
+     * b = s1 + n1 + n2/2 = a + (n1 + n2)/2
+     */
+    Py_ssize_t a = 2 * s1 + n1;  /* 2*a */
+    Py_ssize_t b = a + n1 + n2;  /* 2*b */
+    for (;;) {
+        ++result;
+        if (a >= ms->listlen) {
+            assert(b >= a);
+            a -= ms->listlen;
+            b -= ms->listlen;
+        }
+        else if (b >= ms->listlen) {
+            break;
+        }
+        assert(a < b && b < ms->listlen);
+        a <<= 1;
+        b <<= 1;
+    }
+    return result;
+}
+
 /* Examine the stack of runs waiting to be merged, merging adjacent runs
  * until the stack invariants are re-established:
  *
@@ -1933,25 +1968,26 @@ merge_at(MergeState *ms, Py_ssize_t i)
 static int
 merge_collapse(MergeState *ms)
 {
-    struct s_slice *p = ms->pending;
-
     assert(ms);
-    while (ms->n > 1) {
-        Py_ssize_t n = ms->n - 2;
-        if ((n > 0 && p[n-1].len <= p[n].len + p[n+1].len) ||
-            (n > 1 && p[n-2].len <= p[n-1].len + p[n].len)) {
-            if (p[n-1].len < p[n+1].len)
-                --n;
-            if (merge_at(ms, n) < 0)
-                return -1;
-        }
-        else if (p[n].len <= p[n+1].len) {
-            if (merge_at(ms, n) < 0)
-                return -1;
-        }
-        else
-            break;
+    if (ms->n <= 1)
+        return 0;
+    struct s_slice *p = ms->pending;
+    --ms->n;
+    struct s_slice ss2 = p[ms->n];
+    Py_ssize_t s2 = ss2.start;
+    Py_ssize_t n2 = ss2.len;
+    Py_ssize_t s1 = p[ms->n - 1].start;
+    Py_ssize_t n1 = p[ms->n - 1].len;
+    assert(s1 + n1 == s2);
+    int power = powerloop(ms, s1, n1, n2);
+    while (ms->n > 1 && p[ms->n - 2].power > power) {
+        if (merge_at(ms, ms->n - 2) < 0)
+            return -1;
     }
+    assert(ms->n < 2 || p[ms->n - 2].power < power);
+    p[ms->n - 1].power = power;
+    p[ms->n] = ss2;
+    ++ms->n;
     return 0;
 }
 
@@ -2375,6 +2411,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
      * and extending short natural runs to minrun elements.
      */
     minrun = merge_compute_minrun(nremaining);
+    Py_ssize_t start_index = 0;
     do {
         int descending;
         Py_ssize_t n;
@@ -2396,6 +2433,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
         /* Push run onto pending-runs stack, and maybe merge. */
         assert(ms.n < MAX_MERGE_PENDING);
         ms.pending[ms.n].base = lo;
+        ms.pending[ms.n].start = start_index;
         ms.pending[ms.n].len = n;
         ++ms.n;
         if (merge_collapse(&ms) < 0)
@@ -2403,6 +2441,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
         /* Advance to find next run. */
         sortslice_advance(&lo, n);
         nremaining -= n;
+        start_index += n;
     } while (nremaining);
 
     if (merge_force_collapse(&ms) < 0)

From 1c6dfcf2770a4a56966f5e2d635b0715ee2cedb4 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Wed, 1 Sep 2021 11:01:00 -0500
Subject: [PATCH 2/9] Various changes. Most significantly, got rid of the new
 struct member recording a run's starting index - that can be computed cheaply
 on-the-fly, by storing instead the key array's base address just once in the
 MergeState struct.

---
 Objects/listobject.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index 5a463e9eaeffc1..672509f231a2b9 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1160,20 +1160,20 @@ sortslice_advance(sortslice *slice, Py_ssize_t n)
 struct s_slice {
     sortslice base;
     Py_ssize_t len;   /* length of run */
-    Py_ssize_t start; /* starting index of run */
     int power; /* node "level" for powersort merge strategy */
 };
 
 typedef struct s_MergeState MergeState;
 struct s_MergeState {
-    Py_ssize_t listlen; /* len(input_list) */
-
     /* This controls when we get *into* galloping mode.  It's initialized
      * to MIN_GALLOP.  merge_lo and merge_hi tend to nudge it higher for
      * random data, and lower for highly structured data.
      */
     Py_ssize_t min_gallop;
 
+    Py_ssize_t listlen;     /* len(input_list) - read only*/
+    PyObject **basekeys;    /* base address of keys array - read only */
+
     /* 'a' is temp storage to help with merges.  It contains room for
      * alloced entries.
      */
@@ -1517,7 +1517,8 @@ gallop_right(MergeState *ms, PyObject *key, PyObject **a, Py_ssize_t n, Py_ssize
 
 /* Conceptually a MergeState's constructor. */
 static void
-merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc)
+merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc,
+           sortslice *lo)
 {
     assert(ms != NULL);
     if (has_keyfunc) {
@@ -1543,6 +1544,7 @@ merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc)
     ms->n = 0;
     ms->min_gallop = MIN_GALLOP;
     ms->listlen = list_size;
+    ms->basekeys = lo->keys;
 }
 
 /* Free all the temp memory owned by the MergeState.  This must be called
@@ -1926,29 +1928,30 @@ merge_at(MergeState *ms, Py_ssize_t i)
 }
 
 static int
-powerloop(MergeState *ms, Py_ssize_t s1, Py_ssize_t n1, Py_ssize_t n2)
+powerloop(Py_ssize_t s1, Py_ssize_t n1, Py_ssize_t n2, Py_ssize_t n)
 {
     int result = 0;
     assert(s1 >= 0);
     assert(n1 > 0 && n2 > 0);
-    assert(s1 + n1 + n2 <= ms->listlen);
+    assert(s1 + n1 + n2 <= n);
     /* midpoints a and b:
      * a = s1 + n1/2
      * b = s1 + n1 + n2/2 = a + (n1 + n2)/2
      */
     Py_ssize_t a = 2 * s1 + n1;  /* 2*a */
     Py_ssize_t b = a + n1 + n2;  /* 2*b */
+    /* Emulate a/n and b/n one bit a time, until bits differ. */
     for (;;) {
         ++result;
-        if (a >= ms->listlen) {
+        if (a >= n) {  /* both quotient bits are 1 */
             assert(b >= a);
-            a -= ms->listlen;
-            b -= ms->listlen;
+            a -= n;
+            b -= n;
         }
-        else if (b >= ms->listlen) {
+        else if (b >= n) {  /* a/n bit is 0, b/n bit is 1 */
             break;
-        }
-        assert(a < b && b < ms->listlen);
+        } /* else both quotient bits are 0 */
+        assert(a < b && b < n);
         a <<= 1;
         b <<= 1;
     }
@@ -1974,12 +1977,12 @@ merge_collapse(MergeState *ms)
     struct s_slice *p = ms->pending;
     --ms->n;
     struct s_slice ss2 = p[ms->n];
-    Py_ssize_t s2 = ss2.start;
+    Py_ssize_t s2 = ss2.base.keys - ms->basekeys;
     Py_ssize_t n2 = ss2.len;
-    Py_ssize_t s1 = p[ms->n - 1].start;
+    Py_ssize_t s1 = p[ms->n - 1].base.keys - ms->basekeys;
     Py_ssize_t n1 = p[ms->n - 1].len;
     assert(s1 + n1 == s2);
-    int power = powerloop(ms, s1, n1, n2);
+    int power = powerloop(s1, n1, n2, ms->listlen);
     while (ms->n > 1 && p[ms->n - 2].power > power) {
         if (merge_at(ms, ms->n - 2) < 0)
             return -1;
@@ -2393,7 +2396,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
     }
     /* End of pre-sort check: ms is now set properly! */
 
-    merge_init(&ms, saved_ob_size, keys != NULL);
+    merge_init(&ms, saved_ob_size, keys != NULL, &lo);
 
     nremaining = saved_ob_size;
     if (nremaining < 2)
@@ -2411,7 +2414,6 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
      * and extending short natural runs to minrun elements.
      */
     minrun = merge_compute_minrun(nremaining);
-    Py_ssize_t start_index = 0;
     do {
         int descending;
         Py_ssize_t n;
@@ -2433,7 +2435,6 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
         /* Push run onto pending-runs stack, and maybe merge. */
         assert(ms.n < MAX_MERGE_PENDING);
         ms.pending[ms.n].base = lo;
-        ms.pending[ms.n].start = start_index;
         ms.pending[ms.n].len = n;
         ++ms.n;
         if (merge_collapse(&ms) < 0)
@@ -2441,7 +2442,6 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
         /* Advance to find next run. */
         sortslice_advance(&lo, n);
         nremaining -= n;
-        start_index += n;
     } while (nremaining);
 
     if (merge_force_collapse(&ms) < 0)

From b253852912440417a0794d0906beacae6baef574 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Wed, 1 Sep 2021 13:58:46 -0500
Subject: [PATCH 3/9] Close to done! Some code fiddling, changed comments to
 match the code, and a new writeup for listsort.txt.

---
 Objects/listobject.c |  32 +++++-----
 Objects/listsort.txt | 139 +++++++++++++++++++++++++++----------------
 2 files changed, 107 insertions(+), 64 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index 672509f231a2b9..a7ba0ab705da85 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1139,12 +1139,11 @@ sortslice_advance(sortslice *slice, Py_ssize_t n)
            if (k)
 
 /* The maximum number of entries in a MergeState's pending-runs stack.
- * This is enough to sort arrays of size up to about
- *     32 * phi ** MAX_MERGE_PENDING
- * where phi ~= 1.618.  85 is ridiculouslylarge enough, good for an array
- * with 2**64 elements.
+ * For a list with n elements, thia needs at most floor(log2(n)) + 1 entries
+ * even if we didn't force runs to a minimal length.  So the number of bits
+ * in a Py_ssize_t is plenty large enough for all cases.
  */
-#define MAX_MERGE_PENDING 85
+#define MAX_MERGE_PENDING (SIZEOF_SIZE_T * 8)
 
 /* When we get into galloping mode, we stay there until both runs win less
  * often than MIN_GALLOP consecutive times.  See listsort.txt for more info.
@@ -1171,7 +1170,7 @@ struct s_MergeState {
      */
     Py_ssize_t min_gallop;
 
-    Py_ssize_t listlen;     /* len(input_list) - read only*/
+    Py_ssize_t listlen;     /* len(input_list) - read only */
     PyObject **basekeys;    /* base address of keys array - read only */
 
     /* 'a' is temp storage to help with merges.  It contains room for
@@ -1927,6 +1926,11 @@ merge_at(MergeState *ms, Py_ssize_t i)
         return merge_hi(ms, ssa, na, ssb, nb);
 }
 
+/* Two adjacent runs begin at index s1. The first run has length n1, and
+ * the second run (starting at index s1+n1) has length n2. The list has total
+ * length n.
+ * Compute the "power" of the first run. See listsort.txt for details.
+ */
 static int
 powerloop(Py_ssize_t s1, Py_ssize_t n1, Py_ssize_t n2, Py_ssize_t n)
 {
@@ -1959,10 +1963,7 @@ powerloop(Py_ssize_t s1, Py_ssize_t n1, Py_ssize_t n2, Py_ssize_t n)
 }
 
 /* Examine the stack of runs waiting to be merged, merging adjacent runs
- * until the stack invariants are re-established:
- *
- * 1. len[-3] > len[-2] + len[-1]
- * 2. len[-2] > len[-1]
+ * according to the "powersort" merge strategy.
  *
  * See listsort.txt for more info.
  *
@@ -1975,13 +1976,16 @@ merge_collapse(MergeState *ms)
     if (ms->n <= 1)
         return 0;
     struct s_slice *p = ms->pending;
+    /* The most recent run is never merged here, because we can't know its
+     * power before we see the run that follows it. So pop it off the stack
+     * and restore it later.
+     */
     --ms->n;
     struct s_slice ss2 = p[ms->n];
-    Py_ssize_t s2 = ss2.base.keys - ms->basekeys;
-    Py_ssize_t n2 = ss2.len;
-    Py_ssize_t s1 = p[ms->n - 1].base.keys - ms->basekeys;
+    Py_ssize_t s1 = p[ms->n - 1].base.keys - ms->basekeys; /* start index */
     Py_ssize_t n1 = p[ms->n - 1].len;
-    assert(s1 + n1 == s2);
+    Py_ssize_t n2 = ss2.len;
+    assert(s1 + n1 == ss2.base.keys - ms->basekeys);
     int power = powerloop(s1, n1, n2, ms->listlen);
     while (ms->n > 1 && p[ms->n - 2].power > power) {
         if (merge_at(ms, ms->n - 2) < 0)
diff --git a/Objects/listsort.txt b/Objects/listsort.txt
index 174777a2658dc6..f1be5723f9d16e 100644
--- a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@@ -327,56 +327,95 @@ high in the memory hierarchy.  We also can't delay merging "too long" because
 it consumes memory to remember the runs that are still unmerged, and the
 stack has a fixed size.
 
-What turned out to be a good compromise maintains two invariants on the
-stack entries, where A, B and C are the lengths of the three rightmost not-yet
-merged slices:
-
-1.  A > B+C
-2.  B > C
-
-Note that, by induction, #2 implies the lengths of pending runs form a
-decreasing sequence.  #1 implies that, reading the lengths right to left,
-the pending-run lengths grow at least as fast as the Fibonacci numbers.
-Therefore the stack can never grow larger than about log_base_phi(N) entries,
-where phi = (1+sqrt(5))/2 ~= 1.618.  Thus a small # of stack slots suffice
-for very large arrays.
-
-If A <= B+C, the smaller of A and C is merged with B (ties favor C, for the
-freshness-in-cache reason), and the new run replaces the A,B or B,C entries;
-e.g., if the last 3 entries are
-
-    A:30  B:20  C:10
-
-then B is merged with C, leaving
-
-    A:30  BC:30
-
-on the stack.  Or if they were
-
-    A:500  B:400:  C:1000
-
-then A is merged with B, leaving
-
-    AB:900  C:1000
-
-on the stack.
-
-In both examples, the stack configuration after the merge still violates
-invariant #2, and merge_collapse() goes on to continue merging runs until
-both invariants are satisfied.  As an extreme case, suppose we didn't do the
-minrun gimmick, and natural runs were of lengths 128, 64, 32, 16, 8, 4, 2,
-and 2.  Nothing would get merged until the final 2 was seen, and that would
-trigger 7 perfectly balanced merges.
-
-The thrust of these rules when they trigger merging is to balance the run
-lengths as closely as possible, while keeping a low bound on the number of
-runs we have to remember.  This is maximally effective for random data,
-where all runs are likely to be of (artificially forced) length minrun, and
-then we get a sequence of perfectly balanced merges (with, perhaps, some
-oddballs at the end).
-
-OTOH, one reason this sort is so good for partly ordered data has to do
-with wildly unbalanced run lengths.
+The original version of this code used the first thing I made up that didn't
+obviously suck ;-) It was loosely based on invariants involving the Fibonacci
+sequence.
+
+It worked OK, but it was hard to reason about, and was subtle enough that the
+intended invariants weren't actually preserved.  Researchers discovered that
+when trying to complete a computer-generated correctness proof.  That was
+easily-enough repaired, but the discovery spurred quite a bit of academic
+interest in truly good ways to manage incremental merging on the fly.
+
+At least a dozen different approaches were developed, some provably having
+near-optimal worst case behavior with respect to the entropy of the
+distribution of run lengths.  Some details can be found in bpo-34561.
+
+The code now uses the "powersort" merge strategy from:
+
+    "Nearly-Optimal Mergesorts: Fast, Practical Sorting Methods
+     That Optimally Adapt to Existing Runs"
+    J. Ian Munro and Sebastian Wild
+
+The code is pretty simple, but the justification is quite involved, as it's
+based on fast approximations to optimal binary search trees, which are
+substantial topics on their own.
+
+Here we'll just cover some pragmatic details:
+
+The `powerloop()` function computes a run's "power". Say two adjacent runs
+begin at index s1. The first run has length n1, and the second run (starting
+at index s1+n1, called "s2" below) has length n2. The list has total length n.
+The "power" of the first run is a small integer, the depth of the node
+connecting the two runs in an ideal binary merge tree, where power 1 is the
+root node, and the power increases by 1 for each level deeper in the tree.
+
+The power is the least integer L such that the "midpoint interval" contains
+a rational number of the form J/2**L. The midpoint interval is the semi-
+closed interval:
+
+    ((s1 + n1/2)/n, (s2 + n2/2)/n]
+
+Yes, that's brain-busting at first ;-) Concretely, if (s1 + n1/2)/n and
+(s2 + n2/2)/n are computed to infinite precision in binary, the power L is
+the first position at which the 2**-L bit differs between the expansions.
+Since the left end of the interval is less than the right end, the first
+differing bit must be a 0 bit in the left quotient and a 1 bit in the right
+quotient.
+
+`powerloop()` emulates these divisions, 1 bit at a time, using comparisons,
+subtractions, and shifts in a loop.
+
+You'll notice the paper uses an O(1) method instead, but that relies on two
+things we don't have:
+
+- An O(1) "count leading zeroes" primitive. We can find such a thing as a C
+  extension on most platforms, but not all, and there's no uniform spelling
+  on the platforms that support it.
+
+- Integer divison on an integer type twice as wide as needed to hold the
+  list length. But the latter is Py_ssize_t for us, and is typically the
+  widest native signed integer type the platform supports.
+
+But since runs in our algorithm are almost never very short, the once-per-run
+overhead of `powerloop()` seems lost in the noise.
+
+Detail: why is Py_ssize_t "wide enough" in `powerloop()`?  We do, after all,
+shift integers of that width left by 1.  How do we know that won't spill into
+the sign bit?  The trick is that we have some slop. `n` (the total list
+length) is the number of list elements, which is at most 4 times (on a 32-box,
+with 4-byte pointers) smaller than than the largest size_t. So at least the
+leading two bits of the integers we're using are clear.
+
+Since we can't compute a run's power before seeing the run that follows it,
+the most-recently identified run is never merged by `merge_collapse()`.
+Instead that run is only used to compute the 2nd-most-recent run's power.
+Then adjacent runs are merged so long as their saved power (tree depth) is
+greater than that newly computed power.
+
+A key invariant is that powers on the run stack are strictly decreasing
+(starting from the run at the top of the stack).
+
+Note that even powersort's strategy isn't always truly optimal. It can't be.
+Computing an optimal merge sequence can be done in time quadratic in the
+number of runs, which is very much slower, and also requires finding &
+remembering _all_ the runs lengths (of which there may be billions) in
+advance.  It's remarkable, though, how close to optimal this strategy gets.
+
+Curious factoid: of all the alternatives I've seen in the literature,
+powersort's is the only one that's always truly optimal for a collection of 3
+run lengths (for three lengths A B C, it's always optimal to first merge the
+shorter of A and C with B).
 
 
 Merge Memory

From cbd2e6d7994db81023394cd2e0a9ce2ca2a36ea5 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Wed, 1 Sep 2021 14:10:02 -0500
Subject: [PATCH 4/9] Typo repair.

---
 Objects/listsort.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Objects/listsort.txt b/Objects/listsort.txt
index f1be5723f9d16e..6a2e7d00d603c8 100644
--- a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@@ -409,7 +409,7 @@ A key invariant is that powers on the run stack are strictly decreasing
 Note that even powersort's strategy isn't always truly optimal. It can't be.
 Computing an optimal merge sequence can be done in time quadratic in the
 number of runs, which is very much slower, and also requires finding &
-remembering _all_ the runs lengths (of which there may be billions) in
+remembering _all_ the runs' lengths (of which there may be billions) in
 advance.  It's remarkable, though, how close to optimal this strategy gets.
 
 Curious factoid: of all the alternatives I've seen in the literature,

From c6b1806a5d874acfdbeeed94740abd75beea1dbb Mon Sep 17 00:00:00 2001
From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com>
Date: Wed, 1 Sep 2021 19:21:50 +0000
Subject: [PATCH 5/9] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?=
 =?UTF-8?q?rb=5Fit.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../Core and Builtins/2021-09-01-19-21-48.bpo-34561.uMAVA-.rst   | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-09-01-19-21-48.bpo-34561.uMAVA-.rst

diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-09-01-19-21-48.bpo-34561.uMAVA-.rst b/Misc/NEWS.d/next/Core and Builtins/2021-09-01-19-21-48.bpo-34561.uMAVA-.rst
new file mode 100644
index 00000000000000..b2e2f30fa4459b
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2021-09-01-19-21-48.bpo-34561.uMAVA-.rst	
@@ -0,0 +1 @@
+List sorting now uses the merge-ordering strategy from Munro and Wild's ``powersort()``, Unlike the former strategy, this is provably near-optimal in the entropy of the distribution of run lengths. Most uses of ``list.sort()`` probably won't see a significant time difference, but may see significant improvements in cases where the former strategy was exceptionally poor. However, as these are all fast linear-time approximations to a problem that's inherently at best quadratic-time to solve truly optimally, it's also possible to contrive cases where the former strategy did better.
\ No newline at end of file

From 560a3f154627d4c5667e102b759189edaddda2b0 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Wed, 1 Sep 2021 14:25:32 -0500
Subject: [PATCH 6/9] Update Misc/NEWS.d/next/Core and
 Builtins/2021-09-01-19-21-48.bpo-34561.uMAVA-.rst

---
 .../Core and Builtins/2021-09-01-19-21-48.bpo-34561.uMAVA-.rst  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-09-01-19-21-48.bpo-34561.uMAVA-.rst b/Misc/NEWS.d/next/Core and Builtins/2021-09-01-19-21-48.bpo-34561.uMAVA-.rst
index b2e2f30fa4459b..7c48cb39df1c5a 100644
--- a/Misc/NEWS.d/next/Core and Builtins/2021-09-01-19-21-48.bpo-34561.uMAVA-.rst	
+++ b/Misc/NEWS.d/next/Core and Builtins/2021-09-01-19-21-48.bpo-34561.uMAVA-.rst	
@@ -1 +1 @@
-List sorting now uses the merge-ordering strategy from Munro and Wild's ``powersort()``, Unlike the former strategy, this is provably near-optimal in the entropy of the distribution of run lengths. Most uses of ``list.sort()`` probably won't see a significant time difference, but may see significant improvements in cases where the former strategy was exceptionally poor. However, as these are all fast linear-time approximations to a problem that's inherently at best quadratic-time to solve truly optimally, it's also possible to contrive cases where the former strategy did better.
\ No newline at end of file
+List sorting now uses the merge-ordering strategy from Munro and Wild's ``powersort()``. Unlike the former strategy, this is provably near-optimal in the entropy of the distribution of run lengths. Most uses of ``list.sort()`` probably won't see a significant time difference, but may see significant improvements in cases where the former strategy was exceptionally poor. However, as these are all fast linear-time approximations to a problem that's inherently at best quadratic-time to solve truly optimally, it's also possible to contrive cases where the former strategy did better.
\ No newline at end of file

From 70fd24e36b1295250c8e2a00c1f9eec67eca4315 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Fri, 3 Sep 2021 12:09:04 -0500
Subject: [PATCH 7/9] Comment changes to address good points raised in review.

---
 Objects/listobject.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index a7ba0ab705da85..255f6ee9d391c7 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1139,7 +1139,7 @@ sortslice_advance(sortslice *slice, Py_ssize_t n)
            if (k)
 
 /* The maximum number of entries in a MergeState's pending-runs stack.
- * For a list with n elements, thia needs at most floor(log2(n)) + 1 entries
+ * For a list with n elements, this needs at most floor(log2(n)) + 1 entries
  * even if we didn't force runs to a minimal length.  So the number of bits
  * in a Py_ssize_t is plenty large enough for all cases.
  */
@@ -1941,6 +1941,11 @@ powerloop(Py_ssize_t s1, Py_ssize_t n1, Py_ssize_t n2, Py_ssize_t n)
     /* midpoints a and b:
      * a = s1 + n1/2
      * b = s1 + n1 + n2/2 = a + (n1 + n2)/2
+     *
+     * Those may not be integers, though, because of the "/2". So we work with
+     * 2*a and 2*b instead, which are necessarily integers. It makes no
+     * difference to the outcome, since the bits in the expansion of (2*i)/n
+     * are merely shifted one position from those of i/n.
      */
     Py_ssize_t a = 2 * s1 + n1;  /* 2*a */
     Py_ssize_t b = a + n1 + n2;  /* 2*b */

From 53e85deed6388913b7a060dd33bbee6f508cee69 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Sat, 4 Sep 2021 15:12:51 -0500
Subject: [PATCH 8/9] Rewrote merge_collapse(), and renamed it to
 found_new_run(), to reflect that it never merges the most recent run anymore.
 Putting the new run on the stack before that function is called is a waste of
 time now, and just complicated the function with now-silly code to pop the
 run off at the start and push it back at the end. Now the caller pushes it on
 the stack only after that function returns.

---
 Objects/listobject.c | 53 +++++++++++++++++++++-----------------------
 Objects/listsort.txt | 22 +++++++++---------
 2 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index 255f6ee9d391c7..881623a9d0070f 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1967,39 +1967,35 @@ powerloop(Py_ssize_t s1, Py_ssize_t n1, Py_ssize_t n2, Py_ssize_t n)
     return result;
 }
 
-/* Examine the stack of runs waiting to be merged, merging adjacent runs
- * according to the "powersort" merge strategy.
+/* The next run has been identified, starting at index s2 and of length n2.
+ * If there's already a run on the stack, apply the "powersort" merge strategy:
+ * compute the topmost run's "power" (depth in a conceptual binary merge tree)
+ * and merge adjacent runs on the stack with greater power. See listsort.txt
+ * for more info.
  *
- * See listsort.txt for more info.
+ * It's the caller's responsibilty to push the new run on the stack when this
+ * returns.
  *
  * Returns 0 on success, -1 on error.
  */
 static int
-merge_collapse(MergeState *ms)
+found_new_run(MergeState *ms, Py_ssize_t s2, Py_ssize_t n2)
 {
     assert(ms);
-    if (ms->n <= 1)
-        return 0;
-    struct s_slice *p = ms->pending;
-    /* The most recent run is never merged here, because we can't know its
-     * power before we see the run that follows it. So pop it off the stack
-     * and restore it later.
-     */
-    --ms->n;
-    struct s_slice ss2 = p[ms->n];
-    Py_ssize_t s1 = p[ms->n - 1].base.keys - ms->basekeys; /* start index */
-    Py_ssize_t n1 = p[ms->n - 1].len;
-    Py_ssize_t n2 = ss2.len;
-    assert(s1 + n1 == ss2.base.keys - ms->basekeys);
-    int power = powerloop(s1, n1, n2, ms->listlen);
-    while (ms->n > 1 && p[ms->n - 2].power > power) {
-        if (merge_at(ms, ms->n - 2) < 0)
-            return -1;
+    if (ms->n) {
+        assert(ms->n > 0);
+        struct s_slice *p = ms->pending;
+        Py_ssize_t s1 = p[ms->n - 1].base.keys - ms->basekeys; /* start index */
+        Py_ssize_t n1 = p[ms->n - 1].len;
+        assert(s1 + n1 == s2);
+        int power = powerloop(s1, n1, n2, ms->listlen);
+        while (ms->n > 1 && p[ms->n - 2].power > power) {
+            if (merge_at(ms, ms->n - 2) < 0)
+                return -1;
+        }
+        assert(ms->n < 2 || p[ms->n - 2].power < power);
+        p[ms->n - 1].power = power;
     }
-    assert(ms->n < 2 || p[ms->n - 2].power < power);
-    p[ms->n - 1].power = power;
-    p[ms->n] = ss2;
-    ++ms->n;
     return 0;
 }
 
@@ -2441,13 +2437,14 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
                 goto fail;
             n = force;
         }
-        /* Push run onto pending-runs stack, and maybe merge. */
+        /* Maybe merge pending runs. */
+        if (found_new_run(&ms, lo.keys - ms.basekeys, n) < 0)
+            goto fail;
+        /* Push new run on stack. */
         assert(ms.n < MAX_MERGE_PENDING);
         ms.pending[ms.n].base = lo;
         ms.pending[ms.n].len = n;
         ++ms.n;
-        if (merge_collapse(&ms) < 0)
-            goto fail;
         /* Advance to find next run. */
         sortslice_advance(&lo, n);
         nremaining -= n;
diff --git a/Objects/listsort.txt b/Objects/listsort.txt
index 6a2e7d00d603c8..5c2639abddcd4e 100644
--- a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@@ -318,14 +318,13 @@ merging must be done as (A+B)+C or A+(B+C) instead.
 So merging is always done on two consecutive runs at a time, and in-place,
 although this may require some temp memory (more on that later).
 
-When a run is identified, its base address and length are pushed on a stack
-in the MergeState struct.  merge_collapse() is then called to potentially
-merge runs on that stack.  We would like to delay merging as long as possible
-in order to exploit patterns that may come up later, but we like even more to
-do merging as soon as possible to exploit that the run just found is still
-high in the memory hierarchy.  We also can't delay merging "too long" because
-it consumes memory to remember the runs that are still unmerged, and the
-stack has a fixed size.
+When a run is identified, its starting index and length are passed to
+found_new_run() to potentially merge runs on a stack of pending runs.  We
+would like to delay merging as long as possible in order to exploit patterns
+that may come up later, but we like even more to do merging as soon as
+possible to exploit that the run just found is still high in the memory
+hierarchy.  We also can't delay merging "too long" because it consumes memory
+to remember the runs that are still unmerged, and the stack has a fixed size.
 
 The original version of this code used the first thing I made up that didn't
 obviously suck ;-) It was loosely based on invariants involving the Fibonacci
@@ -398,10 +397,11 @@ with 4-byte pointers) smaller than than the largest size_t. So at least the
 leading two bits of the integers we're using are clear.
 
 Since we can't compute a run's power before seeing the run that follows it,
-the most-recently identified run is never merged by `merge_collapse()`.
-Instead that run is only used to compute the 2nd-most-recent run's power.
+the most-recently identified run is never merged by `found_new_run()`.
+Instead a new run is only used to compute the 2nd-most-recent run's power.
 Then adjacent runs are merged so long as their saved power (tree depth) is
-greater than that newly computed power.
+greater than that newly computed power. When found_new_run() returns, only
+then is a new run pushed on to the stack of pending runs.
 
 A key invariant is that powers on the run stack are strictly decreasing
 (starting from the run at the top of the stack).

From 56f8ccfec7ee8a637a776be45262092867426e3e Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Sat, 4 Sep 2021 16:37:18 -0500
Subject: [PATCH 9/9] The starting-index argument to found_new_run wasn't
 actually used, except in an assert. So get rid of it. Assert the run is
 adjacent in its caller instead.

---
 Objects/listobject.c |  9 +++++----
 Objects/listsort.txt | 14 +++++++-------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index 881623a9d0070f..565c11e7f384fe 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1967,7 +1967,7 @@ powerloop(Py_ssize_t s1, Py_ssize_t n1, Py_ssize_t n2, Py_ssize_t n)
     return result;
 }
 
-/* The next run has been identified, starting at index s2 and of length n2.
+/* The next run has been identified, of length n2.
  * If there's already a run on the stack, apply the "powersort" merge strategy:
  * compute the topmost run's "power" (depth in a conceptual binary merge tree)
  * and merge adjacent runs on the stack with greater power. See listsort.txt
@@ -1979,7 +1979,7 @@ powerloop(Py_ssize_t s1, Py_ssize_t n1, Py_ssize_t n2, Py_ssize_t n)
  * Returns 0 on success, -1 on error.
  */
 static int
-found_new_run(MergeState *ms, Py_ssize_t s2, Py_ssize_t n2)
+found_new_run(MergeState *ms, Py_ssize_t n2)
 {
     assert(ms);
     if (ms->n) {
@@ -1987,7 +1987,6 @@ found_new_run(MergeState *ms, Py_ssize_t s2, Py_ssize_t n2)
         struct s_slice *p = ms->pending;
         Py_ssize_t s1 = p[ms->n - 1].base.keys - ms->basekeys; /* start index */
         Py_ssize_t n1 = p[ms->n - 1].len;
-        assert(s1 + n1 == s2);
         int power = powerloop(s1, n1, n2, ms->listlen);
         while (ms->n > 1 && p[ms->n - 2].power > power) {
             if (merge_at(ms, ms->n - 2) < 0)
@@ -2438,7 +2437,9 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
             n = force;
         }
         /* Maybe merge pending runs. */
-        if (found_new_run(&ms, lo.keys - ms.basekeys, n) < 0)
+        assert(ms.n == 0 || ms.pending[ms.n -1].base.keys +
+                            ms.pending[ms.n-1].len == lo.keys);
+        if (found_new_run(&ms, n) < 0)
             goto fail;
         /* Push new run on stack. */
         assert(ms.n < MAX_MERGE_PENDING);
diff --git a/Objects/listsort.txt b/Objects/listsort.txt
index 5c2639abddcd4e..306e5e44d2ecfe 100644
--- a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@@ -318,13 +318,13 @@ merging must be done as (A+B)+C or A+(B+C) instead.
 So merging is always done on two consecutive runs at a time, and in-place,
 although this may require some temp memory (more on that later).
 
-When a run is identified, its starting index and length are passed to
-found_new_run() to potentially merge runs on a stack of pending runs.  We
-would like to delay merging as long as possible in order to exploit patterns
-that may come up later, but we like even more to do merging as soon as
-possible to exploit that the run just found is still high in the memory
-hierarchy.  We also can't delay merging "too long" because it consumes memory
-to remember the runs that are still unmerged, and the stack has a fixed size.
+When a run is identified, its length is passed to found_new_run() to
+potentially merge runs on a stack of pending runs.  We would like to delay
+merging as long as possible in order to exploit patterns that may come up
+later, but we like even more to do merging as soon as possible to exploit
+that the run just found is still high in the memory hierarchy.  We also can't
+delay merging "too long" because it consumes memory to remember the runs that
+are still unmerged, and the stack has a fixed size.
 
 The original version of this code used the first thing I made up that didn't
 obviously suck ;-) It was loosely based on invariants involving the Fibonacci