python · gvanrossum · Nov 24, 2023 · Nov 23, 2023 · Nov 24, 2023 · Nov 25, 2023
diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h
@@ -60,8 +60,8 @@ PyAPI_FUNC(_PyOptimizerObject *) PyUnstable_GetOptimizer(void);
 
 PyAPI_FUNC(_PyExecutorObject *) PyUnstable_GetExecutor(PyCodeObject *code, int offset);
 
-int
-_PyOptimizer_BackEdge(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer);
+int _PyOptimizer_BackEdge(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer);
+int _PyOptimizer_Unanchored(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **pexecutor, PyObject **stack_pointer);
 
 extern _PyOptimizerObject _PyOptimizer_Default;
 

diff --git a/Include/internal/pycore_uops.h b/Include/internal/pycore_uops.h
@@ -21,6 +21,9 @@ typedef struct {
 
 typedef struct {
     _PyExecutorObject base;
+    // Auxiliary arrays, allocated after trace[base.ob_size]
+    uint16_t *counters;  // An array of counters
+    _PyExecutorObject **executors;  // An array of executors
     _PyUOpInstruction trace[1];
 } _PyUOpExecutorObject;
 

diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py
@@ -539,6 +539,27 @@ def testfunc(n):
         # too much already.
         self.assertEqual(count, 1)
 
+    def test_side_exits(self):
+        def testfunc():
+            for _ in range(100):
+                for i in range(100):
+                    if i >= 70:
+                        i = 0
+
+        opt = _testinternalcapi.get_uop_optimizer()
+        with temporary_optimizer(opt):
+            testfunc()
+
+        ex = get_first_executor(testfunc)
+        self.assertIsNotNone(ex)
+        uops = {opname for opname, _, _ in ex}
+        self.assertIn("_GUARD_IS_FALSE_POP", uops)
+        subs = [sub for sub in ex.sub_executors() if sub is not None]
+        self.assertGreater(len(subs), 0)
+        sub = subs[0]
+        sub_uops = {opname for opname, _, _ in sub}
+        self.assertIn("_GUARD_IS_TRUE_POP", sub_uops)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-12-13-15-30-19.gh-issue-112354.Z6yyTb.rst b/Misc/NEWS.d/next/Core and Builtins/2023-12-13-15-30-19.gh-issue-112354.Z6yyTb.rst
@@ -0,0 +1,2 @@
+In the Tier 2 interpreter, add side exits to sub-executors for certain
+micro-opcodes (currently only conditional branches).
@@ -755,6 +755,7 @@
     next_instr = frame->instr_ptr;
 resume_frame:
     stack_pointer = _PyFrame_GetStackPointer(frame);
+resume_frame_using_stack_pointer:
 
 #ifdef LLTRACE
     lltrace = maybe_lltrace_resume_frame(frame, &entry_frame, GLOBALS());
@@ -1063,17 +1064,123 @@
 
 // Jump here from DEOPT_IF()
 deoptimize:
-    next_instr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame));
+    frame->instr_ptr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame));
     DPRINTF(2, "DEOPT: [UOp %d (%s), oparg %d, operand %" PRIu64 ", target %d @ %d -> %s]\n",
             uopcode, _PyUOpName(uopcode), next_uop[-1].oparg, next_uop[-1].operand, next_uop[-1].target,
             (int)(next_uop - current_executor->trace - 1),
             _PyOpcode_OpName[frame->instr_ptr->op.code]);
     OPT_HIST(trace_uop_execution_counter, trace_run_length_hist);
     UOP_STAT_INC(uopcode, miss);
-    Py_DECREF(current_executor);
-    DISPATCH();
+    frame->return_offset = 0;  // Don't leave this random
+
+    // Check if there is a side-exit executor here already.
+    int pc = next_uop - 1 - current_executor->trace;
+    _PyExecutorObject **pexecutor = current_executor->executors + pc;
+    if (*pexecutor != NULL) {
+#ifdef Py_DEBUG
+        PyCodeObject *code = _PyFrame_GetCode(frame);
+        DPRINTF(2, "Jumping to new executor for %s (%s:%d) at byte offset %d\n",
+                PyUnicode_AsUTF8(code->co_qualname),
+                PyUnicode_AsUTF8(code->co_filename),
+                code->co_firstlineno,
+                2 * (int)(frame->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(frame))));
+#endif
+        _PyUOpExecutorObject *new_executor = (_PyUOpExecutorObject *)Py_NewRef(*pexecutor);
+        Py_DECREF(current_executor);
+        current_executor = new_executor;
+        goto enter_tier_two;
+    }
+
+    // Increment and check side exit counter.
+    // (Even though we only need it for certain opcodes.)
+    next_instr = frame->instr_ptr;
+    uint16_t *pcounter = current_executor->counters + pc;
+    *pcounter += 1 << OPTIMIZER_BITS_IN_COUNTER;
+    /* We are using unsigned values, but we really want signed values, so
+     * do the 2s complement comparison manually */
+    uint16_t ucounter = *pcounter + (1 << 15);
+    uint16_t threshold = tstate->interp->optimizer_resume_threshold + (1 << 15);
+    if (ucounter <= threshold)
+    {
+        Py_DECREF(current_executor);
+        goto resume_frame_using_stack_pointer;
+    }
+
+    // Decode instruction to look past EXTENDED_ARG.
+    opcode = next_instr[0].op.code;
+    if (opcode == EXTENDED_ARG) {
+        opcode = next_instr[1].op.code;
+    }
+
+    // For selected opcodes build a new executor and enter it now.
+    if (opcode == POP_JUMP_IF_FALSE ||
+        opcode == POP_JUMP_IF_TRUE ||
+        opcode == POP_JUMP_IF_NONE ||
+        opcode == POP_JUMP_IF_NOT_NONE)
+    {
+        DPRINTF(2, "--> %s @ %d in %p has %d side exits\n",
+                _PyUOpName(uopcode), pc, current_executor, (int)(*pcounter));
+        DPRINTF(2, "    T1: %s\n", _PyOpcode_OpName[opcode]);
+
+        _PyExecutorObject *tmp_executor = NULL;
+        int optimized = _PyOptimizer_Unanchored(frame, next_instr, &tmp_executor, stack_pointer);
+        if (optimized < 0) {
+            goto error_tier_two;
+        }
 
+        if (!optimized) {
+            DPRINTF(2, "--> Failed to optimize %s @ %d in %p\n",
+                    _PyUOpName(uopcode), pc, current_executor);
+        }
+        else {
+#ifdef Py_DEBUG
+            DPRINTF(1, "--> Optimized %s @ %d in %p\n",
+                    _PyUOpName(uopcode), pc, current_executor);
+            PyCodeObject *code = _PyFrame_GetCode(frame);
+            DPRINTF(2, "Jumping to fresh executor for %s (%s:%d) at byte offset %d\n",
+                    PyUnicode_AsUTF8(code->co_qualname),
+                    PyUnicode_AsUTF8(code->co_filename),
+                    code->co_firstlineno,
+                    2 * (int)(frame->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(frame))));
+#endif
+            _PyUOpExecutorObject *new_executor = (_PyUOpExecutorObject *)Py_NewRef(tmp_executor);
+
+            // Reject trace if it repeats the uop that just deoptimized.
+            int jump_opcode = new_executor->trace[0].opcode;
+            if (jump_opcode == _IS_NONE) {
+                jump_opcode = new_executor->trace[1].opcode;
+            }
+            if (jump_opcode != uopcode) {
+                *pexecutor = tmp_executor;
+                *pcounter &= ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1);
+                Py_DECREF(current_executor);
+                current_executor = new_executor;
+                goto enter_tier_two;  // All systems go!
+            }
+
+            // The trace is guaranteed to deopt again; forget about it.
+            DPRINTF(2, "Alas, it's the same uop again (%s) -- discarding trace\n",
+                    _PyUOpName(jump_opcode));
+            Py_DECREF(tmp_executor);
+            Py_DECREF(new_executor);
+        }
+    }
+
+    // Exponential backoff if we didn't optimize.
+    int backoff = *pcounter & ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1);
+    if (backoff < MINIMUM_TIER2_BACKOFF) {
+        backoff = MINIMUM_TIER2_BACKOFF;
+    }
+    else if (backoff < 15 - OPTIMIZER_BITS_IN_COUNTER) {
+        backoff++;
+    }
+    assert(backoff <= 15 - OPTIMIZER_BITS_IN_COUNTER);
+    *pcounter = ((1 << 16) - ((1 << OPTIMIZER_BITS_IN_COUNTER) << backoff)) | backoff;
+
+    Py_DECREF(current_executor);
+    goto resume_frame_using_stack_pointer;
 }
+
 #if defined(__GNUC__)
 #  pragma GCC diagnostic pop
 #elif defined(_MSC_VER) /* MS_WINDOWS */

@@ -159,7 +159,7 @@ int
 _PyOptimizer_BackEdge(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer)
 {
     assert(src->op.code == JUMP_BACKWARD);
-    PyCodeObject *code = (PyCodeObject *)frame->f_executable;
+    PyCodeObject *code = _PyFrame_GetCode(frame);
     assert(PyCode_Check(code));
     PyInterpreterState *interp = _PyInterpreterState_GET();
     if (!has_space_for_executor(code, src)) {
@@ -189,6 +189,27 @@ _PyOptimizer_BackEdge(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNI
     return 1;
 }
 
+// Return an unanchored executor. The caller owns the executor when returning 1.
+// No ENTER_EXECUTOR is inserted, nor is the executor added to the code object.
+int
+_PyOptimizer_Unanchored(
+    _PyInterpreterFrame *frame,
+    _Py_CODEUNIT *instr,
+    _PyExecutorObject **pexecutor,
+    PyObject **stack_pointer)
+{
+    assert(instr->op.code != ENTER_EXECUTOR);
+    PyCodeObject *code = _PyFrame_GetCode(frame);
+    assert(PyCode_Check(code));
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    _PyOptimizerObject *opt = interp->optimizer;
+    if (strcmp(opt->ob_base.ob_type->tp_name, "uop_optimizer") != 0) {
+        return 0;
+    }
+    *pexecutor = NULL;
+    return opt->optimize(opt, code, instr, pexecutor, (int)(stack_pointer - _PyFrame_Stackbase(frame)));
+}
+
 _PyExecutorObject *
 PyUnstable_GetExecutor(PyCodeObject *code, int offset)
 {
@@ -321,6 +342,11 @@ PyUnstable_Optimizer_NewCounter(void)
 static void
 uop_dealloc(_PyUOpExecutorObject *self) {
     _Py_ExecutorClear((_PyExecutorObject *)self);
+    if (self->executors != NULL) {
+        for (Py_ssize_t i = Py_SIZE(self); --i >= 0; ) {
+            Py_XDECREF(self->executors[i]);
+        }
+    }
     PyObject_Free(self);
 }
 
@@ -375,15 +401,41 @@ PySequenceMethods uop_as_sequence = {
     .sq_item = (ssizeargfunc)uop_item,
 };
 
+static PyObject *
+sub_executors(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    _PyUOpExecutorObject *executor = (_PyUOpExecutorObject *)self;
+    Py_ssize_t len = uop_len(executor);
+    PyObject *list = PyList_New(len);
+    if (list == NULL) {
+        return NULL;
+    }
+    for (Py_ssize_t i = 0; i < len; i++) {
+        PyObject *sub = (PyObject *)executor->executors[i];
+        if (sub == NULL) {
+            sub = Py_None;
+        }
+        Py_INCREF(sub);
+        PyList_SET_ITEM(list, i, (PyObject *)sub);
+    }
+    return list;
+}
+
+static PyMethodDef uop_executor_methods[] = {
+    { "is_valid", is_valid, METH_NOARGS, NULL },
+    { "sub_executors", sub_executors, METH_NOARGS, NULL },
+    { NULL, NULL },
+};
+
 PyTypeObject _PyUOpExecutor_Type = {
     PyVarObject_HEAD_INIT(&PyType_Type, 0)
     .tp_name = "uop_executor",
     .tp_basicsize = sizeof(_PyUOpExecutorObject) - sizeof(_PyUOpInstruction),
-    .tp_itemsize = sizeof(_PyUOpInstruction),
+    .tp_itemsize = sizeof(_PyUOpInstruction) + sizeof(uint16_t) + sizeof(_PyExecutorObject *),
     .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
     .tp_dealloc = (destructor)uop_dealloc,
     .tp_as_sequence = &uop_as_sequence,
-    .tp_methods = executor_methods,
+    .tp_methods = uop_executor_methods,
 };
 
 /* TO DO -- Generate these tables */
@@ -499,7 +551,7 @@ translate_bytecode_to_trace(
     code = trace_stack[trace_stack_depth].code; \
     instr = trace_stack[trace_stack_depth].instr;
 
-    DPRINTF(4,
+    DPRINTF(2,
             "Optimizing %s (%s:%d) at byte offset %d\n",
             PyUnicode_AsUTF8(code->co_qualname),
             PyUnicode_AsUTF8(code->co_filename),
@@ -825,6 +877,10 @@ make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies)
     if (executor == NULL) {
         return NULL;
     }
+    executor->executors = (_PyExecutorObject **)(&executor->trace[length]);
+    executor->counters = (uint16_t *)(&executor->executors[length]);
+    memset(executor->executors, 0, sizeof(_PyExecutorObject *) * length);
+    memset(executor->counters, 0, sizeof(uint16_t) * length);
     int dest = length - 1;
     /* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */
     for (int i = _Py_UOP_MAX_TRACE_LENGTH-1; i >= 0; i--) {
@@ -933,9 +989,8 @@ PyUnstable_Optimizer_NewUOpOptimizer(void)
         return NULL;
     }
     opt->optimize = uop_optimize;
-    opt->resume_threshold = INT16_MAX;
-    // Need at least 3 iterations to settle specializations.
-    // A few lower bits of the counter are reserved for other flags.
+    // The lower bits are reserved for exponential backoff.
+    opt->resume_threshold = 16 << OPTIMIZER_BITS_IN_COUNTER;
     opt->backedge_threshold = 16 << OPTIMIZER_BITS_IN_COUNTER;
     return (PyObject *)opt;
 }

diff --git a/Python/specialize.c b/Python/specialize.c
@@ -2353,6 +2353,7 @@ int
 void
 _Py_Specialize_ForIter(PyObject *iter, _Py_CODEUNIT *instr, int oparg)
 {
+    assert(_PyOpcode_Deopt[instr->op.code] == FOR_ITER);
     assert(ENABLE_SPECIALIZATION);
     assert(_PyOpcode_Caches[FOR_ITER] == INLINE_CACHE_ENTRIES_FOR_ITER);
     _PyForIterCache *cache = (_PyForIterCache *)(instr + 1);