From 811001216582bbe5869b4b2ceeef85ae4ba1e7f5 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Wed, 15 Oct 2025 17:54:06 +0100 Subject: [PATCH 1/2] JIT: Improve machine code for loading smaller constants on AArch64. * Use movz and movk instructions for loading 16 and 32 bit operands and oparg. * Loading of 64 bit operands is unchanged. --- Python/bytecodes.c | 2 +- Python/ceval_macros.h | 8 +- Python/executor_cases.c.h | 110 +++++++++---------- Python/generated_cases.c.h | 2 +- Tools/cases_generator/tier2_generator.py | 7 +- Tools/jit/_optimizers.py | 129 ++++++++++++++++++++++- Tools/jit/_schema.py | 3 + Tools/jit/_stencils.py | 19 +++- Tools/jit/_targets.py | 1 + Tools/jit/template.c | 46 ++++++-- 10 files changed, 256 insertions(+), 71 deletions(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 6411049796bf12..5fcfcb60d180f3 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -810,7 +810,7 @@ dummy_func( assert(next_instr->op.code == STORE_FAST); next_oparg = next_instr->op.arg; #else - next_oparg = (int)CURRENT_OPERAND0(); + next_oparg = (int)CURRENT_OPERAND0_16(); #endif _PyStackRef *target_local = &GETLOCAL(next_oparg); assert(PyUnicode_CheckExact(left_o)); diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index edf8fc9a57d74e..c88df6ad8a010b 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -443,8 +443,12 @@ do { \ } while (0) #define CURRENT_OPARG() (next_uop[-1].oparg) -#define CURRENT_OPERAND0() (next_uop[-1].operand0) -#define CURRENT_OPERAND1() (next_uop[-1].operand1) +#define CURRENT_OPERAND0_64() (next_uop[-1].operand0) +#define CURRENT_OPERAND1_64() (next_uop[-1].operand1) +#define CURRENT_OPERAND0_32() (next_uop[-1].operand0) +#define CURRENT_OPERAND1_32() (next_uop[-1].operand1) +#define CURRENT_OPERAND0_16() (next_uop[-1].operand0) +#define CURRENT_OPERAND1_16() (next_uop[-1].operand1) #define CURRENT_TARGET() (next_uop[-1].target) #define JUMP_TO_JUMP_TARGET() goto jump_to_jump_target diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 079d31da6c1b7a..117604b78a44c9 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -1226,7 +1226,7 @@ assert(next_instr->op.code == STORE_FAST); next_oparg = next_instr->op.arg; #else - next_oparg = (int)CURRENT_OPERAND0(); + next_oparg = (int)CURRENT_OPERAND0_16(); #endif _PyStackRef *target_local = &GETLOCAL(next_oparg); assert(PyUnicode_CheckExact(left_o)); @@ -1264,7 +1264,7 @@ _PyStackRef left; right = stack_pointer[-1]; left = stack_pointer[-2]; - PyObject *descr = (PyObject *)CURRENT_OPERAND0(); + PyObject *descr = (PyObject *)CURRENT_OPERAND0_64(); PyObject *left_o = PyStackRef_AsPyObjectBorrow(left); PyObject *right_o = PyStackRef_AsPyObjectBorrow(right); _PyBinaryOpSpecializationDescr *d = (_PyBinaryOpSpecializationDescr*)descr; @@ -1286,7 +1286,7 @@ _PyStackRef res; right = stack_pointer[-1]; left = stack_pointer[-2]; - PyObject *descr = (PyObject *)CURRENT_OPERAND0(); + PyObject *descr = (PyObject *)CURRENT_OPERAND0_64(); PyObject *left_o = PyStackRef_AsPyObjectBorrow(left); PyObject *right_o = PyStackRef_AsPyObjectBorrow(right); assert(INLINE_CACHE_ENTRIES_BINARY_OP == 5); @@ -2495,7 +2495,7 @@ } case _GUARD_GLOBALS_VERSION: { - uint16_t version = (uint16_t)CURRENT_OPERAND0(); + uint16_t version = (uint16_t)CURRENT_OPERAND0_16(); PyDictObject *dict = (PyDictObject *)GLOBALS(); if (!PyDict_CheckExact(dict)) { UOP_STAT_INC(uopcode, miss); @@ -2512,8 +2512,8 @@ case _LOAD_GLOBAL_MODULE: { _PyStackRef res; - uint16_t version = (uint16_t)CURRENT_OPERAND0(); - uint16_t index = (uint16_t)CURRENT_OPERAND1(); + uint16_t version = (uint16_t)CURRENT_OPERAND0_16(); + uint16_t index = (uint16_t)CURRENT_OPERAND1_16(); PyDictObject *dict = (PyDictObject *)GLOBALS(); if (!PyDict_CheckExact(dict)) { UOP_STAT_INC(uopcode, miss); @@ -2550,8 +2550,8 @@ case _LOAD_GLOBAL_BUILTINS: { _PyStackRef res; - uint16_t version = (uint16_t)CURRENT_OPERAND0(); - uint16_t index = (uint16_t)CURRENT_OPERAND1(); + uint16_t version = (uint16_t)CURRENT_OPERAND0_16(); + uint16_t index = (uint16_t)CURRENT_OPERAND1_16(); PyDictObject *dict = (PyDictObject *)BUILTINS(); if (!PyDict_CheckExact(dict)) { UOP_STAT_INC(uopcode, miss); @@ -3301,7 +3301,7 @@ case _GUARD_TYPE_VERSION: { _PyStackRef owner; owner = stack_pointer[-1]; - uint32_t type_version = (uint32_t)CURRENT_OPERAND0(); + uint32_t type_version = (uint32_t)CURRENT_OPERAND0_32(); PyTypeObject *tp = Py_TYPE(PyStackRef_AsPyObjectBorrow(owner)); assert(type_version != 0); if (FT_ATOMIC_LOAD_UINT_RELAXED(tp->tp_version_tag) != type_version) { @@ -3314,7 +3314,7 @@ case _GUARD_TYPE_VERSION_AND_LOCK: { _PyStackRef owner; owner = stack_pointer[-1]; - uint32_t type_version = (uint32_t)CURRENT_OPERAND0(); + uint32_t type_version = (uint32_t)CURRENT_OPERAND0_32(); PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner); assert(type_version != 0); if (!LOCK_OBJECT(owner_o)) { @@ -3349,7 +3349,7 @@ _PyStackRef owner; _PyStackRef attr; owner = stack_pointer[-1]; - uint16_t offset = (uint16_t)CURRENT_OPERAND0(); + uint16_t offset = (uint16_t)CURRENT_OPERAND0_16(); PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner); PyObject **value_ptr = (PyObject**)(((char *)owner_o) + offset); PyObject *attr_o = FT_ATOMIC_LOAD_PTR_ACQUIRE(*value_ptr); @@ -3380,8 +3380,8 @@ _PyStackRef owner; _PyStackRef attr; owner = stack_pointer[-1]; - uint32_t dict_version = (uint32_t)CURRENT_OPERAND0(); - uint16_t index = (uint16_t)CURRENT_OPERAND1(); + uint32_t dict_version = (uint32_t)CURRENT_OPERAND0_32(); + uint16_t index = (uint16_t)CURRENT_OPERAND1_16(); PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner); if (Py_TYPE(owner_o)->tp_getattro != PyModule_Type.tp_getattro) { UOP_STAT_INC(uopcode, miss); @@ -3426,7 +3426,7 @@ _PyStackRef attr; oparg = CURRENT_OPARG(); owner = stack_pointer[-1]; - uint16_t hint = (uint16_t)CURRENT_OPERAND0(); + uint16_t hint = (uint16_t)CURRENT_OPERAND0_16(); PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner); assert(Py_TYPE(owner_o)->tp_flags & Py_TPFLAGS_MANAGED_DICT); PyDictObject *dict = _PyObject_GetManagedDict(owner_o); @@ -3493,7 +3493,7 @@ _PyStackRef owner; _PyStackRef attr; owner = stack_pointer[-1]; - uint16_t index = (uint16_t)CURRENT_OPERAND0(); + uint16_t index = (uint16_t)CURRENT_OPERAND0_16(); PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner); PyObject **addr = (PyObject **)((char *)owner_o + index); PyObject *attr_o = FT_ATOMIC_LOAD_PTR(*addr); @@ -3523,7 +3523,7 @@ case _CHECK_ATTR_CLASS: { _PyStackRef owner; owner = stack_pointer[-1]; - uint32_t type_version = (uint32_t)CURRENT_OPERAND0(); + uint32_t type_version = (uint32_t)CURRENT_OPERAND0_32(); PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner); if (!PyType_Check(owner_o)) { UOP_STAT_INC(uopcode, miss); @@ -3541,7 +3541,7 @@ _PyStackRef owner; _PyStackRef attr; owner = stack_pointer[-1]; - PyObject *descr = (PyObject *)CURRENT_OPERAND0(); + PyObject *descr = (PyObject *)CURRENT_OPERAND0_64(); STAT_INC(LOAD_ATTR, hit); assert(descr != NULL); attr = PyStackRef_FromPyObjectNew(descr); @@ -3559,7 +3559,7 @@ _PyStackRef new_frame; oparg = CURRENT_OPARG(); owner = stack_pointer[-1]; - PyObject *fget = (PyObject *)CURRENT_OPERAND0(); + PyObject *fget = (PyObject *)CURRENT_OPERAND0_64(); assert((oparg & 1) == 0); assert(Py_IS_TYPE(fget, &PyFunction_Type)); PyFunctionObject *f = (PyFunctionObject *)fget; @@ -3612,7 +3612,7 @@ _PyStackRef value; owner = stack_pointer[-1]; value = stack_pointer[-2]; - uint16_t offset = (uint16_t)CURRENT_OPERAND0(); + uint16_t offset = (uint16_t)CURRENT_OPERAND0_16(); PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner); STAT_INC(STORE_ATTR, hit); assert(_PyObject_GetManagedDict(owner_o) == NULL); @@ -3640,7 +3640,7 @@ oparg = CURRENT_OPARG(); owner = stack_pointer[-1]; value = stack_pointer[-2]; - uint16_t hint = (uint16_t)CURRENT_OPERAND0(); + uint16_t hint = (uint16_t)CURRENT_OPERAND0_16(); PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner); assert(Py_TYPE(owner_o)->tp_flags & Py_TPFLAGS_MANAGED_DICT); PyDictObject *dict = _PyObject_GetManagedDict(owner_o); @@ -3698,7 +3698,7 @@ _PyStackRef value; owner = stack_pointer[-1]; value = stack_pointer[-2]; - uint16_t index = (uint16_t)CURRENT_OPERAND0(); + uint16_t index = (uint16_t)CURRENT_OPERAND0_16(); PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner); if (!LOCK_OBJECT(owner_o)) { UOP_STAT_INC(uopcode, miss); @@ -4689,7 +4689,7 @@ case _GUARD_KEYS_VERSION: { _PyStackRef owner; owner = stack_pointer[-1]; - uint32_t keys_version = (uint32_t)CURRENT_OPERAND0(); + uint32_t keys_version = (uint32_t)CURRENT_OPERAND0_32(); PyTypeObject *owner_cls = Py_TYPE(PyStackRef_AsPyObjectBorrow(owner)); PyHeapTypeObject *owner_heap_type = (PyHeapTypeObject *)owner_cls; PyDictKeysObject *keys = owner_heap_type->ht_cached_keys; @@ -4706,7 +4706,7 @@ _PyStackRef self; oparg = CURRENT_OPARG(); owner = stack_pointer[-1]; - PyObject *descr = (PyObject *)CURRENT_OPERAND0(); + PyObject *descr = (PyObject *)CURRENT_OPERAND0_64(); assert(oparg & 1); STAT_INC(LOAD_ATTR, hit); assert(descr != NULL); @@ -4726,7 +4726,7 @@ _PyStackRef self; oparg = CURRENT_OPARG(); owner = stack_pointer[-1]; - PyObject *descr = (PyObject *)CURRENT_OPERAND0(); + PyObject *descr = (PyObject *)CURRENT_OPERAND0_64(); assert(oparg & 1); assert(Py_TYPE(PyStackRef_AsPyObjectBorrow(owner))->tp_dictoffset == 0); STAT_INC(LOAD_ATTR, hit); @@ -4746,7 +4746,7 @@ _PyStackRef attr; oparg = CURRENT_OPARG(); owner = stack_pointer[-1]; - PyObject *descr = (PyObject *)CURRENT_OPERAND0(); + PyObject *descr = (PyObject *)CURRENT_OPERAND0_64(); assert((oparg & 1) == 0); STAT_INC(LOAD_ATTR, hit); assert(descr != NULL); @@ -4767,7 +4767,7 @@ _PyStackRef attr; oparg = CURRENT_OPARG(); owner = stack_pointer[-1]; - PyObject *descr = (PyObject *)CURRENT_OPERAND0(); + PyObject *descr = (PyObject *)CURRENT_OPERAND0_64(); assert((oparg & 1) == 0); assert(Py_TYPE(PyStackRef_AsPyObjectBorrow(owner))->tp_dictoffset == 0); STAT_INC(LOAD_ATTR, hit); @@ -4787,7 +4787,7 @@ case _CHECK_ATTR_METHOD_LAZY_DICT: { _PyStackRef owner; owner = stack_pointer[-1]; - uint16_t dictoffset = (uint16_t)CURRENT_OPERAND0(); + uint16_t dictoffset = (uint16_t)CURRENT_OPERAND0_16(); char *ptr = ((char *)PyStackRef_AsPyObjectBorrow(owner)) + MANAGED_DICT_OFFSET + dictoffset; PyObject *dict = FT_ATOMIC_LOAD_PTR_ACQUIRE(*(PyObject **)ptr); if (dict != NULL) { @@ -4803,7 +4803,7 @@ _PyStackRef self; oparg = CURRENT_OPARG(); owner = stack_pointer[-1]; - PyObject *descr = (PyObject *)CURRENT_OPERAND0(); + PyObject *descr = (PyObject *)CURRENT_OPERAND0_64(); assert(oparg & 1); STAT_INC(LOAD_ATTR, hit); assert(descr != NULL); @@ -4885,7 +4885,7 @@ _PyStackRef callable; oparg = CURRENT_OPARG(); callable = stack_pointer[-2 - oparg]; - uint32_t func_version = (uint32_t)CURRENT_OPERAND0(); + uint32_t func_version = (uint32_t)CURRENT_OPERAND0_32(); PyObject *callable_o = PyStackRef_AsPyObjectBorrow(callable); if (!PyFunction_Check(callable_o)) { UOP_STAT_INC(uopcode, miss); @@ -4900,8 +4900,8 @@ } case _CHECK_FUNCTION_VERSION_INLINE: { - uint32_t func_version = (uint32_t)CURRENT_OPERAND0(); - PyObject *callable_o = (PyObject *)CURRENT_OPERAND1(); + uint32_t func_version = (uint32_t)CURRENT_OPERAND0_32(); + PyObject *callable_o = (PyObject *)CURRENT_OPERAND1_64(); assert(PyFunction_Check(callable_o)); PyFunctionObject *func = (PyFunctionObject *)callable_o; if (func->func_version != func_version) { @@ -4917,7 +4917,7 @@ oparg = CURRENT_OPARG(); null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; - uint32_t func_version = (uint32_t)CURRENT_OPERAND0(); + uint32_t func_version = (uint32_t)CURRENT_OPERAND0_32(); PyObject *callable_o = PyStackRef_AsPyObjectBorrow(callable); if (Py_TYPE(callable_o) != &PyMethod_Type) { UOP_STAT_INC(uopcode, miss); @@ -5422,7 +5422,7 @@ oparg = CURRENT_OPARG(); self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; - uint32_t type_version = (uint32_t)CURRENT_OPERAND0(); + uint32_t type_version = (uint32_t)CURRENT_OPERAND0_32(); PyObject *callable_o = PyStackRef_AsPyObjectBorrow(callable); if (!PyStackRef_IsNull(self_or_null)) { UOP_STAT_INC(uopcode, miss); @@ -6197,7 +6197,7 @@ _PyStackRef callable; oparg = CURRENT_OPARG(); callable = stack_pointer[-3 - oparg]; - uint32_t func_version = (uint32_t)CURRENT_OPERAND0(); + uint32_t func_version = (uint32_t)CURRENT_OPERAND0_32(); PyObject *callable_o = PyStackRef_AsPyObjectBorrow(callable); if (!PyFunction_Check(callable_o)) { UOP_STAT_INC(uopcode, miss); @@ -6217,7 +6217,7 @@ oparg = CURRENT_OPARG(); null = stack_pointer[-2 - oparg]; callable = stack_pointer[-3 - oparg]; - uint32_t func_version = (uint32_t)CURRENT_OPERAND0(); + uint32_t func_version = (uint32_t)CURRENT_OPERAND0_32(); PyObject *callable_o = PyStackRef_AsPyObjectBorrow(callable); if (Py_TYPE(callable_o) != &PyMethod_Type) { UOP_STAT_INC(uopcode, miss); @@ -6743,13 +6743,13 @@ } case _SET_IP: { - PyObject *instr_ptr = (PyObject *)CURRENT_OPERAND0(); + PyObject *instr_ptr = (PyObject *)CURRENT_OPERAND0_64(); frame->instr_ptr = (_Py_CODEUNIT *)instr_ptr; break; } case _CHECK_STACK_SPACE_OPERAND: { - uint32_t framesize = (uint32_t)CURRENT_OPERAND0(); + uint32_t framesize = (uint32_t)CURRENT_OPERAND0_32(); assert(framesize <= INT_MAX); if (!_PyThreadState_HasStackSpace(tstate, framesize)) { UOP_STAT_INC(uopcode, miss); @@ -6774,7 +6774,7 @@ } case _EXIT_TRACE: { - PyObject *exit_p = (PyObject *)CURRENT_OPERAND0(); + PyObject *exit_p = (PyObject *)CURRENT_OPERAND0_64(); _PyExitData *exit = (_PyExitData *)exit_p; #if defined(Py_DEBUG) && !defined(_Py_JIT) const _Py_CODEUNIT *target = ((frame->owner == FRAME_OWNED_BY_INTERPRETER) @@ -6798,7 +6798,7 @@ } case _DYNAMIC_EXIT: { - PyObject *exit_p = (PyObject *)CURRENT_OPERAND0(); + PyObject *exit_p = (PyObject *)CURRENT_OPERAND0_64(); #if defined(Py_DEBUG) && !defined(_Py_JIT) _PyExitData *exit = (_PyExitData *)exit_p; _Py_CODEUNIT *target = frame->instr_ptr; @@ -6829,7 +6829,7 @@ case _LOAD_CONST_INLINE: { _PyStackRef value; - PyObject *ptr = (PyObject *)CURRENT_OPERAND0(); + PyObject *ptr = (PyObject *)CURRENT_OPERAND0_64(); value = PyStackRef_FromPyObjectNew(ptr); stack_pointer[0] = value; stack_pointer += 1; @@ -6841,7 +6841,7 @@ _PyStackRef pop; _PyStackRef value; pop = stack_pointer[-1]; - PyObject *ptr = (PyObject *)CURRENT_OPERAND0(); + PyObject *ptr = (PyObject *)CURRENT_OPERAND0_64(); stack_pointer += -1; ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__); _PyFrame_SetStackPointer(frame, stack_pointer); @@ -6856,7 +6856,7 @@ case _LOAD_CONST_INLINE_BORROW: { _PyStackRef value; - PyObject *ptr = (PyObject *)CURRENT_OPERAND0(); + PyObject *ptr = (PyObject *)CURRENT_OPERAND0_64(); value = PyStackRef_FromPyObjectBorrow(ptr); stack_pointer[0] = value; stack_pointer += 1; @@ -6931,7 +6931,7 @@ _PyStackRef pop; _PyStackRef value; pop = stack_pointer[-1]; - PyObject *ptr = (PyObject *)CURRENT_OPERAND0(); + PyObject *ptr = (PyObject *)CURRENT_OPERAND0_64(); stack_pointer += -1; ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__); _PyFrame_SetStackPointer(frame, stack_pointer); @@ -6950,7 +6950,7 @@ _PyStackRef value; pop2 = stack_pointer[-1]; pop1 = stack_pointer[-2]; - PyObject *ptr = (PyObject *)CURRENT_OPERAND0(); + PyObject *ptr = (PyObject *)CURRENT_OPERAND0_64(); stack_pointer += -1; ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__); _PyFrame_SetStackPointer(frame, stack_pointer); @@ -6974,7 +6974,7 @@ _PyStackRef value; null = stack_pointer[-1]; callable = stack_pointer[-2]; - PyObject *ptr = (PyObject *)CURRENT_OPERAND0(); + PyObject *ptr = (PyObject *)CURRENT_OPERAND0_64(); (void)null; stack_pointer += -2; ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__); @@ -6996,7 +6996,7 @@ pop = stack_pointer[-1]; null = stack_pointer[-2]; callable = stack_pointer[-3]; - PyObject *ptr = (PyObject *)CURRENT_OPERAND0(); + PyObject *ptr = (PyObject *)CURRENT_OPERAND0_64(); stack_pointer += -1; ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__); _PyFrame_SetStackPointer(frame, stack_pointer); @@ -7025,7 +7025,7 @@ pop1 = stack_pointer[-2]; null = stack_pointer[-3]; callable = stack_pointer[-4]; - PyObject *ptr = (PyObject *)CURRENT_OPERAND0(); + PyObject *ptr = (PyObject *)CURRENT_OPERAND0_64(); stack_pointer += -1; ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__); _PyFrame_SetStackPointer(frame, stack_pointer); @@ -7054,7 +7054,7 @@ _PyStackRef value; _PyStackRef new; old = stack_pointer[-1]; - PyObject *ptr = (PyObject *)CURRENT_OPERAND0(); + PyObject *ptr = (PyObject *)CURRENT_OPERAND0_64(); new = old; value = PyStackRef_FromPyObjectNew(ptr); stack_pointer[-1] = value; @@ -7069,7 +7069,7 @@ _PyStackRef value; _PyStackRef new; old = stack_pointer[-1]; - PyObject *ptr = (PyObject *)CURRENT_OPERAND0(); + PyObject *ptr = (PyObject *)CURRENT_OPERAND0_64(); new = old; value = PyStackRef_FromPyObjectBorrow(ptr); stack_pointer[-1] = value; @@ -7080,7 +7080,7 @@ } case _START_EXECUTOR: { - PyObject *executor = (PyObject *)CURRENT_OPERAND0(); + PyObject *executor = (PyObject *)CURRENT_OPERAND0_64(); #ifndef _Py_JIT assert(current_executor == (_PyExecutorObject*)executor); #endif @@ -7127,7 +7127,7 @@ case _ERROR_POP_N: { oparg = CURRENT_OPARG(); - uint32_t target = (uint32_t)CURRENT_OPERAND0(); + uint32_t target = (uint32_t)CURRENT_OPERAND0_32(); assert(oparg == 0); frame->instr_ptr = _PyFrame_GetBytecode(frame) + target; GOTO_TIER_ONE(NULL); @@ -7192,7 +7192,7 @@ case _GUARD_IP__PUSH_FRAME: { #define OFFSET_OF__PUSH_FRAME ((0)) - PyObject *ip = (PyObject *)CURRENT_OPERAND0(); + PyObject *ip = (PyObject *)CURRENT_OPERAND0_64(); _Py_CODEUNIT *target = frame->instr_ptr + OFFSET_OF__PUSH_FRAME; if (target != (_Py_CODEUNIT *)ip) { frame->instr_ptr += OFFSET_OF__PUSH_FRAME; @@ -7207,7 +7207,7 @@ case _GUARD_IP_YIELD_VALUE: { #define OFFSET_OF_YIELD_VALUE ((1+INLINE_CACHE_ENTRIES_SEND)) - PyObject *ip = (PyObject *)CURRENT_OPERAND0(); + PyObject *ip = (PyObject *)CURRENT_OPERAND0_64(); _Py_CODEUNIT *target = frame->instr_ptr + OFFSET_OF_YIELD_VALUE; if (target != (_Py_CODEUNIT *)ip) { frame->instr_ptr += OFFSET_OF_YIELD_VALUE; @@ -7222,7 +7222,7 @@ case _GUARD_IP_RETURN_VALUE: { #define OFFSET_OF_RETURN_VALUE ((frame->return_offset)) - PyObject *ip = (PyObject *)CURRENT_OPERAND0(); + PyObject *ip = (PyObject *)CURRENT_OPERAND0_64(); _Py_CODEUNIT *target = frame->instr_ptr + OFFSET_OF_RETURN_VALUE; if (target != (_Py_CODEUNIT *)ip) { frame->instr_ptr += OFFSET_OF_RETURN_VALUE; @@ -7237,7 +7237,7 @@ case _GUARD_IP_RETURN_GENERATOR: { #define OFFSET_OF_RETURN_GENERATOR ((frame->return_offset)) - PyObject *ip = (PyObject *)CURRENT_OPERAND0(); + PyObject *ip = (PyObject *)CURRENT_OPERAND0_64(); _Py_CODEUNIT *target = frame->instr_ptr + OFFSET_OF_RETURN_GENERATOR; if (target != (_Py_CODEUNIT *)ip) { frame->instr_ptr += OFFSET_OF_RETURN_GENERATOR; diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 3d5bf75ac0acae..53dcf2cd78ceaf 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -371,7 +371,7 @@ assert(next_instr->op.code == STORE_FAST); next_oparg = next_instr->op.arg; #else - next_oparg = (int)CURRENT_OPERAND0(); + next_oparg = (int)CURRENT_OPERAND0_16(); #endif _PyStackRef *target_local = &GETLOCAL(next_oparg); assert(PyUnicode_CheckExact(left_o)); diff --git a/Tools/cases_generator/tier2_generator.py b/Tools/cases_generator/tier2_generator.py index ac3e6b94afe49e..be343dbd1f0ba2 100644 --- a/Tools/cases_generator/tier2_generator.py +++ b/Tools/cases_generator/tier2_generator.py @@ -170,12 +170,13 @@ def write_uop(uop: Uop, emitter: Emitter, stack: Stack, offset_strs: dict[str, t idx = 0 for cache in uop.caches: if cache.name != "unused": + bits = cache.size*16 if cache.size == 4: type = cast = "PyObject *" else: - type = f"uint{cache.size*16}_t " - cast = f"uint{cache.size*16}_t" - emitter.emit(f"{type}{cache.name} = ({cast})CURRENT_OPERAND{idx}();\n") + type = f"uint{bits}_t " + cast = f"uint{bits}_t" + emitter.emit(f"{type}{cache.name} = ({cast})CURRENT_OPERAND{idx}_{bits}();\n") idx += 1 _, storage = emitter.emit_tokens(uop, storage, None, False) storage.flush(emitter.out) diff --git a/Tools/jit/_optimizers.py b/Tools/jit/_optimizers.py index 297b9517f6a27a..08d4e700652deb 100644 --- a/Tools/jit/_optimizers.py +++ b/Tools/jit/_optimizers.py @@ -96,6 +96,8 @@ class InstructionKind(enum.Enum): LONG_BRANCH = enum.auto() SHORT_BRANCH = enum.auto() RETURN = enum.auto() + SMALL_CONST_1 = enum.auto() + SMALL_CONST_2 = enum.auto() OTHER = enum.auto() @@ -172,6 +174,7 @@ class Optimizer: ) # Override everything that follows in subclasses: _supports_external_relocations = True + supports_small_constants = False _branches: typing.ClassVar[dict[str, tuple[str | None, str | None]]] = {} # Short branches are instructions that can branch within a micro-op, # but might not have the reach to branch anywhere within a trace. @@ -184,6 +187,9 @@ class Optimizer: _re_return: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH text: str = "" globals: set[str] = dataclasses.field(default_factory=set) + _re_small_const_1 = _RE_NEVER_MATCH + _re_small_const_2 = _RE_NEVER_MATCH + const_reloc = "" def __post_init__(self) -> None: # Split the code into a linked list of basic blocks. A basic block is an @@ -253,6 +259,14 @@ def _parse_instruction(self, line: str) -> Instruction: elif match := self._re_return.match(line): name = line kind = InstructionKind.RETURN + elif match := self._re_small_const_1.match(line): + target = match["value"] + name = match["instruction"] + kind = InstructionKind.SMALL_CONST_1 + elif match := self._re_small_const_2.match(line): + target = match["value"] + name = match["instruction"] + kind = InstructionKind.SMALL_CONST_2 else: name, *_ = line.split(" ") kind = InstructionKind.OTHER @@ -385,7 +399,7 @@ def _remove_redundant_jumps(self) -> None: block.fallthrough = True block.instructions.pop() # Before: - # br ? FOO: + # branch FOO: # ... # FOO: # jump BAR @@ -461,6 +475,70 @@ def _fixup_external_labels(self) -> None: ) block.instructions.append(branch.update_target("0")) + def _make_temp_label(self, index: int) -> Instruction: + marker = f"jit_temp_{index}:" + return Instruction(InstructionKind.OTHER, "", marker, None) + + def _fixup_constants(self) -> None: + if not self.supports_small_constants: + return + index = 0 + for block in self._blocks(): + fixed: list[Instruction] = [] + small_const_index = -1 + for inst in block.instructions: + if inst.kind == InstructionKind.SMALL_CONST_1: + marker = f"jit_pending_{inst.target}{index}:" + fixed.append(self._make_temp_label(index)) + index += 1 + small_const_index = len(fixed) + fixed.append(inst) + elif inst.kind == InstructionKind.SMALL_CONST_2: + if small_const_index < 0: + fixed.append(inst) + continue + small_const_1 = fixed[small_const_index] + if not self._small_consts_match(small_const_1, inst): + small_const_index = -1 + fixed.append(inst) + continue + assert small_const_1.target is not None + if small_const_1.target.endswith("16"): + fixed[small_const_index] = self._make_temp_label(index) + index += 1 + else: + assert small_const_1.target.endswith("32") + patch_kind, replacement = self._small_const_1(small_const_1) + if replacement is not None: + label = f"{self.const_reloc}{patch_kind}_JIT_RELOCATION_CONST{small_const_1.target[:-3]}_JIT_RELOCATION_{index}:" + index += 1 + fixed[small_const_index - 1] = Instruction( + InstructionKind.OTHER, "", label, None + ) + fixed[small_const_index] = replacement + patch_kind, replacement = self._small_const_2(inst) + if replacement is not None: + assert inst.target is not None + label = f"{self.const_reloc}{patch_kind}_JIT_RELOCATION_CONST{inst.target[:-3]}_JIT_RELOCATION_{index}:" + index += 1 + fixed.append( + Instruction(InstructionKind.OTHER, "", label, None) + ) + fixed.append(replacement) + small_const_index = -1 + else: + fixed.append(inst) + block.instructions = fixed + + def _small_const_1(self, inst: Instruction) -> tuple[str, Instruction | None]: + raise NotImplementedError() + + def _small_const_2(self, inst: Instruction) -> tuple[str, Instruction | None]: + raise NotImplementedError() + + def _small_consts_match(self, inst1: Instruction, inst2: Instruction) -> bool: + raise NotImplementedError() + def run(self) -> None: """Run this optimizer.""" self._insert_continue_label() @@ -472,6 +550,7 @@ def run(self) -> None: self._remove_redundant_jumps() self._remove_unreachable() self._fixup_external_labels() + self._fixup_constants() self.path.write_text(self._body()) @@ -492,6 +571,54 @@ class OptimizerAArch64(Optimizer): # pylint: disable = too-few-public-methods # https://developer.arm.com/documentation/ddi0602/2025-09/Base-Instructions/RET--Return-from-subroutine- _re_return = re.compile(r"\s*ret\b") + supports_small_constants = True + _re_small_const_1 = re.compile( + r"\s*(?Padrp)\s+.*(?P_JIT_OP(ARG|ERAND(0|1))_(16|32)).*" + ) + _re_small_const_2 = re.compile( + r"\s*(?Pldr)\s+.*(?P_JIT_OP(ARG|ERAND(0|1))_(16|32)).*" + ) + const_reloc = "CUSTOM_AARCH64_CONST" + + def _get_reg(self, inst: Instruction) -> str: + _, rest = inst.text.split(inst.name) + reg, *_ = rest.split(",") + return reg.strip() + + def _small_const_1(self, inst: Instruction) -> tuple[str, Instruction | None]: + assert inst.kind is InstructionKind.SMALL_CONST_1 + assert inst.target is not None + if "16" in inst.target: + return "", None + pre, _ = inst.text.split(inst.name) + return "16a", Instruction( + InstructionKind.OTHER, "movz", f"{pre}movz {self._get_reg(inst)}, 0", None + ) + + def _small_const_2(self, inst: Instruction) -> tuple[str, Instruction | None]: + assert inst.kind is InstructionKind.SMALL_CONST_2 + assert inst.target is not None + pre, _ = inst.text.split(inst.name) + if "16" in inst.target: + return "16a", Instruction( + InstructionKind.OTHER, + "movz", + f"{pre}movz {self._get_reg(inst)}, 0", + None, + ) + else: + return "16b", Instruction( + InstructionKind.OTHER, + "movk", + f"{pre}movk {self._get_reg(inst)}, 0, lsl #16", + None, + ) + + def _small_consts_match(self, inst1: Instruction, inst2: Instruction) -> bool: + reg1 = self._get_reg(inst1) + reg2 = self._get_reg(inst2) + return reg1 == reg2 + class OptimizerX86(Optimizer): # pylint: disable = too-few-public-methods """i686-pc-windows-msvc/x86_64-apple-darwin/x86_64-unknown-linux-gnu""" diff --git a/Tools/jit/_schema.py b/Tools/jit/_schema.py index 4e86abe604972e..964e8bdbdc1010 100644 --- a/Tools/jit/_schema.py +++ b/Tools/jit/_schema.py @@ -9,6 +9,9 @@ "ARM64_RELOC_PAGE21", "ARM64_RELOC_PAGEOFF12", "ARM64_RELOC_UNSIGNED", + "CUSTOM_AARCH64_BRANCH19", + "CUSTOM_AARCH64_CONST_16", + "CUSTOM_AARCH64_CONST_32", "IMAGE_REL_AMD64_REL32", "IMAGE_REL_ARM64_BRANCH19", "IMAGE_REL_ARM64_BRANCH26", diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index 5c45ab930a4ac4..6c16af80083d70 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -32,6 +32,12 @@ class HoleValue(enum.Enum): # The current uop's operand0 on 32-bit platforms (exposed as _JIT_OPERAND0_HI/LO): OPERAND0_HI = enum.auto() OPERAND0_LO = enum.auto() + # 16 and 32 bit versions of OPARG, OPERAND0 and OPERAND1 + OPARG_16 = enum.auto() + OPERAND0_16 = enum.auto() + OPERAND1_16 = enum.auto() + OPERAND0_32 = enum.auto() + OPERAND1_32 = enum.auto() # The current uop's operand1 on 64-bit platforms (exposed as _JIT_OPERAND1): OPERAND1 = enum.auto() # The current uop's operand1 on 32-bit platforms (exposed as _JIT_OPERAND1_HI/LO): @@ -59,6 +65,8 @@ class HoleValue(enum.Enum): "ARM64_RELOC_PAGEOFF12": "patch_aarch64_12", "ARM64_RELOC_UNSIGNED": "patch_64", "CUSTOM_AARCH64_BRANCH19": "patch_aarch64_19r", + "CUSTOM_AARCH64_CONST16a": "patch_aarch64_16a", + "CUSTOM_AARCH64_CONST16b": "patch_aarch64_16b", # x86_64-pc-windows-msvc: "IMAGE_REL_AMD64_REL32": "patch_x86_64_32rx", # aarch64-pc-windows-msvc: @@ -95,6 +103,7 @@ class HoleValue(enum.Enum): "X86_64_RELOC_SIGNED": "patch_32r", "X86_64_RELOC_UNSIGNED": "patch_64", } + # Translate HoleValues to C expressions: _HOLE_EXPRS = { HoleValue.CODE: "(uintptr_t)code", @@ -103,10 +112,15 @@ class HoleValue(enum.Enum): # These should all have been turned into DATA values by process_relocations: # HoleValue.GOT: "", HoleValue.OPARG: "instruction->oparg", + HoleValue.OPARG_16: "instruction->oparg", HoleValue.OPERAND0: "instruction->operand0", + HoleValue.OPERAND0_16: "instruction->operand0", + HoleValue.OPERAND0_32: "instruction->operand0", HoleValue.OPERAND0_HI: "(instruction->operand0 >> 32)", HoleValue.OPERAND0_LO: "(instruction->operand0 & UINT32_MAX)", HoleValue.OPERAND1: "instruction->operand1", + HoleValue.OPERAND1_16: "instruction->operand1", + HoleValue.OPERAND1_32: "instruction->operand1", HoleValue.OPERAND1_HI: "(instruction->operand1 >> 32)", HoleValue.OPERAND1_LO: "(instruction->operand1 & UINT32_MAX)", HoleValue.TARGET: "instruction->target", @@ -175,7 +189,10 @@ def as_c(self, where: str) -> str: if self.symbol: if value: value += " + " - value += f"(uintptr_t)&{self.symbol}" + if self.symbol.startswith("CONST"): + value += f"instruction->{self.symbol[10:].lower()}" + else: + value += f"(uintptr_t)&{self.symbol}" if _signed(self.addend) or not value: if value: value += " + " diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index f92f3eac13bde5..5895e91c3c44ce 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -138,6 +138,7 @@ async def _compile( f"--target={self.triple}", "-DPy_BUILD_CORE_MODULE", "-D_DEBUG" if self.debug else "-DNDEBUG", + f"-DSUPPORTS_SMALL_CONSTS={1 if self.optimizer.supports_small_constants else 0}", f"-D_JIT_OPCODE={opname}", "-D_PyJIT_ACTIVE", "-D_Py_JIT", diff --git a/Tools/jit/template.c b/Tools/jit/template.c index 0167f1b0ae5e37..a339511c3e01a7 100644 --- a/Tools/jit/template.c +++ b/Tools/jit/template.c @@ -34,14 +34,38 @@ #include "jit.h" + +#undef CURRENT_OPERAND0_64 +#define CURRENT_OPERAND0_64() (_operand0_64) + +#undef CURRENT_OPERAND1_64 +#define CURRENT_OPERAND1_64() (_operand1_64) + + #undef CURRENT_OPARG +#undef CURRENT_OPERAND0_16 +#undef CURRENT_OPERAND0_32 +#undef CURRENT_OPERAND1_16 +#undef CURRENT_OPERAND1_32 + +#if SUPPORTS_SMALL_CONSTS + +#define CURRENT_OPARG() (_oparg_16) +#define CURRENT_OPERAND0_32() (_operand0_32) +#define CURRENT_OPERAND0_16() (_operand0_16) +#define CURRENT_OPERAND1_32() (_operand1_32) +#define CURRENT_OPERAND1_16() (_operand1_16) + +#else + #define CURRENT_OPARG() (_oparg) +#define CURRENT_OPERAND0_32() (_operand0_64) +#define CURRENT_OPERAND0_16() (_operand0_64) +#define CURRENT_OPERAND1_32() (_operand1_64) +#define CURRENT_OPERAND1_16() (_operand1_64) -#undef CURRENT_OPERAND0 -#define CURRENT_OPERAND0() (_operand0) +#endif -#undef CURRENT_OPERAND1 -#define CURRENT_OPERAND1() (_operand1) #undef CURRENT_TARGET #define CURRENT_TARGET() (_target) @@ -101,10 +125,9 @@ _JIT_ENTRY(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState int uopcode = _JIT_OPCODE; _Py_CODEUNIT *next_instr; // Other stuff we need handy: - PATCH_VALUE(uint16_t, _oparg, _JIT_OPARG) #if SIZEOF_VOID_P == 8 - PATCH_VALUE(uint64_t, _operand0, _JIT_OPERAND0) - PATCH_VALUE(uint64_t, _operand1, _JIT_OPERAND1) + PATCH_VALUE(uint64_t, _operand0_64, _JIT_OPERAND0) + PATCH_VALUE(uint64_t, _operand1_64, _JIT_OPERAND1) #else assert(SIZEOF_VOID_P == 4); PATCH_VALUE(uint32_t, _operand0_hi, _JIT_OPERAND0_HI) @@ -113,6 +136,15 @@ _JIT_ENTRY(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState PATCH_VALUE(uint32_t, _operand1_hi, _JIT_OPERAND1_HI) PATCH_VALUE(uint32_t, _operand1_lo, _JIT_OPERAND1_LO) uint64_t _operand1 = ((uint64_t)_operand1_hi << 32) | _operand1_lo; +#endif +#if SUPPORTS_SMALL_CONSTS + PATCH_VALUE(uint32_t, _operand0_32, _JIT_OPERAND0_32) + PATCH_VALUE(uint32_t, _operand1_32, _JIT_OPERAND1_32) + PATCH_VALUE(uint16_t, _operand0_16, _JIT_OPERAND0_16) + PATCH_VALUE(uint16_t, _operand1_16, _JIT_OPERAND1_16) + PATCH_VALUE(uint16_t, _oparg_16, _JIT_OPARG_16) +#else + PATCH_VALUE(uint16_t, _oparg, _JIT_OPARG) #endif PATCH_VALUE(uint32_t, _target, _JIT_TARGET) OPT_STAT_INC(uops_executed); From abcd82c14e5454f498d54aadaf7989541011c871 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Wed, 10 Dec 2025 15:29:04 +0000 Subject: [PATCH 2/2] Update constant names for 32 bit builds --- Tools/jit/template.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Tools/jit/template.c b/Tools/jit/template.c index a339511c3e01a7..d0666dfec64201 100644 --- a/Tools/jit/template.c +++ b/Tools/jit/template.c @@ -132,10 +132,10 @@ _JIT_ENTRY(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState assert(SIZEOF_VOID_P == 4); PATCH_VALUE(uint32_t, _operand0_hi, _JIT_OPERAND0_HI) PATCH_VALUE(uint32_t, _operand0_lo, _JIT_OPERAND0_LO) - uint64_t _operand0 = ((uint64_t)_operand0_hi << 32) | _operand0_lo; + uint64_t _operand0_64 = ((uint64_t)_operand0_hi << 32) | _operand0_lo; PATCH_VALUE(uint32_t, _operand1_hi, _JIT_OPERAND1_HI) PATCH_VALUE(uint32_t, _operand1_lo, _JIT_OPERAND1_LO) - uint64_t _operand1 = ((uint64_t)_operand1_hi << 32) | _operand1_lo; + uint64_t _operand1_64 = ((uint64_t)_operand1_hi << 32) | _operand1_lo; #endif #if SUPPORTS_SMALL_CONSTS PATCH_VALUE(uint32_t, _operand0_32, _JIT_OPERAND0_32)