From 6a3bd75f5f9d0f520e9f59ff84192556a5900345 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Thu, 13 Nov 2025 21:17:05 +0000 Subject: [PATCH 01/40] Move all JIT fields to thread state --- Include/internal/pycore_interp_structs.h | 8 -- Include/internal/pycore_optimizer.h | 6 +- Include/internal/pycore_tstate.h | 13 +++ Python/bytecodes.c | 6 +- Python/ceval_gil.c | 2 +- Python/executor_cases.c.h | 2 +- Python/generated_cases.c.h | 4 +- Python/optimizer.c | 110 +++++++++++------------ Python/pylifecycle.c | 6 +- Python/pystate.c | 69 ++++++++------ Python/sysmodule.c | 2 +- 11 files changed, 124 insertions(+), 104 deletions(-) diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index 9e4504479cd9f0..7147f016f9cb07 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -930,14 +930,6 @@ struct _is { struct types_state types; struct callable_cache callable_cache; PyObject *common_consts[NUM_COMMON_CONSTANTS]; - bool jit; - bool compiling; - struct _PyExecutorObject *executor_list_head; - struct _PyExecutorObject *executor_deletion_list_head; - struct _PyExecutorObject *cold_executor; - struct _PyExecutorObject *cold_dynamic_executor; - int executor_deletion_list_remaining_capacity; - size_t executor_creation_counter; _rare_events rare_events; PyDict_WatchCallback builtins_dict_watcher; diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 653285a2c6b79b..406afd95065868 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -24,7 +24,11 @@ typedef struct _PyExecutorLinkListNode { typedef struct { uint8_t opcode; uint8_t oparg; +#ifdef Py_GIL_DISABLED + uint8_t valid; +#else uint8_t valid:1; +#endif uint8_t linked:1; uint8_t chain_depth:6; // Must be big enough for MAX_CHAIN_DEPTH - 1. bool warm; @@ -359,7 +363,7 @@ extern void _PyExecutor_Free(_PyExecutorObject *self); PyAPI_FUNC(int) _PyDumpExecutors(FILE *out); #ifdef _Py_TIER2 -extern void _Py_ClearExecutorDeletionList(PyInterpreterState *interp); +extern void _Py_ClearExecutorDeletionList(PyThreadState *tstate); #endif int _PyJit_translate_single_bytecode_to_trace(PyThreadState *tstate, _PyInterpreterFrame *frame, _Py_CODEUNIT *next_instr, bool stop_tracing); diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index 50048801b2e4ee..1b54daf47f9920 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -52,6 +52,18 @@ typedef struct _PyJitTracerState { _PyJitTracerInitialState initial_state; _PyJitTracerPreviousState prev_state; } _PyJitTracerState; + +typedef struct _PyJitExecutorState { + bool jit; + bool compiling; + struct _PyExecutorObject *executor_list_head; + struct _PyExecutorObject *executor_deletion_list_head; + struct _PyExecutorObject *cold_executor; + struct _PyExecutorObject *cold_dynamic_executor; + int executor_deletion_list_remaining_capacity; + size_t executor_creation_counter; +} _PyJitExecutorState; + #endif // Every PyThreadState is actually allocated as a _PyThreadStateImpl. The @@ -120,6 +132,7 @@ typedef struct _PyThreadStateImpl { #endif #if _Py_TIER2 _PyJitTracerState jit_tracer_state; + _PyJitExecutorState jit_executor_state; #endif } _PyThreadStateImpl; diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 2c798855a71f55..fdab19131c6873 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2941,7 +2941,7 @@ dummy_func( specializing tier1 op(_SPECIALIZE_JUMP_BACKWARD, (--)) { #if ENABLE_SPECIALIZATION if (this_instr->op.code == JUMP_BACKWARD) { - uint8_t desired = tstate->interp->jit ? JUMP_BACKWARD_JIT : JUMP_BACKWARD_NO_JIT; + uint8_t desired = ((_PyThreadStateImpl*)tstate)->jit_executor_state.jit ? JUMP_BACKWARD_JIT : JUMP_BACKWARD_NO_JIT; FT_ATOMIC_STORE_UINT8_RELAXED(this_instr->op.code, desired); // Need to re-dispatch so the warmup counter isn't off by one: next_instr = this_instr; @@ -3035,7 +3035,7 @@ dummy_func( } DISPATCH_GOTO(); } - assert(executor != tstate->interp->cold_executor); + assert(executor != ((_PyThreadStateImpl *)tstate)->jit_executor_state.cold_executor); tstate->jit_exit = NULL; TIER1_TO_TIER2(executor); #else @@ -5282,7 +5282,7 @@ dummy_func( } tier2 op(_CHECK_VALIDITY, (--)) { - DEOPT_IF(!current_executor->vm_data.valid); + DEOPT_IF(!FT_ATOMIC_LOAD_UINT8(current_executor->vm_data.valid)); } tier2 pure op(_LOAD_CONST_INLINE, (ptr/4 -- value)) { diff --git a/Python/ceval_gil.c b/Python/ceval_gil.c index 9b6506ac3326b3..1dafba7ebf3bfb 100644 --- a/Python/ceval_gil.c +++ b/Python/ceval_gil.c @@ -1398,7 +1398,7 @@ _Py_HandlePending(PyThreadState *tstate) if ((breaker & _PY_EVAL_JIT_INVALIDATE_COLD_BIT) != 0) { _Py_unset_eval_breaker_bit(tstate, _PY_EVAL_JIT_INVALIDATE_COLD_BIT); _Py_Executors_InvalidateCold(tstate->interp); - tstate->interp->executor_creation_counter = JIT_CLEANUP_THRESHOLD; + ((_PyThreadStateImpl*)tstate)->jit_executor_state.executor_creation_counter = JIT_CLEANUP_THRESHOLD; } /* GIL drop request */ diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 7ba2e9d0d92999..3491b9136dd31b 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -7153,7 +7153,7 @@ } case _CHECK_VALIDITY: { - if (!current_executor->vm_data.valid) { + if (!FT_ATOMIC_LOAD_UINT8(current_executor->vm_data.valid)) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index a984da6dc912a2..e0bf9b611f0c6d 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -5495,7 +5495,7 @@ } DISPATCH_GOTO(); } - assert(executor != tstate->interp->cold_executor); + assert(executor != ((_PyThreadStateImpl *)tstate)->jit_executor_state.cold_executor); tstate->jit_exit = NULL; TIER1_TO_TIER2(executor); #else @@ -7595,7 +7595,7 @@ { #if ENABLE_SPECIALIZATION if (this_instr->op.code == JUMP_BACKWARD) { - uint8_t desired = tstate->interp->jit ? JUMP_BACKWARD_JIT : JUMP_BACKWARD_NO_JIT; + uint8_t desired = ((_PyThreadStateImpl*)tstate)->jit_executor_state.jit ? JUMP_BACKWARD_JIT : JUMP_BACKWARD_NO_JIT; FT_ATOMIC_STORE_UINT8_RELAXED(this_instr->op.code, desired); next_instr = this_instr; DISPATCH_SAME_OPARG(); diff --git a/Python/optimizer.c b/Python/optimizer.c index 65007a256d0c3b..edd7bf4ffe63ce 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -130,19 +130,18 @@ _PyOptimizer_Optimize( { _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; int chain_depth = _tstate->jit_tracer_state.initial_state.chain_depth; - PyInterpreterState *interp = _PyInterpreterState_GET(); - if (!interp->jit) { + if (!_tstate->jit_executor_state.jit) { // gh-140936: It is possible that interp->jit will become false during // interpreter finalization. However, the specialized JUMP_BACKWARD_JIT // instruction may still be present. In this case, we should // return immediately without optimization. return 0; } - assert(!interp->compiling); + assert(!_tstate->jit_executor_state.compiling); assert(_tstate->jit_tracer_state.initial_state.stack_depth >= 0); #ifndef Py_GIL_DISABLED assert(_tstate->jit_tracer_state.initial_state.func != NULL); - interp->compiling = true; + _tstate->jit_executor_state.compiling = true; // The first executor in a chain and the MAX_CHAIN_DEPTH'th executor *must* // make progress in order to avoid infinite loops or excessively-long // side-exit chains. We can only insert the executor into the bytecode if @@ -152,18 +151,18 @@ _PyOptimizer_Optimize( PyCodeObject *code = (PyCodeObject *)_tstate->jit_tracer_state.initial_state.code; _Py_CODEUNIT *start = _tstate->jit_tracer_state.initial_state.start_instr; if (progress_needed && !has_space_for_executor(code, start)) { - interp->compiling = false; + _tstate->jit_executor_state.compiling = false; return 0; } // One of our dependencies while tracing was invalidated. Not worth compiling. if (!_tstate->jit_tracer_state.prev_state.dependencies_still_valid) { - interp->compiling = false; + _tstate->jit_executor_state.compiling = false; return 0; } _PyExecutorObject *executor; int err = uop_optimize(frame, tstate, &executor, progress_needed); if (err <= 0) { - interp->compiling = false; + _tstate->jit_executor_state.compiling = false; return err; } assert(executor != NULL); @@ -177,7 +176,7 @@ _PyOptimizer_Optimize( * it might get confused by the executor disappearing, * but there is not much we can do about that here. */ Py_DECREF(executor); - interp->compiling = false; + _tstate->jit_executor_state.compiling = false; return 0; } insert_executor(code, start, index, executor); @@ -191,7 +190,7 @@ _PyOptimizer_Optimize( } executor->vm_data.chain_depth = chain_depth; assert(executor->vm_data.valid); - interp->compiling = false; + _tstate->jit_executor_state.compiling = false; return 1; #else return 0; @@ -255,29 +254,20 @@ _PyExecutor_Free(_PyExecutorObject *self) #ifdef _Py_JIT _PyJIT_Free(self); #endif + PyObject_GC_Del(self); } void -_Py_ClearExecutorDeletionList(PyInterpreterState *interp) +_Py_ClearExecutorDeletionList(PyThreadState *tstate) { - _PyRuntimeState *runtime = &_PyRuntime; - HEAD_LOCK(runtime); - PyThreadState* ts = PyInterpreterState_ThreadHead(interp); - HEAD_UNLOCK(runtime); - while (ts) { - _PyExecutorObject *current = (_PyExecutorObject *)ts->current_executor; - if (current != NULL) { - /* Anything in this list will be unlinked, so we can reuse the - * linked field as a reachability marker. */ - current->vm_data.linked = 1; - } - HEAD_LOCK(runtime); - ts = PyThreadState_Next(ts); - HEAD_UNLOCK(runtime); - } - _PyExecutorObject **prev_to_next_ptr = &interp->executor_deletion_list_head; + _PyThreadStateImpl* ts = (_PyThreadStateImpl* )tstate; + _PyExecutorObject **prev_to_next_ptr = &ts->jit_executor_state.executor_deletion_list_head; _PyExecutorObject *exec = *prev_to_next_ptr; + // Mark the currently executing executor. + if (tstate->current_executor != NULL) { + ((_PyExecutorObject *)tstate->current_executor)->vm_data.linked = 1; + } while (exec != NULL) { if (exec->vm_data.linked) { // This executor is currently executing @@ -286,24 +276,26 @@ _Py_ClearExecutorDeletionList(PyInterpreterState *interp) } else { *prev_to_next_ptr = exec->vm_data.links.next; + assert(exec != _PyExecutor_GetColdDynamicExecutor()); + assert(exec != _PyExecutor_GetColdExecutor()); _PyExecutor_Free(exec); } exec = *prev_to_next_ptr; } - interp->executor_deletion_list_remaining_capacity = EXECUTOR_DELETE_LIST_MAX; + ts->jit_executor_state.executor_deletion_list_remaining_capacity = EXECUTOR_DELETE_LIST_MAX; } static void add_to_pending_deletion_list(_PyExecutorObject *self) { - PyInterpreterState *interp = PyInterpreterState_Get(); - self->vm_data.links.next = interp->executor_deletion_list_head; - interp->executor_deletion_list_head = self; - if (interp->executor_deletion_list_remaining_capacity > 0) { - interp->executor_deletion_list_remaining_capacity--; + _PyThreadStateImpl *_tstate = (_PyThreadStateImpl*)_PyThreadState_GET(); + self->vm_data.links.next = _tstate->jit_executor_state.executor_deletion_list_head; + _tstate->jit_executor_state.executor_deletion_list_head = self; + if (_tstate->jit_executor_state.executor_deletion_list_remaining_capacity > 0) { + _tstate->jit_executor_state.executor_deletion_list_remaining_capacity--; } else { - _Py_ClearExecutorDeletionList(interp); + _Py_ClearExecutorDeletionList(&_tstate->base); } } @@ -1403,7 +1395,7 @@ uop_optimize( // Check executor coldness // It's okay if this ends up going negative. - if (--tstate->interp->executor_creation_counter == 0) { + if (--((_PyThreadStateImpl *)tstate)->jit_executor_state.executor_creation_counter == 0) { _Py_set_eval_breaker_bit(tstate, _PY_EVAL_JIT_INVALIDATE_COLD_BIT); } @@ -1504,11 +1496,11 @@ bloom_filter_may_contain(_PyBloomFilter *bloom, _PyBloomFilter *hashes) static void link_executor(_PyExecutorObject *executor) { - PyInterpreterState *interp = _PyInterpreterState_GET(); + _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); _PyExecutorLinkListNode *links = &executor->vm_data.links; - _PyExecutorObject *head = interp->executor_list_head; + _PyExecutorObject *head = _tstate->jit_executor_state.executor_list_head; if (head == NULL) { - interp->executor_list_head = executor; + _tstate->jit_executor_state.executor_list_head = executor; links->previous = NULL; links->next = NULL; } @@ -1517,11 +1509,11 @@ link_executor(_PyExecutorObject *executor) links->previous = NULL; links->next = head; head->vm_data.links.previous = executor; - interp->executor_list_head = executor; + _tstate->jit_executor_state.executor_list_head = executor; } executor->vm_data.linked = true; /* executor_list_head must be first in list */ - assert(interp->executor_list_head->vm_data.links.previous == NULL); + assert(_tstate->jit_executor_state.executor_list_head->vm_data.links.previous == NULL); } static void @@ -1542,9 +1534,9 @@ unlink_executor(_PyExecutorObject *executor) } else { // prev == NULL implies that executor is the list head - PyInterpreterState *interp = PyInterpreterState_Get(); - assert(interp->executor_list_head == executor); - interp->executor_list_head = next; + _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + assert(_tstate->jit_executor_state.executor_list_head == executor); + _tstate->jit_executor_state.executor_list_head = next; } executor->vm_data.linked = false; } @@ -1563,9 +1555,9 @@ _Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_s _PyExecutorObject * _PyExecutor_GetColdExecutor(void) { - PyInterpreterState *interp = _PyInterpreterState_GET(); - if (interp->cold_executor != NULL) { - return interp->cold_executor; + _PyThreadStateImpl *_tstate = (_PyThreadStateImpl*)_PyThreadState_GET(); + if (_tstate->jit_executor_state.cold_executor != NULL) { + return _tstate->jit_executor_state.cold_executor; } _PyExecutorObject *cold = allocate_executor(0, 1); if (cold == NULL) { @@ -1584,17 +1576,17 @@ _PyExecutor_GetColdExecutor(void) } #endif _Py_SetImmortal((PyObject *)cold); - interp->cold_executor = cold; + _tstate->jit_executor_state.cold_executor = cold; return cold; } _PyExecutorObject * _PyExecutor_GetColdDynamicExecutor(void) { - PyInterpreterState *interp = _PyInterpreterState_GET(); - if (interp->cold_dynamic_executor != NULL) { - assert(interp->cold_dynamic_executor->trace[0].opcode == _COLD_DYNAMIC_EXIT); - return interp->cold_dynamic_executor; + _PyThreadStateImpl *_tstate = (_PyThreadStateImpl*)_PyThreadState_GET(); + if (_tstate->jit_executor_state.cold_dynamic_executor != NULL) { + assert(_tstate->jit_executor_state.cold_dynamic_executor->trace[0].opcode == _COLD_DYNAMIC_EXIT); + return _tstate->jit_executor_state.cold_dynamic_executor; } _PyExecutorObject *cold = allocate_executor(0, 1); if (cold == NULL) { @@ -1613,7 +1605,7 @@ _PyExecutor_GetColdDynamicExecutor(void) } #endif _Py_SetImmortal((PyObject *)cold); - interp->cold_dynamic_executor = cold; + _tstate->jit_executor_state.cold_dynamic_executor = cold; return cold; } @@ -1703,9 +1695,10 @@ _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is if (invalidate == NULL) { goto error; } + _PyThreadStateImpl *_tstate = (_PyThreadStateImpl*)_PyThreadState_GET(); /* Clearing an executor can deallocate others, so we need to make a list of * executors to invalidate first */ - for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) { + for (_PyExecutorObject *exec = _tstate->jit_executor_state.executor_list_head; exec != NULL;) { assert(exec->vm_data.valid); _PyExecutorObject *next = exec->vm_data.links.next; if (bloom_filter_may_contain(&exec->vm_data.bloom, &obj_filter) && @@ -1747,8 +1740,9 @@ _PyJit_Tracer_InvalidateDependency(PyThreadState *tstate, void *obj) void _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) { - while (interp->executor_list_head) { - _PyExecutorObject *executor = interp->executor_list_head; + _PyThreadStateImpl *_tstate = (_PyThreadStateImpl*)_PyThreadState_GET(); + while (_tstate->jit_executor_state.executor_list_head) { + _PyExecutorObject *executor = _tstate->jit_executor_state.executor_list_head; assert(executor->vm_data.valid == 1 && executor->vm_data.linked == 1); if (executor->vm_data.code) { // Clear the entire code object so its co_executors array be freed: @@ -1772,10 +1766,10 @@ _Py_Executors_InvalidateCold(PyInterpreterState *interp) if (invalidate == NULL) { goto error; } - + _PyThreadStateImpl *_tstate = (_PyThreadStateImpl*)_PyThreadState_GET(); /* Clearing an executor can deallocate others, so we need to make a list of * executors to invalidate first */ - for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) { + for (_PyExecutorObject *exec = _tstate->jit_executor_state.executor_list_head; exec != NULL;) { assert(exec->vm_data.valid); _PyExecutorObject *next = exec->vm_data.links.next; @@ -1912,8 +1906,8 @@ _PyDumpExecutors(FILE *out) { fprintf(out, "digraph ideal {\n\n"); fprintf(out, " rankdir = \"LR\"\n\n"); - PyInterpreterState *interp = PyInterpreterState_Get(); - for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) { + _PyThreadStateImpl *_tstate = (_PyThreadStateImpl*)_PyThreadState_GET(); + for (_PyExecutorObject *exec = _tstate->jit_executor_state.executor_list_head; exec != NULL;) { executor_to_gv(exec, out); exec = exec->vm_data.links.next; } diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 805805ef188e83..0fa9c6ee73d3e1 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1361,7 +1361,7 @@ init_interp_main(PyThreadState *tstate) } else #endif { - interp->jit = true; + ((_PyThreadStateImpl *)(tstate))->jit_executor_state.jit = true; } } } @@ -1718,8 +1718,8 @@ finalize_modules(PyThreadState *tstate) PyInterpreterState *interp = tstate->interp; // Invalidate all executors and turn off JIT: - interp->jit = false; - interp->compiling = false; + ((_PyThreadStateImpl *)tstate)->jit_executor_state.jit = false; + ((_PyThreadStateImpl *)tstate)->jit_executor_state.compiling = false; #ifdef _Py_TIER2 _Py_Executors_InvalidateAll(interp, 0); #endif diff --git a/Python/pystate.c b/Python/pystate.c index c12a1418e74309..de61970041c690 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -566,12 +566,6 @@ init_interpreter(PyInterpreterState *interp, interp->monitoring_tool_versions[t] = 0; } interp->_code_object_generation = 0; - interp->jit = false; - interp->compiling = false; - interp->executor_list_head = NULL; - interp->executor_deletion_list_head = NULL; - interp->executor_deletion_list_remaining_capacity = 0; - interp->executor_creation_counter = JIT_CLEANUP_THRESHOLD; if (interp != &runtime->_main_interpreter) { /* Fix the self-referential, statically initialized fields. */ interp->dtoa = (struct _dtoa_state)_dtoa_state_INIT(interp); @@ -798,39 +792,46 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate) Py_CLEAR(interp->after_forkers_child); #endif - -#ifdef _Py_TIER2 - _Py_ClearExecutorDeletionList(interp); -#endif _PyAST_Fini(interp); _PyAtExit_Fini(interp); - // All Python types must be destroyed before the last GC collection. Python // types create a reference cycle to themselves in their in their // PyTypeObject.tp_mro member (the tuple contains the type). /* Last garbage collection on this interpreter */ _PyGC_CollectNoFail(tstate); + +#ifdef _Py_TIER2 + // This can only be done after the last GC of the interpreter, as we may + // have pending executors that point to cold executors that expect it + // not to have been freed. + _Py_FOR_EACH_TSTATE_BEGIN(interp, p) { + HEAD_UNLOCK(runtime); + struct _PyExecutorObject *cold = ((_PyThreadStateImpl *)(p))->jit_executor_state.cold_executor; + if (cold != NULL) { + ((_PyThreadStateImpl *)(p))->jit_executor_state.cold_executor = NULL; + assert(cold->vm_data.valid); + assert(cold->vm_data.warm); + _PyExecutor_Free(cold); + } + + struct _PyExecutorObject *cold_dynamic = ((_PyThreadStateImpl *)(p))->jit_executor_state.cold_dynamic_executor; + if (cold_dynamic != NULL) { + ((_PyThreadStateImpl *)(p))->jit_executor_state.cold_dynamic_executor = NULL; + assert(cold_dynamic->vm_data.valid); + assert(cold_dynamic->vm_data.warm); + _PyExecutor_Free(cold_dynamic); + } + HEAD_LOCK(runtime); + } + _Py_FOR_EACH_TSTATE_END(interp); +#endif + _PyGC_Fini(interp); // Finalize warnings after last gc so that any finalizers can // access warnings state _PyWarnings_Fini(interp); - struct _PyExecutorObject *cold = interp->cold_executor; - if (cold != NULL) { - interp->cold_executor = NULL; - assert(cold->vm_data.valid); - assert(cold->vm_data.warm); - _PyExecutor_Free(cold); - } - - struct _PyExecutorObject *cold_dynamic = interp->cold_dynamic_executor; - if (cold_dynamic != NULL) { - interp->cold_dynamic_executor = NULL; - assert(cold_dynamic->vm_data.valid); - assert(cold_dynamic->vm_data.warm); - _PyExecutor_Free(cold_dynamic); - } /* We don't clear sysdict and builtins until the end of this function. Because clearing other attributes can execute arbitrary Python code which requires sysdict and builtins. */ @@ -1503,6 +1504,14 @@ init_threadstate(_PyThreadStateImpl *_tstate, #ifdef _Py_TIER2 _tstate->jit_tracer_state.code_buffer = NULL; + _tstate->jit_executor_state.jit = false; + _tstate->jit_executor_state.compiling = false; + _tstate->jit_executor_state.executor_list_head = NULL; + _tstate->jit_executor_state.executor_deletion_list_head = NULL; + _tstate->jit_executor_state.executor_deletion_list_remaining_capacity = 0; + _tstate->jit_executor_state.executor_creation_counter = JIT_CLEANUP_THRESHOLD; + _tstate->jit_executor_state.cold_executor = NULL; + _tstate->jit_executor_state.cold_dynamic_executor = NULL; #endif tstate->delete_later = NULL; @@ -1736,6 +1745,14 @@ PyThreadState_Clear(PyThreadState *tstate) Py_CLEAR(tstate->context); +#ifdef _Py_TIER2 + ((_PyThreadStateImpl *)(tstate))->jit_executor_state.jit = false; + _Py_ClearExecutorDeletionList(tstate); + + // We can't clear the cold executors here until the interpreter + // clear process starts. +#endif + #ifdef Py_GIL_DISABLED // Each thread should clear own freelists in free-threading builds. struct _Py_freelists *freelists = _Py_freelists_GET(); diff --git a/Python/sysmodule.c b/Python/sysmodule.c index a611844f76e090..3a57dd0d8fa544 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -4114,7 +4114,7 @@ _jit_is_enabled_impl(PyObject *module) /*[clinic end generated code: output=55865f8de993fe42 input=0524151e857f4f3a]*/ { (void)module; - return _PyInterpreterState_GET()->jit; + return ((_PyThreadStateImpl*)_PyThreadState_GET())->jit_executor_state.jit; } /*[clinic input] From ba9a65ad539d29b856a0085a08ad84ba8cae45ab Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Thu, 13 Nov 2025 23:32:36 +0000 Subject: [PATCH 02/40] fix a bug with traversing states --- Include/internal/pycore_optimizer.h | 3 +- Modules/_testinternalcapi.c | 3 +- Objects/codeobject.c | 2 +- Objects/frameobject.c | 1 - Objects/funcobject.c | 1 - Python/ceval_gil.c | 2 +- Python/instrumentation.c | 3 +- Python/optimizer.c | 100 +++++++++++++++++++--------- Python/pylifecycle.c | 2 +- 9 files changed, 75 insertions(+), 42 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 406afd95065868..a1817134105a3c 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -49,6 +49,7 @@ typedef struct _PyExitData { typedef struct _PyExecutorObject { PyObject_VAR_HEAD + PyThreadState *tstate; const _PyUOpInstruction *trace; _PyVMData vm_data; /* Used by the VM, but opaque to the optimizer */ uint32_t exit_count; @@ -78,7 +79,7 @@ PyAPI_FUNC(void) _Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj); #ifdef _Py_TIER2 PyAPI_FUNC(void) _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation); PyAPI_FUNC(void) _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation); -PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyInterpreterState *interp); +PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyThreadState *tstate); #else # define _Py_Executors_InvalidateDependency(A, B, C) ((void)0) diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 89e558b0fe8933..31900a16b9e044 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -1240,8 +1240,7 @@ add_executor_dependency(PyObject *self, PyObject *args) static PyObject * invalidate_executors(PyObject *self, PyObject *obj) { - PyInterpreterState *interp = PyInterpreterState_Get(); - _Py_Executors_InvalidateDependency(interp, obj, 1); + _Py_Executors_InvalidateDependency(_PyInterpreterState_GET(), obj, 1); Py_RETURN_NONE; } diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 3aea2038fd17e7..a78772b2ddf781 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2432,7 +2432,7 @@ code_dealloc(PyObject *self) PyMem_Free(co_extra); } #ifdef _Py_TIER2 - _PyJit_Tracer_InvalidateDependency(tstate, self); + _Py_Executors_InvalidateDependency(tstate->interp, self, 1); if (co->co_executors != NULL) { clear_executors(co); } diff --git a/Objects/frameobject.c b/Objects/frameobject.c index b652973600c17d..05ecf74d7d1e2f 100644 --- a/Objects/frameobject.c +++ b/Objects/frameobject.c @@ -263,7 +263,6 @@ framelocalsproxy_setitem(PyObject *self, PyObject *key, PyObject *value) #if _Py_TIER2 _Py_Executors_InvalidateDependency(_PyInterpreterState_GET(), co, 1); - _PyJit_Tracer_InvalidateDependency(_PyThreadState_GET(), co); #endif _PyLocals_Kind kind = _PyLocals_GetKind(co->co_localspluskinds, i); diff --git a/Objects/funcobject.c b/Objects/funcobject.c index b659ac8023373b..e5d8a8faf11e81 100644 --- a/Objects/funcobject.c +++ b/Objects/funcobject.c @@ -1153,7 +1153,6 @@ func_dealloc(PyObject *self) } #if _Py_TIER2 _Py_Executors_InvalidateDependency(_PyInterpreterState_GET(), self, 1); - _PyJit_Tracer_InvalidateDependency(_PyThreadState_GET(), self); #endif _PyObject_GC_UNTRACK(op); FT_CLEAR_WEAKREFS(self, op->func_weakreflist); diff --git a/Python/ceval_gil.c b/Python/ceval_gil.c index 1dafba7ebf3bfb..6fc9f2f5b4981b 100644 --- a/Python/ceval_gil.c +++ b/Python/ceval_gil.c @@ -1397,7 +1397,7 @@ _Py_HandlePending(PyThreadState *tstate) if ((breaker & _PY_EVAL_JIT_INVALIDATE_COLD_BIT) != 0) { _Py_unset_eval_breaker_bit(tstate, _PY_EVAL_JIT_INVALIDATE_COLD_BIT); - _Py_Executors_InvalidateCold(tstate->interp); + _Py_Executors_InvalidateCold(tstate); ((_PyThreadStateImpl*)tstate)->jit_executor_state.executor_creation_counter = JIT_CLEANUP_THRESHOLD; } diff --git a/Python/instrumentation.c b/Python/instrumentation.c index 81e46a331e0b9e..1185a706a45095 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -1785,8 +1785,7 @@ force_instrument_lock_held(PyCodeObject *code, PyInterpreterState *interp) if (code->co_executors != NULL) { _PyCode_Clear_Executors(code); } - _Py_Executors_InvalidateDependency(interp, code, 1); - _PyJit_Tracer_InvalidateDependency(PyThreadState_GET(), code); + _Py_Executors_InvalidateDependency(_PyInterpreterState_GET(), code, 1); #endif int code_len = (int)Py_SIZE(code); /* Exit early to avoid creating instrumentation diff --git a/Python/optimizer.c b/Python/optimizer.c index edd7bf4ffe63ce..f2b9a8bc188a5e 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1172,6 +1172,7 @@ allocate_executor(int exit_count, int length) res->trace = (_PyUOpInstruction *)(res->exits + exit_count); res->code_size = length; res->exit_count = exit_count; + res->tstate = _PyThreadState_GET(); return res; } @@ -1534,7 +1535,7 @@ unlink_executor(_PyExecutorObject *executor) } else { // prev == NULL implies that executor is the list head - _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)executor->tstate; assert(_tstate->jit_executor_state.executor_list_head == executor); _tstate->jit_executor_state.executor_list_head = next; } @@ -1680,6 +1681,19 @@ _Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj) _Py_BloomFilter_Add(&executor->vm_data.bloom, obj); } +void +_PyJit_Tracer_InvalidateDependency(PyThreadState *tstate, void *obj) +{ + _PyBloomFilter obj_filter; + _Py_BloomFilter_Init(&obj_filter); + _Py_BloomFilter_Add(&obj_filter, obj); + _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; + if (bloom_filter_may_contain(&_tstate->jit_tracer_state.prev_state.dependencies, &obj_filter)) + { + _tstate->jit_tracer_state.prev_state.dependencies_still_valid = false; + } +} + /* Invalidate all executors that depend on `obj` * May cause other executors to be invalidated as well */ @@ -1695,19 +1709,26 @@ _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is if (invalidate == NULL) { goto error; } - _PyThreadStateImpl *_tstate = (_PyThreadStateImpl*)_PyThreadState_GET(); /* Clearing an executor can deallocate others, so we need to make a list of * executors to invalidate first */ - for (_PyExecutorObject *exec = _tstate->jit_executor_state.executor_list_head; exec != NULL;) { - assert(exec->vm_data.valid); - _PyExecutorObject *next = exec->vm_data.links.next; - if (bloom_filter_may_contain(&exec->vm_data.bloom, &obj_filter) && - PyList_Append(invalidate, (PyObject *)exec)) - { - goto error; + _PyEval_StopTheWorldAll(&_PyRuntime); + _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { + _PyJit_Tracer_InvalidateDependency(p, obj); + for (_PyExecutorObject *exec = ((_PyThreadStateImpl *)p)->jit_executor_state.executor_list_head; exec != NULL;) { + assert(exec->vm_data.valid); + if (bloom_filter_may_contain(&exec->vm_data.bloom, &obj_filter)) { + if (PyList_Append(invalidate, (PyObject *)exec) < 0) { + PyErr_Clear(); + Py_DECREF(invalidate); + _PyEval_StartTheWorldAll(&_PyRuntime); + return; + } + } + _PyExecutorObject *next = exec->vm_data.links.next; + exec = next; } - exec = next; } + _PyEval_StartTheWorldAll(&_PyRuntime); for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) { PyObject *exec = PyList_GET_ITEM(invalidate, i); executor_clear(exec); @@ -1724,41 +1745,56 @@ _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is _Py_Executors_InvalidateAll(interp, is_invalidation); } -void -_PyJit_Tracer_InvalidateDependency(PyThreadState *tstate, void *obj) -{ - _PyBloomFilter obj_filter; - _Py_BloomFilter_Init(&obj_filter); - _Py_BloomFilter_Add(&obj_filter, obj); - _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; - if (bloom_filter_may_contain(&_tstate->jit_tracer_state.prev_state.dependencies, &obj_filter)) - { - _tstate->jit_tracer_state.prev_state.dependencies_still_valid = false; - } -} /* Invalidate all executors */ void _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) { - _PyThreadStateImpl *_tstate = (_PyThreadStateImpl*)_PyThreadState_GET(); - while (_tstate->jit_executor_state.executor_list_head) { - _PyExecutorObject *executor = _tstate->jit_executor_state.executor_list_head; - assert(executor->vm_data.valid == 1 && executor->vm_data.linked == 1); + PyObject *invalidate = PyList_New(0); + if (invalidate == NULL) { + PyErr_Clear(); + return; + } + /* Clearing an executor can deallocate others, so we need to make a list of + * executors to invalidate first */ + _PyEval_StopTheWorldAll(&_PyRuntime); + _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { + for (_PyExecutorObject *exec = ((_PyThreadStateImpl *)p)->jit_executor_state.executor_list_head; exec != NULL;) { + assert(exec->vm_data.valid); + assert(exec->tstate == p); + _PyExecutorObject *next = exec->vm_data.links.next; + if (PyList_Append(invalidate, (PyObject *)exec) < 0) { + PyErr_Clear(); + Py_DECREF(invalidate); + _PyEval_StartTheWorldAll(&_PyRuntime); + return; + } + exec = next; + } + } + _PyEval_StartTheWorldAll(&_PyRuntime); + Py_ssize_t list_len = PyList_GET_SIZE(invalidate); + for (Py_ssize_t i = 0; i < list_len; i++) { + _PyExecutorObject *executor = (_PyExecutorObject *)PyList_GET_ITEM(invalidate, i); + + assert(executor->vm_data.valid == 1); if (executor->vm_data.code) { // Clear the entire code object so its co_executors array be freed: _PyCode_Clear_Executors(executor->vm_data.code); } - else { - executor_clear((PyObject *)executor); - } + executor_clear((PyObject *)executor); if (is_invalidation) { OPT_STAT_INC(executors_invalidated); } } + Py_DECREF(invalidate); } + +// Unlike _PyExecutor_InvalidateDependency, this is not for correctness but memory savings. +// Thus there is no need to lock the runtime or traverse everything. We simply make a +// best-effort attempt to clean things up. void -_Py_Executors_InvalidateCold(PyInterpreterState *interp) +_Py_Executors_InvalidateCold(PyThreadState *tstate) { /* Walk the list of executors */ /* TO DO -- Use a tree to avoid traversing as many objects */ @@ -1766,7 +1802,7 @@ _Py_Executors_InvalidateCold(PyInterpreterState *interp) if (invalidate == NULL) { goto error; } - _PyThreadStateImpl *_tstate = (_PyThreadStateImpl*)_PyThreadState_GET(); + _PyThreadStateImpl *_tstate = (_PyThreadStateImpl*)tstate; /* Clearing an executor can deallocate others, so we need to make a list of * executors to invalidate first */ for (_PyExecutorObject *exec = _tstate->jit_executor_state.executor_list_head; exec != NULL;) { @@ -1792,7 +1828,7 @@ _Py_Executors_InvalidateCold(PyInterpreterState *interp) PyErr_Clear(); Py_XDECREF(invalidate); // If we're truly out of memory, wiping out everything is a fine fallback - _Py_Executors_InvalidateAll(interp, 0); + _Py_Executors_InvalidateAll(_PyInterpreterState_GET(), 0); } static void diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 0fa9c6ee73d3e1..e1db7167292947 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -606,7 +606,7 @@ builtins_dict_watcher(PyDict_WatchEvent event, PyObject *dict, PyObject *key, Py PyInterpreterState *interp = _PyInterpreterState_GET(); #ifdef _Py_TIER2 if (interp->rare_events.builtin_dict < _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS) { - _Py_Executors_InvalidateAll(interp, 1); + _Py_Executors_InvalidateAll(_PyInterpreterState_GET(), 1); } #endif RARE_EVENT_INTERP_INC(interp, builtin_dict); From 725894d0c73dd673071c1df98a1e6ad784546499 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Fri, 14 Nov 2025 00:12:22 +0000 Subject: [PATCH 03/40] fix JIT invalidation mechanism for FT --- Include/cpython/code.h | 1 + Include/internal/pycore_optimizer.h | 2 ++ Objects/codeobject.c | 7 +++++++ Python/instrumentation.c | 4 ++-- Python/optimizer.c | 31 ++++++++++++++++++++--------- 5 files changed, 34 insertions(+), 11 deletions(-) diff --git a/Include/cpython/code.h b/Include/cpython/code.h index 84456a709a6abe..bbb12951601c53 100644 --- a/Include/cpython/code.h +++ b/Include/cpython/code.h @@ -16,6 +16,7 @@ typedef struct { } _PyCoCached; typedef struct { + uint8_t is_finalizing; int size; int capacity; struct _PyExecutorObject *executors[1]; diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index a1817134105a3c..e396a6085ea13b 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -77,7 +77,9 @@ PyAPI_FUNC(void) _Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj); #define _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS 6 #ifdef _Py_TIER2 +PyAPI_FUNC(void) _Py_Executors_InvalidateDependencyWorldStopped(PyInterpreterState *interp, void *obj, int is_invalidation); PyAPI_FUNC(void) _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation); +PyAPI_FUNC(void) _Py_Executors_InvalidateAllWorldStopped(PyInterpreterState *interp, int is_invalidation); PyAPI_FUNC(void) _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation); PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyThreadState *tstate); diff --git a/Objects/codeobject.c b/Objects/codeobject.c index a78772b2ddf781..d6a18b3e319722 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2185,6 +2185,13 @@ _PyCode_ReturnsOnlyNone(PyCodeObject *co) static void clear_executors(PyCodeObject *co) { +#ifdef Py_GIL_DISABLED + uint8_t expected = 0; + if (!_Py_atomic_compare_exchange_uint8(&co->co_executors->is_finalizing, &expected, 1)) { + // Another thread is already finalizing this. + return; + } +#endif assert(co->co_executors); for (int i = 0; i < co->co_executors->size; i++) { if (co->co_executors->executors[i]) { diff --git a/Python/instrumentation.c b/Python/instrumentation.c index 1185a706a45095..ee2aa36281c63c 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -1785,7 +1785,7 @@ force_instrument_lock_held(PyCodeObject *code, PyInterpreterState *interp) if (code->co_executors != NULL) { _PyCode_Clear_Executors(code); } - _Py_Executors_InvalidateDependency(_PyInterpreterState_GET(), code, 1); + _Py_Executors_InvalidateDependencyWorldStopped(_PyInterpreterState_GET(), code, 1); #endif int code_len = (int)Py_SIZE(code); /* Exit early to avoid creating instrumentation @@ -2028,7 +2028,7 @@ _PyMonitoring_SetEvents(int tool_id, _PyMonitoringEventSet events) } set_global_version(tstate, new_version); #ifdef _Py_TIER2 - _Py_Executors_InvalidateAll(interp, 1); + _Py_Executors_InvalidateAllWorldStopped(interp, 1); #endif return instrument_all_executing_code_objects(interp); } diff --git a/Python/optimizer.c b/Python/optimizer.c index f2b9a8bc188a5e..4b5bce48b6fcca 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -81,6 +81,7 @@ get_index_for_executor(PyCodeObject *code, _Py_CODEUNIT *instr) if (new == NULL) { return -1; } + new->is_finalizing = 0; new->capacity = new_capacity; new->size = size; code->co_executors = new; @@ -1698,7 +1699,7 @@ _PyJit_Tracer_InvalidateDependency(PyThreadState *tstate, void *obj) * May cause other executors to be invalidated as well */ void -_Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation) +_Py_Executors_InvalidateDependencyWorldStopped(PyInterpreterState *interp, void *obj, int is_invalidation) { _PyBloomFilter obj_filter; _Py_BloomFilter_Init(&obj_filter); @@ -1711,7 +1712,6 @@ _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is } /* Clearing an executor can deallocate others, so we need to make a list of * executors to invalidate first */ - _PyEval_StopTheWorldAll(&_PyRuntime); _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { _PyJit_Tracer_InvalidateDependency(p, obj); for (_PyExecutorObject *exec = ((_PyThreadStateImpl *)p)->jit_executor_state.executor_list_head; exec != NULL;) { @@ -1728,7 +1728,6 @@ _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is exec = next; } } - _PyEval_StartTheWorldAll(&_PyRuntime); for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) { PyObject *exec = PyList_GET_ITEM(invalidate, i); executor_clear(exec); @@ -1741,13 +1740,19 @@ _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is error: PyErr_Clear(); Py_XDECREF(invalidate); - // If we're truly out of memory, wiping out everything is a fine fallback: - _Py_Executors_InvalidateAll(interp, is_invalidation); + // If we're truly out of memory, DO NOT wipe everything as the world is stopped, and we might deadlock. } -/* Invalidate all executors */ void -_Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) +_Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation) +{ + _PyEval_StopTheWorld(interp); + _Py_Executors_InvalidateDependencyWorldStopped(interp, obj, is_invalidation); + _PyEval_StartTheWorld(interp); +} + +void +_Py_Executors_InvalidateAllWorldStopped(PyInterpreterState *interp, int is_invalidation) { PyObject *invalidate = PyList_New(0); if (invalidate == NULL) { @@ -1756,7 +1761,6 @@ _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) } /* Clearing an executor can deallocate others, so we need to make a list of * executors to invalidate first */ - _PyEval_StopTheWorldAll(&_PyRuntime); _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { for (_PyExecutorObject *exec = ((_PyThreadStateImpl *)p)->jit_executor_state.executor_list_head; exec != NULL;) { assert(exec->vm_data.valid); @@ -1771,7 +1775,6 @@ _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) exec = next; } } - _PyEval_StartTheWorldAll(&_PyRuntime); Py_ssize_t list_len = PyList_GET_SIZE(invalidate); for (Py_ssize_t i = 0; i < list_len; i++) { _PyExecutorObject *executor = (_PyExecutorObject *)PyList_GET_ITEM(invalidate, i); @@ -1787,6 +1790,16 @@ _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) } } Py_DECREF(invalidate); + // If we're truly out of memory, DO NOT wipe everything as the world is stopped, and we might deadlock. +} + +/* Invalidate all executors */ +void +_Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) +{ + _PyEval_StopTheWorld(interp); + _Py_Executors_InvalidateAllWorldStopped(interp, is_invalidation); + _PyEval_StartTheWorld(interp); } From 1e5713debbaed34de8b967344bc1a8936d6192c1 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Fri, 14 Nov 2025 01:19:05 +0000 Subject: [PATCH 04/40] fix re-entrant finalizers --- Include/internal/pycore_optimizer.h | 4 +- Python/instrumentation.c | 4 +- Python/optimizer.c | 96 ++++++++--------------------- 3 files changed, 28 insertions(+), 76 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index e396a6085ea13b..24d7f04123b5d6 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -77,12 +77,10 @@ PyAPI_FUNC(void) _Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj); #define _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS 6 #ifdef _Py_TIER2 -PyAPI_FUNC(void) _Py_Executors_InvalidateDependencyWorldStopped(PyInterpreterState *interp, void *obj, int is_invalidation); PyAPI_FUNC(void) _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation); -PyAPI_FUNC(void) _Py_Executors_InvalidateAllWorldStopped(PyInterpreterState *interp, int is_invalidation); PyAPI_FUNC(void) _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation); PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyThreadState *tstate); - +PyAPI_FUNC(void) _Py_Executors_ClearExecutorList(PyObject *invalidate, int is_invalidation); #else # define _Py_Executors_InvalidateDependency(A, B, C) ((void)0) # define _Py_Executors_InvalidateAll(A, B) ((void)0) diff --git a/Python/instrumentation.c b/Python/instrumentation.c index ee2aa36281c63c..1185a706a45095 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -1785,7 +1785,7 @@ force_instrument_lock_held(PyCodeObject *code, PyInterpreterState *interp) if (code->co_executors != NULL) { _PyCode_Clear_Executors(code); } - _Py_Executors_InvalidateDependencyWorldStopped(_PyInterpreterState_GET(), code, 1); + _Py_Executors_InvalidateDependency(_PyInterpreterState_GET(), code, 1); #endif int code_len = (int)Py_SIZE(code); /* Exit early to avoid creating instrumentation @@ -2028,7 +2028,7 @@ _PyMonitoring_SetEvents(int tool_id, _PyMonitoringEventSet events) } set_global_version(tstate, new_version); #ifdef _Py_TIER2 - _Py_Executors_InvalidateAllWorldStopped(interp, 1); + _Py_Executors_InvalidateAll(interp, 1); #endif return instrument_all_executing_code_objects(interp); } diff --git a/Python/optimizer.c b/Python/optimizer.c index 4b5bce48b6fcca..f9a559a3a8d2d0 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1021,7 +1021,8 @@ _PyJit_TryInitializeTracing( _tstate->jit_tracer_state.initial_state.jump_backward_instr = curr_instr; if (_PyOpcode_Caches[_PyOpcode_Deopt[close_loop_instr->op.code]]) { - close_loop_instr[1].counter = trigger_backoff_counter(); + _Py_BackoffCounter zero = trigger_backoff_counter();; + FT_ATOMIC_STORE_UINT16_RELAXED(close_loop_instr[1].counter.value_and_backoff, zero.value_and_backoff); } _Py_BloomFilter_Init(&_tstate->jit_tracer_state.prev_state.dependencies); return 1; @@ -1694,22 +1695,34 @@ _PyJit_Tracer_InvalidateDependency(PyThreadState *tstate, void *obj) _tstate->jit_tracer_state.prev_state.dependencies_still_valid = false; } } +void +_Py_Executors_ClearExecutorList(PyObject *invalidate, int is_invalidation) +{ + if (invalidate == NULL) { + return; + } + for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) { + PyObject *exec = PyList_GET_ITEM(invalidate, i); + executor_clear(exec); + if (is_invalidation) { + OPT_STAT_INC(executors_invalidated); + } + } + Py_DECREF(invalidate); +} /* Invalidate all executors that depend on `obj` * May cause other executors to be invalidated as well + * To avoid deadlocks due to stop the world, we just invalidate the executors but leave them to be freed + * on their own later. */ void -_Py_Executors_InvalidateDependencyWorldStopped(PyInterpreterState *interp, void *obj, int is_invalidation) +_Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation) { _PyBloomFilter obj_filter; _Py_BloomFilter_Init(&obj_filter); _Py_BloomFilter_Add(&obj_filter, obj); /* Walk the list of executors */ - /* TO DO -- Use a tree to avoid traversing as many objects */ - PyObject *invalidate = PyList_New(0); - if (invalidate == NULL) { - goto error; - } /* Clearing an executor can deallocate others, so we need to make a list of * executors to invalidate first */ _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { @@ -1717,89 +1730,30 @@ _Py_Executors_InvalidateDependencyWorldStopped(PyInterpreterState *interp, void for (_PyExecutorObject *exec = ((_PyThreadStateImpl *)p)->jit_executor_state.executor_list_head; exec != NULL;) { assert(exec->vm_data.valid); if (bloom_filter_may_contain(&exec->vm_data.bloom, &obj_filter)) { - if (PyList_Append(invalidate, (PyObject *)exec) < 0) { - PyErr_Clear(); - Py_DECREF(invalidate); - _PyEval_StartTheWorldAll(&_PyRuntime); - return; - } + exec->vm_data.valid = 0; } _PyExecutorObject *next = exec->vm_data.links.next; exec = next; } } - for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) { - PyObject *exec = PyList_GET_ITEM(invalidate, i); - executor_clear(exec); - if (is_invalidation) { - OPT_STAT_INC(executors_invalidated); - } - } - Py_DECREF(invalidate); - return; -error: - PyErr_Clear(); - Py_XDECREF(invalidate); - // If we're truly out of memory, DO NOT wipe everything as the world is stopped, and we might deadlock. -} - -void -_Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation) -{ - _PyEval_StopTheWorld(interp); - _Py_Executors_InvalidateDependencyWorldStopped(interp, obj, is_invalidation); - _PyEval_StartTheWorld(interp); } +// To avoid deadlocks due to stop the world, we just invalidate the executors but leave them to be freed +// on their own later. void -_Py_Executors_InvalidateAllWorldStopped(PyInterpreterState *interp, int is_invalidation) +_Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) { - PyObject *invalidate = PyList_New(0); - if (invalidate == NULL) { - PyErr_Clear(); - return; - } /* Clearing an executor can deallocate others, so we need to make a list of * executors to invalidate first */ _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { for (_PyExecutorObject *exec = ((_PyThreadStateImpl *)p)->jit_executor_state.executor_list_head; exec != NULL;) { assert(exec->vm_data.valid); assert(exec->tstate == p); + exec->vm_data.valid = 0; _PyExecutorObject *next = exec->vm_data.links.next; - if (PyList_Append(invalidate, (PyObject *)exec) < 0) { - PyErr_Clear(); - Py_DECREF(invalidate); - _PyEval_StartTheWorldAll(&_PyRuntime); - return; - } exec = next; } } - Py_ssize_t list_len = PyList_GET_SIZE(invalidate); - for (Py_ssize_t i = 0; i < list_len; i++) { - _PyExecutorObject *executor = (_PyExecutorObject *)PyList_GET_ITEM(invalidate, i); - - assert(executor->vm_data.valid == 1); - if (executor->vm_data.code) { - // Clear the entire code object so its co_executors array be freed: - _PyCode_Clear_Executors(executor->vm_data.code); - } - executor_clear((PyObject *)executor); - if (is_invalidation) { - OPT_STAT_INC(executors_invalidated); - } - } - Py_DECREF(invalidate); - // If we're truly out of memory, DO NOT wipe everything as the world is stopped, and we might deadlock. -} - -/* Invalidate all executors */ -void -_Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) -{ - _PyEval_StopTheWorld(interp); - _Py_Executors_InvalidateAllWorldStopped(interp, is_invalidation); - _PyEval_StartTheWorld(interp); } From f0d4c576fea04773cbf71797b2088a4c11a1ebd7 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Fri, 14 Nov 2025 03:41:21 +0000 Subject: [PATCH 05/40] Re-enable the JIT --- Include/internal/pycore_opcode_metadata.h | 2 +- Include/internal/pycore_optimizer.h | 2 + Include/internal/pycore_tstate.h | 3 +- Lib/test/test_capi/test_opt.py | 6 -- Objects/codeobject.c | 10 ++- Python/bytecodes.c | 27 ++++++-- Python/ceval_macros.h | 4 ++ Python/generated_cases.c.h | 28 ++++++-- Python/instrumentation.c | 4 +- Python/optimizer.c | 84 +++++++++++++++-------- Python/pylifecycle.c | 1 - Python/pystate.c | 13 +++- 12 files changed, 132 insertions(+), 52 deletions(-) diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index 548627dc7982ec..437508be4dd8b1 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -1150,7 +1150,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[267] = { [END_ASYNC_FOR] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_JUMP_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG | HAS_UNPREDICTABLE_JUMP_FLAG | HAS_NEEDS_GUARD_IP_FLAG }, [END_FOR] = { true, INSTR_FMT_IX, HAS_ESCAPES_FLAG | HAS_NO_SAVE_IP_FLAG }, [END_SEND] = { true, INSTR_FMT_IX, HAS_ESCAPES_FLAG | HAS_PURE_FLAG }, - [ENTER_EXECUTOR] = { true, INSTR_FMT_IB, HAS_ARG_FLAG }, + [ENTER_EXECUTOR] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ESCAPES_FLAG }, [EXIT_INIT_CHECK] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG }, [EXTENDED_ARG] = { true, INSTR_FMT_IB, HAS_ARG_FLAG }, [FORMAT_SIMPLE] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 24d7f04123b5d6..1ab4b84f3184e0 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -78,7 +78,9 @@ PyAPI_FUNC(void) _Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj); #ifdef _Py_TIER2 PyAPI_FUNC(void) _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation); +PyAPI_FUNC(void) _Py_Executors_InvalidateDependencyLockHeld(PyInterpreterState *interp, void *obj, int is_invalidation); PyAPI_FUNC(void) _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation); +PyAPI_FUNC(void) _Py_Executors_InvalidateAllLockHeld(PyInterpreterState *interp, int is_invalidation); PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyThreadState *tstate); PyAPI_FUNC(void) _Py_Executors_ClearExecutorList(PyObject *invalidate, int is_invalidation); #else diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index 1b54daf47f9920..b2a3047304af28 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -54,8 +54,7 @@ typedef struct _PyJitTracerState { } _PyJitTracerState; typedef struct _PyJitExecutorState { - bool jit; - bool compiling; + char jit; struct _PyExecutorObject *executor_list_head; struct _PyExecutorObject *executor_deletion_list_head; struct _PyExecutorObject *cold_executor; diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index f06c6cbda2976c..629c753d494455 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -50,8 +50,6 @@ def get_opnames(ex): return list(iter_opnames(ex)) -@requires_specialization -@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds") @requires_jit_enabled class TestExecutorInvalidation(unittest.TestCase): @@ -115,12 +113,8 @@ def f(): self.assertTrue(exe.is_valid()) sys._clear_internal_caches() self.assertFalse(exe.is_valid()) - exe = get_first_executor(f) - self.assertIsNone(exe) -@requires_specialization -@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds") @requires_jit_enabled @unittest.skipIf(os.getenv("PYTHON_UOPS_OPTIMIZE") == "0", "Needs uop optimizer to run.") class TestUops(unittest.TestCase): diff --git a/Objects/codeobject.c b/Objects/codeobject.c index d6a18b3e319722..c6b132138a7e91 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2441,7 +2441,9 @@ code_dealloc(PyObject *self) #ifdef _Py_TIER2 _Py_Executors_InvalidateDependency(tstate->interp, self, 1); if (co->co_executors != NULL) { + Py_BEGIN_CRITICAL_SECTION(co); clear_executors(co); + Py_END_CRITICAL_SECTION(); } #endif @@ -3370,8 +3372,12 @@ deopt_code_unit(PyCodeObject *code, int i) inst.op.code = _PyOpcode_Deopt[opcode]; assert(inst.op.code < MIN_SPECIALIZED_OPCODE); } - // JIT should not be enabled with free-threading - assert(inst.op.code != ENTER_EXECUTOR); + if (inst.op.code == ENTER_EXECUTOR) { + _PyExecutorObject *exec = code->co_executors->executors[inst.op.arg]; + assert(exec != NULL); + inst.op.code = exec->vm_data.opcode; + inst.op.arg = exec->vm_data.oparg; + } return inst; } diff --git a/Python/bytecodes.c b/Python/bytecodes.c index fdab19131c6873..ab3f9c349c44af 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2939,9 +2939,9 @@ dummy_func( }; specializing tier1 op(_SPECIALIZE_JUMP_BACKWARD, (--)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZATION_FT if (this_instr->op.code == JUMP_BACKWARD) { - uint8_t desired = ((_PyThreadStateImpl*)tstate)->jit_executor_state.jit ? JUMP_BACKWARD_JIT : JUMP_BACKWARD_NO_JIT; + uint8_t desired = FT_ATOMIC_LOAD_CHAR_RELAXED(((_PyThreadStateImpl*)tstate)->jit_executor_state.jit) ? JUMP_BACKWARD_JIT : JUMP_BACKWARD_NO_JIT; FT_ATOMIC_STORE_UINT8_RELAXED(this_instr->op.code, desired); // Need to re-dispatch so the warmup counter isn't off by one: next_instr = this_instr; @@ -3018,11 +3018,25 @@ dummy_func( goto stop_tracing; } PyCodeObject *code = _PyFrame_GetCode(frame); +#ifdef Py_GIL_DISABLED + LOCK_OBJECT_SLOW(code); + _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; + // On FT, we are responsible for cleaning up after ourselves. + if (!_Py_atomic_load_uint8_relaxed(&executor->vm_data.valid)) { + opcode = executor->vm_data.opcode; + oparg = (oparg & ~255) | executor->vm_data.oparg; + next_instr = this_instr; + _Py_ExecutorDetach(executor); + UNLOCK_OBJECT_SLOW(code); + DISPATCH_GOTO(); + } +#else _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; - assert(executor->vm_data.index == INSTR_OFFSET() - 1); - assert(executor->vm_data.code == code); assert(executor->vm_data.valid); +#endif assert(tstate->current_executor == NULL); + assert(executor->vm_data.index == INSTR_OFFSET() - 1); + assert(executor->vm_data.code == code); /* If the eval breaker is set then stay in tier 1. * This avoids any potentially infinite loops * involving _RESUME_CHECK */ @@ -3033,10 +3047,15 @@ dummy_func( if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]]) { PAUSE_ADAPTIVE_COUNTER(this_instr[1].counter); } +#ifdef Py_GIL_DISABLED + UNLOCK_OBJECT_SLOW(code); +#endif DISPATCH_GOTO(); } assert(executor != ((_PyThreadStateImpl *)tstate)->jit_executor_state.cold_executor); tstate->jit_exit = NULL; + Py_INCREF(executor); + UNLOCK_OBJECT_SLOW(code); TIER1_TO_TIER2(executor); #else Py_FatalError("ENTER_EXECUTOR is not supported in this build"); diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 05a2760671e847..87b344c8c6b627 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -294,9 +294,13 @@ GETITEM(PyObject *v, Py_ssize_t i) { #ifdef Py_GIL_DISABLED # define LOCK_OBJECT(op) PyMutex_LockFast(&(_PyObject_CAST(op))->ob_mutex) # define UNLOCK_OBJECT(op) PyMutex_Unlock(&(_PyObject_CAST(op))->ob_mutex) +# define LOCK_OBJECT_SLOW(op) PyMutex_LockFast(&(_PyObject_CAST(op))->ob_mutex) +# define UNLOCK_OBJECT_SLOW(op) PyMutex_Unlock(&(_PyObject_CAST(op))->ob_mutex) #else # define LOCK_OBJECT(op) (1) # define UNLOCK_OBJECT(op) ((void)0) +# define LOCK_OBJECT_SLOW(op) ((void)0) +# define UNLOCK_OBJECT_SLOW(op) ((void)0) #endif #define GLOBALS() frame->f_globals diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index e0bf9b611f0c6d..d39235ea5c801f 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -5481,11 +5481,26 @@ JUMP_TO_LABEL(stop_tracing); } PyCodeObject *code = _PyFrame_GetCode(frame); + #ifdef Py_GIL_DISABLED + LOCK_OBJECT_SLOW(code); + _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; + if (!_Py_atomic_load_uint8_relaxed(&executor->vm_data.valid)) { + opcode = executor->vm_data.opcode; + oparg = (oparg & ~255) | executor->vm_data.oparg; + next_instr = this_instr; + _PyFrame_SetStackPointer(frame, stack_pointer); + _Py_ExecutorDetach(executor); + stack_pointer = _PyFrame_GetStackPointer(frame); + UNLOCK_OBJECT_SLOW(code); + DISPATCH_GOTO(); + } + #else _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; - assert(executor->vm_data.index == INSTR_OFFSET() - 1); - assert(executor->vm_data.code == code); assert(executor->vm_data.valid); + #endif assert(tstate->current_executor == NULL); + assert(executor->vm_data.index == INSTR_OFFSET() - 1); + assert(executor->vm_data.code == code); if (_Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & _PY_EVAL_EVENTS_MASK) { opcode = executor->vm_data.opcode; oparg = (oparg & ~255) | executor->vm_data.oparg; @@ -5493,10 +5508,15 @@ if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]]) { PAUSE_ADAPTIVE_COUNTER(this_instr[1].counter); } + #ifdef Py_GIL_DISABLED + UNLOCK_OBJECT_SLOW(code); + #endif DISPATCH_GOTO(); } assert(executor != ((_PyThreadStateImpl *)tstate)->jit_executor_state.cold_executor); tstate->jit_exit = NULL; + Py_INCREF(executor); + UNLOCK_OBJECT_SLOW(code); TIER1_TO_TIER2(executor); #else Py_FatalError("ENTER_EXECUTOR is not supported in this build"); @@ -7593,9 +7613,9 @@ /* Skip 1 cache entry */ // _SPECIALIZE_JUMP_BACKWARD { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZATION_FT if (this_instr->op.code == JUMP_BACKWARD) { - uint8_t desired = ((_PyThreadStateImpl*)tstate)->jit_executor_state.jit ? JUMP_BACKWARD_JIT : JUMP_BACKWARD_NO_JIT; + uint8_t desired = FT_ATOMIC_LOAD_CHAR_RELAXED(((_PyThreadStateImpl*)tstate)->jit_executor_state.jit) ? JUMP_BACKWARD_JIT : JUMP_BACKWARD_NO_JIT; FT_ATOMIC_STORE_UINT8_RELAXED(this_instr->op.code, desired); next_instr = this_instr; DISPATCH_SAME_OPARG(); diff --git a/Python/instrumentation.c b/Python/instrumentation.c index 1185a706a45095..cc3381e461e755 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -1785,7 +1785,7 @@ force_instrument_lock_held(PyCodeObject *code, PyInterpreterState *interp) if (code->co_executors != NULL) { _PyCode_Clear_Executors(code); } - _Py_Executors_InvalidateDependency(_PyInterpreterState_GET(), code, 1); + _Py_Executors_InvalidateDependencyLockHeld(_PyInterpreterState_GET(), code, 1); #endif int code_len = (int)Py_SIZE(code); /* Exit early to avoid creating instrumentation @@ -2028,7 +2028,7 @@ _PyMonitoring_SetEvents(int tool_id, _PyMonitoringEventSet events) } set_global_version(tstate, new_version); #ifdef _Py_TIER2 - _Py_Executors_InvalidateAll(interp, 1); + _Py_Executors_InvalidateAllLockHeld(interp, 1); #endif return instrument_all_executing_code_objects(interp); } diff --git a/Python/optimizer.c b/Python/optimizer.c index f9a559a3a8d2d0..67efa0a27220f4 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -121,6 +121,14 @@ uop_optimize(_PyInterpreterFrame *frame, PyThreadState *tstate, _PyExecutorObject **exec_ptr, bool progress_needed); +#if Py_GIL_DISABLED +#define LOCK_CODE(code) PyMutex_Lock(&((PyObject *)code)->ob_mutex) +#define UNLOCK_CODE(code) PyMutex_Unlock(&((PyObject *)code)->ob_mutex) +#else +#define LOCK_CODE(code) ((void)(code)) +#define UNLOCK_CODE(code) ((void)(code)) +#endif + /* Returns 1 if optimized, 0 if not optimized, and -1 for an error. * If optimized, *executor_ptr contains a new reference to the executor */ @@ -131,18 +139,15 @@ _PyOptimizer_Optimize( { _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; int chain_depth = _tstate->jit_tracer_state.initial_state.chain_depth; - if (!_tstate->jit_executor_state.jit) { + if (!FT_ATOMIC_LOAD_CHAR_RELAXED(_tstate->jit_executor_state.jit)) { // gh-140936: It is possible that interp->jit will become false during // interpreter finalization. However, the specialized JUMP_BACKWARD_JIT // instruction may still be present. In this case, we should // return immediately without optimization. return 0; } - assert(!_tstate->jit_executor_state.compiling); assert(_tstate->jit_tracer_state.initial_state.stack_depth >= 0); -#ifndef Py_GIL_DISABLED assert(_tstate->jit_tracer_state.initial_state.func != NULL); - _tstate->jit_executor_state.compiling = true; // The first executor in a chain and the MAX_CHAIN_DEPTH'th executor *must* // make progress in order to avoid infinite loops or excessively-long // side-exit chains. We can only insert the executor into the bytecode if @@ -150,20 +155,21 @@ _PyOptimizer_Optimize( chain_depth %= MAX_CHAIN_DEPTH; bool progress_needed = chain_depth == 0; PyCodeObject *code = (PyCodeObject *)_tstate->jit_tracer_state.initial_state.code; + LOCK_CODE(code); _Py_CODEUNIT *start = _tstate->jit_tracer_state.initial_state.start_instr; if (progress_needed && !has_space_for_executor(code, start)) { - _tstate->jit_executor_state.compiling = false; + UNLOCK_CODE(code); return 0; } // One of our dependencies while tracing was invalidated. Not worth compiling. if (!_tstate->jit_tracer_state.prev_state.dependencies_still_valid) { - _tstate->jit_executor_state.compiling = false; + UNLOCK_CODE(code); return 0; } _PyExecutorObject *executor; int err = uop_optimize(frame, tstate, &executor, progress_needed); if (err <= 0) { - _tstate->jit_executor_state.compiling = false; + UNLOCK_CODE(code); return err; } assert(executor != NULL); @@ -177,7 +183,7 @@ _PyOptimizer_Optimize( * it might get confused by the executor disappearing, * but there is not much we can do about that here. */ Py_DECREF(executor); - _tstate->jit_executor_state.compiling = false; + UNLOCK_CODE(code); return 0; } insert_executor(code, start, index, executor); @@ -191,11 +197,8 @@ _PyOptimizer_Optimize( } executor->vm_data.chain_depth = chain_depth; assert(executor->vm_data.valid); - _tstate->jit_executor_state.compiling = false; + UNLOCK_CODE(code); return 1; -#else - return 0; -#endif } static _PyExecutorObject * @@ -444,7 +447,11 @@ static PyMethodDef uop_executor_methods[] = { static int executor_is_gc(PyObject *o) { +#ifdef Py_GIL_DISABLED + return 1; +#else return !_Py_IsImmortal(o); +#endif } PyTypeObject _PyUOpExecutor_Type = { @@ -1315,6 +1322,9 @@ make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFil } #endif _PyObject_GC_TRACK(executor); + // Executors are cleared by coldness or invalidation. + // Thus there's no point for them to be refcounted. + _Py_SetImmortal((PyObject *)executor); return executor; } @@ -1637,6 +1647,7 @@ _Py_ExecutorDetach(_PyExecutorObject *executor) if (code == NULL) { return; } + assert(_PyInterpreterState_GET()->stoptheworld.world_stopped || PyMutex_IsLocked(&((PyObject *)code)->ob_mutex)); _Py_CODEUNIT *instruction = &_PyCode_CODE(code)[executor->vm_data.index]; assert(instruction->op.code == ENTER_EXECUTOR); int index = instruction->op.arg; @@ -1645,7 +1656,6 @@ _Py_ExecutorDetach(_PyExecutorObject *executor) instruction->op.arg = executor->vm_data.oparg; executor->vm_data.code = NULL; code->co_executors->executors[index] = NULL; - Py_DECREF(executor); } static int @@ -1695,20 +1705,21 @@ _PyJit_Tracer_InvalidateDependency(PyThreadState *tstate, void *obj) _tstate->jit_tracer_state.prev_state.dependencies_still_valid = false; } } + void -_Py_Executors_ClearExecutorList(PyObject *invalidate, int is_invalidation) +invalidate_sub_executors(_PyThreadStateImpl *tstate, _PyExecutorObject *executor) { - if (invalidate == NULL) { + if (!executor->vm_data.valid) { return; } - for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) { - PyObject *exec = PyList_GET_ITEM(invalidate, i); - executor_clear(exec); - if (is_invalidation) { - OPT_STAT_INC(executors_invalidated); + executor->vm_data.valid = 0; + executor_clear((PyObject *)executor); + for (uint32_t i = 0; i < executor->exit_count; i++) { + _PyExecutorObject *next = executor->exits[i].executor; + if (next != tstate->jit_executor_state.cold_dynamic_executor && next != tstate->jit_executor_state.cold_executor) { + invalidate_sub_executors(tstate, next); } } - Py_DECREF(invalidate); } /* Invalidate all executors that depend on `obj` @@ -1717,7 +1728,7 @@ _Py_Executors_ClearExecutorList(PyObject *invalidate, int is_invalidation) * on their own later. */ void -_Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation) +_Py_Executors_InvalidateDependencyLockHeld(PyInterpreterState *interp, void *obj, int is_invalidation) { _PyBloomFilter obj_filter; _Py_BloomFilter_Init(&obj_filter); @@ -1728,9 +1739,8 @@ _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { _PyJit_Tracer_InvalidateDependency(p, obj); for (_PyExecutorObject *exec = ((_PyThreadStateImpl *)p)->jit_executor_state.executor_list_head; exec != NULL;) { - assert(exec->vm_data.valid); if (bloom_filter_may_contain(&exec->vm_data.bloom, &obj_filter)) { - exec->vm_data.valid = 0; + invalidate_sub_executors((_PyThreadStateImpl *)p, exec); } _PyExecutorObject *next = exec->vm_data.links.next; exec = next; @@ -1738,16 +1748,24 @@ _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is } } +void +_Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation) +{ + _PyEval_StopTheWorld(interp); + _Py_Executors_InvalidateDependencyLockHeld(interp, obj, is_invalidation); + _PyEval_StartTheWorld(interp); +} + // To avoid deadlocks due to stop the world, we just invalidate the executors but leave them to be freed // on their own later. void -_Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) +_Py_Executors_InvalidateAllLockHeld(PyInterpreterState *interp, int is_invalidation) { /* Clearing an executor can deallocate others, so we need to make a list of * executors to invalidate first */ _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { + ((_PyThreadStateImpl *)p)->jit_tracer_state.prev_state.dependencies_still_valid = false; for (_PyExecutorObject *exec = ((_PyThreadStateImpl *)p)->jit_executor_state.executor_list_head; exec != NULL;) { - assert(exec->vm_data.valid); assert(exec->tstate == p); exec->vm_data.valid = 0; _PyExecutorObject *next = exec->vm_data.links.next; @@ -1756,6 +1774,13 @@ _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) } } +void +_Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) +{ + _PyEval_StopTheWorld(interp); + _Py_Executors_InvalidateAllLockHeld(interp, is_invalidation); + _PyEval_StartTheWorld(interp); +} // Unlike _PyExecutor_InvalidateDependency, this is not for correctness but memory savings. // Thus there is no need to lock the runtime or traverse everything. We simply make a @@ -1773,11 +1798,12 @@ _Py_Executors_InvalidateCold(PyThreadState *tstate) /* Clearing an executor can deallocate others, so we need to make a list of * executors to invalidate first */ for (_PyExecutorObject *exec = _tstate->jit_executor_state.executor_list_head; exec != NULL;) { - assert(exec->vm_data.valid); _PyExecutorObject *next = exec->vm_data.links.next; - if (!exec->vm_data.warm && PyList_Append(invalidate, (PyObject *)exec) < 0) { - goto error; + if (!exec->vm_data.warm || !exec->vm_data.valid) { + if (PyList_Append(invalidate, (PyObject *)exec) < 0) { + goto error; + } } else { exec->vm_data.warm = false; diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index e1db7167292947..a8f9e6d979509e 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1719,7 +1719,6 @@ finalize_modules(PyThreadState *tstate) // Invalidate all executors and turn off JIT: ((_PyThreadStateImpl *)tstate)->jit_executor_state.jit = false; - ((_PyThreadStateImpl *)tstate)->jit_executor_state.compiling = false; #ifdef _Py_TIER2 _Py_Executors_InvalidateAll(interp, 0); #endif diff --git a/Python/pystate.c b/Python/pystate.c index de61970041c690..7290c4775809d9 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -1505,7 +1505,6 @@ init_threadstate(_PyThreadStateImpl *_tstate, #ifdef _Py_TIER2 _tstate->jit_tracer_state.code_buffer = NULL; _tstate->jit_executor_state.jit = false; - _tstate->jit_executor_state.compiling = false; _tstate->jit_executor_state.executor_list_head = NULL; _tstate->jit_executor_state.executor_deletion_list_head = NULL; _tstate->jit_executor_state.executor_deletion_list_remaining_capacity = 0; @@ -1537,6 +1536,18 @@ add_threadstate(PyInterpreterState *interp, PyThreadState *tstate, tstate->next = next; assert(tstate->prev == NULL); interp->threads.head = tstate; +#if defined(_Py_TIER2) && defined(Py_GIL_DISABLED) + // There's more than one thread. In FT mode, + // disable the JIT completely for now. + if (next != NULL) { + _Py_Executors_InvalidateAllLockHeld(interp, 1); + } + PyThreadState *curr = interp->threads.head; + while (curr != NULL) { + _Py_atomic_store_char_relaxed(&((_PyThreadStateImpl*)curr)->jit_executor_state.jit, false); + curr = curr->next; + } +#endif } static PyThreadState * From 67de7d6a14f4c20443396a16c6adc97912bf3cb5 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Fri, 14 Nov 2025 03:57:54 +0000 Subject: [PATCH 06/40] cleanup a little --- Include/internal/pycore_optimizer.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 1ab4b84f3184e0..a09d2e85792a8a 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -82,7 +82,6 @@ PyAPI_FUNC(void) _Py_Executors_InvalidateDependencyLockHeld(PyInterpreterState * PyAPI_FUNC(void) _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation); PyAPI_FUNC(void) _Py_Executors_InvalidateAllLockHeld(PyInterpreterState *interp, int is_invalidation); PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyThreadState *tstate); -PyAPI_FUNC(void) _Py_Executors_ClearExecutorList(PyObject *invalidate, int is_invalidation); #else # define _Py_Executors_InvalidateDependency(A, B, C) ((void)0) # define _Py_Executors_InvalidateAll(A, B) ((void)0) From 7f0bc57474ed9fde1c043ba5091358d71102e623 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Fri, 14 Nov 2025 04:12:22 +0000 Subject: [PATCH 07/40] fix weird GC bugs --- Python/optimizer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index 67efa0a27220f4..aabc0906dd7b52 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1751,9 +1751,9 @@ _Py_Executors_InvalidateDependencyLockHeld(PyInterpreterState *interp, void *obj void _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation) { - _PyEval_StopTheWorld(interp); + HEAD_LOCK(&_PyRuntime); _Py_Executors_InvalidateDependencyLockHeld(interp, obj, is_invalidation); - _PyEval_StartTheWorld(interp); + HEAD_UNLOCK(&_PyRuntime); } // To avoid deadlocks due to stop the world, we just invalidate the executors but leave them to be freed @@ -1777,9 +1777,9 @@ _Py_Executors_InvalidateAllLockHeld(PyInterpreterState *interp, int is_invalidat void _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) { - _PyEval_StopTheWorld(interp); + HEAD_LOCK(&_PyRuntime); _Py_Executors_InvalidateAllLockHeld(interp, is_invalidation); - _PyEval_StartTheWorld(interp); + HEAD_UNLOCK(&_PyRuntime); } // Unlike _PyExecutor_InvalidateDependency, this is not for correctness but memory savings. From f05e61c4a2a793a3753dcd28a6bbf6f0a4ed849a Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Fri, 14 Nov 2025 04:26:53 +0000 Subject: [PATCH 08/40] re-enable jit on some stuff --- Lib/test/test_capi/test_opt.py | 2 -- Python/pystate.c | 7 +++++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index 629c753d494455..8c2b550bdf04a3 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -417,8 +417,6 @@ def testfunc(n, m): self.assertIn("_FOR_ITER_TIER_TWO", uops) -@requires_specialization -@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds") @requires_jit_enabled @unittest.skipIf(os.getenv("PYTHON_UOPS_OPTIMIZE") == "0", "Needs uop optimizer to run.") class TestUopsOptimization(unittest.TestCase): diff --git a/Python/pystate.c b/Python/pystate.c index 7290c4775809d9..fc22e224affdc8 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -1844,6 +1844,13 @@ tstate_delete_common(PyThreadState *tstate, int release_gil) _PyObject_VirtualFree(_tstate->jit_tracer_state.code_buffer, UOP_BUFFER_SIZE); _tstate->jit_tracer_state.code_buffer = NULL; } +#ifdef Py_GIL_DISABLED + // There's only one thread. Re-enable JIT. + PyThreadState *curr = interp->threads.head; + if (curr != NULL && curr->prev == NULL && curr->next == NULL) { + _Py_atomic_store_char_relaxed(&((_PyThreadStateImpl*)curr)->jit_executor_state.jit, true); + } +#endif #endif HEAD_UNLOCK(runtime); From f78e8c8af4bc6a31a1650448cac59820c5fc4aab Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Fri, 14 Nov 2025 16:22:03 +0000 Subject: [PATCH 09/40] fix test, more locks! --- Lib/test/test_capi/test_opt.py | 9 +++++++++ Python/optimizer.c | 4 +++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index 8c2b550bdf04a3..39e94ef4b8a105 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -1167,6 +1167,7 @@ def test_modified_local_is_seen_by_optimized_code(self): self.assertIs(type(s), float) self.assertEqual(s, 1024.0) + @unittest.skipIf(Py_GIL_DISABLED, "FT build only specializes on deferred methods/functions/classes") def test_guard_type_version_removed(self): def thing(a): x = 0 @@ -1185,6 +1186,7 @@ class Foo: guard_type_version_count = opnames.count("_GUARD_TYPE_VERSION") self.assertEqual(guard_type_version_count, 1) + @unittest.skipIf(Py_GIL_DISABLED, "FT build only specializes on deferred methods/functions/classes") def test_guard_type_version_removed_inlined(self): """ Verify that the guard type version if we have an inlined function @@ -1211,6 +1213,7 @@ class Foo: guard_type_version_count = opnames.count("_GUARD_TYPE_VERSION") self.assertEqual(guard_type_version_count, 1) + @unittest.skipIf(Py_GIL_DISABLED, "FT build only specializes on deferred methods/functions") def test_guard_type_version_removed_invalidation(self): def thing(a): @@ -1241,6 +1244,7 @@ class Bar: self.assertEqual(opnames[:load_attr_top].count("_GUARD_TYPE_VERSION"), 1) self.assertEqual(opnames[call:load_attr_bottom].count("_CHECK_VALIDITY"), 2) + @unittest.skipIf(Py_GIL_DISABLED, "FT build only specializes on deferred methods/functions") def test_guard_type_version_removed_escaping(self): def thing(a): @@ -1264,6 +1268,7 @@ class Foo: self.assertEqual(opnames[:load_attr_top].count("_GUARD_TYPE_VERSION"), 1) self.assertEqual(opnames[call:load_attr_bottom].count("_CHECK_VALIDITY"), 2) + @unittest.skipIf(Py_GIL_DISABLED, "FT build only specializes on deferred methods/functions/classes") def test_guard_type_version_executor_invalidated(self): """ Verify that the executor is invalided on a type change. @@ -2033,6 +2038,7 @@ def testfunc(n): self.assertNotIn("_GUARD_NOS_INT", uops) self.assertNotIn("_GUARD_TOS_INT", uops) + @unittest.skipIf(Py_GIL_DISABLED, "FT build immortalizes constants") def test_call_len_known_length_small_int(self): # Make sure that len(t) is optimized for a tuple of length 5. # See https://github.com/python/cpython/issues/139393. @@ -2057,6 +2063,7 @@ def testfunc(n): self.assertNotIn("_POP_CALL_LOAD_CONST_INLINE_BORROW", uops) self.assertNotIn("_POP_TOP_LOAD_CONST_INLINE_BORROW", uops) + @unittest.skipIf(Py_GIL_DISABLED, "FT build immortalizes constants") def test_call_len_known_length(self): # Make sure that len(t) is not optimized for a tuple of length 2048. # See https://github.com/python/cpython/issues/139393. @@ -2308,6 +2315,7 @@ def testfunc(n): self.assertNotIn("_TO_BOOL_BOOL", uops) self.assertIn("_GUARD_IS_TRUE_POP", uops) + @unittest.skipIf(Py_GIL_DISABLED, "FT build only specializes on deferred methods/functions/classes") def test_set_type_version_sets_type(self): class C: A = 1 @@ -2340,6 +2348,7 @@ def testfunc(n): self.assertNotIn("_LOAD_SMALL_INT", uops) self.assertIn("_LOAD_CONST_INLINE_BORROW", uops) + @unittest.skipIf(Py_GIL_DISABLED, "FT build only specializes on deferred methods/functions") def test_cached_attributes(self): class C: A = 1 diff --git a/Python/optimizer.c b/Python/optimizer.c index aabc0906dd7b52..4553b24074f9b1 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1317,7 +1317,7 @@ make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFil // from being immediately detected as cold and invalidated. executor->vm_data.warm = true; if (_PyJIT_Compile(executor, executor->trace, length)) { - Py_DECREF(executor); + executor->vm_data.warm = false; return NULL; } #endif @@ -1399,8 +1399,10 @@ uop_optimize( OPT_HIST(effective_trace_length(buffer, length), optimized_trace_length_hist); length = prepare_for_execution(buffer, length); assert(length <= UOP_MAX_TRACE_LENGTH); + HEAD_LOCK(&_PyRuntime); _PyExecutorObject *executor = make_executor_from_uops( buffer, length, dependencies, _tstate->jit_tracer_state.initial_state.chain_depth); + HEAD_UNLOCK(&_PyRuntime); if (executor == NULL) { return -1; } From e1f1b307c085213dc3e3e17b1776e3ac48409488 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Fri, 14 Nov 2025 16:28:15 +0000 Subject: [PATCH 10/40] fix JIT builds --- Python/sysmodule.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/sysmodule.c b/Python/sysmodule.c index 3a57dd0d8fa544..636f6362cbcfa4 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2365,7 +2365,7 @@ sys_activate_stack_trampoline_impl(PyObject *module, const char *backend) { #ifdef PY_HAVE_PERF_TRAMPOLINE #ifdef _Py_JIT - if (_PyInterpreterState_GET()->jit) { + if (((_PyThreadStateImpl *)_PyThreadState_GET())->jit_executor_state.jit) { PyErr_SetString(PyExc_ValueError, "Cannot activate the perf trampoline if the JIT is active"); return NULL; } From 53c5e1d278ed7f3131b1e1e62265b0758aa5dd65 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Fri, 14 Nov 2025 17:24:53 +0000 Subject: [PATCH 11/40] only clear at end --- Python/optimizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index 4553b24074f9b1..de44aefcef2487 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1715,13 +1715,13 @@ invalidate_sub_executors(_PyThreadStateImpl *tstate, _PyExecutorObject *executor return; } executor->vm_data.valid = 0; - executor_clear((PyObject *)executor); for (uint32_t i = 0; i < executor->exit_count; i++) { _PyExecutorObject *next = executor->exits[i].executor; if (next != tstate->jit_executor_state.cold_dynamic_executor && next != tstate->jit_executor_state.cold_executor) { invalidate_sub_executors(tstate, next); } } + executor_clear((PyObject *)executor); } /* Invalidate all executors that depend on `obj` From cc38ee4afea40e8e4b1f06459fbc80d05e294bee Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Fri, 14 Nov 2025 18:01:08 +0000 Subject: [PATCH 12/40] remove locks in JIT code --- Python/executor_cases.c.h | 38 ++++++++++---------- Tools/cases_generator/generators_common.py | 26 +++++++++++++- Tools/cases_generator/optimizer_generator.py | 23 ------------ Tools/cases_generator/tier2_generator.py | 29 +++++++++++++-- 4 files changed, 71 insertions(+), 45 deletions(-) diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 3491b9136dd31b..4c625d84bc50e5 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -1780,12 +1780,12 @@ JUMP_TO_JUMP_TARGET(); } Py_ssize_t index = ((PyLongObject*)sub)->long_value.ob_digit[0]; - if (!LOCK_OBJECT(list)) { + if (!1 && (list)) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } if (index >= PyList_GET_SIZE(list)) { - UNLOCK_OBJECT(list); + (void)(list); if (true) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); @@ -1796,7 +1796,7 @@ FT_ATOMIC_STORE_PTR_RELEASE(_PyList_ITEMS(list)[index], PyStackRef_AsPyObjectSteal(value)); assert(old_value != NULL); - UNLOCK_OBJECT(list); + (void)(list); PyStackRef_CLOSE_SPECIALIZED(sub_st, _PyLong_ExactDealloc); stack_pointer += -3; assert(WITHIN_STACK_BOUNDS()); @@ -2299,12 +2299,12 @@ values = &stack_pointer[-1]; PyObject *seq_o = PyStackRef_AsPyObjectBorrow(seq); assert(PyList_CheckExact(seq_o)); - if (!LOCK_OBJECT(seq_o)) { + if (!1 && (seq_o)) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } if (PyList_GET_SIZE(seq_o) != oparg) { - UNLOCK_OBJECT(seq_o); + (void)(seq_o); if (true) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); @@ -2315,7 +2315,7 @@ for (int i = oparg; --i >= 0; ) { *values++ = PyStackRef_FromPyObjectNew(items[i]); } - UNLOCK_OBJECT(seq_o); + (void)(seq_o); stack_pointer += -1 + oparg; assert(WITHIN_STACK_BOUNDS()); _PyFrame_SetStackPointer(frame, stack_pointer); @@ -3363,13 +3363,13 @@ uint32_t type_version = (uint32_t)CURRENT_OPERAND0(); PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner); assert(type_version != 0); - if (!LOCK_OBJECT(owner_o)) { + if (!1 && (owner_o)) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } PyTypeObject *tp = Py_TYPE(owner_o); if (FT_ATOMIC_LOAD_UINT_RELAXED(tp->tp_version_tag) != type_version) { - UNLOCK_OBJECT(owner_o); + (void)(owner_o); if (true) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); @@ -3644,7 +3644,7 @@ assert(Py_TYPE(owner_o)->tp_flags & Py_TPFLAGS_INLINE_VALUES); if (_PyObject_GetManagedDict(owner_o) || !FT_ATOMIC_LOAD_UINT8(_PyObject_InlineValues(owner_o)->valid)) { - UNLOCK_OBJECT(owner_o); + (void)(owner_o); if (true) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); @@ -3670,7 +3670,7 @@ Py_ssize_t index = value_ptr - values->values; _PyDictValues_AddToInsertionOrder(values, index); } - UNLOCK_OBJECT(owner_o); + (void)(owner_o); stack_pointer += -2; assert(WITHIN_STACK_BOUNDS()); _PyFrame_SetStackPointer(frame, stack_pointer); @@ -3694,7 +3694,7 @@ UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } - if (!LOCK_OBJECT(dict)) { + if (!1 && (dict)) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } @@ -3702,7 +3702,7 @@ PyObject *name = GETITEM(FRAME_CO_NAMES, oparg); if (hint >= (size_t)dict->ma_keys->dk_nentries || !DK_IS_UNICODE(dict->ma_keys)) { - UNLOCK_OBJECT(dict); + (void)(dict); if (true) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); @@ -3710,7 +3710,7 @@ } PyDictUnicodeEntry *ep = DK_UNICODE_ENTRIES(dict->ma_keys) + hint; if (ep->me_key != name) { - UNLOCK_OBJECT(dict); + (void)(dict); if (true) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); @@ -3718,7 +3718,7 @@ } PyObject *old_value = ep->me_value; if (old_value == NULL) { - UNLOCK_OBJECT(dict); + (void)(dict); if (true) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); @@ -3728,7 +3728,7 @@ _PyDict_NotifyEvent(tstate->interp, PyDict_EVENT_MODIFIED, dict, name, PyStackRef_AsPyObjectBorrow(value)); stack_pointer = _PyFrame_GetStackPointer(frame); FT_ATOMIC_STORE_PTR_RELEASE(ep->me_value, PyStackRef_AsPyObjectSteal(value)); - UNLOCK_OBJECT(dict); + (void)(dict); STAT_INC(STORE_ATTR, hit); stack_pointer += -2; assert(WITHIN_STACK_BOUNDS()); @@ -3746,7 +3746,7 @@ value = stack_pointer[-2]; uint16_t index = (uint16_t)CURRENT_OPERAND0(); PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner); - if (!LOCK_OBJECT(owner_o)) { + if (!1 && (owner_o)) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } @@ -3754,7 +3754,7 @@ STAT_INC(STORE_ATTR, hit); PyObject *old_value = *(PyObject **)addr; FT_ATOMIC_STORE_PTR_RELEASE(*(PyObject **)addr, PyStackRef_AsPyObjectSteal(value)); - UNLOCK_OBJECT(owner_o); + (void)(owner_o); stack_pointer += -2; assert(WITHIN_STACK_BOUNDS()); _PyFrame_SetStackPointer(frame, stack_pointer); @@ -6041,13 +6041,13 @@ UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } - if (!LOCK_OBJECT(self_o)) { + if (!1 && (self_o)) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } STAT_INC(CALL, hit); int err = _PyList_AppendTakeRef((PyListObject *)self_o, PyStackRef_AsPyObjectSteal(arg)); - UNLOCK_OBJECT(self_o); + (void)(self_o); stack_pointer += -2; assert(WITHIN_STACK_BOUNDS()); _PyFrame_SetStackPointer(frame, stack_pointer); diff --git a/Tools/cases_generator/generators_common.py b/Tools/cases_generator/generators_common.py index 0b5f764ec52b45..4bcf54df037a16 100644 --- a/Tools/cases_generator/generators_common.py +++ b/Tools/cases_generator/generators_common.py @@ -135,6 +135,30 @@ def __init__(self, out: CWriter, labels: dict[str, Label], cannot_escape: bool = self.cannot_escape = cannot_escape self.jump_prefix = jump_prefix + def emit_to_with_replacement( + self, + out: CWriter, + tkn_iter: TokenIterator, + end: str, + uop: CodeSection, + storage: Storage, + inst: Instruction | None + ) -> Token: + parens = 0 + for tkn in tkn_iter: + if tkn.kind == end and parens == 0: + return tkn + if tkn.kind == "LPAREN": + parens += 1 + if tkn.kind == "RPAREN": + parens -= 1 + if tkn.text in self._replacers: + self._replacers[tkn.text](tkn, tkn_iter, uop, storage, inst) + else: + out.emit(tkn) + raise analysis_error(f"Expecting {end}. Reached end of file", tkn) + + def dispatch( self, tkn: Token, @@ -162,7 +186,7 @@ def deopt_if( lparen = next(tkn_iter) assert lparen.kind == "LPAREN" first_tkn = tkn_iter.peek() - emit_to(self.out, tkn_iter, "RPAREN") + self.emit_to_with_replacement(self.out, tkn_iter, "RPAREN", uop, storage, inst) self.emit(") {\n") next(tkn_iter) # Semi colon assert inst is not None diff --git a/Tools/cases_generator/optimizer_generator.py b/Tools/cases_generator/optimizer_generator.py index 41df073cf6df23..d29b5406aad1ec 100644 --- a/Tools/cases_generator/optimizer_generator.py +++ b/Tools/cases_generator/optimizer_generator.py @@ -271,29 +271,6 @@ def __init__(self, out: CWriter, labels: dict[str, Label], original_uop: Uop, st self._replacers = {**self._replacers, **overrides} self.cannot_escape = True - def emit_to_with_replacement( - self, - out: CWriter, - tkn_iter: TokenIterator, - end: str, - uop: CodeSection, - storage: Storage, - inst: Instruction | None - ) -> Token: - parens = 0 - for tkn in tkn_iter: - if tkn.kind == end and parens == 0: - return tkn - if tkn.kind == "LPAREN": - parens += 1 - if tkn.kind == "RPAREN": - parens -= 1 - if tkn.text in self._replacers: - self._replacers[tkn.text](tkn, tkn_iter, uop, storage, inst) - else: - out.emit(tkn) - raise analysis_error(f"Expecting {end}. Reached end of file", tkn) - def emit_stackref_override( self, tkn: Token, diff --git a/Tools/cases_generator/tier2_generator.py b/Tools/cases_generator/tier2_generator.py index ac3e6b94afe49e..950fad708a38d3 100644 --- a/Tools/cases_generator/tier2_generator.py +++ b/Tools/cases_generator/tier2_generator.py @@ -64,6 +64,8 @@ def __init__(self, out: CWriter, labels: dict[str, Label]): super().__init__(out, labels) self._replacers["oparg"] = self.oparg self._replacers["IP_OFFSET_OF"] = self.ip_offset_of + self._replacers["LOCK_OBJECT"] = self.lock_object + self._replacers["UNLOCK_OBJECT"] = self.unlock_object def goto_error(self, offset: int, storage: Storage) -> str: # To do: Add jump targets for popping values. @@ -84,7 +86,7 @@ def deopt_if( self.emit(lparen) assert lparen.kind == "LPAREN" first_tkn = tkn_iter.peek() - emit_to(self.out, tkn_iter, "RPAREN") + self.emit_to_with_replacement(self.out, tkn_iter, "RPAREN", uop, storage, inst) next(tkn_iter) # Semi colon self.emit(") {\n") self.emit("UOP_STAT_INC(uopcode, miss);\n") @@ -104,7 +106,7 @@ def exit_if( lparen = next(tkn_iter) self.emit(lparen) first_tkn = tkn_iter.peek() - emit_to(self.out, tkn_iter, "RPAREN") + self.emit_to_with_replacement(self.out, tkn_iter, "RPAREN", uop, storage, inst) next(tkn_iter) # Semi colon self.emit(") {\n") self.emit("UOP_STAT_INC(uopcode, miss);\n") @@ -154,6 +156,29 @@ def ip_offset_of( next(tkn_iter) return True + def lock_object( + self, + tkn: Token, + tkn_iter: TokenIterator, + uop: CodeSection, + storage: Storage, + inst: Instruction | None, + ) -> bool: + self.emit("1 && ") + return True + + def unlock_object( + self, + tkn: Token, + tkn_iter: TokenIterator, + uop: CodeSection, + storage: Storage, + inst: Instruction | None, + ) -> bool: + self.out.start_line() + self.emit("(void)") + return True + def write_uop(uop: Uop, emitter: Emitter, stack: Stack, offset_strs: dict[str, tuple[str, str]]) -> Stack: locals: dict[str, Local] = {} try: From f8fefb334d65e9d0087cd32a556dd0946638d74f Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Fri, 14 Nov 2025 18:46:35 +0000 Subject: [PATCH 13/40] fix a few bugs --- Include/internal/pycore_optimizer.h | 2 +- Python/bytecodes.c | 6 ++---- Python/ceval.c | 2 +- Python/executor_cases.c.h | 2 -- Python/generated_cases.c.h | 4 ++-- Python/optimizer.c | 6 +++--- 6 files changed, 9 insertions(+), 13 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index a09d2e85792a8a..2e5f42bdf3bd13 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -368,7 +368,7 @@ PyAPI_FUNC(int) _PyDumpExecutors(FILE *out); extern void _Py_ClearExecutorDeletionList(PyThreadState *tstate); #endif -int _PyJit_translate_single_bytecode_to_trace(PyThreadState *tstate, _PyInterpreterFrame *frame, _Py_CODEUNIT *next_instr, bool stop_tracing); +int _PyJit_translate_single_bytecode_to_trace(PyThreadState *tstate, _PyInterpreterFrame *frame, _Py_CODEUNIT *next_instr, int stop_tracing_exit); int _PyJit_TryInitializeTracing(PyThreadState *tstate, _PyInterpreterFrame *frame, diff --git a/Python/bytecodes.c b/Python/bytecodes.c index ab3f9c349c44af..f51016d366e315 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -3303,11 +3303,9 @@ dummy_func( // Only used by Tier 2 op(_GUARD_NOT_EXHAUSTED_LIST, (iter, null_or_index -- iter, null_or_index)) { -#ifndef Py_GIL_DISABLED PyObject *list_o = PyStackRef_AsPyObjectBorrow(iter); assert(Py_TYPE(list_o) == &PyList_Type); EXIT_IF((size_t)PyStackRef_UntagInt(null_or_index) >= (size_t)PyList_GET_SIZE(list_o)); -#endif } replaced op(_ITER_NEXT_LIST, (iter, null_or_index -- iter, null_or_index, next)) { @@ -5663,7 +5661,7 @@ dummy_func( bool stop_tracing = (opcode == WITH_EXCEPT_START || opcode == RERAISE || opcode == CLEANUP_THROW || opcode == PUSH_EXC_INFO || opcode == INTERPRETER_EXIT); - int full = !_PyJit_translate_single_bytecode_to_trace(tstate, frame, next_instr, stop_tracing); + int full = !_PyJit_translate_single_bytecode_to_trace(tstate, frame, next_instr, stop_tracing ? _DEOPT : 0); if (full) { LEAVE_TRACING(); int err = stop_tracing_and_jit(tstate, frame); @@ -5703,7 +5701,7 @@ dummy_func( #if _Py_TIER2 assert(IS_JIT_TRACING()); int opcode = next_instr->op.code; - _PyJit_translate_single_bytecode_to_trace(tstate, frame, NULL, true); + _PyJit_translate_single_bytecode_to_trace(tstate, frame, NULL, _EXIT_TRACE); LEAVE_TRACING(); int err = stop_tracing_and_jit(tstate, frame); ERROR_IF(err < 0); diff --git a/Python/ceval.c b/Python/ceval.c index b76c9ec28119d5..1844bc0f834c22 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1307,7 +1307,7 @@ _PyTier2Interpreter( for (;;) { uopcode = next_uop->opcode; #ifdef Py_DEBUG - if (frame->lltrace >= 3) { + if (frame->lltrace >= 4) { dump_stack(frame, stack_pointer); if (next_uop->opcode == _START_EXECUTOR) { printf("%4d uop: ", 0); diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 4c625d84bc50e5..6ff638b29f8a43 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -4436,14 +4436,12 @@ _PyStackRef iter; null_or_index = stack_pointer[-1]; iter = stack_pointer[-2]; - #ifndef Py_GIL_DISABLED PyObject *list_o = PyStackRef_AsPyObjectBorrow(iter); assert(Py_TYPE(list_o) == &PyList_Type); if ((size_t)PyStackRef_UntagInt(null_or_index) >= (size_t)PyList_GET_SIZE(list_o)) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } - #endif break; } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index d39235ea5c801f..1b5c88444a645a 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -12288,7 +12288,7 @@ JUMP_TO_LABEL(error); opcode == RERAISE || opcode == CLEANUP_THROW || opcode == PUSH_EXC_INFO || opcode == INTERPRETER_EXIT); _PyFrame_SetStackPointer(frame, stack_pointer); - int full = !_PyJit_translate_single_bytecode_to_trace(tstate, frame, next_instr, stop_tracing); + int full = !_PyJit_translate_single_bytecode_to_trace(tstate, frame, next_instr, stop_tracing ? _DEOPT : 0); stack_pointer = _PyFrame_GetStackPointer(frame); if (full) { LEAVE_TRACING(); @@ -12334,7 +12334,7 @@ JUMP_TO_LABEL(error); assert(IS_JIT_TRACING()); int opcode = next_instr->op.code; _PyFrame_SetStackPointer(frame, stack_pointer); - _PyJit_translate_single_bytecode_to_trace(tstate, frame, NULL, true); + _PyJit_translate_single_bytecode_to_trace(tstate, frame, NULL, _EXIT_TRACE); stack_pointer = _PyFrame_GetStackPointer(frame); LEAVE_TRACING(); _PyFrame_SetStackPointer(frame, stack_pointer); diff --git a/Python/optimizer.c b/Python/optimizer.c index de44aefcef2487..93f64448064e55 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -574,7 +574,7 @@ _PyJit_translate_single_bytecode_to_trace( PyThreadState *tstate, _PyInterpreterFrame *frame, _Py_CODEUNIT *next_instr, - bool stop_tracing) + int stop_tracing_exit) { #ifdef Py_DEBUG @@ -637,8 +637,8 @@ _PyJit_translate_single_bytecode_to_trace( goto full; } - if (stop_tracing) { - ADD_TO_TRACE(_DEOPT, 0, 0, target); + if (stop_tracing_exit) { + ADD_TO_TRACE(stop_tracing_exit, 0, 0, target); goto done; } From d76a24b3de5b23514f8ed019852d61cbc0c41b52 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sat, 15 Nov 2025 16:15:26 +0000 Subject: [PATCH 14/40] Improve tracer thread safety --- Include/internal/pycore_tstate.h | 3 ++- Objects/listobject.c | 3 ++- Python/optimizer.c | 26 ++++++++++++++++++-------- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index b2a3047304af28..783793716fb82c 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -35,7 +35,7 @@ typedef struct _PyJitTracerInitialState { } _PyJitTracerInitialState; typedef struct _PyJitTracerPreviousState { - bool dependencies_still_valid; + uint8_t dependencies_still_valid; bool instr_is_super; int code_max_size; int code_curr_size; @@ -48,6 +48,7 @@ typedef struct _PyJitTracerPreviousState { } _PyJitTracerPreviousState; typedef struct _PyJitTracerState { + PyMutex lock; _PyUOpInstruction *code_buffer; _PyJitTracerInitialState initial_state; _PyJitTracerPreviousState prev_state; diff --git a/Objects/listobject.c b/Objects/listobject.c index 1722ea60cdc68f..9022d1fe12c068 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -79,7 +79,8 @@ ensure_shared_on_resize(PyListObject *self) // We can't use _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED here because // the `CALL_LIST_APPEND` bytecode handler may lock the list without // a critical section. - assert(Py_REFCNT(self) == 1 || PyMutex_IsLocked(&_PyObject_CAST(self)->ob_mutex)); + assert((_Py_IsOwnedByCurrentThread((PyObject *)self) && !_PyObject_GC_IS_SHARED(self)) || + PyMutex_IsLocked(&_PyObject_CAST(self)->ob_mutex)); // Ensure that the list array is freed using QSBR if we are not the // owning thread. diff --git a/Python/optimizer.c b/Python/optimizer.c index 93f64448064e55..1707ad87931591 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -603,7 +603,7 @@ _PyJit_translate_single_bytecode_to_trace( // Rewind EXTENDED_ARG so that we see the whole thing. // We must point to the first EXTENDED_ARG when deopting. int oparg = _tstate->jit_tracer_state.prev_state.instr_oparg; - int opcode = this_instr->op.code; + int opcode = FT_ATOMIC_LOAD_UINT8_RELAXED(this_instr->op.code); int rewind_oparg = oparg; while (rewind_oparg > 255) { rewind_oparg >>= 8; @@ -977,17 +977,24 @@ _PyJit_TryInitializeTracing( if (oparg > 0xFFFF) { return 0; } + + PyObject *func = PyStackRef_AsPyObjectBorrow(frame->f_funcobj); + if (func == NULL) { + return 0; + } + + if (!PyMutex_LockFast(&_tstate->jit_tracer_state.lock)) { + return 0; + } + if (_tstate->jit_tracer_state.code_buffer == NULL) { _tstate->jit_tracer_state.code_buffer = (_PyUOpInstruction *)_PyObject_VirtualAlloc(UOP_BUFFER_SIZE); if (_tstate->jit_tracer_state.code_buffer == NULL) { // Don't error, just go to next instruction. + PyMutex_Unlock(&_tstate->jit_tracer_state.lock); return 0; } } - PyObject *func = PyStackRef_AsPyObjectBorrow(frame->f_funcobj); - if (func == NULL) { - return 0; - } PyCodeObject *code = _PyFrame_GetCode(frame); #ifdef Py_DEBUG char *python_lltrace = Py_GETENV("PYTHON_LLTRACE"); @@ -1017,7 +1024,7 @@ _PyJit_TryInitializeTracing( _tstate->jit_tracer_state.initial_state.stack_depth = curr_stackdepth; _tstate->jit_tracer_state.initial_state.chain_depth = chain_depth; _tstate->jit_tracer_state.prev_state.instr_frame = frame; - _tstate->jit_tracer_state.prev_state.dependencies_still_valid = true; + _tstate->jit_tracer_state.prev_state.dependencies_still_valid = 1; _tstate->jit_tracer_state.prev_state.instr_code = (PyCodeObject *)Py_NewRef(_PyFrame_GetCode(frame)); _tstate->jit_tracer_state.prev_state.instr = curr_instr; _tstate->jit_tracer_state.prev_state.instr_frame = frame; @@ -1032,6 +1039,7 @@ _PyJit_TryInitializeTracing( FT_ATOMIC_STORE_UINT16_RELAXED(close_loop_instr[1].counter.value_and_backoff, zero.value_and_backoff); } _Py_BloomFilter_Init(&_tstate->jit_tracer_state.prev_state.dependencies); + PyMutex_Unlock(&_tstate->jit_tracer_state.lock); return 1; } @@ -1702,10 +1710,12 @@ _PyJit_Tracer_InvalidateDependency(PyThreadState *tstate, void *obj) _Py_BloomFilter_Init(&obj_filter); _Py_BloomFilter_Add(&obj_filter, obj); _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; + PyMutex_Lock(&_tstate->jit_tracer_state.lock); if (bloom_filter_may_contain(&_tstate->jit_tracer_state.prev_state.dependencies, &obj_filter)) { - _tstate->jit_tracer_state.prev_state.dependencies_still_valid = false; + FT_ATOMIC_STORE_UINT8(_tstate->jit_tracer_state.prev_state.dependencies_still_valid, 0); } + PyMutex_Unlock(&_tstate->jit_tracer_state.lock); } void @@ -1766,7 +1776,7 @@ _Py_Executors_InvalidateAllLockHeld(PyInterpreterState *interp, int is_invalidat /* Clearing an executor can deallocate others, so we need to make a list of * executors to invalidate first */ _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { - ((_PyThreadStateImpl *)p)->jit_tracer_state.prev_state.dependencies_still_valid = false; + FT_ATOMIC_STORE_UINT8(((_PyThreadStateImpl *)p)->jit_tracer_state.prev_state.dependencies_still_valid, 0); for (_PyExecutorObject *exec = ((_PyThreadStateImpl *)p)->jit_executor_state.executor_list_head; exec != NULL;) { assert(exec->tstate == p); exec->vm_data.valid = 0; From fa9910826b350c1147923554c37213a587e20725 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sat, 15 Nov 2025 16:19:20 +0000 Subject: [PATCH 15/40] set immortal before GC --- Python/optimizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index 1707ad87931591..448e48cb5ec9ac 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1329,10 +1329,10 @@ make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFil return NULL; } #endif - _PyObject_GC_TRACK(executor); // Executors are cleared by coldness or invalidation. // Thus there's no point for them to be refcounted. _Py_SetImmortal((PyObject *)executor); + _PyObject_GC_TRACK(executor); return executor; } From fcfed9670245f839cfee3302d155ad4b8dcf68c0 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Sat, 15 Nov 2025 16:30:50 +0000 Subject: [PATCH 16/40] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2025-11-15-16-30-46.gh-issue-141594.PSsC5J.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-11-15-16-30-46.gh-issue-141594.PSsC5J.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-11-15-16-30-46.gh-issue-141594.PSsC5J.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-11-15-16-30-46.gh-issue-141594.PSsC5J.rst new file mode 100644 index 00000000000000..181d8232fb8e1e --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-11-15-16-30-46.gh-issue-141594.PSsC5J.rst @@ -0,0 +1 @@ +Add free-threading support to the JIT. Patch by Ken Jin. From ba67ab707dd3a9850f656a995b24048be934e6d0 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sat, 15 Nov 2025 16:43:46 +0000 Subject: [PATCH 17/40] fix default builkd --- Lib/test/test_capi/test_opt.py | 2 ++ Python/bytecodes.c | 8 ++------ Python/generated_cases.c.h | 7 +------ Python/optimizer.c | 2 ++ 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index 39e94ef4b8a105..8284f006c7b215 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -2487,6 +2487,7 @@ def testfunc(n): self.assertIn("_POP_TOP_NOP", uops) + @unittest.skipIf(Py_GIL_DISABLED, "FT build immortalizes constants") def test_pop_top_specialize_int(self): def testfunc(n): for _ in range(n): @@ -2500,6 +2501,7 @@ def testfunc(n): self.assertIn("_POP_TOP_INT", uops) + @unittest.skipIf(Py_GIL_DISABLED, "FT build immortalizes constants") def test_pop_top_specialize_float(self): def testfunc(n): for _ in range(n): diff --git a/Python/bytecodes.c b/Python/bytecodes.c index f51016d366e315..123e5fb230ff3f 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -3018,11 +3018,11 @@ dummy_func( goto stop_tracing; } PyCodeObject *code = _PyFrame_GetCode(frame); -#ifdef Py_GIL_DISABLED + LOCK_OBJECT_SLOW(code); _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; // On FT, we are responsible for cleaning up after ourselves. - if (!_Py_atomic_load_uint8_relaxed(&executor->vm_data.valid)) { + if (!FT_ATOMIC_LOAD_UINT8_RELAXED(executor->vm_data.valid)) { opcode = executor->vm_data.opcode; oparg = (oparg & ~255) | executor->vm_data.oparg; next_instr = this_instr; @@ -3030,10 +3030,6 @@ dummy_func( UNLOCK_OBJECT_SLOW(code); DISPATCH_GOTO(); } -#else - _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; - assert(executor->vm_data.valid); -#endif assert(tstate->current_executor == NULL); assert(executor->vm_data.index == INSTR_OFFSET() - 1); assert(executor->vm_data.code == code); diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 1b5c88444a645a..e03691459185ff 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -5481,10 +5481,9 @@ JUMP_TO_LABEL(stop_tracing); } PyCodeObject *code = _PyFrame_GetCode(frame); - #ifdef Py_GIL_DISABLED LOCK_OBJECT_SLOW(code); _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; - if (!_Py_atomic_load_uint8_relaxed(&executor->vm_data.valid)) { + if (!FT_ATOMIC_LOAD_UINT8_RELAXED(executor->vm_data.valid)) { opcode = executor->vm_data.opcode; oparg = (oparg & ~255) | executor->vm_data.oparg; next_instr = this_instr; @@ -5494,10 +5493,6 @@ UNLOCK_OBJECT_SLOW(code); DISPATCH_GOTO(); } - #else - _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; - assert(executor->vm_data.valid); - #endif assert(tstate->current_executor == NULL); assert(executor->vm_data.index == INSTR_OFFSET() - 1); assert(executor->vm_data.code == code); diff --git a/Python/optimizer.c b/Python/optimizer.c index 448e48cb5ec9ac..b4e8c10c33ca81 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1657,7 +1657,9 @@ _Py_ExecutorDetach(_PyExecutorObject *executor) if (code == NULL) { return; } +#ifdef Py_GIL_DISABLED assert(_PyInterpreterState_GET()->stoptheworld.world_stopped || PyMutex_IsLocked(&((PyObject *)code)->ob_mutex)); +#endif _Py_CODEUNIT *instruction = &_PyCode_CODE(code)[executor->vm_data.index]; assert(instruction->op.code == ENTER_EXECUTOR); int index = instruction->op.arg; From 8ce83cd6aab765c1462ce6d4482f72a06b7a2032 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sat, 15 Nov 2025 20:26:26 +0000 Subject: [PATCH 18/40] fix comment --- Python/bytecodes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 69b862c5fc15c3..b702c8dbe24586 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -3021,7 +3021,7 @@ dummy_func( LOCK_OBJECT_SLOW(code); _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; - // On FT, we are responsible for cleaning up after ourselves. + // We are responsible for cleaning up after ourselves. if (!FT_ATOMIC_LOAD_UINT8_RELAXED(executor->vm_data.valid)) { opcode = executor->vm_data.opcode; oparg = (oparg & ~255) | executor->vm_data.oparg; From fa556439eeeffb2d58e26427773742a39a730ab2 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sat, 15 Nov 2025 20:50:03 +0000 Subject: [PATCH 19/40] lint --- Lib/test/test_capi/test_opt.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index 4c321cf5aad32f..a440832d025662 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -9,9 +9,13 @@ import _opcode -from test.support import (script_helper, requires_specialization, - import_helper, Py_GIL_DISABLED, requires_jit_enabled, - reset_code) +from test.support import ( + script_helper, + import_helper, + Py_GIL_DISABLED, + requires_jit_enabled, + reset_code +) _testinternalcapi = import_helper.import_module("_testinternalcapi") From 92f3dbf98764b9cc9b702ff52229d716d643e53b Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sat, 15 Nov 2025 20:55:06 +0000 Subject: [PATCH 20/40] fix nojit builds --- Include/internal/pycore_tstate.h | 8 +++++--- Python/ceval_gil.c | 2 ++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index 783793716fb82c..bb4ac5bc2af7c0 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -54,17 +54,19 @@ typedef struct _PyJitTracerState { _PyJitTracerPreviousState prev_state; } _PyJitTracerState; +#endif + typedef struct _PyJitExecutorState { char jit; +#if _Py_TIER2 struct _PyExecutorObject *executor_list_head; struct _PyExecutorObject *executor_deletion_list_head; struct _PyExecutorObject *cold_executor; struct _PyExecutorObject *cold_dynamic_executor; int executor_deletion_list_remaining_capacity; size_t executor_creation_counter; -} _PyJitExecutorState; - #endif +} _PyJitExecutorState; // Every PyThreadState is actually allocated as a _PyThreadStateImpl. The // PyThreadState fields are exposed as part of the C API, although most fields @@ -132,8 +134,8 @@ typedef struct _PyThreadStateImpl { #endif #if _Py_TIER2 _PyJitTracerState jit_tracer_state; - _PyJitExecutorState jit_executor_state; #endif + _PyJitExecutorState jit_executor_state; } _PyThreadStateImpl; #ifdef __cplusplus diff --git a/Python/ceval_gil.c b/Python/ceval_gil.c index 6fc9f2f5b4981b..f012d0d777e362 100644 --- a/Python/ceval_gil.c +++ b/Python/ceval_gil.c @@ -1395,11 +1395,13 @@ _Py_HandlePending(PyThreadState *tstate) _Py_RunGC(tstate); } +#ifdef _Py_TIER2 if ((breaker & _PY_EVAL_JIT_INVALIDATE_COLD_BIT) != 0) { _Py_unset_eval_breaker_bit(tstate, _PY_EVAL_JIT_INVALIDATE_COLD_BIT); _Py_Executors_InvalidateCold(tstate); ((_PyThreadStateImpl*)tstate)->jit_executor_state.executor_creation_counter = JIT_CLEANUP_THRESHOLD; } +#endif /* GIL drop request */ if ((breaker & _PY_GIL_DROP_REQUEST_BIT) != 0) { From 7240b15b35e1da70b0d7aa06d004ea682a5fecb4 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sat, 15 Nov 2025 21:03:00 +0000 Subject: [PATCH 21/40] Remove heavywieght locking--not needed --- Python/bytecodes.c | 9 ++------- Python/ceval_macros.h | 4 ---- Python/generated_cases.c.h | 9 ++------- 3 files changed, 4 insertions(+), 18 deletions(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index b702c8dbe24586..d4f3a035a71549 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -3019,15 +3019,15 @@ dummy_func( } PyCodeObject *code = _PyFrame_GetCode(frame); - LOCK_OBJECT_SLOW(code); _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; // We are responsible for cleaning up after ourselves. if (!FT_ATOMIC_LOAD_UINT8_RELAXED(executor->vm_data.valid)) { opcode = executor->vm_data.opcode; oparg = (oparg & ~255) | executor->vm_data.oparg; next_instr = this_instr; + Py_BEGIN_CRITICAL_SECTION(code); _Py_ExecutorDetach(executor); - UNLOCK_OBJECT_SLOW(code); + Py_END_CRITICAL_SECTION(); DISPATCH_GOTO(); } assert(tstate->current_executor == NULL); @@ -3043,15 +3043,10 @@ dummy_func( if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]]) { PAUSE_ADAPTIVE_COUNTER(this_instr[1].counter); } -#ifdef Py_GIL_DISABLED - UNLOCK_OBJECT_SLOW(code); -#endif DISPATCH_GOTO(); } assert(executor != ((_PyThreadStateImpl *)tstate)->jit_executor_state.cold_executor); tstate->jit_exit = NULL; - Py_INCREF(executor); - UNLOCK_OBJECT_SLOW(code); TIER1_TO_TIER2(executor); #else Py_FatalError("ENTER_EXECUTOR is not supported in this build"); diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 87b344c8c6b627..05a2760671e847 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -294,13 +294,9 @@ GETITEM(PyObject *v, Py_ssize_t i) { #ifdef Py_GIL_DISABLED # define LOCK_OBJECT(op) PyMutex_LockFast(&(_PyObject_CAST(op))->ob_mutex) # define UNLOCK_OBJECT(op) PyMutex_Unlock(&(_PyObject_CAST(op))->ob_mutex) -# define LOCK_OBJECT_SLOW(op) PyMutex_LockFast(&(_PyObject_CAST(op))->ob_mutex) -# define UNLOCK_OBJECT_SLOW(op) PyMutex_Unlock(&(_PyObject_CAST(op))->ob_mutex) #else # define LOCK_OBJECT(op) (1) # define UNLOCK_OBJECT(op) ((void)0) -# define LOCK_OBJECT_SLOW(op) ((void)0) -# define UNLOCK_OBJECT_SLOW(op) ((void)0) #endif #define GLOBALS() frame->f_globals diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 7022ecbe5210ae..c9107837da22a7 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -5476,16 +5476,16 @@ JUMP_TO_LABEL(stop_tracing); } PyCodeObject *code = _PyFrame_GetCode(frame); - LOCK_OBJECT_SLOW(code); _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; if (!FT_ATOMIC_LOAD_UINT8_RELAXED(executor->vm_data.valid)) { opcode = executor->vm_data.opcode; oparg = (oparg & ~255) | executor->vm_data.oparg; next_instr = this_instr; _PyFrame_SetStackPointer(frame, stack_pointer); + Py_BEGIN_CRITICAL_SECTION(code); _Py_ExecutorDetach(executor); + Py_END_CRITICAL_SECTION(); stack_pointer = _PyFrame_GetStackPointer(frame); - UNLOCK_OBJECT_SLOW(code); DISPATCH_GOTO(); } assert(tstate->current_executor == NULL); @@ -5498,15 +5498,10 @@ if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]]) { PAUSE_ADAPTIVE_COUNTER(this_instr[1].counter); } - #ifdef Py_GIL_DISABLED - UNLOCK_OBJECT_SLOW(code); - #endif DISPATCH_GOTO(); } assert(executor != ((_PyThreadStateImpl *)tstate)->jit_executor_state.cold_executor); tstate->jit_exit = NULL; - Py_INCREF(executor); - UNLOCK_OBJECT_SLOW(code); TIER1_TO_TIER2(executor); #else Py_FatalError("ENTER_EXECUTOR is not supported in this build"); From cb876766bf04abe305fb67875c61a538fbd3f8bd Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sat, 15 Nov 2025 21:58:30 +0000 Subject: [PATCH 22/40] address review, fix bug --- Include/internal/pycore_optimizer.h | 2 -- Objects/funcobject.c | 2 +- Objects/listobject.c | 3 ++- Python/optimizer.c | 35 ++++++++++++++--------------- 4 files changed, 20 insertions(+), 22 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 7c81c71c5720d4..201f5e09a8bf1b 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -378,8 +378,6 @@ _PyJit_TryInitializeTracing(PyThreadState *tstate, _PyInterpreterFrame *frame, void _PyJit_FinalizeTracing(PyThreadState *tstate); -void _PyJit_Tracer_InvalidateDependency(PyThreadState *old_tstate, void *obj); - #ifdef __cplusplus } #endif diff --git a/Objects/funcobject.c b/Objects/funcobject.c index e5d8a8faf11e81..fb503df328782a 100644 --- a/Objects/funcobject.c +++ b/Objects/funcobject.c @@ -11,7 +11,7 @@ #include "pycore_setobject.h" // _PySet_NextEntry() #include "pycore_stats.h" #include "pycore_weakref.h" // FT_CLEAR_WEAKREFS() -#include "pycore_optimizer.h" // _PyJit_Tracer_InvalidateDependency +#include "pycore_optimizer.h" // _Py_Executors_InvalidateDependency static const char * func_event_name(PyFunction_WatchEvent event) { diff --git a/Objects/listobject.c b/Objects/listobject.c index 9022d1fe12c068..62ba2ba641740e 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -79,7 +79,8 @@ ensure_shared_on_resize(PyListObject *self) // We can't use _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED here because // the `CALL_LIST_APPEND` bytecode handler may lock the list without // a critical section. - assert((_Py_IsOwnedByCurrentThread((PyObject *)self) && !_PyObject_GC_IS_SHARED(self)) || + assert(Py_REFCNT(self) == 1 || + (_Py_IsOwnedByCurrentThread((PyObject *)self) && !_PyObject_GC_IS_SHARED(self)) || PyMutex_IsLocked(&_PyObject_CAST(self)->ob_mutex)); // Ensure that the list array is freed using QSBR if we are not the diff --git a/Python/optimizer.c b/Python/optimizer.c index 499381af0eae74..795f95c4782d99 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -140,7 +140,7 @@ _PyOptimizer_Optimize( _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; int chain_depth = _tstate->jit_tracer_state.initial_state.chain_depth; if (!FT_ATOMIC_LOAD_CHAR_RELAXED(_tstate->jit_executor_state.jit)) { - // gh-140936: It is possible that interp->jit will become false during + // gh-140936: It is possible that jit_executor_state.jit will become false during // interpreter finalization. However, the specialized JUMP_BACKWARD_JIT // instruction may still be present. In this case, we should // return immediately without optimization. @@ -1546,7 +1546,7 @@ unlink_executor(_PyExecutorObject *executor) return; } _PyExecutorLinkListNode *links = &executor->vm_data.links; - assert(executor->vm_data.valid); + assert(FT_ATOMIC_LOAD_UINT8_RELAXED(executor->vm_data.valid) == 0); _PyExecutorObject *next = links->next; _PyExecutorObject *prev = links->previous; if (next != NULL) { @@ -1657,9 +1657,7 @@ _Py_ExecutorDetach(_PyExecutorObject *executor) if (code == NULL) { return; } -#ifdef Py_GIL_DISABLED - assert(_PyInterpreterState_GET()->stoptheworld.world_stopped || PyMutex_IsLocked(&((PyObject *)code)->ob_mutex)); -#endif + Py_BEGIN_CRITICAL_SECTION(code); _Py_CODEUNIT *instruction = &_PyCode_CODE(code)[executor->vm_data.index]; assert(instruction->op.code == ENTER_EXECUTOR); int index = instruction->op.arg; @@ -1668,33 +1666,27 @@ _Py_ExecutorDetach(_PyExecutorObject *executor) instruction->op.arg = executor->vm_data.oparg; executor->vm_data.code = NULL; code->co_executors->executors[index] = NULL; + Py_END_CRITICAL_SECTION(); } static int executor_clear(PyObject *op) { _PyExecutorObject *executor = _PyExecutorObject_CAST(op); - if (!executor->vm_data.valid) { - return 0; - } - assert(executor->vm_data.valid == 1); + assert(FT_ATOMIC_LOAD_UINT8_RELAXED(executor->vm_data.valid) == 0); unlink_executor(executor); - executor->vm_data.valid = 0; /* It is possible for an executor to form a reference * cycle with itself, so decref'ing a side exit could * free the executor unless we hold a strong reference to it */ _PyExecutorObject *cold = _PyExecutor_GetColdExecutor(); - Py_INCREF(executor); for (uint32_t i = 0; i < executor->exit_count; i++) { executor->exits[i].temperature = initial_unreachable_backoff_counter(); _PyExecutorObject *e = executor->exits[i].executor; executor->exits[i].executor = cold; - Py_DECREF(e); } _Py_ExecutorDetach(executor); - Py_DECREF(executor); return 0; } @@ -1705,8 +1697,8 @@ _Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj) _Py_BloomFilter_Add(&executor->vm_data.bloom, obj); } -void -_PyJit_Tracer_InvalidateDependency(PyThreadState *tstate, void *obj) +static void +jit_tracer_invalidate_dependency(PyThreadState *tstate, void *obj) { _PyBloomFilter obj_filter; _Py_BloomFilter_Init(&obj_filter); @@ -1726,7 +1718,7 @@ invalidate_sub_executors(_PyThreadStateImpl *tstate, _PyExecutorObject *executor if (!executor->vm_data.valid) { return; } - executor->vm_data.valid = 0; + FT_ATOMIC_STORE_UINT8(executor->vm_data.valid, 0); for (uint32_t i = 0; i < executor->exit_count; i++) { _PyExecutorObject *next = executor->exits[i].executor; if (next != tstate->jit_executor_state.cold_dynamic_executor && next != tstate->jit_executor_state.cold_executor) { @@ -1751,10 +1743,13 @@ _Py_Executors_InvalidateDependencyLockHeld(PyInterpreterState *interp, void *obj /* Clearing an executor can deallocate others, so we need to make a list of * executors to invalidate first */ _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { - _PyJit_Tracer_InvalidateDependency(p, obj); + jit_tracer_invalidate_dependency(p, obj); for (_PyExecutorObject *exec = ((_PyThreadStateImpl *)p)->jit_executor_state.executor_list_head; exec != NULL;) { if (bloom_filter_may_contain(&exec->vm_data.bloom, &obj_filter)) { invalidate_sub_executors((_PyThreadStateImpl *)p, exec); + if (is_invalidation) { + OPT_STAT_INC(executors_invalidated); + } } _PyExecutorObject *next = exec->vm_data.links.next; exec = next; @@ -1781,7 +1776,10 @@ _Py_Executors_InvalidateAllLockHeld(PyInterpreterState *interp, int is_invalidat FT_ATOMIC_STORE_UINT8(((_PyThreadStateImpl *)p)->jit_tracer_state.prev_state.dependencies_still_valid, 0); for (_PyExecutorObject *exec = ((_PyThreadStateImpl *)p)->jit_executor_state.executor_list_head; exec != NULL;) { assert(exec->tstate == p); - exec->vm_data.valid = 0; + FT_ATOMIC_STORE_UINT8(exec->vm_data.valid, 0); + if (is_invalidation) { + OPT_STAT_INC(executors_invalidated); + } _PyExecutorObject *next = exec->vm_data.links.next; exec = next; } @@ -1816,6 +1814,7 @@ _Py_Executors_InvalidateCold(PyThreadState *tstate) if (!exec->vm_data.warm || !exec->vm_data.valid) { if (PyList_Append(invalidate, (PyObject *)exec) < 0) { + FT_ATOMIC_STORE_UINT8(exec->vm_data.valid, 0); goto error; } } From f9726375101e4a6653aec89c3eff7cc0a9c98b04 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sat, 15 Nov 2025 23:03:47 +0000 Subject: [PATCH 23/40] GC executors --- Include/internal/pycore_optimizer.h | 3 +- Python/gc.c | 2 + Python/gc_free_threading.c | 3 ++ Python/optimizer.c | 63 ++++++++++++++++++++++++++--- 4 files changed, 64 insertions(+), 7 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 201f5e09a8bf1b..eb9e6e623c687e 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -82,11 +82,12 @@ PyAPI_FUNC(void) _Py_Executors_InvalidateDependencyLockHeld(PyInterpreterState * PyAPI_FUNC(void) _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation); PyAPI_FUNC(void) _Py_Executors_InvalidateAllLockHeld(PyInterpreterState *interp, int is_invalidation); PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyThreadState *tstate); +PyAPI_FUNC(void) _Py_Executors_InvalidateColdGC(PyInterpreterState *interp); #else # define _Py_Executors_InvalidateDependency(A, B, C) ((void)0) # define _Py_Executors_InvalidateAll(A, B) ((void)0) # define _Py_Executors_InvalidateCold(A) ((void)0) - +# define _Py_Executors_InvalidateColdGC(A) ((void)0) #endif // Used as the threshold to trigger executor invalidation when diff --git a/Python/gc.c b/Python/gc.c index 03a5d7366ea6c9..a8226e69277312 100644 --- a/Python/gc.c +++ b/Python/gc.c @@ -12,6 +12,7 @@ #include "pycore_pystate.h" // _PyThreadState_GET() #include "pycore_tuple.h" // _PyTuple_MaybeUntrack() #include "pycore_weakref.h" // _PyWeakref_ClearRef() +#include "pycore_optimizer.h" // _Py_Executors_InvalidateColdGC #include "pydtrace.h" @@ -1735,6 +1736,7 @@ gc_collect_full(PyThreadState *tstate, gcstate->old[1].count = 0; completed_scavenge(gcstate); _PyGC_ClearAllFreeLists(tstate->interp); + _Py_Executors_InvalidateColdGC(tstate->interp); validate_spaces(gcstate); add_stats(gcstate, 2, stats); } diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index b183062eff7952..e49175bbf128ff 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -10,6 +10,7 @@ #include "pycore_interp.h" // PyInterpreterState.gc #include "pycore_interpframe.h" // _PyFrame_GetLocalsArray() #include "pycore_object_alloc.h" // _PyObject_MallocWithType() +#include "pycore_optimizer.h" #include "pycore_pystate.h" // _PyThreadState_GET() #include "pycore_tstate.h" // _PyThreadStateImpl #include "pycore_tuple.h" // _PyTuple_MaybeUntrack() @@ -2310,6 +2311,8 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, } _PyEval_StartTheWorld(interp); + _Py_Executors_InvalidateColdGC(interp); + if (err < 0) { cleanup_worklist(&state->unreachable); cleanup_worklist(&state->legacy_finalizers); diff --git a/Python/optimizer.c b/Python/optimizer.c index 795f95c4782d99..202bcd517852c4 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1797,8 +1797,8 @@ _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) // Unlike _PyExecutor_InvalidateDependency, this is not for correctness but memory savings. // Thus there is no need to lock the runtime or traverse everything. We simply make a // best-effort attempt to clean things up. -void -_Py_Executors_InvalidateCold(PyThreadState *tstate) +PyObject * +_Py_Executors_CollectCold(PyThreadState *tstate) { /* Walk the list of executors */ /* TO DO -- Use a tree to avoid traversing as many objects */ @@ -1813,8 +1813,8 @@ _Py_Executors_InvalidateCold(PyThreadState *tstate) _PyExecutorObject *next = exec->vm_data.links.next; if (!exec->vm_data.warm || !exec->vm_data.valid) { + FT_ATOMIC_STORE_UINT8(exec->vm_data.valid, 0); if (PyList_Append(invalidate, (PyObject *)exec) < 0) { - FT_ATOMIC_STORE_UINT8(exec->vm_data.valid, 0); goto error; } } @@ -1824,19 +1824,70 @@ _Py_Executors_InvalidateCold(PyThreadState *tstate) exec = next; } + return invalidate; +error: + PyErr_Clear(); + Py_XDECREF(invalidate); + return NULL; +} + +void +_Py_Executors_ClearColdList(PyObject *invalidate) +{ + if (invalidate == NULL) { + return; + } for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) { PyObject *exec = PyList_GET_ITEM(invalidate, i); executor_clear(exec); } Py_DECREF(invalidate); +} + +void +_Py_Executors_InvalidateCold(PyThreadState *tstate) +{ + _Py_Executors_ClearColdList(_Py_Executors_CollectCold(tstate)); +} + +void +_Py_Executors_InvalidateColdGC(PyInterpreterState *interp) +{ + PyObject *invalidate = PyList_New(0); + if (invalidate == NULL) { + PyErr_Clear(); + return; + } + _PyEval_StopTheWorld(interp); + HEAD_LOCK(interp->runtime); + _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) + { + PyObject *more_invalidate = _Py_Executors_CollectCold(p); + if (more_invalidate == NULL) { + goto error; + } + int err = PyList_Extend(invalidate, more_invalidate); + Py_DECREF(more_invalidate); + if (err) { + goto error; + } + } + HEAD_UNLOCK(interp->runtime); + _PyEval_StartTheWorld(interp); + // We can only clear the list after unlocking the runtime. + // Otherwise, it may deadlock. + _Py_Executors_ClearColdList(invalidate); return; error: PyErr_Clear(); - Py_XDECREF(invalidate); - // If we're truly out of memory, wiping out everything is a fine fallback - _Py_Executors_InvalidateAll(_PyInterpreterState_GET(), 0); + Py_DECREF(invalidate); + // Invalidate all the executors if we run out of memory. + // This means the next run will remove all executors. + _Py_Executors_InvalidateAllLockHeld(interp, 0); + HEAD_UNLOCK(interp->runtime); } + static void write_str(PyObject *str, FILE *out) { From 3ef237b6ec9e83cb29860bb2ede68db4d2ee40af Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sun, 16 Nov 2025 01:36:06 +0000 Subject: [PATCH 24/40] Change the name --- Lib/test/test_capi/test_opt.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index a440832d025662..643c8cd1fb0368 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -64,6 +64,7 @@ def iter_opnames(ex): def get_opnames(ex): return list(iter_opnames(ex)) +FT_TEST_DISABLED_REVERSE_TYPE_CACHE = "FT build reverse type cache is disabled for now" @requires_jit_enabled class TestExecutorInvalidation(unittest.TestCase): @@ -1182,7 +1183,7 @@ def test_modified_local_is_seen_by_optimized_code(self): self.assertIs(type(s), float) self.assertEqual(s, 1024.0) - @unittest.skipIf(Py_GIL_DISABLED, "FT build only specializes on deferred methods/functions/classes") + @unittest.skipIf(Py_GIL_DISABLED, FT_TEST_DISABLED_REVERSE_TYPE_CACHE) def test_guard_type_version_removed(self): def thing(a): x = 0 @@ -1201,7 +1202,7 @@ class Foo: guard_type_version_count = opnames.count("_GUARD_TYPE_VERSION") self.assertEqual(guard_type_version_count, 1) - @unittest.skipIf(Py_GIL_DISABLED, "FT build only specializes on deferred methods/functions/classes") + @unittest.skipIf(Py_GIL_DISABLED, FT_TEST_DISABLED_REVERSE_TYPE_CACHE) def test_guard_type_version_removed_inlined(self): """ Verify that the guard type version if we have an inlined function @@ -1228,7 +1229,7 @@ class Foo: guard_type_version_count = opnames.count("_GUARD_TYPE_VERSION") self.assertEqual(guard_type_version_count, 1) - @unittest.skipIf(Py_GIL_DISABLED, "FT build only specializes on deferred methods/functions") + @unittest.skipIf(Py_GIL_DISABLED, FT_TEST_DISABLED_REVERSE_TYPE_CACHE) def test_guard_type_version_removed_invalidation(self): def thing(a): @@ -1259,7 +1260,7 @@ class Bar: self.assertEqual(opnames[:load_attr_top].count("_GUARD_TYPE_VERSION"), 1) self.assertEqual(opnames[call:load_attr_bottom].count("_CHECK_VALIDITY"), 2) - @unittest.skipIf(Py_GIL_DISABLED, "FT build only specializes on deferred methods/functions") + @unittest.skipIf(Py_GIL_DISABLED, FT_TEST_DISABLED_REVERSE_TYPE_CACHE) def test_guard_type_version_removed_escaping(self): def thing(a): @@ -2363,7 +2364,7 @@ def testfunc(n): self.assertNotIn("_LOAD_SMALL_INT", uops) self.assertIn("_LOAD_CONST_INLINE_BORROW", uops) - @unittest.skipIf(Py_GIL_DISABLED, "FT build only specializes on deferred methods/functions") + @unittest.skipIf(Py_GIL_DISABLED, FT_TEST_DISABLED_REVERSE_TYPE_CACHE) def test_cached_attributes(self): class C: A = 1 From 44356d608612e7e22b5e3ce8923f5da6f611d90b Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sun, 16 Nov 2025 12:57:46 +0000 Subject: [PATCH 25/40] Address review, remove more refcounting --- .../2025-11-15-16-30-46.gh-issue-141594.PSsC5J.rst | 4 +++- Python/optimizer.c | 5 ----- Python/pylifecycle.c | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-11-15-16-30-46.gh-issue-141594.PSsC5J.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-11-15-16-30-46.gh-issue-141594.PSsC5J.rst index 181d8232fb8e1e..5db6330ca540b0 100644 --- a/Misc/NEWS.d/next/Core_and_Builtins/2025-11-15-16-30-46.gh-issue-141594.PSsC5J.rst +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-11-15-16-30-46.gh-issue-141594.PSsC5J.rst @@ -1 +1,3 @@ -Add free-threading support to the JIT. Patch by Ken Jin. +Add free-threading support to the JIT. The JIT is only enabled on +single-threaded code in free-threading, and is disabled when multiple +threads are spawned. Patch by Ken Jin. diff --git a/Python/optimizer.c b/Python/optimizer.c index 202bcd517852c4..ef53605dfff677 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -93,7 +93,6 @@ get_index_for_executor(PyCodeObject *code, _Py_CODEUNIT *instr) static void insert_executor(PyCodeObject *code, _Py_CODEUNIT *instr, int index, _PyExecutorObject *executor) { - Py_INCREF(executor); if (instr->op.code == ENTER_EXECUTOR) { assert(index == instr->op.arg); _Py_ExecutorDetach(code->co_executors->executors[index]); @@ -209,7 +208,6 @@ get_executor_lock_held(PyCodeObject *code, int offset) if (_PyCode_CODE(code)[i].op.code == ENTER_EXECUTOR && i*2 == offset) { int oparg = _PyCode_CODE(code)[i].op.arg; _PyExecutorObject *res = code->co_executors->executors[oparg]; - Py_INCREF(res); return res; } i += _PyInstruction_GetLength(code, i); @@ -1638,14 +1636,12 @@ _PyExecutor_ClearExit(_PyExitData *exit) if (exit == NULL) { return; } - _PyExecutorObject *old = exit->executor; if (exit->is_dynamic) { exit->executor = _PyExecutor_GetColdDynamicExecutor(); } else { exit->executor = _PyExecutor_GetColdExecutor(); } - Py_DECREF(old); } /* Detaches the executor from the code object (if any) that @@ -1683,7 +1679,6 @@ executor_clear(PyObject *op) _PyExecutorObject *cold = _PyExecutor_GetColdExecutor(); for (uint32_t i = 0; i < executor->exit_count; i++) { executor->exits[i].temperature = initial_unreachable_backoff_counter(); - _PyExecutorObject *e = executor->exits[i].executor; executor->exits[i].executor = cold; } _Py_ExecutorDetach(executor); diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 7e973ed8ed5e15..24da1ecf8fc8fa 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -606,7 +606,7 @@ builtins_dict_watcher(PyDict_WatchEvent event, PyObject *dict, PyObject *key, Py PyInterpreterState *interp = _PyInterpreterState_GET(); #ifdef _Py_TIER2 if (interp->rare_events.builtin_dict < _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS) { - _Py_Executors_InvalidateAll(_PyInterpreterState_GET(), 1); + _Py_Executors_InvalidateAll(interp, 1); } #endif RARE_EVENT_INTERP_INC(interp, builtin_dict); From b8190536883373074a8a6a581cdcf69cef6bfbb4 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sun, 16 Nov 2025 14:24:58 +0000 Subject: [PATCH 26/40] Fix TSAN races --- Include/internal/pycore_optimizer.h | 4 +- Lib/test/test_capi/test_opt.py | 3 ++ Python/instrumentation.c | 9 ++++- Python/optimizer.c | 61 +++++------------------------ Python/pystate.c | 3 +- 5 files changed, 22 insertions(+), 58 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index eb9e6e623c687e..3dcdc5c95a31f6 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -365,9 +365,7 @@ static inline int is_terminator(const _PyUOpInstruction *uop) extern void _PyExecutor_Free(_PyExecutorObject *self); PyAPI_FUNC(int) _PyDumpExecutors(FILE *out); -#ifdef _Py_TIER2 -extern void _Py_ClearExecutorDeletionList(PyThreadState *tstate); -#endif + int _PyJit_translate_single_bytecode_to_trace(PyThreadState *tstate, _PyInterpreterFrame *frame, _Py_CODEUNIT *next_instr, int stop_tracing_opcode); diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index 643c8cd1fb0368..ef7e8f63be023c 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -129,6 +129,9 @@ def f(): self.assertTrue(exe.is_valid()) sys._clear_internal_caches() self.assertFalse(exe.is_valid()) + gc.collect() + exe = get_first_executor(f) + self.assertIsNone(exe) @requires_jit_enabled diff --git a/Python/instrumentation.c b/Python/instrumentation.c index cc3381e461e755..2cf0581651f6f1 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -1785,7 +1785,7 @@ force_instrument_lock_held(PyCodeObject *code, PyInterpreterState *interp) if (code->co_executors != NULL) { _PyCode_Clear_Executors(code); } - _Py_Executors_InvalidateDependencyLockHeld(_PyInterpreterState_GET(), code, 1); + _Py_Executors_InvalidateDependencyLockHeld(interp, code, 1); #endif int code_len = (int)Py_SIZE(code); /* Exit early to avoid creating instrumentation @@ -1928,7 +1928,9 @@ _Py_Instrument(PyCodeObject *code, PyInterpreterState *interp) { int res; LOCK_CODE(code); + HEAD_LOCK(interp->runtime); res = instrument_lock_held(code, interp); + HEAD_UNLOCK(interp->runtime); UNLOCK_CODE(); return res; } @@ -2062,7 +2064,10 @@ _PyMonitoring_SetLocalEvents(PyCodeObject *code, int tool_id, _PyMonitoringEvent } set_local_events(local, tool_id, events); - return force_instrument_lock_held(code, interp); + HEAD_LOCK(interp->runtime); + int res = force_instrument_lock_held(code, interp); + HEAD_UNLOCK(interp->runtime); + return res; } int diff --git a/Python/optimizer.c b/Python/optimizer.c index ef53605dfff677..9a2c8341617cf8 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -260,46 +260,6 @@ _PyExecutor_Free(_PyExecutorObject *self) PyObject_GC_Del(self); } -void -_Py_ClearExecutorDeletionList(PyThreadState *tstate) -{ - _PyThreadStateImpl* ts = (_PyThreadStateImpl* )tstate; - _PyExecutorObject **prev_to_next_ptr = &ts->jit_executor_state.executor_deletion_list_head; - _PyExecutorObject *exec = *prev_to_next_ptr; - // Mark the currently executing executor. - if (tstate->current_executor != NULL) { - ((_PyExecutorObject *)tstate->current_executor)->vm_data.linked = 1; - } - while (exec != NULL) { - if (exec->vm_data.linked) { - // This executor is currently executing - exec->vm_data.linked = 0; - prev_to_next_ptr = &exec->vm_data.links.next; - } - else { - *prev_to_next_ptr = exec->vm_data.links.next; - assert(exec != _PyExecutor_GetColdDynamicExecutor()); - assert(exec != _PyExecutor_GetColdExecutor()); - _PyExecutor_Free(exec); - } - exec = *prev_to_next_ptr; - } - ts->jit_executor_state.executor_deletion_list_remaining_capacity = EXECUTOR_DELETE_LIST_MAX; -} - -static void -add_to_pending_deletion_list(_PyExecutorObject *self) -{ - _PyThreadStateImpl *_tstate = (_PyThreadStateImpl*)_PyThreadState_GET(); - self->vm_data.links.next = _tstate->jit_executor_state.executor_deletion_list_head; - _tstate->jit_executor_state.executor_deletion_list_head = self; - if (_tstate->jit_executor_state.executor_deletion_list_remaining_capacity > 0) { - _tstate->jit_executor_state.executor_deletion_list_remaining_capacity--; - } - else { - _Py_ClearExecutorDeletionList(&_tstate->base); - } -} static void uop_dealloc(PyObject *op) { @@ -309,7 +269,7 @@ uop_dealloc(PyObject *op) { unlink_executor(self); // Once unlinked it becomes impossible to invalidate an executor, so do it here. self->vm_data.valid = 0; - add_to_pending_deletion_list(self); + PyObject_GC_Del(self); } const char * @@ -1672,10 +1632,6 @@ executor_clear(PyObject *op) assert(FT_ATOMIC_LOAD_UINT8_RELAXED(executor->vm_data.valid) == 0); unlink_executor(executor); - /* It is possible for an executor to form a reference - * cycle with itself, so decref'ing a side exit could - * free the executor unless we hold a strong reference to it - */ _PyExecutorObject *cold = _PyExecutor_GetColdExecutor(); for (uint32_t i = 0; i < executor->exit_count; i++) { executor->exits[i].temperature = initial_unreachable_backoff_counter(); @@ -1720,7 +1676,8 @@ invalidate_sub_executors(_PyThreadStateImpl *tstate, _PyExecutorObject *executor invalidate_sub_executors(tstate, next); } } - executor_clear((PyObject *)executor); + // Let it be cleared up by cold executor invalidation later. + executor->vm_data.warm = 0; } /* Invalidate all executors that depend on `obj` @@ -1737,6 +1694,7 @@ _Py_Executors_InvalidateDependencyLockHeld(PyInterpreterState *interp, void *obj /* Walk the list of executors */ /* Clearing an executor can deallocate others, so we need to make a list of * executors to invalidate first */ + assert(PyMutex_IsLocked(&(interp->runtime)->interpreters.mutex)); _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { jit_tracer_invalidate_dependency(p, obj); for (_PyExecutorObject *exec = ((_PyThreadStateImpl *)p)->jit_executor_state.executor_list_head; exec != NULL;) { @@ -1755,9 +1713,9 @@ _Py_Executors_InvalidateDependencyLockHeld(PyInterpreterState *interp, void *obj void _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation) { - HEAD_LOCK(&_PyRuntime); + HEAD_LOCK(interp->runtime); _Py_Executors_InvalidateDependencyLockHeld(interp, obj, is_invalidation); - HEAD_UNLOCK(&_PyRuntime); + HEAD_UNLOCK(interp->runtime); } // To avoid deadlocks due to stop the world, we just invalidate the executors but leave them to be freed @@ -1772,6 +1730,8 @@ _Py_Executors_InvalidateAllLockHeld(PyInterpreterState *interp, int is_invalidat for (_PyExecutorObject *exec = ((_PyThreadStateImpl *)p)->jit_executor_state.executor_list_head; exec != NULL;) { assert(exec->tstate == p); FT_ATOMIC_STORE_UINT8(exec->vm_data.valid, 0); + // Let it be cleared up by cold executor invalidation later. + exec->vm_data.warm = 0; if (is_invalidation) { OPT_STAT_INC(executors_invalidated); } @@ -1854,8 +1814,7 @@ _Py_Executors_InvalidateColdGC(PyInterpreterState *interp) return; } _PyEval_StopTheWorld(interp); - HEAD_LOCK(interp->runtime); - _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) + _Py_FOR_EACH_TSTATE_BEGIN(interp, p) { PyObject *more_invalidate = _Py_Executors_CollectCold(p); if (more_invalidate == NULL) { @@ -1867,7 +1826,7 @@ _Py_Executors_InvalidateColdGC(PyInterpreterState *interp) goto error; } } - HEAD_UNLOCK(interp->runtime); + _Py_FOR_EACH_TSTATE_END(interp); _PyEval_StartTheWorld(interp); // We can only clear the list after unlocking the runtime. // Otherwise, it may deadlock. diff --git a/Python/pystate.c b/Python/pystate.c index fc22e224affdc8..ca19146321d092 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -1757,8 +1757,7 @@ PyThreadState_Clear(PyThreadState *tstate) Py_CLEAR(tstate->context); #ifdef _Py_TIER2 - ((_PyThreadStateImpl *)(tstate))->jit_executor_state.jit = false; - _Py_ClearExecutorDeletionList(tstate); + FT_ATOMIC_STORE_CHAR_RELAXED(((_PyThreadStateImpl *)(tstate))->jit_executor_state.jit, false); // We can't clear the cold executors here until the interpreter // clear process starts. From 527aac16fda441fb689c2782ef2ff3af3fb4d860 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sun, 16 Nov 2025 15:28:36 +0000 Subject: [PATCH 27/40] Remove atomics from _CHECK_VALIDITY --- Python/bytecodes.c | 16 ++++++++++++++-- Python/executor_cases.c.h | 2 +- Python/generated_cases.c.h | 2 +- Python/optimizer.c | 10 +++++----- 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index d4f3a035a71549..981c40960c97ba 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -3021,7 +3021,7 @@ dummy_func( _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; // We are responsible for cleaning up after ourselves. - if (!FT_ATOMIC_LOAD_UINT8_RELAXED(executor->vm_data.valid)) { + if (!executor->vm_data.valid) { opcode = executor->vm_data.opcode; oparg = (oparg & ~255) | executor->vm_data.oparg; next_instr = this_instr; @@ -5289,7 +5289,19 @@ dummy_func( } tier2 op(_CHECK_VALIDITY, (--)) { - DEOPT_IF(!FT_ATOMIC_LOAD_UINT8(current_executor->vm_data.valid)); + // This doesn't need atomics (for now) as there is only a single time + // where a write from another thread is possible: + // when a new thread is spawned and it invalidates all current + // executors. + // The new thread can only be created by an executing uop prior to the + // _CHECK_VALIDITY check. New thread creation is synchronized by + // locking of the runtime, and the current thread is naturally + // paused/waiting for the new thread to be created. Thus, + // there is a strict happens-before relation between that + // uop's invalidation of validity and this check. + // So for now, while the JIT does not run on multiple threads, + // it is safe for this to be nona-atomic. + DEOPT_IF(!current_executor->vm_data.valid); } tier2 pure op(_LOAD_CONST_INLINE, (ptr/4 -- value)) { diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index f639651b22227f..b2d413ca6d64db 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -7147,7 +7147,7 @@ } case _CHECK_VALIDITY: { - if (!FT_ATOMIC_LOAD_UINT8(current_executor->vm_data.valid)) { + if (!current_executor->vm_data.valid) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index c9107837da22a7..8f6ca4c738483f 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -5477,7 +5477,7 @@ } PyCodeObject *code = _PyFrame_GetCode(frame); _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; - if (!FT_ATOMIC_LOAD_UINT8_RELAXED(executor->vm_data.valid)) { + if (!executor->vm_data.valid) { opcode = executor->vm_data.opcode; oparg = (oparg & ~255) | executor->vm_data.oparg; next_instr = this_instr; diff --git a/Python/optimizer.c b/Python/optimizer.c index 9a2c8341617cf8..d09c09316d780d 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1504,7 +1504,7 @@ unlink_executor(_PyExecutorObject *executor) return; } _PyExecutorLinkListNode *links = &executor->vm_data.links; - assert(FT_ATOMIC_LOAD_UINT8_RELAXED(executor->vm_data.valid) == 0); + assert(executor->vm_data.valid == 0); _PyExecutorObject *next = links->next; _PyExecutorObject *prev = links->previous; if (next != NULL) { @@ -1629,7 +1629,7 @@ static int executor_clear(PyObject *op) { _PyExecutorObject *executor = _PyExecutorObject_CAST(op); - assert(FT_ATOMIC_LOAD_UINT8_RELAXED(executor->vm_data.valid) == 0); + assert(executor->vm_data.valid == 0); unlink_executor(executor); _PyExecutorObject *cold = _PyExecutor_GetColdExecutor(); @@ -1669,7 +1669,7 @@ invalidate_sub_executors(_PyThreadStateImpl *tstate, _PyExecutorObject *executor if (!executor->vm_data.valid) { return; } - FT_ATOMIC_STORE_UINT8(executor->vm_data.valid, 0); + executor->vm_data.valid = 0; for (uint32_t i = 0; i < executor->exit_count; i++) { _PyExecutorObject *next = executor->exits[i].executor; if (next != tstate->jit_executor_state.cold_dynamic_executor && next != tstate->jit_executor_state.cold_executor) { @@ -1729,7 +1729,7 @@ _Py_Executors_InvalidateAllLockHeld(PyInterpreterState *interp, int is_invalidat FT_ATOMIC_STORE_UINT8(((_PyThreadStateImpl *)p)->jit_tracer_state.prev_state.dependencies_still_valid, 0); for (_PyExecutorObject *exec = ((_PyThreadStateImpl *)p)->jit_executor_state.executor_list_head; exec != NULL;) { assert(exec->tstate == p); - FT_ATOMIC_STORE_UINT8(exec->vm_data.valid, 0); + exec->vm_data.valid = 0; // Let it be cleared up by cold executor invalidation later. exec->vm_data.warm = 0; if (is_invalidation) { @@ -1768,7 +1768,7 @@ _Py_Executors_CollectCold(PyThreadState *tstate) _PyExecutorObject *next = exec->vm_data.links.next; if (!exec->vm_data.warm || !exec->vm_data.valid) { - FT_ATOMIC_STORE_UINT8(exec->vm_data.valid, 0); + exec->vm_data.valid = 0; if (PyList_Append(invalidate, (PyObject *)exec) < 0) { goto error; } From b80c02e6d434dc73a871962946b0404b32915d52 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sun, 16 Nov 2025 15:52:20 +0000 Subject: [PATCH 28/40] fix comment --- Python/bytecodes.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 981c40960c97ba..e46ac093d53a1e 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -5289,6 +5289,7 @@ dummy_func( } tier2 op(_CHECK_VALIDITY, (--)) { + // For FT: // This doesn't need atomics (for now) as there is only a single time // where a write from another thread is possible: // when a new thread is spawned and it invalidates all current From 4278c9d50a420d8e888bd1bbbcc6392ac5f8726f Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sun, 16 Nov 2025 20:08:08 +0000 Subject: [PATCH 29/40] fix typo --- Python/bytecodes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index e46ac093d53a1e..eb6b91e2b36284 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -5301,7 +5301,7 @@ dummy_func( // there is a strict happens-before relation between that // uop's invalidation of validity and this check. // So for now, while the JIT does not run on multiple threads, - // it is safe for this to be nona-atomic. + // it is safe for this to be non-atomic. DEOPT_IF(!current_executor->vm_data.valid); } From d84215d375854e24574a1e47d4dd634f6beddcb1 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sun, 16 Nov 2025 23:01:08 +0000 Subject: [PATCH 30/40] Re-enable the type/function reverse cache --- Lib/test/test_capi/test_opt.py | 10 ---------- Objects/funcobject.c | 24 +++++++++++++++++------- Objects/typeobject.c | 13 ++++++++++--- Python/optimizer.c | 4 ++-- 4 files changed, 29 insertions(+), 22 deletions(-) diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index ef7e8f63be023c..c3aa3eb658ba04 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -64,7 +64,6 @@ def iter_opnames(ex): def get_opnames(ex): return list(iter_opnames(ex)) -FT_TEST_DISABLED_REVERSE_TYPE_CACHE = "FT build reverse type cache is disabled for now" @requires_jit_enabled class TestExecutorInvalidation(unittest.TestCase): @@ -1186,7 +1185,6 @@ def test_modified_local_is_seen_by_optimized_code(self): self.assertIs(type(s), float) self.assertEqual(s, 1024.0) - @unittest.skipIf(Py_GIL_DISABLED, FT_TEST_DISABLED_REVERSE_TYPE_CACHE) def test_guard_type_version_removed(self): def thing(a): x = 0 @@ -1205,7 +1203,6 @@ class Foo: guard_type_version_count = opnames.count("_GUARD_TYPE_VERSION") self.assertEqual(guard_type_version_count, 1) - @unittest.skipIf(Py_GIL_DISABLED, FT_TEST_DISABLED_REVERSE_TYPE_CACHE) def test_guard_type_version_removed_inlined(self): """ Verify that the guard type version if we have an inlined function @@ -1232,7 +1229,6 @@ class Foo: guard_type_version_count = opnames.count("_GUARD_TYPE_VERSION") self.assertEqual(guard_type_version_count, 1) - @unittest.skipIf(Py_GIL_DISABLED, FT_TEST_DISABLED_REVERSE_TYPE_CACHE) def test_guard_type_version_removed_invalidation(self): def thing(a): @@ -1263,7 +1259,6 @@ class Bar: self.assertEqual(opnames[:load_attr_top].count("_GUARD_TYPE_VERSION"), 1) self.assertEqual(opnames[call:load_attr_bottom].count("_CHECK_VALIDITY"), 2) - @unittest.skipIf(Py_GIL_DISABLED, FT_TEST_DISABLED_REVERSE_TYPE_CACHE) def test_guard_type_version_removed_escaping(self): def thing(a): @@ -1287,7 +1282,6 @@ class Foo: self.assertEqual(opnames[:load_attr_top].count("_GUARD_TYPE_VERSION"), 1) self.assertEqual(opnames[call:load_attr_bottom].count("_CHECK_VALIDITY"), 2) - @unittest.skipIf(Py_GIL_DISABLED, "FT build only specializes on deferred methods/functions/classes") def test_guard_type_version_executor_invalidated(self): """ Verify that the executor is invalided on a type change. @@ -2334,7 +2328,6 @@ def testfunc(n): self.assertNotIn("_TO_BOOL_BOOL", uops) self.assertIn("_GUARD_IS_TRUE_POP", uops) - @unittest.skipIf(Py_GIL_DISABLED, "FT build only specializes on deferred methods/functions/classes") def test_set_type_version_sets_type(self): class C: A = 1 @@ -2367,7 +2360,6 @@ def testfunc(n): self.assertNotIn("_LOAD_SMALL_INT", uops) self.assertIn("_LOAD_CONST_INLINE_BORROW", uops) - @unittest.skipIf(Py_GIL_DISABLED, FT_TEST_DISABLED_REVERSE_TYPE_CACHE) def test_cached_attributes(self): class C: A = 1 @@ -2506,7 +2498,6 @@ def testfunc(n): self.assertIn("_POP_TOP_NOP", uops) - @unittest.skipIf(Py_GIL_DISABLED, "FT build immortalizes constants") def test_pop_top_specialize_int(self): def testfunc(n): for _ in range(n): @@ -2520,7 +2511,6 @@ def testfunc(n): self.assertIn("_POP_TOP_INT", uops) - @unittest.skipIf(Py_GIL_DISABLED, "FT build immortalizes constants") def test_pop_top_specialize_float(self): def testfunc(n): for _ in range(n): diff --git a/Objects/funcobject.c b/Objects/funcobject.c index fb503df328782a..f8a1b72c9f23e4 100644 --- a/Objects/funcobject.c +++ b/Objects/funcobject.c @@ -298,7 +298,7 @@ functions is running. */ -#ifndef Py_GIL_DISABLED +#if _Py_TIER2 static inline struct _func_version_cache_item * get_cache_item(PyInterpreterState *interp, uint32_t version) { @@ -315,11 +315,13 @@ _PyFunction_SetVersion(PyFunctionObject *func, uint32_t version) // This should only be called from MAKE_FUNCTION. No code is specialized // based on the version, so we do not need to stop the world to set it. func->func_version = version; -#ifndef Py_GIL_DISABLED +#if _Py_TIER2 PyInterpreterState *interp = _PyInterpreterState_GET(); + FT_MUTEX_LOCK(&interp->func_state.mutex); struct _func_version_cache_item *slot = get_cache_item(interp, version); slot->func = func; slot->code = func->func_code; + FT_MUTEX_UNLOCK(&interp->func_state.mutex); #endif } @@ -330,13 +332,15 @@ func_clear_version(PyInterpreterState *interp, PyFunctionObject *func) // Version was never set or has already been cleared. return; } -#ifndef Py_GIL_DISABLED +#if _Py_TIER2 + FT_MUTEX_LOCK(&interp->func_state.mutex); struct _func_version_cache_item *slot = get_cache_item(interp, func->func_version); if (slot->func == func) { slot->func = NULL; // Leave slot->code alone, there may be use for it. } + FT_MUTEX_UNLOCK(&interp->func_state.mutex); #endif func->func_version = FUNC_VERSION_CLEARED; } @@ -358,8 +362,9 @@ _PyFunction_ClearVersion(PyFunctionObject *func) void _PyFunction_ClearCodeByVersion(uint32_t version) { -#ifndef Py_GIL_DISABLED +#if _Py_TIER2 PyInterpreterState *interp = _PyInterpreterState_GET(); + FT_MUTEX_LOCK(&interp->func_state.mutex); struct _func_version_cache_item *slot = get_cache_item(interp, version); if (slot->code) { assert(PyCode_Check(slot->code)); @@ -369,15 +374,17 @@ _PyFunction_ClearCodeByVersion(uint32_t version) slot->func = NULL; } } + FT_MUTEX_UNLOCK(&interp->func_state.mutex); #endif } PyFunctionObject * _PyFunction_LookupByVersion(uint32_t version, PyObject **p_code) { -#ifdef Py_GIL_DISABLED - return NULL; -#else +#if _Py_TIER2 + // This function does not need locking/atomics as it can only be + // called from the optimizer, which is currently disabled + // when there are multiple threads. PyInterpreterState *interp = _PyInterpreterState_GET(); struct _func_version_cache_item *slot = get_cache_item(interp, version); if (slot->code) { @@ -401,6 +408,9 @@ _PyFunction_LookupByVersion(uint32_t version, PyObject **p_code) uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func) { + // This function does not need locking/atomics as it can only be + // called from the optimizer, which is currently disabled + // when there are multiple threads. return func->func_version; } diff --git a/Objects/typeobject.c b/Objects/typeobject.c index 61bcc21ce13d47..33a94ca17e74ad 100644 --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -1148,8 +1148,9 @@ static void set_version_unlocked(PyTypeObject *tp, unsigned int version) { assert(version == 0 || (tp->tp_versions_used != _Py_ATTR_CACHE_UNUSED)); -#ifndef Py_GIL_DISABLED +#if _Py_TIER2 PyInterpreterState *interp = _PyInterpreterState_GET(); + BEGIN_TYPE_LOCK(); // lookup the old version and set to null if (tp->tp_version_tag != 0) { PyTypeObject **slot = @@ -1157,6 +1158,8 @@ set_version_unlocked(PyTypeObject *tp, unsigned int version) + (tp->tp_version_tag % TYPE_VERSION_CACHE_SIZE); *slot = NULL; } +#endif +#ifndef Py_GIL_DISABLED if (version) { tp->tp_versions_used++; } @@ -1166,13 +1169,14 @@ set_version_unlocked(PyTypeObject *tp, unsigned int version) } #endif FT_ATOMIC_STORE_UINT_RELAXED(tp->tp_version_tag, version); -#ifndef Py_GIL_DISABLED +#if _Py_TIER2 if (version != 0) { PyTypeObject **slot = interp->types.type_version_cache + (version % TYPE_VERSION_CACHE_SIZE); *slot = tp; } + END_TYPE_LOCK(); #endif } @@ -1357,9 +1361,12 @@ _PyType_SetVersion(PyTypeObject *tp, unsigned int version) PyTypeObject * _PyType_LookupByVersion(unsigned int version) { -#ifdef Py_GIL_DISABLED +#ifndef _Py_TIER2 return NULL; #else + // This function does not need locking/atomics as it can only be + // called from the optimizer, which is currently disabled + // when there are multiple threads. PyInterpreterState *interp = _PyInterpreterState_GET(); PyTypeObject **slot = interp->types.type_version_cache diff --git a/Python/optimizer.c b/Python/optimizer.c index d09c09316d780d..fdf90dbc4a17b5 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1655,12 +1655,12 @@ jit_tracer_invalidate_dependency(PyThreadState *tstate, void *obj) _Py_BloomFilter_Init(&obj_filter); _Py_BloomFilter_Add(&obj_filter, obj); _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; - PyMutex_Lock(&_tstate->jit_tracer_state.lock); + FT_MUTEX_LOCK(&_tstate->jit_tracer_state.lock); if (bloom_filter_may_contain(&_tstate->jit_tracer_state.prev_state.dependencies, &obj_filter)) { FT_ATOMIC_STORE_UINT8(_tstate->jit_tracer_state.prev_state.dependencies_still_valid, 0); } - PyMutex_Unlock(&_tstate->jit_tracer_state.lock); + FT_MUTEX_UNLOCK(&_tstate->jit_tracer_state.lock); } void From b08ef603fe3e0d12cbc4e3f3456e9fc45b2d5a20 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sun, 16 Nov 2025 23:12:19 +0000 Subject: [PATCH 31/40] allow allow deferred things to be borrrowed too --- Python/optimizer_analysis.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index 8d7b734e17cb0b..e08deef4746b49 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -114,7 +114,7 @@ convert_global_to_const(_PyUOpInstruction *inst, PyObject *obj, bool pop) if (res == NULL) { return NULL; } - if (_Py_IsImmortal(res)) { + if (_Py_IsImmortal(res) || _PyObject_HasDeferredRefcount(res)) { inst->opcode = pop ? _POP_TOP_LOAD_CONST_INLINE_BORROW : _LOAD_CONST_INLINE_BORROW; } else { @@ -233,7 +233,7 @@ lookup_attr(JitOptContext *ctx, _PyUOpInstruction *this_instr, if (type && PyType_Check(type)) { PyObject *lookup = _PyType_Lookup(type, name); if (lookup) { - int opcode = _Py_IsImmortal(lookup) ? immortal : mortal; + int opcode = _Py_IsImmortal(lookup) || _PyObject_HasDeferredRefcount(lookup) ? immortal : mortal; REPLACE_OP(this_instr, opcode, 0, (uintptr_t)lookup); return sym_new_const(ctx, lookup); } From c2c8fbe5dc99f2b2fef66acc2414f6814d4fe2fa Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sun, 16 Nov 2025 23:13:32 +0000 Subject: [PATCH 32/40] fix warnings --- Objects/funcobject.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Objects/funcobject.c b/Objects/funcobject.c index f8a1b72c9f23e4..45b8315d294056 100644 --- a/Objects/funcobject.c +++ b/Objects/funcobject.c @@ -402,6 +402,8 @@ _PyFunction_LookupByVersion(uint32_t version, PyObject **p_code) return slot->func; } return NULL; +#else + return NULL; #endif } From 162b1ec5ea5c6684dd0511e2b9958bc2f72b8457 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sun, 16 Nov 2025 23:15:40 +0000 Subject: [PATCH 33/40] skip bad tests --- Lib/test/test_capi/test_opt.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index c3aa3eb658ba04..7bdcb739e2771d 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -2498,6 +2498,7 @@ def testfunc(n): self.assertIn("_POP_TOP_NOP", uops) + @unittest.skipIf(Py_GIL_DISABLED, "FT might immortalize this.") def test_pop_top_specialize_int(self): def testfunc(n): for _ in range(n): @@ -2511,6 +2512,7 @@ def testfunc(n): self.assertIn("_POP_TOP_INT", uops) + @unittest.skipIf(Py_GIL_DISABLED, "FT might immortalize this.") def test_pop_top_specialize_float(self): def testfunc(n): for _ in range(n): From e0890ff8bd2f4734bfb23ce5f874ffb9a62cb8f7 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sun, 16 Nov 2025 23:15:40 +0000 Subject: [PATCH 34/40] skip bad tests From 197e9f4dbf1797747c03d62940f5400971e169a8 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Mon, 17 Nov 2025 12:08:17 +0000 Subject: [PATCH 35/40] reduce diff --- Include/internal/pycore_optimizer.h | 4 ---- Include/internal/pycore_tstate.h | 1 - Python/bytecodes.c | 14 ++------------ Python/optimizer.c | 9 +-------- 4 files changed, 3 insertions(+), 25 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 3dcdc5c95a31f6..cb6a9758a41535 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -24,11 +24,7 @@ typedef struct _PyExecutorLinkListNode { typedef struct { uint8_t opcode; uint8_t oparg; -#ifdef Py_GIL_DISABLED - uint8_t valid; -#else uint8_t valid:1; -#endif uint8_t linked:1; uint8_t chain_depth:6; // Must be big enough for MAX_CHAIN_DEPTH - 1. bool warm; diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index bb4ac5bc2af7c0..ebdba5c79ff8bc 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -48,7 +48,6 @@ typedef struct _PyJitTracerPreviousState { } _PyJitTracerPreviousState; typedef struct _PyJitTracerState { - PyMutex lock; _PyUOpInstruction *code_buffer; _PyJitTracerInitialState initial_state; _PyJitTracerPreviousState prev_state; diff --git a/Python/bytecodes.c b/Python/bytecodes.c index eb6b91e2b36284..1f36e873487d15 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -3020,19 +3020,9 @@ dummy_func( PyCodeObject *code = _PyFrame_GetCode(frame); _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; - // We are responsible for cleaning up after ourselves. - if (!executor->vm_data.valid) { - opcode = executor->vm_data.opcode; - oparg = (oparg & ~255) | executor->vm_data.oparg; - next_instr = this_instr; - Py_BEGIN_CRITICAL_SECTION(code); - _Py_ExecutorDetach(executor); - Py_END_CRITICAL_SECTION(); - DISPATCH_GOTO(); - } - assert(tstate->current_executor == NULL); assert(executor->vm_data.index == INSTR_OFFSET() - 1); assert(executor->vm_data.code == code); + assert(tstate->current_executor == NULL); /* If the eval breaker is set then stay in tier 1. * This avoids any potentially infinite loops * involving _RESUME_CHECK */ @@ -3045,7 +3035,7 @@ dummy_func( } DISPATCH_GOTO(); } - assert(executor != ((_PyThreadStateImpl *)tstate)->jit_executor_state.cold_executor); + assert(executor != tstate->interp->cold_executor); tstate->jit_exit = NULL; TIER1_TO_TIER2(executor); #else diff --git a/Python/optimizer.c b/Python/optimizer.c index fdf90dbc4a17b5..b3938354de5b44 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -941,15 +941,11 @@ _PyJit_TryInitializeTracing( return 0; } - if (!PyMutex_LockFast(&_tstate->jit_tracer_state.lock)) { - return 0; - } if (_tstate->jit_tracer_state.code_buffer == NULL) { _tstate->jit_tracer_state.code_buffer = (_PyUOpInstruction *)_PyObject_VirtualAlloc(UOP_BUFFER_SIZE); if (_tstate->jit_tracer_state.code_buffer == NULL) { // Don't error, just go to next instruction. - PyMutex_Unlock(&_tstate->jit_tracer_state.lock); return 0; } } @@ -997,7 +993,6 @@ _PyJit_TryInitializeTracing( FT_ATOMIC_STORE_UINT16_RELAXED(close_loop_instr[1].counter.value_and_backoff, zero.value_and_backoff); } _Py_BloomFilter_Init(&_tstate->jit_tracer_state.prev_state.dependencies); - PyMutex_Unlock(&_tstate->jit_tracer_state.lock); return 1; } @@ -1655,12 +1650,10 @@ jit_tracer_invalidate_dependency(PyThreadState *tstate, void *obj) _Py_BloomFilter_Init(&obj_filter); _Py_BloomFilter_Add(&obj_filter, obj); _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; - FT_MUTEX_LOCK(&_tstate->jit_tracer_state.lock); if (bloom_filter_may_contain(&_tstate->jit_tracer_state.prev_state.dependencies, &obj_filter)) { FT_ATOMIC_STORE_UINT8(_tstate->jit_tracer_state.prev_state.dependencies_still_valid, 0); - } - FT_MUTEX_UNLOCK(&_tstate->jit_tracer_state.lock); + }; } void From 46413cf25a16608c9dfe6a65969e0bf1089af5cf Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Mon, 17 Nov 2025 12:40:14 +0000 Subject: [PATCH 36/40] Reduce diff to minmal --- Include/cpython/code.h | 1 - Include/internal/pycore_interp_structs.h | 8 + Include/internal/pycore_opcode_metadata.h | 2 +- Include/internal/pycore_optimizer.h | 15 +- Include/internal/pycore_tstate.h | 14 - Objects/codeobject.c | 7 - Python/bytecodes.c | 2 +- Python/ceval_gil.c | 4 +- Python/gc.c | 2 - Python/gc_free_threading.c | 2 - Python/generated_cases.c.h | 17 +- Python/instrumentation.c | 4 +- Python/optimizer.c | 381 ++++++++++------------ Python/pylifecycle.c | 5 +- Python/pystate.c | 77 ++--- Python/sysmodule.c | 2 +- 16 files changed, 236 insertions(+), 307 deletions(-) diff --git a/Include/cpython/code.h b/Include/cpython/code.h index bbb12951601c53..84456a709a6abe 100644 --- a/Include/cpython/code.h +++ b/Include/cpython/code.h @@ -16,7 +16,6 @@ typedef struct { } _PyCoCached; typedef struct { - uint8_t is_finalizing; int size; int capacity; struct _PyExecutorObject *executors[1]; diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index 7147f016f9cb07..9671f3457e6272 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -930,6 +930,14 @@ struct _is { struct types_state types; struct callable_cache callable_cache; PyObject *common_consts[NUM_COMMON_CONSTANTS]; + uint8_t jit; + bool compiling; + struct _PyExecutorObject *executor_list_head; + struct _PyExecutorObject *executor_deletion_list_head; + struct _PyExecutorObject *cold_executor; + struct _PyExecutorObject *cold_dynamic_executor; + int executor_deletion_list_remaining_capacity; + size_t executor_creation_counter; _rare_events rare_events; PyDict_WatchCallback builtins_dict_watcher; diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index 437508be4dd8b1..548627dc7982ec 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -1150,7 +1150,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[267] = { [END_ASYNC_FOR] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_JUMP_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG | HAS_UNPREDICTABLE_JUMP_FLAG | HAS_NEEDS_GUARD_IP_FLAG }, [END_FOR] = { true, INSTR_FMT_IX, HAS_ESCAPES_FLAG | HAS_NO_SAVE_IP_FLAG }, [END_SEND] = { true, INSTR_FMT_IX, HAS_ESCAPES_FLAG | HAS_PURE_FLAG }, - [ENTER_EXECUTOR] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ESCAPES_FLAG }, + [ENTER_EXECUTOR] = { true, INSTR_FMT_IB, HAS_ARG_FLAG }, [EXIT_INIT_CHECK] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG }, [EXTENDED_ARG] = { true, INSTR_FMT_IB, HAS_ARG_FLAG }, [FORMAT_SIMPLE] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index cb6a9758a41535..ad038b090c4024 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -45,7 +45,6 @@ typedef struct _PyExitData { typedef struct _PyExecutorObject { PyObject_VAR_HEAD - PyThreadState *tstate; const _PyUOpInstruction *trace; _PyVMData vm_data; /* Used by the VM, but opaque to the optimizer */ uint32_t exit_count; @@ -74,16 +73,14 @@ PyAPI_FUNC(void) _Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj); #ifdef _Py_TIER2 PyAPI_FUNC(void) _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation); -PyAPI_FUNC(void) _Py_Executors_InvalidateDependencyLockHeld(PyInterpreterState *interp, void *obj, int is_invalidation); PyAPI_FUNC(void) _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation); -PyAPI_FUNC(void) _Py_Executors_InvalidateAllLockHeld(PyInterpreterState *interp, int is_invalidation); -PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyThreadState *tstate); -PyAPI_FUNC(void) _Py_Executors_InvalidateColdGC(PyInterpreterState *interp); +PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyInterpreterState *interp); + #else # define _Py_Executors_InvalidateDependency(A, B, C) ((void)0) # define _Py_Executors_InvalidateAll(A, B) ((void)0) # define _Py_Executors_InvalidateCold(A) ((void)0) -# define _Py_Executors_InvalidateColdGC(A) ((void)0) + #endif // Used as the threshold to trigger executor invalidation when @@ -361,7 +358,9 @@ static inline int is_terminator(const _PyUOpInstruction *uop) extern void _PyExecutor_Free(_PyExecutorObject *self); PyAPI_FUNC(int) _PyDumpExecutors(FILE *out); - +#ifdef _Py_TIER2 +extern void _Py_ClearExecutorDeletionList(PyInterpreterState *interp); +#endif int _PyJit_translate_single_bytecode_to_trace(PyThreadState *tstate, _PyInterpreterFrame *frame, _Py_CODEUNIT *next_instr, int stop_tracing_opcode); @@ -376,4 +375,4 @@ void _PyJit_FinalizeTracing(PyThreadState *tstate); #ifdef __cplusplus } #endif -#endif /* !Py_INTERNAL_OPTIMIZER_H */ +#endif /* !Py_INTERNAL_OPTIMIZER_H */ \ No newline at end of file diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index ebdba5c79ff8bc..042f4d77fb19c1 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -52,20 +52,7 @@ typedef struct _PyJitTracerState { _PyJitTracerInitialState initial_state; _PyJitTracerPreviousState prev_state; } _PyJitTracerState; - -#endif - -typedef struct _PyJitExecutorState { - char jit; -#if _Py_TIER2 - struct _PyExecutorObject *executor_list_head; - struct _PyExecutorObject *executor_deletion_list_head; - struct _PyExecutorObject *cold_executor; - struct _PyExecutorObject *cold_dynamic_executor; - int executor_deletion_list_remaining_capacity; - size_t executor_creation_counter; #endif -} _PyJitExecutorState; // Every PyThreadState is actually allocated as a _PyThreadStateImpl. The // PyThreadState fields are exposed as part of the C API, although most fields @@ -134,7 +121,6 @@ typedef struct _PyThreadStateImpl { #if _Py_TIER2 _PyJitTracerState jit_tracer_state; #endif - _PyJitExecutorState jit_executor_state; } _PyThreadStateImpl; #ifdef __cplusplus diff --git a/Objects/codeobject.c b/Objects/codeobject.c index c6b132138a7e91..db9d891f9217c0 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2185,13 +2185,6 @@ _PyCode_ReturnsOnlyNone(PyCodeObject *co) static void clear_executors(PyCodeObject *co) { -#ifdef Py_GIL_DISABLED - uint8_t expected = 0; - if (!_Py_atomic_compare_exchange_uint8(&co->co_executors->is_finalizing, &expected, 1)) { - // Another thread is already finalizing this. - return; - } -#endif assert(co->co_executors); for (int i = 0; i < co->co_executors->size; i++) { if (co->co_executors->executors[i]) { diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 1f36e873487d15..ef3195086e95f6 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2941,7 +2941,7 @@ dummy_func( specializing tier1 op(_SPECIALIZE_JUMP_BACKWARD, (--)) { #if ENABLE_SPECIALIZATION_FT if (this_instr->op.code == JUMP_BACKWARD) { - uint8_t desired = FT_ATOMIC_LOAD_CHAR_RELAXED(((_PyThreadStateImpl*)tstate)->jit_executor_state.jit) ? JUMP_BACKWARD_JIT : JUMP_BACKWARD_NO_JIT; + uint8_t desired = FT_ATOMIC_LOAD_UINT8(tstate->interp->jit) ? JUMP_BACKWARD_JIT : JUMP_BACKWARD_NO_JIT; FT_ATOMIC_STORE_UINT8_RELAXED(this_instr->op.code, desired); // Need to re-dispatch so the warmup counter isn't off by one: next_instr = this_instr; diff --git a/Python/ceval_gil.c b/Python/ceval_gil.c index f012d0d777e362..e42c5f237e45b3 100644 --- a/Python/ceval_gil.c +++ b/Python/ceval_gil.c @@ -1398,8 +1398,8 @@ _Py_HandlePending(PyThreadState *tstate) #ifdef _Py_TIER2 if ((breaker & _PY_EVAL_JIT_INVALIDATE_COLD_BIT) != 0) { _Py_unset_eval_breaker_bit(tstate, _PY_EVAL_JIT_INVALIDATE_COLD_BIT); - _Py_Executors_InvalidateCold(tstate); - ((_PyThreadStateImpl*)tstate)->jit_executor_state.executor_creation_counter = JIT_CLEANUP_THRESHOLD; + _Py_Executors_InvalidateCold(tstate->interp); + tstate->interp->executor_creation_counter = JIT_CLEANUP_THRESHOLD; } #endif diff --git a/Python/gc.c b/Python/gc.c index a8226e69277312..03a5d7366ea6c9 100644 --- a/Python/gc.c +++ b/Python/gc.c @@ -12,7 +12,6 @@ #include "pycore_pystate.h" // _PyThreadState_GET() #include "pycore_tuple.h" // _PyTuple_MaybeUntrack() #include "pycore_weakref.h" // _PyWeakref_ClearRef() -#include "pycore_optimizer.h" // _Py_Executors_InvalidateColdGC #include "pydtrace.h" @@ -1736,7 +1735,6 @@ gc_collect_full(PyThreadState *tstate, gcstate->old[1].count = 0; completed_scavenge(gcstate); _PyGC_ClearAllFreeLists(tstate->interp); - _Py_Executors_InvalidateColdGC(tstate->interp); validate_spaces(gcstate); add_stats(gcstate, 2, stats); } diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index e49175bbf128ff..5e9d14b8c4acd1 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -2311,8 +2311,6 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, } _PyEval_StartTheWorld(interp); - _Py_Executors_InvalidateColdGC(interp); - if (err < 0) { cleanup_worklist(&state->unreachable); cleanup_worklist(&state->legacy_finalizers); diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 8f6ca4c738483f..9b54c5a8df73e2 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -5477,20 +5477,9 @@ } PyCodeObject *code = _PyFrame_GetCode(frame); _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; - if (!executor->vm_data.valid) { - opcode = executor->vm_data.opcode; - oparg = (oparg & ~255) | executor->vm_data.oparg; - next_instr = this_instr; - _PyFrame_SetStackPointer(frame, stack_pointer); - Py_BEGIN_CRITICAL_SECTION(code); - _Py_ExecutorDetach(executor); - Py_END_CRITICAL_SECTION(); - stack_pointer = _PyFrame_GetStackPointer(frame); - DISPATCH_GOTO(); - } - assert(tstate->current_executor == NULL); assert(executor->vm_data.index == INSTR_OFFSET() - 1); assert(executor->vm_data.code == code); + assert(tstate->current_executor == NULL); if (_Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & _PY_EVAL_EVENTS_MASK) { opcode = executor->vm_data.opcode; oparg = (oparg & ~255) | executor->vm_data.oparg; @@ -5500,7 +5489,7 @@ } DISPATCH_GOTO(); } - assert(executor != ((_PyThreadStateImpl *)tstate)->jit_executor_state.cold_executor); + assert(executor != tstate->interp->cold_executor); tstate->jit_exit = NULL; TIER1_TO_TIER2(executor); #else @@ -7600,7 +7589,7 @@ { #if ENABLE_SPECIALIZATION_FT if (this_instr->op.code == JUMP_BACKWARD) { - uint8_t desired = FT_ATOMIC_LOAD_CHAR_RELAXED(((_PyThreadStateImpl*)tstate)->jit_executor_state.jit) ? JUMP_BACKWARD_JIT : JUMP_BACKWARD_NO_JIT; + uint8_t desired = FT_ATOMIC_LOAD_UINT8(tstate->interp->jit) ? JUMP_BACKWARD_JIT : JUMP_BACKWARD_NO_JIT; FT_ATOMIC_STORE_UINT8_RELAXED(this_instr->op.code, desired); next_instr = this_instr; DISPATCH_SAME_OPARG(); diff --git a/Python/instrumentation.c b/Python/instrumentation.c index 2cf0581651f6f1..26aac602260313 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -1785,7 +1785,7 @@ force_instrument_lock_held(PyCodeObject *code, PyInterpreterState *interp) if (code->co_executors != NULL) { _PyCode_Clear_Executors(code); } - _Py_Executors_InvalidateDependencyLockHeld(interp, code, 1); + _Py_Executors_InvalidateDependency(interp, code, 1); #endif int code_len = (int)Py_SIZE(code); /* Exit early to avoid creating instrumentation @@ -2030,7 +2030,7 @@ _PyMonitoring_SetEvents(int tool_id, _PyMonitoringEventSet events) } set_global_version(tstate, new_version); #ifdef _Py_TIER2 - _Py_Executors_InvalidateAllLockHeld(interp, 1); + _Py_Executors_InvalidateAll(interp, 1); #endif return instrument_all_executing_code_objects(interp); } diff --git a/Python/optimizer.c b/Python/optimizer.c index b3938354de5b44..8f29c850fa5ae5 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -81,7 +81,6 @@ get_index_for_executor(PyCodeObject *code, _Py_CODEUNIT *instr) if (new == NULL) { return -1; } - new->is_finalizing = 0; new->capacity = new_capacity; new->size = size; code->co_executors = new; @@ -93,6 +92,7 @@ get_index_for_executor(PyCodeObject *code, _Py_CODEUNIT *instr) static void insert_executor(PyCodeObject *code, _Py_CODEUNIT *instr, int index, _PyExecutorObject *executor) { + Py_INCREF(executor); if (instr->op.code == ENTER_EXECUTOR) { assert(index == instr->op.arg); _Py_ExecutorDetach(code->co_executors->executors[index]); @@ -120,14 +120,6 @@ uop_optimize(_PyInterpreterFrame *frame, PyThreadState *tstate, _PyExecutorObject **exec_ptr, bool progress_needed); -#if Py_GIL_DISABLED -#define LOCK_CODE(code) PyMutex_Lock(&((PyObject *)code)->ob_mutex) -#define UNLOCK_CODE(code) PyMutex_Unlock(&((PyObject *)code)->ob_mutex) -#else -#define LOCK_CODE(code) ((void)(code)) -#define UNLOCK_CODE(code) ((void)(code)) -#endif - /* Returns 1 if optimized, 0 if not optimized, and -1 for an error. * If optimized, *executor_ptr contains a new reference to the executor */ @@ -138,15 +130,18 @@ _PyOptimizer_Optimize( { _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; int chain_depth = _tstate->jit_tracer_state.initial_state.chain_depth; - if (!FT_ATOMIC_LOAD_CHAR_RELAXED(_tstate->jit_executor_state.jit)) { - // gh-140936: It is possible that jit_executor_state.jit will become false during + PyInterpreterState *interp = _PyInterpreterState_GET(); + if (!interp->jit) { + // gh-140936: It is possible that interp->jit will become false during // interpreter finalization. However, the specialized JUMP_BACKWARD_JIT // instruction may still be present. In this case, we should // return immediately without optimization. return 0; } + assert(!interp->compiling); assert(_tstate->jit_tracer_state.initial_state.stack_depth >= 0); assert(_tstate->jit_tracer_state.initial_state.func != NULL); + interp->compiling = true; // The first executor in a chain and the MAX_CHAIN_DEPTH'th executor *must* // make progress in order to avoid infinite loops or excessively-long // side-exit chains. We can only insert the executor into the bytecode if @@ -154,21 +149,20 @@ _PyOptimizer_Optimize( chain_depth %= MAX_CHAIN_DEPTH; bool progress_needed = chain_depth == 0; PyCodeObject *code = (PyCodeObject *)_tstate->jit_tracer_state.initial_state.code; - LOCK_CODE(code); _Py_CODEUNIT *start = _tstate->jit_tracer_state.initial_state.start_instr; if (progress_needed && !has_space_for_executor(code, start)) { - UNLOCK_CODE(code); + interp->compiling = false; return 0; } // One of our dependencies while tracing was invalidated. Not worth compiling. if (!_tstate->jit_tracer_state.prev_state.dependencies_still_valid) { - UNLOCK_CODE(code); + interp->compiling = false; return 0; } _PyExecutorObject *executor; int err = uop_optimize(frame, tstate, &executor, progress_needed); if (err <= 0) { - UNLOCK_CODE(code); + interp->compiling = false; return err; } assert(executor != NULL); @@ -182,7 +176,7 @@ _PyOptimizer_Optimize( * it might get confused by the executor disappearing, * but there is not much we can do about that here. */ Py_DECREF(executor); - UNLOCK_CODE(code); + interp->compiling = false; return 0; } insert_executor(code, start, index, executor); @@ -196,7 +190,7 @@ _PyOptimizer_Optimize( } executor->vm_data.chain_depth = chain_depth; assert(executor->vm_data.valid); - UNLOCK_CODE(code); + interp->compiling = false; return 1; } @@ -208,6 +202,7 @@ get_executor_lock_held(PyCodeObject *code, int offset) if (_PyCode_CODE(code)[i].op.code == ENTER_EXECUTOR && i*2 == offset) { int oparg = _PyCode_CODE(code)[i].op.arg; _PyExecutorObject *res = code->co_executors->executors[oparg]; + Py_INCREF(res); return res; } i += _PyInstruction_GetLength(code, i); @@ -256,10 +251,57 @@ _PyExecutor_Free(_PyExecutorObject *self) #ifdef _Py_JIT _PyJIT_Free(self); #endif - PyObject_GC_Del(self); } +void +_Py_ClearExecutorDeletionList(PyInterpreterState *interp) +{ + _PyRuntimeState *runtime = &_PyRuntime; + HEAD_LOCK(runtime); + PyThreadState* ts = PyInterpreterState_ThreadHead(interp); + HEAD_UNLOCK(runtime); + while (ts) { + _PyExecutorObject *current = (_PyExecutorObject *)ts->current_executor; + if (current != NULL) { + /* Anything in this list will be unlinked, so we can reuse the + * linked field as a reachability marker. */ + current->vm_data.linked = 1; + } + HEAD_LOCK(runtime); + ts = PyThreadState_Next(ts); + HEAD_UNLOCK(runtime); + } + _PyExecutorObject **prev_to_next_ptr = &interp->executor_deletion_list_head; + _PyExecutorObject *exec = *prev_to_next_ptr; + while (exec != NULL) { + if (exec->vm_data.linked) { + // This executor is currently executing + exec->vm_data.linked = 0; + prev_to_next_ptr = &exec->vm_data.links.next; + } + else { + *prev_to_next_ptr = exec->vm_data.links.next; + _PyExecutor_Free(exec); + } + exec = *prev_to_next_ptr; + } + interp->executor_deletion_list_remaining_capacity = EXECUTOR_DELETE_LIST_MAX; +} + +static void +add_to_pending_deletion_list(_PyExecutorObject *self) +{ + PyInterpreterState *interp = PyInterpreterState_Get(); + self->vm_data.links.next = interp->executor_deletion_list_head; + interp->executor_deletion_list_head = self; + if (interp->executor_deletion_list_remaining_capacity > 0) { + interp->executor_deletion_list_remaining_capacity--; + } + else { + _Py_ClearExecutorDeletionList(interp); + } +} static void uop_dealloc(PyObject *op) { @@ -269,7 +311,7 @@ uop_dealloc(PyObject *op) { unlink_executor(self); // Once unlinked it becomes impossible to invalidate an executor, so do it here. self->vm_data.valid = 0; - PyObject_GC_Del(self); + add_to_pending_deletion_list(self); } const char * @@ -561,7 +603,7 @@ _PyJit_translate_single_bytecode_to_trace( // Rewind EXTENDED_ARG so that we see the whole thing. // We must point to the first EXTENDED_ARG when deopting. int oparg = _tstate->jit_tracer_state.prev_state.instr_oparg; - int opcode = FT_ATOMIC_LOAD_UINT8_RELAXED(this_instr->op.code); + int opcode = this_instr->op.code; int rewind_oparg = oparg; while (rewind_oparg > 255) { rewind_oparg >>= 8; @@ -935,13 +977,6 @@ _PyJit_TryInitializeTracing( if (oparg > 0xFFFF) { return 0; } - - PyObject *func = PyStackRef_AsPyObjectBorrow(frame->f_funcobj); - if (func == NULL) { - return 0; - } - - if (_tstate->jit_tracer_state.code_buffer == NULL) { _tstate->jit_tracer_state.code_buffer = (_PyUOpInstruction *)_PyObject_VirtualAlloc(UOP_BUFFER_SIZE); if (_tstate->jit_tracer_state.code_buffer == NULL) { @@ -949,6 +984,10 @@ _PyJit_TryInitializeTracing( return 0; } } + PyObject *func = PyStackRef_AsPyObjectBorrow(frame->f_funcobj); + if (func == NULL) { + return 0; + } PyCodeObject *code = _PyFrame_GetCode(frame); #ifdef Py_DEBUG char *python_lltrace = Py_GETENV("PYTHON_LLTRACE"); @@ -978,7 +1017,7 @@ _PyJit_TryInitializeTracing( _tstate->jit_tracer_state.initial_state.stack_depth = curr_stackdepth; _tstate->jit_tracer_state.initial_state.chain_depth = chain_depth; _tstate->jit_tracer_state.prev_state.instr_frame = frame; - _tstate->jit_tracer_state.prev_state.dependencies_still_valid = 1; + _tstate->jit_tracer_state.prev_state.dependencies_still_valid = true; _tstate->jit_tracer_state.prev_state.instr_code = (PyCodeObject *)Py_NewRef(_PyFrame_GetCode(frame)); _tstate->jit_tracer_state.prev_state.instr = curr_instr; _tstate->jit_tracer_state.prev_state.instr_frame = frame; @@ -989,8 +1028,7 @@ _PyJit_TryInitializeTracing( _tstate->jit_tracer_state.initial_state.jump_backward_instr = curr_instr; if (_PyOpcode_Caches[_PyOpcode_Deopt[close_loop_instr->op.code]]) { - _Py_BackoffCounter zero = trigger_backoff_counter();; - FT_ATOMIC_STORE_UINT16_RELAXED(close_loop_instr[1].counter.value_and_backoff, zero.value_and_backoff); + close_loop_instr[1].counter = trigger_backoff_counter(); } _Py_BloomFilter_Init(&_tstate->jit_tracer_state.prev_state.dependencies); return 1; @@ -1142,7 +1180,6 @@ allocate_executor(int exit_count, int length) res->trace = (_PyUOpInstruction *)(res->exits + exit_count); res->code_size = length; res->exit_count = exit_count; - res->tstate = _PyThreadState_GET(); return res; } @@ -1278,13 +1315,10 @@ make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFil // from being immediately detected as cold and invalidated. executor->vm_data.warm = true; if (_PyJIT_Compile(executor, executor->trace, length)) { - executor->vm_data.warm = false; + Py_DECREF(executor); return NULL; } #endif - // Executors are cleared by coldness or invalidation. - // Thus there's no point for them to be refcounted. - _Py_SetImmortal((PyObject *)executor); _PyObject_GC_TRACK(executor); return executor; } @@ -1360,10 +1394,8 @@ uop_optimize( OPT_HIST(effective_trace_length(buffer, length), optimized_trace_length_hist); length = prepare_for_execution(buffer, length); assert(length <= UOP_MAX_TRACE_LENGTH); - HEAD_LOCK(&_PyRuntime); _PyExecutorObject *executor = make_executor_from_uops( buffer, length, dependencies, _tstate->jit_tracer_state.initial_state.chain_depth); - HEAD_UNLOCK(&_PyRuntime); if (executor == NULL) { return -1; } @@ -1371,7 +1403,7 @@ uop_optimize( // Check executor coldness // It's okay if this ends up going negative. - if (--((_PyThreadStateImpl *)tstate)->jit_executor_state.executor_creation_counter == 0) { + if (--tstate->interp->executor_creation_counter == 0) { _Py_set_eval_breaker_bit(tstate, _PY_EVAL_JIT_INVALIDATE_COLD_BIT); } @@ -1472,11 +1504,11 @@ bloom_filter_may_contain(_PyBloomFilter *bloom, _PyBloomFilter *hashes) static void link_executor(_PyExecutorObject *executor) { - _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + PyInterpreterState *interp = _PyInterpreterState_GET(); _PyExecutorLinkListNode *links = &executor->vm_data.links; - _PyExecutorObject *head = _tstate->jit_executor_state.executor_list_head; + _PyExecutorObject *head = interp->executor_list_head; if (head == NULL) { - _tstate->jit_executor_state.executor_list_head = executor; + interp->executor_list_head = executor; links->previous = NULL; links->next = NULL; } @@ -1485,11 +1517,11 @@ link_executor(_PyExecutorObject *executor) links->previous = NULL; links->next = head; head->vm_data.links.previous = executor; - _tstate->jit_executor_state.executor_list_head = executor; + interp->executor_list_head = executor; } executor->vm_data.linked = true; /* executor_list_head must be first in list */ - assert(_tstate->jit_executor_state.executor_list_head->vm_data.links.previous == NULL); + assert(interp->executor_list_head->vm_data.links.previous == NULL); } static void @@ -1499,7 +1531,7 @@ unlink_executor(_PyExecutorObject *executor) return; } _PyExecutorLinkListNode *links = &executor->vm_data.links; - assert(executor->vm_data.valid == 0); + assert(executor->vm_data.valid); _PyExecutorObject *next = links->next; _PyExecutorObject *prev = links->previous; if (next != NULL) { @@ -1510,9 +1542,9 @@ unlink_executor(_PyExecutorObject *executor) } else { // prev == NULL implies that executor is the list head - _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)executor->tstate; - assert(_tstate->jit_executor_state.executor_list_head == executor); - _tstate->jit_executor_state.executor_list_head = next; + PyInterpreterState *interp = PyInterpreterState_Get(); + assert(interp->executor_list_head == executor); + interp->executor_list_head = next; } executor->vm_data.linked = false; } @@ -1531,9 +1563,9 @@ _Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_s _PyExecutorObject * _PyExecutor_GetColdExecutor(void) { - _PyThreadStateImpl *_tstate = (_PyThreadStateImpl*)_PyThreadState_GET(); - if (_tstate->jit_executor_state.cold_executor != NULL) { - return _tstate->jit_executor_state.cold_executor; + PyInterpreterState *interp = _PyInterpreterState_GET(); + if (interp->cold_executor != NULL) { + return interp->cold_executor; } _PyExecutorObject *cold = allocate_executor(0, 1); if (cold == NULL) { @@ -1552,17 +1584,17 @@ _PyExecutor_GetColdExecutor(void) } #endif _Py_SetImmortal((PyObject *)cold); - _tstate->jit_executor_state.cold_executor = cold; + interp->cold_executor = cold; return cold; } _PyExecutorObject * _PyExecutor_GetColdDynamicExecutor(void) { - _PyThreadStateImpl *_tstate = (_PyThreadStateImpl*)_PyThreadState_GET(); - if (_tstate->jit_executor_state.cold_dynamic_executor != NULL) { - assert(_tstate->jit_executor_state.cold_dynamic_executor->trace[0].opcode == _COLD_DYNAMIC_EXIT); - return _tstate->jit_executor_state.cold_dynamic_executor; + PyInterpreterState *interp = _PyInterpreterState_GET(); + if (interp->cold_dynamic_executor != NULL) { + assert(interp->cold_dynamic_executor->trace[0].opcode == _COLD_DYNAMIC_EXIT); + return interp->cold_dynamic_executor; } _PyExecutorObject *cold = allocate_executor(0, 1); if (cold == NULL) { @@ -1581,7 +1613,7 @@ _PyExecutor_GetColdDynamicExecutor(void) } #endif _Py_SetImmortal((PyObject *)cold); - _tstate->jit_executor_state.cold_dynamic_executor = cold; + interp->cold_dynamic_executor = cold; return cold; } @@ -1591,12 +1623,14 @@ _PyExecutor_ClearExit(_PyExitData *exit) if (exit == NULL) { return; } + _PyExecutorObject *old = exit->executor; if (exit->is_dynamic) { exit->executor = _PyExecutor_GetColdDynamicExecutor(); } else { exit->executor = _PyExecutor_GetColdExecutor(); } + Py_DECREF(old); } /* Detaches the executor from the code object (if any) that @@ -1608,7 +1642,6 @@ _Py_ExecutorDetach(_PyExecutorObject *executor) if (code == NULL) { return; } - Py_BEGIN_CRITICAL_SECTION(code); _Py_CODEUNIT *instruction = &_PyCode_CODE(code)[executor->vm_data.index]; assert(instruction->op.code == ENTER_EXECUTOR); int index = instruction->op.arg; @@ -1617,22 +1650,37 @@ _Py_ExecutorDetach(_PyExecutorObject *executor) instruction->op.arg = executor->vm_data.oparg; executor->vm_data.code = NULL; code->co_executors->executors[index] = NULL; - Py_END_CRITICAL_SECTION(); + Py_DECREF(executor); } static int executor_clear(PyObject *op) { _PyExecutorObject *executor = _PyExecutorObject_CAST(op); - assert(executor->vm_data.valid == 0); + if (!executor->vm_data.valid) { + return 0; + } + assert(executor->vm_data.valid == 1); unlink_executor(executor); + executor->vm_data.valid = 0; + /* It is possible for an executor to form a reference + * cycle with itself, so decref'ing a side exit could + * free the executor unless we hold a strong reference to it + */ _PyExecutorObject *cold = _PyExecutor_GetColdExecutor(); + _PyExecutorObject *cold_dynamic = _PyExecutor_GetColdDynamicExecutor(); + Py_INCREF(executor); for (uint32_t i = 0; i < executor->exit_count; i++) { executor->exits[i].temperature = initial_unreachable_backoff_counter(); - executor->exits[i].executor = cold; + _PyExecutorObject *e = executor->exits[i].executor; + executor->exits[i].executor = NULL; + if (e != cold && e != cold_dynamic) { + executor_clear((PyObject *)e); + } } _Py_ExecutorDetach(executor); + Py_DECREF(executor); return 0; } @@ -1643,110 +1691,91 @@ _Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj) _Py_BloomFilter_Add(&executor->vm_data.bloom, obj); } -static void -jit_tracer_invalidate_dependency(PyThreadState *tstate, void *obj) -{ - _PyBloomFilter obj_filter; - _Py_BloomFilter_Init(&obj_filter); - _Py_BloomFilter_Add(&obj_filter, obj); - _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; - if (bloom_filter_may_contain(&_tstate->jit_tracer_state.prev_state.dependencies, &obj_filter)) - { - FT_ATOMIC_STORE_UINT8(_tstate->jit_tracer_state.prev_state.dependencies_still_valid, 0); - }; -} - -void -invalidate_sub_executors(_PyThreadStateImpl *tstate, _PyExecutorObject *executor) -{ - if (!executor->vm_data.valid) { - return; - } - executor->vm_data.valid = 0; - for (uint32_t i = 0; i < executor->exit_count; i++) { - _PyExecutorObject *next = executor->exits[i].executor; - if (next != tstate->jit_executor_state.cold_dynamic_executor && next != tstate->jit_executor_state.cold_executor) { - invalidate_sub_executors(tstate, next); - } - } - // Let it be cleared up by cold executor invalidation later. - executor->vm_data.warm = 0; -} +static void jit_tracer_invalidate_dependency(PyThreadState *tstate, void *obj); /* Invalidate all executors that depend on `obj` * May cause other executors to be invalidated as well - * To avoid deadlocks due to stop the world, we just invalidate the executors but leave them to be freed - * on their own later. */ void -_Py_Executors_InvalidateDependencyLockHeld(PyInterpreterState *interp, void *obj, int is_invalidation) +_Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation) { _PyBloomFilter obj_filter; _Py_BloomFilter_Init(&obj_filter); _Py_BloomFilter_Add(&obj_filter, obj); /* Walk the list of executors */ + /* TO DO -- Use a tree to avoid traversing as many objects */ + PyObject *invalidate = PyList_New(0); + if (invalidate == NULL) { + goto error; + } + + // It doesn't matter if we don't invalidate all threads. + // If more thread are spawned, we force the jit not to compile anyways + // so the trace gets abandoned. + jit_tracer_invalidate_dependency(_PyThreadState_GET(), obj); + /* Clearing an executor can deallocate others, so we need to make a list of * executors to invalidate first */ - assert(PyMutex_IsLocked(&(interp->runtime)->interpreters.mutex)); - _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { - jit_tracer_invalidate_dependency(p, obj); - for (_PyExecutorObject *exec = ((_PyThreadStateImpl *)p)->jit_executor_state.executor_list_head; exec != NULL;) { - if (bloom_filter_may_contain(&exec->vm_data.bloom, &obj_filter)) { - invalidate_sub_executors((_PyThreadStateImpl *)p, exec); - if (is_invalidation) { - OPT_STAT_INC(executors_invalidated); - } - } - _PyExecutorObject *next = exec->vm_data.links.next; - exec = next; + for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) { + assert(exec->vm_data.valid); + _PyExecutorObject *next = exec->vm_data.links.next; + if (bloom_filter_may_contain(&exec->vm_data.bloom, &obj_filter) && + PyList_Append(invalidate, (PyObject *)exec)) + { + goto error; } + exec = next; } + for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) { + PyObject *exec = PyList_GET_ITEM(invalidate, i); + executor_clear(exec); + if (is_invalidation) { + OPT_STAT_INC(executors_invalidated); + } + } + Py_DECREF(invalidate); + return; +error: + PyErr_Clear(); + Py_XDECREF(invalidate); + // If we're truly out of memory, wiping out everything is a fine fallback: + _Py_Executors_InvalidateAll(interp, is_invalidation); } -void -_Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation) +static void +jit_tracer_invalidate_dependency(PyThreadState *tstate, void *obj) { - HEAD_LOCK(interp->runtime); - _Py_Executors_InvalidateDependencyLockHeld(interp, obj, is_invalidation); - HEAD_UNLOCK(interp->runtime); + _PyBloomFilter obj_filter; + _Py_BloomFilter_Init(&obj_filter); + _Py_BloomFilter_Add(&obj_filter, obj); + _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; + if (bloom_filter_may_contain(&_tstate->jit_tracer_state.prev_state.dependencies, &obj_filter)) + { + _tstate->jit_tracer_state.prev_state.dependencies_still_valid = false; + } } - -// To avoid deadlocks due to stop the world, we just invalidate the executors but leave them to be freed -// on their own later. +/* Invalidate all executors */ void -_Py_Executors_InvalidateAllLockHeld(PyInterpreterState *interp, int is_invalidation) +_Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) { - /* Clearing an executor can deallocate others, so we need to make a list of - * executors to invalidate first */ - _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { - FT_ATOMIC_STORE_UINT8(((_PyThreadStateImpl *)p)->jit_tracer_state.prev_state.dependencies_still_valid, 0); - for (_PyExecutorObject *exec = ((_PyThreadStateImpl *)p)->jit_executor_state.executor_list_head; exec != NULL;) { - assert(exec->tstate == p); - exec->vm_data.valid = 0; - // Let it be cleared up by cold executor invalidation later. - exec->vm_data.warm = 0; - if (is_invalidation) { - OPT_STAT_INC(executors_invalidated); - } - _PyExecutorObject *next = exec->vm_data.links.next; - exec = next; + while (interp->executor_list_head) { + _PyExecutorObject *executor = interp->executor_list_head; + assert(executor->vm_data.valid == 1 && executor->vm_data.linked == 1); + if (executor->vm_data.code) { + // Clear the entire code object so its co_executors array be freed: + _PyCode_Clear_Executors(executor->vm_data.code); + } + else { + executor_clear((PyObject *)executor); + } + if (is_invalidation) { + OPT_STAT_INC(executors_invalidated); } } } void -_Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) -{ - HEAD_LOCK(&_PyRuntime); - _Py_Executors_InvalidateAllLockHeld(interp, is_invalidation); - HEAD_UNLOCK(&_PyRuntime); -} - -// Unlike _PyExecutor_InvalidateDependency, this is not for correctness but memory savings. -// Thus there is no need to lock the runtime or traverse everything. We simply make a -// best-effort attempt to clean things up. -PyObject * -_Py_Executors_CollectCold(PyThreadState *tstate) +_Py_Executors_InvalidateCold(PyInterpreterState *interp) { /* Walk the list of executors */ /* TO DO -- Use a tree to avoid traversing as many objects */ @@ -1754,17 +1783,15 @@ _Py_Executors_CollectCold(PyThreadState *tstate) if (invalidate == NULL) { goto error; } - _PyThreadStateImpl *_tstate = (_PyThreadStateImpl*)tstate; + /* Clearing an executor can deallocate others, so we need to make a list of * executors to invalidate first */ - for (_PyExecutorObject *exec = _tstate->jit_executor_state.executor_list_head; exec != NULL;) { + for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) { + assert(exec->vm_data.valid); _PyExecutorObject *next = exec->vm_data.links.next; - if (!exec->vm_data.warm || !exec->vm_data.valid) { - exec->vm_data.valid = 0; - if (PyList_Append(invalidate, (PyObject *)exec) < 0) { - goto error; - } + if (!exec->vm_data.warm && PyList_Append(invalidate, (PyObject *)exec) < 0) { + goto error; } else { exec->vm_data.warm = false; @@ -1772,69 +1799,19 @@ _Py_Executors_CollectCold(PyThreadState *tstate) exec = next; } - return invalidate; -error: - PyErr_Clear(); - Py_XDECREF(invalidate); - return NULL; -} - -void -_Py_Executors_ClearColdList(PyObject *invalidate) -{ - if (invalidate == NULL) { - return; - } for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) { PyObject *exec = PyList_GET_ITEM(invalidate, i); executor_clear(exec); } Py_DECREF(invalidate); -} - -void -_Py_Executors_InvalidateCold(PyThreadState *tstate) -{ - _Py_Executors_ClearColdList(_Py_Executors_CollectCold(tstate)); -} - -void -_Py_Executors_InvalidateColdGC(PyInterpreterState *interp) -{ - PyObject *invalidate = PyList_New(0); - if (invalidate == NULL) { - PyErr_Clear(); - return; - } - _PyEval_StopTheWorld(interp); - _Py_FOR_EACH_TSTATE_BEGIN(interp, p) - { - PyObject *more_invalidate = _Py_Executors_CollectCold(p); - if (more_invalidate == NULL) { - goto error; - } - int err = PyList_Extend(invalidate, more_invalidate); - Py_DECREF(more_invalidate); - if (err) { - goto error; - } - } - _Py_FOR_EACH_TSTATE_END(interp); - _PyEval_StartTheWorld(interp); - // We can only clear the list after unlocking the runtime. - // Otherwise, it may deadlock. - _Py_Executors_ClearColdList(invalidate); return; error: PyErr_Clear(); - Py_DECREF(invalidate); - // Invalidate all the executors if we run out of memory. - // This means the next run will remove all executors. - _Py_Executors_InvalidateAllLockHeld(interp, 0); - HEAD_UNLOCK(interp->runtime); + Py_XDECREF(invalidate); + // If we're truly out of memory, wiping out everything is a fine fallback + _Py_Executors_InvalidateAll(interp, 0); } - static void write_str(PyObject *str, FILE *out) { @@ -1946,8 +1923,8 @@ _PyDumpExecutors(FILE *out) { fprintf(out, "digraph ideal {\n\n"); fprintf(out, " rankdir = \"LR\"\n\n"); - _PyThreadStateImpl *_tstate = (_PyThreadStateImpl*)_PyThreadState_GET(); - for (_PyExecutorObject *exec = _tstate->jit_executor_state.executor_list_head; exec != NULL;) { + PyInterpreterState *interp = PyInterpreterState_Get(); + for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) { executor_to_gv(exec, out); exec = exec->vm_data.links.next; } @@ -1971,4 +1948,4 @@ _PyExecutor_Free(struct _PyExecutorObject *self) Py_UNREACHABLE(); } -#endif /* _Py_TIER2 */ +#endif /* _Py_TIER2 */ \ No newline at end of file diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 24da1ecf8fc8fa..b3048c64d21f73 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1361,7 +1361,7 @@ init_interp_main(PyThreadState *tstate) } else #endif { - ((_PyThreadStateImpl *)(tstate))->jit_executor_state.jit = true; + interp->jit = 1; } } } @@ -1718,7 +1718,8 @@ finalize_modules(PyThreadState *tstate) PyInterpreterState *interp = tstate->interp; // Invalidate all executors and turn off JIT: - ((_PyThreadStateImpl *)tstate)->jit_executor_state.jit = false; + interp->jit = 0; + interp->compiling = false; #ifdef _Py_TIER2 _Py_Executors_InvalidateAll(interp, 0); #endif diff --git a/Python/pystate.c b/Python/pystate.c index ca19146321d092..150ddc357f54a9 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -566,6 +566,12 @@ init_interpreter(PyInterpreterState *interp, interp->monitoring_tool_versions[t] = 0; } interp->_code_object_generation = 0; + interp->jit = 0; + interp->compiling = false; + interp->executor_list_head = NULL; + interp->executor_deletion_list_head = NULL; + interp->executor_deletion_list_remaining_capacity = 0; + interp->executor_creation_counter = JIT_CLEANUP_THRESHOLD; if (interp != &runtime->_main_interpreter) { /* Fix the self-referential, statically initialized fields. */ interp->dtoa = (struct _dtoa_state)_dtoa_state_INIT(interp); @@ -792,46 +798,39 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate) Py_CLEAR(interp->after_forkers_child); #endif + +#ifdef _Py_TIER2 + _Py_ClearExecutorDeletionList(interp); +#endif _PyAST_Fini(interp); _PyAtExit_Fini(interp); + // All Python types must be destroyed before the last GC collection. Python // types create a reference cycle to themselves in their in their // PyTypeObject.tp_mro member (the tuple contains the type). /* Last garbage collection on this interpreter */ _PyGC_CollectNoFail(tstate); - -#ifdef _Py_TIER2 - // This can only be done after the last GC of the interpreter, as we may - // have pending executors that point to cold executors that expect it - // not to have been freed. - _Py_FOR_EACH_TSTATE_BEGIN(interp, p) { - HEAD_UNLOCK(runtime); - struct _PyExecutorObject *cold = ((_PyThreadStateImpl *)(p))->jit_executor_state.cold_executor; - if (cold != NULL) { - ((_PyThreadStateImpl *)(p))->jit_executor_state.cold_executor = NULL; - assert(cold->vm_data.valid); - assert(cold->vm_data.warm); - _PyExecutor_Free(cold); - } - - struct _PyExecutorObject *cold_dynamic = ((_PyThreadStateImpl *)(p))->jit_executor_state.cold_dynamic_executor; - if (cold_dynamic != NULL) { - ((_PyThreadStateImpl *)(p))->jit_executor_state.cold_dynamic_executor = NULL; - assert(cold_dynamic->vm_data.valid); - assert(cold_dynamic->vm_data.warm); - _PyExecutor_Free(cold_dynamic); - } - HEAD_LOCK(runtime); - } - _Py_FOR_EACH_TSTATE_END(interp); -#endif - _PyGC_Fini(interp); // Finalize warnings after last gc so that any finalizers can // access warnings state _PyWarnings_Fini(interp); + struct _PyExecutorObject *cold = interp->cold_executor; + if (cold != NULL) { + interp->cold_executor = NULL; + assert(cold->vm_data.valid); + assert(cold->vm_data.warm); + _PyExecutor_Free(cold); + } + + struct _PyExecutorObject *cold_dynamic = interp->cold_dynamic_executor; + if (cold_dynamic != NULL) { + interp->cold_dynamic_executor = NULL; + assert(cold_dynamic->vm_data.valid); + assert(cold_dynamic->vm_data.warm); + _PyExecutor_Free(cold_dynamic); + } /* We don't clear sysdict and builtins until the end of this function. Because clearing other attributes can execute arbitrary Python code which requires sysdict and builtins. */ @@ -1504,13 +1503,6 @@ init_threadstate(_PyThreadStateImpl *_tstate, #ifdef _Py_TIER2 _tstate->jit_tracer_state.code_buffer = NULL; - _tstate->jit_executor_state.jit = false; - _tstate->jit_executor_state.executor_list_head = NULL; - _tstate->jit_executor_state.executor_deletion_list_head = NULL; - _tstate->jit_executor_state.executor_deletion_list_remaining_capacity = 0; - _tstate->jit_executor_state.executor_creation_counter = JIT_CLEANUP_THRESHOLD; - _tstate->jit_executor_state.cold_executor = NULL; - _tstate->jit_executor_state.cold_dynamic_executor = NULL; #endif tstate->delete_later = NULL; @@ -1540,12 +1532,8 @@ add_threadstate(PyInterpreterState *interp, PyThreadState *tstate, // There's more than one thread. In FT mode, // disable the JIT completely for now. if (next != NULL) { - _Py_Executors_InvalidateAllLockHeld(interp, 1); - } - PyThreadState *curr = interp->threads.head; - while (curr != NULL) { - _Py_atomic_store_char_relaxed(&((_PyThreadStateImpl*)curr)->jit_executor_state.jit, false); - curr = curr->next; + _Py_Executors_InvalidateAll(interp, 1); + FT_ATOMIC_STORE_UINT8(interp->jit, 0); } #endif } @@ -1756,13 +1744,6 @@ PyThreadState_Clear(PyThreadState *tstate) Py_CLEAR(tstate->context); -#ifdef _Py_TIER2 - FT_ATOMIC_STORE_CHAR_RELAXED(((_PyThreadStateImpl *)(tstate))->jit_executor_state.jit, false); - - // We can't clear the cold executors here until the interpreter - // clear process starts. -#endif - #ifdef Py_GIL_DISABLED // Each thread should clear own freelists in free-threading builds. struct _Py_freelists *freelists = _Py_freelists_GET(); @@ -1847,7 +1828,7 @@ tstate_delete_common(PyThreadState *tstate, int release_gil) // There's only one thread. Re-enable JIT. PyThreadState *curr = interp->threads.head; if (curr != NULL && curr->prev == NULL && curr->next == NULL) { - _Py_atomic_store_char_relaxed(&((_PyThreadStateImpl*)curr)->jit_executor_state.jit, true); + FT_ATOMIC_STORE_UINT8(interp->jit, 1); } #endif #endif diff --git a/Python/sysmodule.c b/Python/sysmodule.c index a70693bda2ad2a..9626370f5a8757 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -4114,7 +4114,7 @@ _jit_is_enabled_impl(PyObject *module) /*[clinic end generated code: output=55865f8de993fe42 input=0524151e857f4f3a]*/ { (void)module; - return ((_PyThreadStateImpl*)_PyThreadState_GET())->jit_executor_state.jit; + return _PyInterpreterState_GET()->jit; } /*[clinic input] From b5d6571268296ee32d5a2a2553a613752f01cc3e Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Mon, 17 Nov 2025 12:45:39 +0000 Subject: [PATCH 37/40] reduce diff further --- Include/internal/pycore_optimizer.h | 2 +- Include/internal/pycore_tstate.h | 2 +- Lib/test/test_capi/test_opt.py | 1 - Python/bytecodes.c | 2 +- Python/gc_free_threading.c | 1 - Python/instrumentation.c | 7 +------ Python/sysmodule.c | 2 +- 7 files changed, 5 insertions(+), 12 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index ad038b090c4024..c0f9f545b30799 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -375,4 +375,4 @@ void _PyJit_FinalizeTracing(PyThreadState *tstate); #ifdef __cplusplus } #endif -#endif /* !Py_INTERNAL_OPTIMIZER_H */ \ No newline at end of file +#endif /* !Py_INTERNAL_OPTIMIZER_H */ diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index 042f4d77fb19c1..50048801b2e4ee 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -35,7 +35,7 @@ typedef struct _PyJitTracerInitialState { } _PyJitTracerInitialState; typedef struct _PyJitTracerPreviousState { - uint8_t dependencies_still_valid; + bool dependencies_still_valid; bool instr_is_super; int code_max_size; int code_curr_size; diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index 7bdcb739e2771d..f336ae73bf1462 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -128,7 +128,6 @@ def f(): self.assertTrue(exe.is_valid()) sys._clear_internal_caches() self.assertFalse(exe.is_valid()) - gc.collect() exe = get_first_executor(f) self.assertIsNone(exe) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index ef3195086e95f6..8f391049c8dc1b 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -3018,10 +3018,10 @@ dummy_func( goto stop_tracing; } PyCodeObject *code = _PyFrame_GetCode(frame); - _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; assert(executor->vm_data.index == INSTR_OFFSET() - 1); assert(executor->vm_data.code == code); + assert(executor->vm_data.valid); assert(tstate->current_executor == NULL); /* If the eval breaker is set then stay in tier 1. * This avoids any potentially infinite loops diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 5e9d14b8c4acd1..b183062eff7952 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -10,7 +10,6 @@ #include "pycore_interp.h" // PyInterpreterState.gc #include "pycore_interpframe.h" // _PyFrame_GetLocalsArray() #include "pycore_object_alloc.h" // _PyObject_MallocWithType() -#include "pycore_optimizer.h" #include "pycore_pystate.h" // _PyThreadState_GET() #include "pycore_tstate.h" // _PyThreadStateImpl #include "pycore_tuple.h" // _PyTuple_MaybeUntrack() diff --git a/Python/instrumentation.c b/Python/instrumentation.c index 26aac602260313..7c9f9381e41c54 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -1928,9 +1928,7 @@ _Py_Instrument(PyCodeObject *code, PyInterpreterState *interp) { int res; LOCK_CODE(code); - HEAD_LOCK(interp->runtime); res = instrument_lock_held(code, interp); - HEAD_UNLOCK(interp->runtime); UNLOCK_CODE(); return res; } @@ -2064,10 +2062,7 @@ _PyMonitoring_SetLocalEvents(PyCodeObject *code, int tool_id, _PyMonitoringEvent } set_local_events(local, tool_id, events); - HEAD_LOCK(interp->runtime); - int res = force_instrument_lock_held(code, interp); - HEAD_UNLOCK(interp->runtime); - return res; + return force_instrument_lock_held(code, interp); } int diff --git a/Python/sysmodule.c b/Python/sysmodule.c index 9626370f5a8757..1d9da9b0f0610b 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2365,7 +2365,7 @@ sys_activate_stack_trampoline_impl(PyObject *module, const char *backend) { #ifdef PY_HAVE_PERF_TRAMPOLINE #ifdef _Py_JIT - if (((_PyThreadStateImpl *)_PyThreadState_GET())->jit_executor_state.jit) { + if (FT_ATOMIC_LOAD_UINT8(_PyInterpreterState_GET()->jit)) { PyErr_SetString(PyExc_ValueError, "Cannot activate the perf trampoline if the JIT is active"); return NULL; } From a93c26c87d3a485b79e019dde82c879aed506792 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Mon, 17 Nov 2025 12:49:46 +0000 Subject: [PATCH 38/40] Reduce diff to near minimal --- Python/executor_cases.c.h | 38 ++++++++++---------- Python/generated_cases.c.h | 1 + Tools/cases_generator/generators_common.py | 26 +------------- Tools/cases_generator/optimizer_generator.py | 23 ++++++++++++ Tools/cases_generator/tier2_generator.py | 29 ++------------- 5 files changed, 46 insertions(+), 71 deletions(-) diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index b2d413ca6d64db..7dd38f03b3f7db 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -1780,12 +1780,12 @@ JUMP_TO_JUMP_TARGET(); } Py_ssize_t index = ((PyLongObject*)sub)->long_value.ob_digit[0]; - if (!1 && (list)) { + if (!LOCK_OBJECT(list)) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } if (index >= PyList_GET_SIZE(list)) { - (void)(list); + UNLOCK_OBJECT(list); if (true) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); @@ -1796,7 +1796,7 @@ FT_ATOMIC_STORE_PTR_RELEASE(_PyList_ITEMS(list)[index], PyStackRef_AsPyObjectSteal(value)); assert(old_value != NULL); - (void)(list); + UNLOCK_OBJECT(list); PyStackRef_CLOSE_SPECIALIZED(sub_st, _PyLong_ExactDealloc); stack_pointer += -3; assert(WITHIN_STACK_BOUNDS()); @@ -2299,12 +2299,12 @@ values = &stack_pointer[-1]; PyObject *seq_o = PyStackRef_AsPyObjectBorrow(seq); assert(PyList_CheckExact(seq_o)); - if (!1 && (seq_o)) { + if (!LOCK_OBJECT(seq_o)) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } if (PyList_GET_SIZE(seq_o) != oparg) { - (void)(seq_o); + UNLOCK_OBJECT(seq_o); if (true) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); @@ -2315,7 +2315,7 @@ for (int i = oparg; --i >= 0; ) { *values++ = PyStackRef_FromPyObjectNew(items[i]); } - (void)(seq_o); + UNLOCK_OBJECT(seq_o); stack_pointer += -1 + oparg; assert(WITHIN_STACK_BOUNDS()); _PyFrame_SetStackPointer(frame, stack_pointer); @@ -3363,13 +3363,13 @@ uint32_t type_version = (uint32_t)CURRENT_OPERAND0(); PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner); assert(type_version != 0); - if (!1 && (owner_o)) { + if (!LOCK_OBJECT(owner_o)) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } PyTypeObject *tp = Py_TYPE(owner_o); if (FT_ATOMIC_LOAD_UINT_RELAXED(tp->tp_version_tag) != type_version) { - (void)(owner_o); + UNLOCK_OBJECT(owner_o); if (true) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); @@ -3644,7 +3644,7 @@ assert(Py_TYPE(owner_o)->tp_flags & Py_TPFLAGS_INLINE_VALUES); if (_PyObject_GetManagedDict(owner_o) || !FT_ATOMIC_LOAD_UINT8(_PyObject_InlineValues(owner_o)->valid)) { - (void)(owner_o); + UNLOCK_OBJECT(owner_o); if (true) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); @@ -3670,7 +3670,7 @@ Py_ssize_t index = value_ptr - values->values; _PyDictValues_AddToInsertionOrder(values, index); } - (void)(owner_o); + UNLOCK_OBJECT(owner_o); stack_pointer += -2; assert(WITHIN_STACK_BOUNDS()); _PyFrame_SetStackPointer(frame, stack_pointer); @@ -3694,7 +3694,7 @@ UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } - if (!1 && (dict)) { + if (!LOCK_OBJECT(dict)) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } @@ -3702,7 +3702,7 @@ PyObject *name = GETITEM(FRAME_CO_NAMES, oparg); if (hint >= (size_t)dict->ma_keys->dk_nentries || !DK_IS_UNICODE(dict->ma_keys)) { - (void)(dict); + UNLOCK_OBJECT(dict); if (true) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); @@ -3710,7 +3710,7 @@ } PyDictUnicodeEntry *ep = DK_UNICODE_ENTRIES(dict->ma_keys) + hint; if (ep->me_key != name) { - (void)(dict); + UNLOCK_OBJECT(dict); if (true) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); @@ -3718,7 +3718,7 @@ } PyObject *old_value = ep->me_value; if (old_value == NULL) { - (void)(dict); + UNLOCK_OBJECT(dict); if (true) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); @@ -3728,7 +3728,7 @@ _PyDict_NotifyEvent(tstate->interp, PyDict_EVENT_MODIFIED, dict, name, PyStackRef_AsPyObjectBorrow(value)); stack_pointer = _PyFrame_GetStackPointer(frame); FT_ATOMIC_STORE_PTR_RELEASE(ep->me_value, PyStackRef_AsPyObjectSteal(value)); - (void)(dict); + UNLOCK_OBJECT(dict); STAT_INC(STORE_ATTR, hit); stack_pointer += -2; assert(WITHIN_STACK_BOUNDS()); @@ -3746,7 +3746,7 @@ value = stack_pointer[-2]; uint16_t index = (uint16_t)CURRENT_OPERAND0(); PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner); - if (!1 && (owner_o)) { + if (!LOCK_OBJECT(owner_o)) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } @@ -3754,7 +3754,7 @@ STAT_INC(STORE_ATTR, hit); PyObject *old_value = *(PyObject **)addr; FT_ATOMIC_STORE_PTR_RELEASE(*(PyObject **)addr, PyStackRef_AsPyObjectSteal(value)); - (void)(owner_o); + UNLOCK_OBJECT(owner_o); stack_pointer += -2; assert(WITHIN_STACK_BOUNDS()); _PyFrame_SetStackPointer(frame, stack_pointer); @@ -6035,13 +6035,13 @@ callable = stack_pointer[-3]; assert(oparg == 1); PyObject *self_o = PyStackRef_AsPyObjectBorrow(self); - if (!1 && (self_o)) { + if (!LOCK_OBJECT(self_o)) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } STAT_INC(CALL, hit); int err = _PyList_AppendTakeRef((PyListObject *)self_o, PyStackRef_AsPyObjectSteal(arg)); - (void)(self_o); + UNLOCK_OBJECT(self_o); stack_pointer += -2; assert(WITHIN_STACK_BOUNDS()); _PyFrame_SetStackPointer(frame, stack_pointer); diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 9b54c5a8df73e2..c94bfa955c3a86 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -5479,6 +5479,7 @@ _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; assert(executor->vm_data.index == INSTR_OFFSET() - 1); assert(executor->vm_data.code == code); + assert(executor->vm_data.valid); assert(tstate->current_executor == NULL); if (_Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & _PY_EVAL_EVENTS_MASK) { opcode = executor->vm_data.opcode; diff --git a/Tools/cases_generator/generators_common.py b/Tools/cases_generator/generators_common.py index 4bcf54df037a16..0b5f764ec52b45 100644 --- a/Tools/cases_generator/generators_common.py +++ b/Tools/cases_generator/generators_common.py @@ -135,30 +135,6 @@ def __init__(self, out: CWriter, labels: dict[str, Label], cannot_escape: bool = self.cannot_escape = cannot_escape self.jump_prefix = jump_prefix - def emit_to_with_replacement( - self, - out: CWriter, - tkn_iter: TokenIterator, - end: str, - uop: CodeSection, - storage: Storage, - inst: Instruction | None - ) -> Token: - parens = 0 - for tkn in tkn_iter: - if tkn.kind == end and parens == 0: - return tkn - if tkn.kind == "LPAREN": - parens += 1 - if tkn.kind == "RPAREN": - parens -= 1 - if tkn.text in self._replacers: - self._replacers[tkn.text](tkn, tkn_iter, uop, storage, inst) - else: - out.emit(tkn) - raise analysis_error(f"Expecting {end}. Reached end of file", tkn) - - def dispatch( self, tkn: Token, @@ -186,7 +162,7 @@ def deopt_if( lparen = next(tkn_iter) assert lparen.kind == "LPAREN" first_tkn = tkn_iter.peek() - self.emit_to_with_replacement(self.out, tkn_iter, "RPAREN", uop, storage, inst) + emit_to(self.out, tkn_iter, "RPAREN") self.emit(") {\n") next(tkn_iter) # Semi colon assert inst is not None diff --git a/Tools/cases_generator/optimizer_generator.py b/Tools/cases_generator/optimizer_generator.py index d29b5406aad1ec..41df073cf6df23 100644 --- a/Tools/cases_generator/optimizer_generator.py +++ b/Tools/cases_generator/optimizer_generator.py @@ -271,6 +271,29 @@ def __init__(self, out: CWriter, labels: dict[str, Label], original_uop: Uop, st self._replacers = {**self._replacers, **overrides} self.cannot_escape = True + def emit_to_with_replacement( + self, + out: CWriter, + tkn_iter: TokenIterator, + end: str, + uop: CodeSection, + storage: Storage, + inst: Instruction | None + ) -> Token: + parens = 0 + for tkn in tkn_iter: + if tkn.kind == end and parens == 0: + return tkn + if tkn.kind == "LPAREN": + parens += 1 + if tkn.kind == "RPAREN": + parens -= 1 + if tkn.text in self._replacers: + self._replacers[tkn.text](tkn, tkn_iter, uop, storage, inst) + else: + out.emit(tkn) + raise analysis_error(f"Expecting {end}. Reached end of file", tkn) + def emit_stackref_override( self, tkn: Token, diff --git a/Tools/cases_generator/tier2_generator.py b/Tools/cases_generator/tier2_generator.py index 950fad708a38d3..ac3e6b94afe49e 100644 --- a/Tools/cases_generator/tier2_generator.py +++ b/Tools/cases_generator/tier2_generator.py @@ -64,8 +64,6 @@ def __init__(self, out: CWriter, labels: dict[str, Label]): super().__init__(out, labels) self._replacers["oparg"] = self.oparg self._replacers["IP_OFFSET_OF"] = self.ip_offset_of - self._replacers["LOCK_OBJECT"] = self.lock_object - self._replacers["UNLOCK_OBJECT"] = self.unlock_object def goto_error(self, offset: int, storage: Storage) -> str: # To do: Add jump targets for popping values. @@ -86,7 +84,7 @@ def deopt_if( self.emit(lparen) assert lparen.kind == "LPAREN" first_tkn = tkn_iter.peek() - self.emit_to_with_replacement(self.out, tkn_iter, "RPAREN", uop, storage, inst) + emit_to(self.out, tkn_iter, "RPAREN") next(tkn_iter) # Semi colon self.emit(") {\n") self.emit("UOP_STAT_INC(uopcode, miss);\n") @@ -106,7 +104,7 @@ def exit_if( lparen = next(tkn_iter) self.emit(lparen) first_tkn = tkn_iter.peek() - self.emit_to_with_replacement(self.out, tkn_iter, "RPAREN", uop, storage, inst) + emit_to(self.out, tkn_iter, "RPAREN") next(tkn_iter) # Semi colon self.emit(") {\n") self.emit("UOP_STAT_INC(uopcode, miss);\n") @@ -156,29 +154,6 @@ def ip_offset_of( next(tkn_iter) return True - def lock_object( - self, - tkn: Token, - tkn_iter: TokenIterator, - uop: CodeSection, - storage: Storage, - inst: Instruction | None, - ) -> bool: - self.emit("1 && ") - return True - - def unlock_object( - self, - tkn: Token, - tkn_iter: TokenIterator, - uop: CodeSection, - storage: Storage, - inst: Instruction | None, - ) -> bool: - self.out.start_line() - self.emit("(void)") - return True - def write_uop(uop: Uop, emitter: Emitter, stack: Stack, offset_strs: dict[str, tuple[str, str]]) -> Stack: locals: dict[str, Local] = {} try: From 97d5f2bf5585db085a7fc90c8f8a0781a64493c4 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Mon, 17 Nov 2025 12:50:42 +0000 Subject: [PATCH 39/40] typo fix --- Python/optimizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index 8f29c850fa5ae5..b32d6fa9e30a96 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1710,7 +1710,7 @@ _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is } // It doesn't matter if we don't invalidate all threads. - // If more thread are spawned, we force the jit not to compile anyways + // If more threads are spawned, we force the jit not to compile anyways // so the trace gets abandoned. jit_tracer_invalidate_dependency(_PyThreadState_GET(), obj); From c381903fdf9cdb475572024efa7e2dca19c8091d Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Mon, 17 Nov 2025 14:53:56 +0000 Subject: [PATCH 40/40] Address review --- Objects/codeobject.c | 2 -- Objects/funcobject.c | 5 +++-- Python/optimizer.c | 36 +++++++++++++++++++++++------------- Python/optimizer_analysis.c | 4 ++-- Python/pystate.c | 14 ++++++-------- 5 files changed, 34 insertions(+), 27 deletions(-) diff --git a/Objects/codeobject.c b/Objects/codeobject.c index db9d891f9217c0..b61d86d110fb1b 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2434,9 +2434,7 @@ code_dealloc(PyObject *self) #ifdef _Py_TIER2 _Py_Executors_InvalidateDependency(tstate->interp, self, 1); if (co->co_executors != NULL) { - Py_BEGIN_CRITICAL_SECTION(co); clear_executors(co); - Py_END_CRITICAL_SECTION(); } #endif diff --git a/Objects/funcobject.c b/Objects/funcobject.c index 45b8315d294056..e11db261bc23f3 100644 --- a/Objects/funcobject.c +++ b/Objects/funcobject.c @@ -411,8 +411,9 @@ uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func) { // This function does not need locking/atomics as it can only be - // called from the optimizer, which is currently disabled - // when there are multiple threads. + // called from the specializing interpreter or optimizer. + // The specializing interpreter holds a strong reference to the function. + // The optimizer is currently disabled when there are multiple threads. return func->func_version; } diff --git a/Python/optimizer.c b/Python/optimizer.c index b32d6fa9e30a96..f79c0d5c893cca 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -241,8 +241,9 @@ get_oparg(PyObject *self, PyObject *Py_UNUSED(ignored)) ///////////////////// Experimental UOp Optimizer ///////////////////// -static int executor_clear(PyObject *executor); -static void unlink_executor(_PyExecutorObject *executor); +static int executor_clear(PyInterpreterState *interp, PyObject *executor); +static int executor_clear_implicit_interp(PyObject *executor); +static void unlink_executor(PyInterpreterState *interp, _PyExecutorObject *executor); void @@ -308,7 +309,7 @@ uop_dealloc(PyObject *op) { _PyExecutorObject *self = _PyExecutorObject_CAST(op); _PyObject_GC_UNTRACK(self); assert(self->vm_data.code == NULL); - unlink_executor(self); + unlink_executor(_PyInterpreterState_GET(), self); // Once unlinked it becomes impossible to invalidate an executor, so do it here. self->vm_data.valid = 0; add_to_pending_deletion_list(self); @@ -464,7 +465,7 @@ PyTypeObject _PyUOpExecutor_Type = { .tp_as_sequence = &uop_as_sequence, .tp_methods = uop_executor_methods, .tp_traverse = executor_traverse, - .tp_clear = executor_clear, + .tp_clear = executor_clear_implicit_interp, .tp_is_gc = executor_is_gc, }; @@ -1525,7 +1526,7 @@ link_executor(_PyExecutorObject *executor) } static void -unlink_executor(_PyExecutorObject *executor) +unlink_executor(PyInterpreterState *interp, _PyExecutorObject *executor) { if (!executor->vm_data.linked) { return; @@ -1542,7 +1543,6 @@ unlink_executor(_PyExecutorObject *executor) } else { // prev == NULL implies that executor is the list head - PyInterpreterState *interp = PyInterpreterState_Get(); assert(interp->executor_list_head == executor); interp->executor_list_head = next; } @@ -1653,15 +1653,19 @@ _Py_ExecutorDetach(_PyExecutorObject *executor) Py_DECREF(executor); } +// Note: we must use the interp state supplied and pass it to +// unlink_executor. This function might be called when a new +// interpreter is being created/removed. Thus the interpreter +// state is inconsistent with where the executor actually belongs. static int -executor_clear(PyObject *op) +executor_clear(PyInterpreterState *interp, PyObject *op) { _PyExecutorObject *executor = _PyExecutorObject_CAST(op); if (!executor->vm_data.valid) { return 0; } assert(executor->vm_data.valid == 1); - unlink_executor(executor); + unlink_executor(interp, executor); executor->vm_data.valid = 0; /* It is possible for an executor to form a reference @@ -1675,8 +1679,8 @@ executor_clear(PyObject *op) executor->exits[i].temperature = initial_unreachable_backoff_counter(); _PyExecutorObject *e = executor->exits[i].executor; executor->exits[i].executor = NULL; - if (e != cold && e != cold_dynamic) { - executor_clear((PyObject *)e); + if (e != cold && e != cold_dynamic && e->vm_data.code == NULL) { + executor_clear(interp, (PyObject *)e); } } _Py_ExecutorDetach(executor); @@ -1684,6 +1688,12 @@ executor_clear(PyObject *op) return 0; } +static int +executor_clear_implicit_interp(PyObject *op) +{ + return executor_clear(_PyInterpreterState_GET(), op); +} + void _Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj) { @@ -1728,7 +1738,7 @@ _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is } for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) { PyObject *exec = PyList_GET_ITEM(invalidate, i); - executor_clear(exec); + executor_clear(interp, exec); if (is_invalidation) { OPT_STAT_INC(executors_invalidated); } @@ -1766,7 +1776,7 @@ _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) _PyCode_Clear_Executors(executor->vm_data.code); } else { - executor_clear((PyObject *)executor); + executor_clear(interp, (PyObject *)executor); } if (is_invalidation) { OPT_STAT_INC(executors_invalidated); @@ -1801,7 +1811,7 @@ _Py_Executors_InvalidateCold(PyInterpreterState *interp) } for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) { PyObject *exec = PyList_GET_ITEM(invalidate, i); - executor_clear(exec); + executor_clear(interp, exec); } Py_DECREF(invalidate); return; diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index e08deef4746b49..e60e927aba6f21 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -226,14 +226,14 @@ eliminate_pop_guard(_PyUOpInstruction *this_instr, bool exit) static JitOptRef lookup_attr(JitOptContext *ctx, _PyUOpInstruction *this_instr, - PyTypeObject *type, PyObject *name, uint16_t immortal, + PyTypeObject *type, PyObject *name, uint16_t deferred_refcount, uint16_t mortal) { // The cached value may be dead, so we need to do the lookup again... :( if (type && PyType_Check(type)) { PyObject *lookup = _PyType_Lookup(type, name); if (lookup) { - int opcode = _Py_IsImmortal(lookup) || _PyObject_HasDeferredRefcount(lookup) ? immortal : mortal; + int opcode = _Py_IsImmortal(lookup) || _PyObject_HasDeferredRefcount(lookup) ? deferred_refcount : mortal; REPLACE_OP(this_instr, opcode, 0, (uintptr_t)lookup); return sym_new_const(ctx, lookup); } diff --git a/Python/pystate.c b/Python/pystate.c index 150ddc357f54a9..425e39a293bfc0 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -1522,20 +1522,18 @@ add_threadstate(PyInterpreterState *interp, PyThreadState *tstate, { assert(interp->threads.head != tstate); if (next != NULL) { +#if defined(_Py_TIER2) && defined(Py_GIL_DISABLED) + FT_ATOMIC_STORE_UINT8(interp->jit, 0); + // There's more than one thread. In FT mode, + // disable the JIT completely for now. + _Py_Executors_InvalidateAll(interp, 1); +#endif assert(next->prev == NULL || next->prev == tstate); next->prev = tstate; } tstate->next = next; assert(tstate->prev == NULL); interp->threads.head = tstate; -#if defined(_Py_TIER2) && defined(Py_GIL_DISABLED) - // There's more than one thread. In FT mode, - // disable the JIT completely for now. - if (next != NULL) { - _Py_Executors_InvalidateAll(interp, 1); - FT_ATOMIC_STORE_UINT8(interp->jit, 0); - } -#endif } static PyThreadState *