Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-112529: Make the GC scheduling thread-safe #114880

Merged
merged 5 commits into from
Feb 16, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 7 additions & 0 deletions Include/internal/pycore_gc.h
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,13 @@ struct _gc_runtime_state {
Py_ssize_t long_lived_pending;
};

#ifdef Py_GIL_DISABLED
struct _gc_thread_state {
/* Thread-local allocation count. */
Py_ssize_t alloc_count;
};
#endif


extern void _PyGC_InitState(struct _gc_runtime_state *);

Expand Down
1 change: 1 addition & 0 deletions Include/internal/pycore_tstate.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ typedef struct _PyThreadStateImpl {
PyThreadState base;

#ifdef Py_GIL_DISABLED
struct _gc_thread_state gc;
struct _mimalloc_thread_state mimalloc;
struct _Py_freelist_state freelist_state;
#endif
Expand Down
3 changes: 2 additions & 1 deletion Lib/test/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
_testcapi = None

from test import support
from test.support import threading_helper
from test.support import threading_helper, Py_GIL_DISABLED
from test.support.script_helper import assert_python_ok


Expand Down Expand Up @@ -295,6 +295,7 @@ def gen():
assert_python_ok("-c", code)

@support.cpython_only
@unittest.skipIf(Py_GIL_DISABLED, "test requires precise GC scheduling")
def test_sneaky_frame_object(self):

def trace(frame, event, arg):
Expand Down
10 changes: 10 additions & 0 deletions Modules/gcmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,16 @@
/*[clinic end generated code: output=354012e67b16398f input=a392794a08251751]*/
{
GCState *gcstate = get_gc_state();

#ifdef Py_GIL_DISABLED
_PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET();
struct _gc_thread_state *gc = &tstate->gc;

// Flush the local allocation count to the global count
_Py_atomic_add_int(&gcstate->generations[0].count, gc->alloc_count);

Check warning on line 210 in Modules/gcmodule.c

View workflow job for this annotation

GitHub Actions / Windows (free-threading) / build and test (x64)

'function': conversion from 'Py_ssize_t' to 'int', possible loss of data [D:\a\cpython\cpython\PCbuild\pythoncore.vcxproj]

Check warning on line 210 in Modules/gcmodule.c

View workflow job for this annotation

GitHub Actions / Windows (free-threading) / build (arm64)

'function': conversion from 'Py_ssize_t' to 'int', possible loss of data [D:\a\cpython\cpython\PCbuild\pythoncore.vcxproj]
gc->alloc_count = 0;
#endif

return Py_BuildValue("(iii)",
gcstate->generations[0].count,
gcstate->generations[1].count,
Expand Down
2 changes: 2 additions & 0 deletions Objects/typeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -1739,6 +1739,8 @@ _PyType_AllocNoTrack(PyTypeObject *type, Py_ssize_t nitems)
if (presize) {
((PyObject **)alloc)[0] = NULL;
((PyObject **)alloc)[1] = NULL;
}
if (PyType_IS_GC(type)) {
_PyObject_GC_Link(obj);
}
ericsnowcurrently marked this conversation as resolved.
Show resolved Hide resolved
memset(obj, '\0', size);
Expand Down
63 changes: 48 additions & 15 deletions Python/gc_free_threading.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ typedef struct _gc_runtime_state GCState;
# define GC_DEBUG
#endif

// Each thread buffers the count of allocated objects in a thread-local
// variable up to +/- this amount to reduce the overhead of updating
// the global count.
#define LOCAL_ALLOC_COUNT_THRESHOLD 512

// Automatically choose the generation that needs collecting.
#define GENERATION_AUTO (-1)

Expand Down Expand Up @@ -923,6 +928,41 @@ gc_should_collect(GCState *gcstate)
gcstate->generations[1].threshold == 0);
}

static void
record_allocation(PyThreadState *tstate)
{
struct _gc_thread_state *gc = &((_PyThreadStateImpl *)tstate)->gc;

// We buffer the allocation count to avoid the overhead of atomic
// operations for every allocation.
gc->alloc_count++;
if (gc->alloc_count >= LOCAL_ALLOC_COUNT_THRESHOLD) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this could be tied to the configurable GC threshold and therefore the tests could continue to pass but maybe it doesn't matter enough and the extra read isn't worth it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I considered making it a configurable runtime threshold, but decided it wasn't worth it, at least for now.

I think there's a decent chance we change how we count allocations in the future. In the nogil forks, for example, I accounted for allocations in mi_page_to_full and _mi_page_unfull, which provides some natural batching and avoids the thread-local that's done in every allocation here, but wouldn't allow for a configurable threshold. I haven't attempted that yet because I'd like some performance measurements to justify it first.

// TODO: Use Py_ssize_t for the generation count.
GCState *gcstate = &tstate->interp->gc;
_Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count);
ericsnowcurrently marked this conversation as resolved.
Show resolved Hide resolved
gc->alloc_count = 0;

if (gc_should_collect(gcstate) &&
!_Py_atomic_load_int_relaxed(&gcstate->collecting))
{
_Py_ScheduleGC(tstate->interp);
}
}
}

static void
record_deallocation(PyThreadState *tstate)
{
struct _gc_thread_state *gc = &((_PyThreadStateImpl *)tstate)->gc;

gc->alloc_count--;
if (gc->alloc_count <= -LOCAL_ALLOC_COUNT_THRESHOLD) {
ericsnowcurrently marked this conversation as resolved.
Show resolved Hide resolved
GCState *gcstate = &tstate->interp->gc;
_Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count);
ericsnowcurrently marked this conversation as resolved.
Show resolved Hide resolved
gc->alloc_count = 0;
}
}

static void
gc_collect_internal(PyInterpreterState *interp, struct collection_state *state)
{
Expand All @@ -942,6 +982,9 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state)
}
}

// Record the number of live GC objects
interp->gc.long_lived_total = state->long_lived_total;

// Clear weakrefs and enqueue callbacks (but do not call them).
clear_weakrefs(state);
_PyEval_StartTheWorld(interp);
Expand Down Expand Up @@ -1048,7 +1091,6 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason)

m = state.collected;
n = state.uncollectable;
gcstate->long_lived_total = state.long_lived_total;

if (gcstate->debug & _PyGC_DEBUG_STATS) {
double d = _PyTime_AsSecondsDouble(_PyTime_GetPerfCounter() - t1);
Expand Down Expand Up @@ -1488,15 +1530,7 @@ _Py_ScheduleGC(PyInterpreterState *interp)
void
_PyObject_GC_Link(PyObject *op)
{
PyThreadState *tstate = _PyThreadState_GET();
GCState *gcstate = &tstate->interp->gc;
gcstate->generations[0].count++;

if (gc_should_collect(gcstate) &&
!_Py_atomic_load_int_relaxed(&gcstate->collecting))
{
_Py_ScheduleGC(tstate->interp);
}
record_allocation(_PyThreadState_GET());
}

void
Expand All @@ -1522,7 +1556,7 @@ gc_alloc(PyTypeObject *tp, size_t basicsize, size_t presize)
((PyObject **)mem)[1] = NULL;
}
PyObject *op = (PyObject *)(mem + presize);
_PyObject_GC_Link(op);
record_allocation(tstate);
return op;
}

Expand Down Expand Up @@ -1604,10 +1638,9 @@ PyObject_GC_Del(void *op)
PyErr_SetRaisedException(exc);
#endif
}
GCState *gcstate = get_gc_state();
if (gcstate->generations[0].count > 0) {
gcstate->generations[0].count--;
}

record_deallocation(_PyThreadState_GET());

PyObject_Free(((char *)op)-presize);
}

Expand Down