Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpo-36694: Do not memoize temporary objects in the C implementation of pickle. #13036

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 6 additions & 2 deletions Lib/test/pickletester.py
Expand Up @@ -572,6 +572,8 @@ def create_dynamic_class(name, bases):

# xrange(5) pickled from 2.x with protocol 2
DATA_XRANGE = b'\x80\x02c__builtin__\nxrange\nq\x00K\x00K\x05K\x01\x87q\x01Rq\x02.'
# optimized pickle produced by the C implementation in 3.8
DATA_XRANGE_X = b'\x80\x02c__builtin__\nxrange\nq\x00K\x00K\x05K\x01\x87Rq\x01.'

# a SimpleCookie() object pickled from 2.x with protocol 2
DATA_COOKIE = (b'\x80\x02cCookie\nSimpleCookie\nq\x00)\x81q\x01U\x03key'
Expand All @@ -583,6 +585,8 @@ def create_dynamic_class(name, bases):

# set([3]) pickled from 2.x with protocol 2
DATA_SET2 = b'\x80\x02c__builtin__\nset\nq\x00]q\x01K\x03a\x85q\x02Rq\x03.'
# optimized pickle produced by the C implementation in 3.8
DATA_SET2_X = b'\x80\x02c__builtin__\nset\nq\x00]q\x01K\x03a\x85Rq\x02.'

python2_exceptions_without_args = (
ArithmeticError,
Expand Down Expand Up @@ -2050,9 +2054,9 @@ def test_pickle_to_2x(self):
# NOTE: this test is a bit too strong since we can produce different
# bytecode that 2.x will still understand.
dumped = self.dumps(range(5), 2)
self.assertEqual(dumped, DATA_XRANGE)
self.assertIn(dumped, (DATA_XRANGE, DATA_XRANGE_X))
dumped = self.dumps(set([3]), 2)
self.assertEqual(dumped, DATA_SET2)
self.assertIn(dumped, (DATA_SET2, DATA_SET2_X))

def test_large_pickles(self):
# Test the correctness of internal buffering routines when handling
Expand Down
@@ -0,0 +1,3 @@
The C implementation of :mod:`pickle` no longer memoizes temporary objects.
This produces more optimal pickle data and reduces memory consumption on
pickling and unpickling.
29 changes: 15 additions & 14 deletions Modules/_pickle.c
Expand Up @@ -1601,15 +1601,15 @@ memo_get(PicklerObject *self, PyObject *key)
/* Store an object in the memo, assign it a new unique ID based on the number
of objects currently stored in the memo and generate a PUT opcode. */
static int
memo_put(PicklerObject *self, PyObject *obj)
memo_put(PicklerObject *self, PyObject *obj, int opt)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand how you decide whether opt should be 0 or 1. What is the heuristic?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 on this. This is critical behavior for cloudpickle :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an interesting question. The rule is opt=0 for objects which save its content after saving itself. They are non-empty lists, sets, dicts and general objects with non-trivial elements 2-4 of the tuple returned by __reduce__(). They should be memoized to allow detecting reference loops.

{
char pdata[30];
Py_ssize_t len;
Py_ssize_t idx;

const char memoize_op = MEMOIZE;

if (self->fast)
if (self->fast || (opt && Py_REFCNT(obj) == 1))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Requiring Py_REFCNT(obj) to be 1 pretty strong right? Does this only affect objects created using the C API, i.e never bounded to python-level variables?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It mostly affects temporary objects. For example, if __reduce__ returns constructor, ((x, y),) then the tuple (x, y) will not be memoized. This is a case of namedtuples.

Another example: if you have a list of unique numbers, strings, tuples, etc, then items of the list will not be memoized as the only reference to the item is from the list.

return 0;

idx = PyMemoTable_Size(self->memo);
Expand Down Expand Up @@ -2289,7 +2289,7 @@ save_bytes(PicklerObject *self, PyObject *obj)
return -1;
}

if (memo_put(self, obj) < 0)
if (memo_put(self, obj, 1) < 0)
return -1;

return 0;
Expand Down Expand Up @@ -2457,7 +2457,7 @@ save_unicode(PicklerObject *self, PyObject *obj)
if (_Pickler_Write(self, "\n", 1) < 0)
return -1;
}
if (memo_put(self, obj) < 0)
if (memo_put(self, obj, 1) < 0)
return -1;

return 0;
Expand Down Expand Up @@ -2583,7 +2583,7 @@ save_tuple(PicklerObject *self, PyObject *obj)
}

memoize:
if (memo_put(self, obj) < 0)
if (memo_put(self, obj, 1) < 0)
return -1;

return 0;
Expand Down Expand Up @@ -2784,7 +2784,7 @@ save_list(PicklerObject *self, PyObject *obj)
if ((len = PyList_Size(obj)) < 0)
goto error;

if (memo_put(self, obj) < 0)
if (memo_put(self, obj, len == 0) < 0)
goto error;

if (len != 0) {
Expand Down Expand Up @@ -3041,10 +3041,11 @@ save_dict(PicklerObject *self, PyObject *obj)
if (_Pickler_Write(self, header, len) < 0)
goto error;

if (memo_put(self, obj) < 0)
len = PyDict_GET_SIZE(obj);
if (memo_put(self, obj, len == 0) < 0)
goto error;

if (PyDict_GET_SIZE(obj)) {
if (len != 0) {
/* Save the dict items. */
if (PyDict_CheckExact(obj) && self->proto > 0) {
/* We can take certain shortcuts if we know this is a dict and
Expand Down Expand Up @@ -3119,10 +3120,10 @@ save_set(PicklerObject *self, PyObject *obj)
if (_Pickler_Write(self, &empty_set_op, 1) < 0)
return -1;

if (memo_put(self, obj) < 0)
set_size = PySet_GET_SIZE(obj);
if (memo_put(self, obj, set_size == 0) < 0)
return -1;

set_size = PySet_GET_SIZE(obj);
if (set_size == 0)
return 0; /* nothing to do */

Expand Down Expand Up @@ -3224,7 +3225,7 @@ save_frozenset(PicklerObject *self, PyObject *obj)

if (_Pickler_Write(self, &frozenset_op, 1) < 0)
return -1;
if (memo_put(self, obj) < 0)
if (memo_put(self, obj, 1) < 0)
return -1;

return 0;
Expand Down Expand Up @@ -3533,7 +3534,7 @@ save_global(PicklerObject *self, PyObject *obj, PyObject *name)
goto error;
}
/* Memoize the object. */
if (memo_put(self, obj) < 0)
if (memo_put(self, obj, 1) < 0)
goto error;
}

Expand Down Expand Up @@ -3922,7 +3923,7 @@ save_reduce(PicklerObject *self, PyObject *args, PyObject *obj)

return 0;
}
else if (memo_put(self, obj) < 0)
else if (memo_put(self, obj, 1) < 0)
return -1;
}

Expand Down Expand Up @@ -4113,6 +4114,7 @@ save(PicklerObject *self, PyObject *obj, int pers_save)
}
}
}
Py_DECREF(reduce_func);

if (reduce_value == NULL)
goto error;
Expand All @@ -4138,7 +4140,6 @@ save(PicklerObject *self, PyObject *obj, int pers_save)
done:

Py_LeaveRecursiveCall();
Py_XDECREF(reduce_func);
Py_XDECREF(reduce_value);

return status;
Expand Down