Skip to content

Commit

Permalink
bpo-32503: Avoid creating too small frames in pickles. (#5127)
Browse files Browse the repository at this point in the history
  • Loading branch information
serhiy-storchaka committed Jan 20, 2018
1 parent bd5c7d2 commit 1211c9a
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 51 deletions.
12 changes: 7 additions & 5 deletions Lib/pickle.py
Expand Up @@ -183,6 +183,7 @@ def __init__(self, value):

class _Framer:

_FRAME_SIZE_MIN = 4
_FRAME_SIZE_TARGET = 64 * 1024

def __init__(self, file_write):
Expand All @@ -203,11 +204,12 @@ def commit_frame(self, force=False):
if f.tell() >= self._FRAME_SIZE_TARGET or force:
data = f.getbuffer()
write = self.file_write
# Issue a single call to the write method of the underlying
# file object for the frame opcode with the size of the
# frame. The concatenation is expected to be less expensive
# than issuing an additional call to write.
write(FRAME + pack("<Q", len(data)))
if len(data) >= self._FRAME_SIZE_MIN:
# Issue a single call to the write method of the underlying
# file object for the frame opcode with the size of the
# frame. The concatenation is expected to be less expensive
# than issuing an additional call to write.
write(FRAME + pack("<Q", len(data)))

# Issue a separate call to write to append the frame
# contents without concatenation to the above to avoid a
Expand Down
82 changes: 45 additions & 37 deletions Lib/test/pickletester.py
Expand Up @@ -2037,6 +2037,7 @@ def test_setitems_on_non_dicts(self):

# Exercise framing (proto >= 4) for significant workloads

FRAME_SIZE_MIN = 4
FRAME_SIZE_TARGET = 64 * 1024

def check_frame_opcodes(self, pickled):
Expand All @@ -2047,36 +2048,43 @@ def check_frame_opcodes(self, pickled):
framed by default and are therefore considered a frame by themselves in
the following consistency check.
"""
last_arg = last_pos = last_frame_opcode_size = None
frameless_opcode_sizes = {
'BINBYTES': 5,
'BINUNICODE': 5,
'BINBYTES8': 9,
'BINUNICODE8': 9,
}
frame_end = frameless_start = None
frameless_opcodes = {'BINBYTES', 'BINUNICODE', 'BINBYTES8', 'BINUNICODE8'}
for op, arg, pos in pickletools.genops(pickled):
if op.name in frameless_opcode_sizes:
if len(arg) > self.FRAME_SIZE_TARGET:
frame_opcode_size = frameless_opcode_sizes[op.name]
arg = len(arg)
else:
continue
elif op.name == 'FRAME':
frame_opcode_size = 9
else:
continue

if last_pos is not None:
# The previous frame's size should be equal to the number
# of bytes up to the current frame.
frame_size = pos - last_pos - last_frame_opcode_size
self.assertEqual(frame_size, last_arg)
last_arg, last_pos = arg, pos
last_frame_opcode_size = frame_opcode_size
# The last frame's size should be equal to the number of bytes up
# to the pickle's end.
frame_size = len(pickled) - last_pos - last_frame_opcode_size
self.assertEqual(frame_size, last_arg)
if frame_end is not None:
self.assertLessEqual(pos, frame_end)
if pos == frame_end:
frame_end = None

if frame_end is not None: # framed
self.assertNotEqual(op.name, 'FRAME')
if op.name in frameless_opcodes:
# Only short bytes and str objects should be written
# in a frame
self.assertLessEqual(len(arg), self.FRAME_SIZE_TARGET)

else: # not framed
if (op.name == 'FRAME' or
(op.name in frameless_opcodes and
len(arg) > self.FRAME_SIZE_TARGET)):
# Frame or large bytes or str object
if frameless_start is not None:
# Only short data should be written outside of a frame
self.assertLess(pos - frameless_start,
self.FRAME_SIZE_MIN)
frameless_start = None
elif frameless_start is None and op.name != 'PROTO':
frameless_start = pos

if op.name == 'FRAME':
self.assertGreaterEqual(arg, self.FRAME_SIZE_MIN)
frame_end = pos + 9 + arg

pos = len(pickled)
if frame_end is not None:
self.assertEqual(frame_end, pos)
elif frameless_start is not None:
self.assertLess(pos - frameless_start, self.FRAME_SIZE_MIN)

def test_framing_many_objects(self):
obj = list(range(10**5))
Expand All @@ -2095,7 +2103,8 @@ def test_framing_many_objects(self):

def test_framing_large_objects(self):
N = 1024 * 1024
obj = [b'x' * N, b'y' * N, 'z' * N]
small_items = [[i] for i in range(10)]
obj = [b'x' * N, *small_items, b'y' * N, 'z' * N]
for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):
for fast in [False, True]:
with self.subTest(proto=proto, fast=fast):
Expand All @@ -2119,12 +2128,9 @@ def test_framing_large_objects(self):
# Perform full equality check if the lengths match.
self.assertEqual(obj, unpickled)
n_frames = count_opcode(pickle.FRAME, pickled)
if not fast:
# One frame per memoize for each large object.
self.assertGreaterEqual(n_frames, len(obj))
else:
# One frame at the beginning and one at the end.
self.assertGreaterEqual(n_frames, 2)
# A single frame for small objects between
# first two large objects.
self.assertEqual(n_frames, 1)
self.check_frame_opcodes(pickled)

def test_optional_frames(self):
Expand Down Expand Up @@ -2152,7 +2158,9 @@ def remove_frames(pickled, keep_frame=None):

frame_size = self.FRAME_SIZE_TARGET
num_frames = 20
obj = [bytes([i]) * frame_size for i in range(num_frames)]
# Large byte objects (dict values) intermitted with small objects
# (dict keys)
obj = {i: bytes([i]) * frame_size for i in range(num_frames)}

for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):
pickled = self.dumps(obj, proto)
Expand Down
@@ -0,0 +1 @@
Pickling with protocol 4 no longer creates too small frames.
18 changes: 9 additions & 9 deletions Modules/_pickle.c
Expand Up @@ -119,8 +119,8 @@ enum {
/* Prefetch size when unpickling (disabled on unpeekable streams) */
PREFETCH = 8192 * 16,

FRAME_SIZE_MIN = 4,
FRAME_SIZE_TARGET = 64 * 1024,

FRAME_HEADER_SIZE = 9
};

Expand Down Expand Up @@ -949,13 +949,6 @@ _write_size64(char *out, size_t value)
}
}

static void
_Pickler_WriteFrameHeader(PicklerObject *self, char *qdata, size_t frame_len)
{
qdata[0] = FRAME;
_write_size64(qdata + 1, frame_len);
}

static int
_Pickler_CommitFrame(PicklerObject *self)
{
Expand All @@ -966,7 +959,14 @@ _Pickler_CommitFrame(PicklerObject *self)
return 0;
frame_len = self->output_len - self->frame_start - FRAME_HEADER_SIZE;
qdata = PyBytes_AS_STRING(self->output_buffer) + self->frame_start;
_Pickler_WriteFrameHeader(self, qdata, frame_len);
if (frame_len >= FRAME_SIZE_MIN) {
qdata[0] = FRAME;
_write_size64(qdata + 1, frame_len);
}
else {
memmove(qdata, qdata + FRAME_HEADER_SIZE, frame_len);
self->output_len -= FRAME_HEADER_SIZE;
}
self->frame_start = -1;
return 0;
}
Expand Down

0 comments on commit 1211c9a

Please sign in to comment.