From 1211c9a9897a174b7261ca258cabf289815a40d8 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 20 Jan 2018 16:42:44 +0200 Subject: [PATCH] bpo-32503: Avoid creating too small frames in pickles. (#5127) --- Lib/pickle.py | 12 +-- Lib/test/pickletester.py | 82 ++++++++++--------- .../2018-01-07-09-22-26.bpo-32503.ViMxpD.rst | 1 + Modules/_pickle.c | 18 ++-- 4 files changed, 62 insertions(+), 51 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2018-01-07-09-22-26.bpo-32503.ViMxpD.rst diff --git a/Lib/pickle.py b/Lib/pickle.py index 301e8cf55893dd..e6d003787bad89 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -183,6 +183,7 @@ def __init__(self, value): class _Framer: + _FRAME_SIZE_MIN = 4 _FRAME_SIZE_TARGET = 64 * 1024 def __init__(self, file_write): @@ -203,11 +204,12 @@ def commit_frame(self, force=False): if f.tell() >= self._FRAME_SIZE_TARGET or force: data = f.getbuffer() write = self.file_write - # Issue a single call to the write method of the underlying - # file object for the frame opcode with the size of the - # frame. The concatenation is expected to be less expensive - # than issuing an additional call to write. - write(FRAME + pack("= self._FRAME_SIZE_MIN: + # Issue a single call to the write method of the underlying + # file object for the frame opcode with the size of the + # frame. The concatenation is expected to be less expensive + # than issuing an additional call to write. + write(FRAME + pack("= 4) for significant workloads + FRAME_SIZE_MIN = 4 FRAME_SIZE_TARGET = 64 * 1024 def check_frame_opcodes(self, pickled): @@ -2047,36 +2048,43 @@ def check_frame_opcodes(self, pickled): framed by default and are therefore considered a frame by themselves in the following consistency check. """ - last_arg = last_pos = last_frame_opcode_size = None - frameless_opcode_sizes = { - 'BINBYTES': 5, - 'BINUNICODE': 5, - 'BINBYTES8': 9, - 'BINUNICODE8': 9, - } + frame_end = frameless_start = None + frameless_opcodes = {'BINBYTES', 'BINUNICODE', 'BINBYTES8', 'BINUNICODE8'} for op, arg, pos in pickletools.genops(pickled): - if op.name in frameless_opcode_sizes: - if len(arg) > self.FRAME_SIZE_TARGET: - frame_opcode_size = frameless_opcode_sizes[op.name] - arg = len(arg) - else: - continue - elif op.name == 'FRAME': - frame_opcode_size = 9 - else: - continue - - if last_pos is not None: - # The previous frame's size should be equal to the number - # of bytes up to the current frame. - frame_size = pos - last_pos - last_frame_opcode_size - self.assertEqual(frame_size, last_arg) - last_arg, last_pos = arg, pos - last_frame_opcode_size = frame_opcode_size - # The last frame's size should be equal to the number of bytes up - # to the pickle's end. - frame_size = len(pickled) - last_pos - last_frame_opcode_size - self.assertEqual(frame_size, last_arg) + if frame_end is not None: + self.assertLessEqual(pos, frame_end) + if pos == frame_end: + frame_end = None + + if frame_end is not None: # framed + self.assertNotEqual(op.name, 'FRAME') + if op.name in frameless_opcodes: + # Only short bytes and str objects should be written + # in a frame + self.assertLessEqual(len(arg), self.FRAME_SIZE_TARGET) + + else: # not framed + if (op.name == 'FRAME' or + (op.name in frameless_opcodes and + len(arg) > self.FRAME_SIZE_TARGET)): + # Frame or large bytes or str object + if frameless_start is not None: + # Only short data should be written outside of a frame + self.assertLess(pos - frameless_start, + self.FRAME_SIZE_MIN) + frameless_start = None + elif frameless_start is None and op.name != 'PROTO': + frameless_start = pos + + if op.name == 'FRAME': + self.assertGreaterEqual(arg, self.FRAME_SIZE_MIN) + frame_end = pos + 9 + arg + + pos = len(pickled) + if frame_end is not None: + self.assertEqual(frame_end, pos) + elif frameless_start is not None: + self.assertLess(pos - frameless_start, self.FRAME_SIZE_MIN) def test_framing_many_objects(self): obj = list(range(10**5)) @@ -2095,7 +2103,8 @@ def test_framing_many_objects(self): def test_framing_large_objects(self): N = 1024 * 1024 - obj = [b'x' * N, b'y' * N, 'z' * N] + small_items = [[i] for i in range(10)] + obj = [b'x' * N, *small_items, b'y' * N, 'z' * N] for proto in range(4, pickle.HIGHEST_PROTOCOL + 1): for fast in [False, True]: with self.subTest(proto=proto, fast=fast): @@ -2119,12 +2128,9 @@ def test_framing_large_objects(self): # Perform full equality check if the lengths match. self.assertEqual(obj, unpickled) n_frames = count_opcode(pickle.FRAME, pickled) - if not fast: - # One frame per memoize for each large object. - self.assertGreaterEqual(n_frames, len(obj)) - else: - # One frame at the beginning and one at the end. - self.assertGreaterEqual(n_frames, 2) + # A single frame for small objects between + # first two large objects. + self.assertEqual(n_frames, 1) self.check_frame_opcodes(pickled) def test_optional_frames(self): @@ -2152,7 +2158,9 @@ def remove_frames(pickled, keep_frame=None): frame_size = self.FRAME_SIZE_TARGET num_frames = 20 - obj = [bytes([i]) * frame_size for i in range(num_frames)] + # Large byte objects (dict values) intermitted with small objects + # (dict keys) + obj = {i: bytes([i]) * frame_size for i in range(num_frames)} for proto in range(4, pickle.HIGHEST_PROTOCOL + 1): pickled = self.dumps(obj, proto) diff --git a/Misc/NEWS.d/next/Library/2018-01-07-09-22-26.bpo-32503.ViMxpD.rst b/Misc/NEWS.d/next/Library/2018-01-07-09-22-26.bpo-32503.ViMxpD.rst new file mode 100644 index 00000000000000..609c59308e6c72 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2018-01-07-09-22-26.bpo-32503.ViMxpD.rst @@ -0,0 +1 @@ +Pickling with protocol 4 no longer creates too small frames. diff --git a/Modules/_pickle.c b/Modules/_pickle.c index f8cbea12e80958..f06a87d04f29c7 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -119,8 +119,8 @@ enum { /* Prefetch size when unpickling (disabled on unpeekable streams) */ PREFETCH = 8192 * 16, + FRAME_SIZE_MIN = 4, FRAME_SIZE_TARGET = 64 * 1024, - FRAME_HEADER_SIZE = 9 }; @@ -949,13 +949,6 @@ _write_size64(char *out, size_t value) } } -static void -_Pickler_WriteFrameHeader(PicklerObject *self, char *qdata, size_t frame_len) -{ - qdata[0] = FRAME; - _write_size64(qdata + 1, frame_len); -} - static int _Pickler_CommitFrame(PicklerObject *self) { @@ -966,7 +959,14 @@ _Pickler_CommitFrame(PicklerObject *self) return 0; frame_len = self->output_len - self->frame_start - FRAME_HEADER_SIZE; qdata = PyBytes_AS_STRING(self->output_buffer) + self->frame_start; - _Pickler_WriteFrameHeader(self, qdata, frame_len); + if (frame_len >= FRAME_SIZE_MIN) { + qdata[0] = FRAME; + _write_size64(qdata + 1, frame_len); + } + else { + memmove(qdata, qdata + FRAME_HEADER_SIZE, frame_len); + self->output_len -= FRAME_HEADER_SIZE; + } self->frame_start = -1; return 0; }