From 94b53eb9186cd06ecd111c81ed90dbf038d20e0d Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Fri, 17 Apr 2026 15:54:03 -0400 Subject: [PATCH 1/9] pickle: add exact-container fast paths to pure-Python _batch_appends / _batch_setitems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The C accelerator has batch_list_exact (Modules/_pickle.c:3179) and batch_dict_exact (Modules/_pickle.c:3455) that skip the generic batched() + enumerate() overhead for exact list / dict instances under binary protocols. The pure-Python _batch_appends and _batch_setitems didn't have equivalents. This adds: _batch_appends_exact(obj) — slice-snapshot per batch, no itertools.batched generator, no enumerate() object. _batch_setitems_exact(obj) — for dicts <= BATCHSIZE, a single direct iteration of obj.items(); for larger dicts, a materialized list with sliced batches. Activated only when type(obj) is list / dict and self.bin is true. Other types and proto 0 still use the generic paths unchanged. Pure-Python dump numbers (best-of-9 median per bench): list_of_ints_10k dump -3.1% load -2.6% list_of_strs_1k dump -1.7% load -3.2% dict_str_int_5k dump -0.2% load -3.5% (>BATCHSIZE) deep_list dump -17.2% load -0.3% nested_list_of_dicts dump -22.1% load -1.1% Load deltas are unrelated micro-noise; the load path was not touched. Correctness: test_pickle 1060/1060 pass test_pickletools 202/202 pass test_copy 83/83 pass test_copyreg 6/6 pass test_importlib 1217/1217 pass dill 0.4.1 29/30 pass (pre-existing 3.15a8 compat) cloudpickle 3.1.2 236/236 + 22 skipped + 2 xfailed (identical) The test_evil_pickler_mutating_collection case motivated using a per-batch slice snapshot for lists (so concurrent mutation doesn't raise IndexError) and relying on dict's built-in size-change check for dicts. --- Lib/pickle.py | 111 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 109 insertions(+), 2 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index 3e7cf25cb05337..89cc897e2bf890 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -1028,12 +1028,55 @@ def save_list(self, obj): self.write(MARK + LIST) self.memoize(obj) - self._batch_appends(obj, obj) + if self.bin and type(obj) is list: + # Fast path for exact lists under binary protocols; mirrors the + # C accelerator's batch_list_exact (Modules/_pickle.c). Avoids + # the per-batch tuple allocation from batched() and the + # enumerate() overhead used by the generic _batch_appends path. + self._batch_appends_exact(obj) + else: + self._batch_appends(obj, obj) dispatch[list] = save_list _BATCHSIZE = 1000 + def _batch_appends_exact(self, obj): + # Fast path for type(obj) is list, binary protocols. Snapshots a + # slice per batch so concurrent mutation (e.g. via persistent_id) + # does not break indexing; matches the tolerance of the generic + # _batch_appends path that goes through batched(). + save = self.save + write = self.write + batch_size = self._BATCHSIZE + idx = 0 + while True: + n = len(obj) + if idx >= n: + return + remaining = n - idx + if remaining == 1: + try: + save(obj[idx]) + except BaseException as exc: + exc.add_note(f'when serializing {_T(obj)} item {idx}') + raise + write(APPEND) + return + batch = remaining if remaining < batch_size else batch_size + snapshot = obj[idx:idx + batch] + write(MARK) + i = idx + for x in snapshot: + try: + save(x) + except BaseException as exc: + exc.add_note(f'when serializing {_T(obj)} item {i}') + raise + i += 1 + write(APPENDS) + idx = i + def _batch_appends(self, items, obj): # Helper to batch up APPENDS sequences save = self.save @@ -1077,10 +1120,74 @@ def save_dict(self, obj): self.write(MARK + DICT) self.memoize(obj) - self._batch_setitems(obj.items(), obj) + if self.bin and type(obj) is dict: + self._batch_setitems_exact(obj) + else: + self._batch_setitems(obj.items(), obj) dispatch[dict] = save_dict + def _batch_setitems_exact(self, obj): + # Fast path for type(obj) is dict, binary protocols. dict's own + # iterator raises RuntimeError on size change, so no snapshotting + # is needed. + save = self.save + write = self.write + batch_size = self._BATCHSIZE + items = obj.items() + n = len(items) + if n == 0: + return + if n == 1: + for k, v in items: + save(k) + try: + save(v) + except BaseException as exc: + exc.add_note(f'when serializing {_T(obj)} item {k!r}') + raise + write(SETITEM) + return + if n <= batch_size: + # Single batch: iterate items() directly, no batching machinery. + write(MARK) + for k, v in items: + save(k) + try: + save(v) + except BaseException as exc: + exc.add_note(f'when serializing {_T(obj)} item {k!r}') + raise + write(SETITEMS) + return + # Large dict: materialize items once, batch via slicing. The full + # items list is allocated only when n > batch_size. + all_items = list(items) + total = 0 + while total < n: + remaining = n - total + if remaining == 1: + k, v = all_items[total] + save(k) + try: + save(v) + except BaseException as exc: + exc.add_note(f'when serializing {_T(obj)} item {k!r}') + raise + write(SETITEM) + return + this_batch = remaining if remaining < batch_size else batch_size + write(MARK) + for k, v in all_items[total:total + this_batch]: + save(k) + try: + save(v) + except BaseException as exc: + exc.add_note(f'when serializing {_T(obj)} item {k!r}') + raise + write(SETITEMS) + total += this_batch + def _batch_setitems(self, items, obj): # Helper to batch up SETITEMS sequences; proto >= 1 only save = self.save From bb9d72199059850b3d5ef84f1a2e67865031b67c Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Fri, 17 Apr 2026 18:00:25 -0400 Subject: [PATCH 2/9] pickle: inline frame-check and short-circuit common atomic types in save() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two follow-ups to the exact-container fast paths (94b53eb): D) Inline framer.commit_frame() at the top of save(). The hot check (current_frame is None, or tell() < _FRAME_SIZE_TARGET) runs on every save() call; skipping the Python method dispatch when no commit is needed removes a measurable per-call tax on long runs. E) Short-circuit the dispatch-table dict.get for the common atomic types (str, int, NoneType, bool, float) by matching type(obj) with direct `is` checks before falling through to `self.dispatch.get(t)`. Placed after the memo and reducer_override checks so semantics for repeated strings, subclasses, and custom reducers are unchanged. Pure-Python dump numbers (best-of-9 median per bench, vs `main` with no pickle patches): list_of_ints_10k dump -11.7% load -3.9% list_of_strs_1k dump -8.9% load -3.4% dict_str_int_5k dump -9.9% load -4.4% deep_list dump -24.6% load -2.9% nested_list_of_dicts dump -28.5% load -2.8% (Cumulative with the exact-container fast paths from 94b53eb.) The load deltas are unrelated noise; the load path was not touched. Correctness: test_pickle 1060/1060 pass test_pickletools 202/202 pass test_copy 83/83 pass test_copyreg 6/6 pass test_importlib 1217/1217 pass dill 0.4.1 29/30 pass (pre-existing 3.15a8 incompat) cloudpickle 3.1.2 236/236 + 22 skipped + 2 xfailed (identical) Investigations that did NOT ship: B) Hoisting persistent_id / reducer_override hook checks to __init__ — precomputed bool + __dict__ probe is strictly more work than the original self.persistent_id(obj) call, which hits the type-attribute cache. Rejected twice on clean measurements. C) Atomic-tuple memoize skip — semantically safe but changes byte- exact pickle output, breaking test_pickle_to_2x's fixture assertion. Deferred; would require updating DATA_SET2 / DATA_XRANGE. Co-Authored-By: Claude Opus 4.7 (1M context) --- Lib/pickle.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index 89cc897e2bf890..6856a81d5e5343 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -560,7 +560,15 @@ def get(self, i): return GET + repr(i).encode("ascii") + b'\n' def save(self, obj, save_persistent_id=True): - self.framer.commit_frame() + # Inlined commit_frame() hot check. The frame is either None + # (proto < 4) or a BytesIO that only needs committing once it + # exceeds _FRAME_SIZE_TARGET. Skip the Python-level method + # dispatch for the no-op case (the overwhelming majority of + # saves on small/medium payloads). + framer = self.framer + cf = framer.current_frame + if cf is not None and cf.tell() >= _Framer._FRAME_SIZE_TARGET: + framer.commit_frame() # Check for persistent id (defined by a subclass) if save_persistent_id: @@ -581,8 +589,27 @@ def save(self, obj, save_persistent_id=True): rv = reduce(obj) if rv is NotImplemented: - # Check the type dispatch table + # Fast-path common types before the general dispatch table + # lookup. Saves one dict.get per save() call on payloads + # dominated by these types. The memo check already ran, so + # repeated strings / bytes / tuples still dedup via that path. t = type(obj) + if t is str: + self.save_str(obj) + return + if t is int: + self.save_long(obj) + return + if obj is None: + self.write(NONE) + return + if t is bool: + self.save_bool(obj) + return + if t is float: + self.save_float(obj) + return + # Check the type dispatch table f = self.dispatch.get(t) if f is not None: f(self, obj) # Call unbound method with explicit self From 69eed0405f8f6ec09dfcb51ee9833ae891803ea8 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Fri, 17 Apr 2026 18:17:18 -0400 Subject: [PATCH 3/9] pickle: add Misc/pickle-perf-diary.md + raw bench data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Documents the pure-Python _Pickler fast-path investigation that produced commits 94b53eb and bb9d721. Mirrors the structure of Misc/marshal-perf-diary.md. Covers five experiments (A-E) — what shipped (Exp 4 exact-container fast paths; D inlined commit_frame; E atomic-type is-short-circuit), what was rejected (B hook hoisting — Python's type-attribute cache beats manual short-circuits), and what was deferred (C atomic-tuple memoize skip — blocked by byte-exact pickle-output test fixture). Includes the kernel-compile-contamination lesson and the methodology that caught it on the clean rerun. Adds Misc/pickle-perf-data/ with the six canonical JSON artifacts plus the bench / profile scripts used throughout. Co-Authored-By: Claude Opus 4.7 (1M context) --- Misc/pickle-perf-data/README.md | 65 +++ Misc/pickle-perf-data/pickle-pure-BD.json | 157 ++++++ Misc/pickle-perf-data/pickle-pure-DE-v2.json | 157 ++++++ Misc/pickle-perf-data/pickle-pure-DE.json | 157 ++++++ .../pickle-pure-Donly-verify.json | 157 ++++++ .../pickle-pure-baseline.json | 157 ++++++ Misc/pickle-perf-data/pickle-pure-exp4d.json | 157 ++++++ Misc/pickle-perf-data/pickle_pure_bench.py | 68 +++ Misc/pickle-perf-data/pickle_save_profile.py | 32 ++ Misc/pickle-perf-diary.md | 446 ++++++++++++++++++ 10 files changed, 1553 insertions(+) create mode 100644 Misc/pickle-perf-data/README.md create mode 100644 Misc/pickle-perf-data/pickle-pure-BD.json create mode 100644 Misc/pickle-perf-data/pickle-pure-DE-v2.json create mode 100644 Misc/pickle-perf-data/pickle-pure-DE.json create mode 100644 Misc/pickle-perf-data/pickle-pure-Donly-verify.json create mode 100644 Misc/pickle-perf-data/pickle-pure-baseline.json create mode 100644 Misc/pickle-perf-data/pickle-pure-exp4d.json create mode 100644 Misc/pickle-perf-data/pickle_pure_bench.py create mode 100644 Misc/pickle-perf-data/pickle_save_profile.py create mode 100644 Misc/pickle-perf-diary.md diff --git a/Misc/pickle-perf-data/README.md b/Misc/pickle-perf-data/README.md new file mode 100644 index 00000000000000..1cd1eb9c930f5e --- /dev/null +++ b/Misc/pickle-perf-data/README.md @@ -0,0 +1,65 @@ +# Pickle Perf Raw Data + +Raw artifacts backing `Misc/pickle-perf-diary.md`. Regeneratable; +checked in so reviewers can re-verify numbers without rerunning the +methodology. + +## Harness + +`pickle_pure_bench.py` — the pure-Python `pickle._Pickler` / +`_Unpickler` benchmark used throughout. Five workloads (list-of-ints, +list-of-strs, flat str-keyed dict, deep list-of-lists, nested +list-of-dicts). Each reports a best-of-9 median for dump and load at +protocol 5. + +`pickle_save_profile.py` — `cProfile`-based breakdown used once to +identify which internal calls dominate `save()` (informed the priority +ordering of ideas D, E over B). + +Run each with `taskset -c 0 ./python