python · mjbommar · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/Lib/pickle.py b/Lib/pickle.py
@@ -162,6 +162,10 @@ def __init__(self, value):
 
 _tuplesize2code = [EMPTY_TUPLE, TUPLE1, TUPLE2, TUPLE3]
 
+# Precomputed BININT1 opcode + payload for n in 0..255. Avoids the
+# struct.pack("<B", n) on every small non-negative int save.
+_BININT1_BYTES = tuple(BININT1 + bytes([_i]) for _i in range(256))
+
 # Protocol 3 (Python 3.x)
 
 BINBYTES       = b'B'   # push bytes; counted binary string argument
@@ -532,12 +536,27 @@ def memoize(self, obj):
         # growable) array, indexed by memo key.
         if self.fast:
             return
-        assert id(obj) not in self.memo
-        idx = len(self.memo)
-        self.write(self.put(idx))
-        self.memo[id(obj)] = idx, obj
+        memo = self.memo
+        assert id(obj) not in memo
+        idx = len(memo)
+        # Inlined self.put(idx). For proto >= 4 (the common case for any
+        # recent user), MEMOIZE is a one-byte constant; avoid the method
+        # dispatch + the redundant self.write indirection.
+        proto = self.proto
+        if proto >= 4:
+            self.write(MEMOIZE)
+        elif self.bin:
+            if idx < 256:
+                self.write(BINPUT + pack("<B", idx))
+            else:
+                self.write(LONG_BINPUT + pack("<I", idx))
+        else:
+            self.write(PUT + repr(idx).encode("ascii") + b'\n')
+        memo[id(obj)] = idx, obj
 
     # Return a PUT (BINPUT, LONG_BINPUT) opcode string, with argument i.
+    # Retained for backward compatibility with subclasses that override
+    # this method; memoize() now inlines the common paths directly.
     def put(self, idx):
         if self.proto >= 4:
             return MEMOIZE
@@ -560,7 +579,15 @@ def get(self, i):
         return GET + repr(i).encode("ascii") + b'\n'
 
     def save(self, obj, save_persistent_id=True):
-        self.framer.commit_frame()
+        # Inlined commit_frame() hot check. The frame is either None
+        # (proto < 4) or a BytesIO that only needs committing once it
+        # exceeds _FRAME_SIZE_TARGET. Skip the Python-level method
+        # dispatch for the no-op case (the overwhelming majority of
+        # saves on small/medium payloads).
+        framer = self.framer
+        cf = framer.current_frame
+        if cf is not None and cf.tell() >= _Framer._FRAME_SIZE_TARGET:
+            framer.commit_frame()
 
         # Check for persistent id (defined by a subclass)
         if save_persistent_id:
@@ -569,7 +596,44 @@ def save(self, obj, save_persistent_id=True):
                 self.save_pers(pid)
                 return
 
-        # Check the memo
+        # Fast paths matching the order of Modules/_pickle.c::save().
+        # Each of these returns without going through reducer_override,
+        # which the C reference implementation also skips for these
+        # types.
+        t = type(obj)
+        # str: memoized, so check memo inline before falling into save_str.
+        if t is str:
+            x = self.memo.get(id(obj))
+            if x is not None:
+                self.write(self.get(x[0]))
+                return
+            self.save_str(obj)
+            return
+        # int / None / bool / float: not memoized; skip memo.get entirely.
+        # Placed before bytes so int-heavy workloads don't pay an extra
+        # branch miss before hitting their fast path.
+        if t is int:
+            self.save_long(obj)
+            return
+        if obj is None:
+            self.write(NONE)
+            return
+        if t is bool:
+            self.save_bool(obj)
+            return
+        if t is float:
+            self.save_float(obj)
+            return
+        # bytes: memoized; same inline memo pattern as str.
+        if t is bytes:
+            x = self.memo.get(id(obj))
+            if x is not None:
+                self.write(self.get(x[0]))
+                return
+            self.save_bytes(obj)
+            return
+
+        # Check the memo (non-atomic, non-str types)
         x = self.memo.get(id(obj))
         if x is not None:
             self.write(self.get(x[0]))
@@ -582,7 +646,6 @@ def save(self, obj, save_persistent_id=True):
 
         if rv is NotImplemented:
             # Check the type dispatch table
-            t = type(obj)
             f = self.dispatch.get(t)
             if f is not None:
                 f(self, obj)  # Call unbound method with explicit self
@@ -827,7 +890,7 @@ def save_long(self, obj):
             # First one- and two-byte unsigned ints:
             if obj >= 0:
                 if obj <= 0xff:
-                    self.write(BININT1 + pack("<B", obj))
+                    self.write(_BININT1_BYTES[obj])
                     return
                 if obj <= 0xffff:
                     self.write(BININT2 + pack("<H", obj))
@@ -1028,12 +1091,55 @@ def save_list(self, obj):
             self.write(MARK + LIST)
 
         self.memoize(obj)
-        self._batch_appends(obj, obj)
+        if self.bin and type(obj) is list:
+            # Fast path for exact lists under binary protocols; mirrors the
+            # C accelerator's batch_list_exact (Modules/_pickle.c). Avoids
+            # the per-batch tuple allocation from batched() and the
+            # enumerate() overhead used by the generic _batch_appends path.
+            self._batch_appends_exact(obj)
+        else:
+            self._batch_appends(obj, obj)
 
     dispatch[list] = save_list
 
     _BATCHSIZE = 1000
 
+    def _batch_appends_exact(self, obj):
+        # Fast path for type(obj) is list, binary protocols. Snapshots a
+        # slice per batch so concurrent mutation (e.g. via persistent_id)
+        # does not break indexing; matches the tolerance of the generic
+        # _batch_appends path that goes through batched().
+        save = self.save
+        write = self.write
+        batch_size = self._BATCHSIZE
+        idx = 0
+        while True:
+            n = len(obj)
+            if idx >= n:
+                return
+            remaining = n - idx
+            if remaining == 1:
+                try:
+                    save(obj[idx])
+                except BaseException as exc:
+                    exc.add_note(f'when serializing {_T(obj)} item {idx}')
+                    raise
+                write(APPEND)
+                return
+            batch = remaining if remaining < batch_size else batch_size
+            snapshot = obj[idx:idx + batch]
+            write(MARK)
+            i = idx
+            for x in snapshot:
+                try:
+                    save(x)
+                except BaseException as exc:
+                    exc.add_note(f'when serializing {_T(obj)} item {i}')
+                    raise
+                i += 1
+            write(APPENDS)
+            idx = i
+
     def _batch_appends(self, items, obj):
         # Helper to batch up APPENDS sequences
         save = self.save
@@ -1077,10 +1183,56 @@ def save_dict(self, obj):
             self.write(MARK + DICT)
 
         self.memoize(obj)
-        self._batch_setitems(obj.items(), obj)
+        if self.bin and type(obj) is dict:
+            self._batch_setitems_exact(obj)
+        else:
+            self._batch_setitems(obj.items(), obj)
 
     dispatch[dict] = save_dict
 
+    def _batch_setitems_exact(self, obj):
+        # Fast path for type(obj) is dict, binary protocols. dict's own
+        # iterator raises RuntimeError on size change, so no snapshotting
+        # is needed.
+        save = self.save
+        write = self.write
+        batch_size = self._BATCHSIZE
+        items = obj.items()
+        n = len(items)
+        if n == 0:
+            return
+        if n == 1:
+            for k, v in items:
+                save(k)
+                try:
+                    save(v)
+                except BaseException as exc:
+                    exc.add_note(f'when serializing {_T(obj)} item {k!r}')
+                    raise
+                write(SETITEM)
+            return
+        if n <= batch_size:
+            # Single batch: iterate items() directly, no batching machinery.
+            # dict_items iteration itself raises RuntimeError on size change,
+            # so mutation during save() (e.g. from persistent_id hooks) is
+            # detected.
+            write(MARK)
+            for k, v in items:
+                save(k)
+                try:
+                    save(v)
+                except BaseException as exc:
+                    exc.add_note(f'when serializing {_T(obj)} item {k!r}')
+                    raise
+            write(SETITEMS)
+            return
+        # Large dict: delegate to the generic path, which uses batched()
+        # over the live items iterator and preserves dict mutation-during-
+        # save detection. The per-batch tuple allocation is amortised over
+        # BATCHSIZE items here, so the exact-dict fast-path advantage is
+        # concentrated on the n <= batch_size case above.
+        self._batch_setitems(items, obj)
+
     def _batch_setitems(self, items, obj):
         # Helper to batch up SETITEMS sequences; proto >= 1 only
         save = self.save

diff --git a/Misc/NEWS.d/next/Library/2026-04-18-00-28-38.gh-issue-148706.mvlqm5.rst b/Misc/NEWS.d/next/Library/2026-04-18-00-28-38.gh-issue-148706.mvlqm5.rst
@@ -0,0 +1,16 @@
+Speed up :class:`pickle._Pickler` (the pure-Python pickler fallback,
+also the base class of :class:`dill.Pickler`) on the ``dump`` path by
+inlining the frame-boundary check, reordering
+:meth:`~pickle._Pickler.save` to dispatch atomic types (``str``,
+``bytes``, ``int``, ``None``, ``bool``, ``float``) ahead of the memo
+lookup to match the C reference implementation in
+:mod:`!Modules/_pickle.c`, adding exact-container fast paths for
+:class:`list` and :class:`dict` under binary protocols, inlining
+``MEMOIZE`` for protocol 4+, and precomputing the ``BININT1`` opcode
+byte sequences for integers in ``0..255``.  Pure-Python
+:meth:`pickle._Pickler.dump` is now 20–49% faster on representative
+workloads; :func:`dill.dumps` (which inherits from
+:class:`pickle._Pickler`) is 19–37% faster on the same shapes.  One
+user-visible semantic change: atomic types no longer invoke
+:meth:`~pickle._Pickler.reducer_override`, aligning pure-Python
+behaviour with the long-standing C dispatch order.
diff --git a/Misc/pickle-perf-data/README.md b/Misc/pickle-perf-data/README.md
@@ -0,0 +1,90 @@
+# Pickle Perf Raw Data
+
+Raw artifacts backing `Misc/pickle-perf-diary.md`. Regeneratable;
+checked in so reviewers can re-verify numbers without rerunning the
+methodology.
+
+## Harness
+
+`pickle_pure_bench.py` — the pure-Python `pickle._Pickler` /
+`_Unpickler` benchmark used throughout. Five workloads (list-of-ints,
+list-of-strs, flat str-keyed dict, deep list-of-lists, nested
+list-of-dicts). Each reports a best-of-9 median for dump and load at
+protocol 5.
+
+`pickle_pure_bench_bytes.py` — bytes-heavy workload (short bytes,
+medium bytes, bytearrays, bytes-keyed dict). Introduced in round 2 to
+evaluate F6 (bytes in the save() fast path).
+
+`pickle_save_profile.py` — `cProfile`-based breakdown used to identify
+which internal calls dominate `save()` (informed the priority ordering
+of ideas D, E over B; in round 2, drove the F1 / F2 / F4 ordering).
+
+Run each with `taskset -c 0 ./python <script>` on a quiet machine.
+
+## JSON files
+
+### Round 1 (Exp 4 → E)
+
+| File | Commit / state |
+| --- | --- |
+| `pickle-pure-baseline.json` | Clean `main` (2faceeec), no pickle patches |
+| `pickle-pure-exp4d.json` | Exp 4 only (exact-container fast paths, `94b53eb`) |
+| `pickle-pure-Donly-verify.json` | Exp 4 + D (inlined `commit_frame`) |
+| `pickle-pure-BD.json` | Exp 4 + D + B attempt — **used to confirm B regression** |
+| `pickle-pure-DE.json` | Exp 4 + D + E (int-only initial form) |
+| `pickle-pure-DE-v2.json` | Exp 4 + D + E (str added — `bb9d721`) |
+
+### Round 2 (F1 → F6)
+
+| File | Commit / state |
+| --- | --- |
+| `pickle-post-fix.json` | Large-dict mutation fix, before F1 reorder |
+| `pickle-F1v2.json` | F1 (save() reordered, atomic short-circuit before memo, `285fcae`) |
+| `pickle-F3.json` | F3 (frame byte counter) — **rejected**, reverted |
+| `pickle-F4.json` | F4 (BININT1 opcode cache, `7c6af84`) |
+| `pickle-F2.json` | F2 (inlined MEMOIZE in memoize(), `2f1d38b`) |
+| `pickle-F5.json` | F5 (ASCII save_str) — **rejected**, reverted |
+| `pickle-F6v2.json` | F6 (bytes in fast path, `e917108`) — current tip |
+
+### Bytes-specific bench (introduced in round 2)
+
+| File | Content |
+| --- | --- |
+| `pickle-bytes-pre.json` | Before F6; baseline for bytes workloads |
+| `pickle-bytes-F6v2.json` | After F6 |
+
+## Interpretation guide
+
+Each JSON has per-workload records with:
+
+    loads_number / dumps_number     # inner loop counts
+    loads_runs / dumps_runs         # 9 raw timings
+    loads_median / dumps_median     # primary statistic
+    loads_min / dumps_min           # outlier-robust secondary
+
+Compare two files with:
+
+    ./python -c "
+    import json
+    def load(p):
+        s = open(p).read()
+        return json.loads(s[s.find('{'):])
+    a = load('a.json'); b = load('b.json')
+    for k in sorted(a):
+        print(k, (b[k]['dump_median'] - a[k]['dump_median']) /
+                  a[k]['dump_median'] * 100, '% dump')"
+
+(The benchmark prints a summary line before the JSON, hence the
+`s.find('{')` trick.)
+
+## What's missing
+
+Earlier-iteration `pickle-pure-exp4.json` / `exp4b.json` / `exp4c.json`
+/ `exp4e.json` variants were not copied — they represent abandoned
+shapes of the Exp 4 implementation (index-based iteration with its
+mutation bug, dict-path using `iter()+next()`) and are superseded by
+`exp4d.json`. Still available in `/tmp/` on the authoring machine.
+
+`pickle-pure-D.json` / `pickle-pure-exp4.json` from the kernel-compile
+thermal contamination event are excluded — numbers unusable.