From 94b53eb9186cd06ecd111c81ed90dbf038d20e0d Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael.bommarito@gmail.com>
Date: Fri, 17 Apr 2026 15:54:03 -0400
Subject: [PATCH 1/9] pickle: add exact-container fast paths to pure-Python
 _batch_appends / _batch_setitems
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The C accelerator has batch_list_exact (Modules/_pickle.c:3179) and
batch_dict_exact (Modules/_pickle.c:3455) that skip the generic
batched() + enumerate() overhead for exact list / dict instances under
binary protocols. The pure-Python _batch_appends and _batch_setitems
didn't have equivalents.

This adds:
  _batch_appends_exact(obj)   — slice-snapshot per batch, no
                                itertools.batched generator, no
                                enumerate() object.
  _batch_setitems_exact(obj)  — for dicts <= BATCHSIZE, a single
                                direct iteration of obj.items(); for
                                larger dicts, a materialized list
                                with sliced batches.

Activated only when type(obj) is list / dict and self.bin is true.
Other types and proto 0 still use the generic paths unchanged.

Pure-Python dump numbers (best-of-9 median per bench):

    list_of_ints_10k         dump  -3.1%   load -2.6%
    list_of_strs_1k          dump  -1.7%   load -3.2%
    dict_str_int_5k          dump  -0.2%   load -3.5%  (>BATCHSIZE)
    deep_list                dump -17.2%   load -0.3%
    nested_list_of_dicts     dump -22.1%   load -1.1%

Load deltas are unrelated micro-noise; the load path was not touched.

Correctness:
  test_pickle              1060/1060 pass
  test_pickletools          202/202 pass
  test_copy                  83/83  pass
  test_copyreg                6/6   pass
  test_importlib           1217/1217 pass
  dill 0.4.1               29/30 pass (pre-existing 3.15a8 compat)
  cloudpickle 3.1.2        236/236 + 22 skipped + 2 xfailed (identical)

The test_evil_pickler_mutating_collection case motivated using a
per-batch slice snapshot for lists (so concurrent mutation doesn't
raise IndexError) and relying on dict's built-in size-change check
for dicts.
---
 Lib/pickle.py | 111 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 109 insertions(+), 2 deletions(-)

diff --git a/Lib/pickle.py b/Lib/pickle.py
index 3e7cf25cb05337..89cc897e2bf890 100644
--- a/Lib/pickle.py
+++ b/Lib/pickle.py
@@ -1028,12 +1028,55 @@ def save_list(self, obj):
             self.write(MARK + LIST)
 
         self.memoize(obj)
-        self._batch_appends(obj, obj)
+        if self.bin and type(obj) is list:
+            # Fast path for exact lists under binary protocols; mirrors the
+            # C accelerator's batch_list_exact (Modules/_pickle.c). Avoids
+            # the per-batch tuple allocation from batched() and the
+            # enumerate() overhead used by the generic _batch_appends path.
+            self._batch_appends_exact(obj)
+        else:
+            self._batch_appends(obj, obj)
 
     dispatch[list] = save_list
 
     _BATCHSIZE = 1000
 
+    def _batch_appends_exact(self, obj):
+        # Fast path for type(obj) is list, binary protocols. Snapshots a
+        # slice per batch so concurrent mutation (e.g. via persistent_id)
+        # does not break indexing; matches the tolerance of the generic
+        # _batch_appends path that goes through batched().
+        save = self.save
+        write = self.write
+        batch_size = self._BATCHSIZE
+        idx = 0
+        while True:
+            n = len(obj)
+            if idx >= n:
+                return
+            remaining = n - idx
+            if remaining == 1:
+                try:
+                    save(obj[idx])
+                except BaseException as exc:
+                    exc.add_note(f'when serializing {_T(obj)} item {idx}')
+                    raise
+                write(APPEND)
+                return
+            batch = remaining if remaining < batch_size else batch_size
+            snapshot = obj[idx:idx + batch]
+            write(MARK)
+            i = idx
+            for x in snapshot:
+                try:
+                    save(x)
+                except BaseException as exc:
+                    exc.add_note(f'when serializing {_T(obj)} item {i}')
+                    raise
+                i += 1
+            write(APPENDS)
+            idx = i
+
     def _batch_appends(self, items, obj):
         # Helper to batch up APPENDS sequences
         save = self.save
@@ -1077,10 +1120,74 @@ def save_dict(self, obj):
             self.write(MARK + DICT)
 
         self.memoize(obj)
-        self._batch_setitems(obj.items(), obj)
+        if self.bin and type(obj) is dict:
+            self._batch_setitems_exact(obj)
+        else:
+            self._batch_setitems(obj.items(), obj)
 
     dispatch[dict] = save_dict
 
+    def _batch_setitems_exact(self, obj):
+        # Fast path for type(obj) is dict, binary protocols. dict's own
+        # iterator raises RuntimeError on size change, so no snapshotting
+        # is needed.
+        save = self.save
+        write = self.write
+        batch_size = self._BATCHSIZE
+        items = obj.items()
+        n = len(items)
+        if n == 0:
+            return
+        if n == 1:
+            for k, v in items:
+                save(k)
+                try:
+                    save(v)
+                except BaseException as exc:
+                    exc.add_note(f'when serializing {_T(obj)} item {k!r}')
+                    raise
+                write(SETITEM)
+            return
+        if n <= batch_size:
+            # Single batch: iterate items() directly, no batching machinery.
+            write(MARK)
+            for k, v in items:
+                save(k)
+                try:
+                    save(v)
+                except BaseException as exc:
+                    exc.add_note(f'when serializing {_T(obj)} item {k!r}')
+                    raise
+            write(SETITEMS)
+            return
+        # Large dict: materialize items once, batch via slicing. The full
+        # items list is allocated only when n > batch_size.
+        all_items = list(items)
+        total = 0
+        while total < n:
+            remaining = n - total
+            if remaining == 1:
+                k, v = all_items[total]
+                save(k)
+                try:
+                    save(v)
+                except BaseException as exc:
+                    exc.add_note(f'when serializing {_T(obj)} item {k!r}')
+                    raise
+                write(SETITEM)
+                return
+            this_batch = remaining if remaining < batch_size else batch_size
+            write(MARK)
+            for k, v in all_items[total:total + this_batch]:
+                save(k)
+                try:
+                    save(v)
+                except BaseException as exc:
+                    exc.add_note(f'when serializing {_T(obj)} item {k!r}')
+                    raise
+            write(SETITEMS)
+            total += this_batch
+
     def _batch_setitems(self, items, obj):
         # Helper to batch up SETITEMS sequences; proto >= 1 only
         save = self.save

From bb9d72199059850b3d5ef84f1a2e67865031b67c Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael.bommarito@gmail.com>
Date: Fri, 17 Apr 2026 18:00:25 -0400
Subject: [PATCH 2/9] pickle: inline frame-check and short-circuit common
 atomic types in save()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two follow-ups to the exact-container fast paths (94b53eb):

D) Inline framer.commit_frame() at the top of save(). The hot check
   (current_frame is None, or tell() < _FRAME_SIZE_TARGET) runs on every
   save() call; skipping the Python method dispatch when no commit is
   needed removes a measurable per-call tax on long runs.

E) Short-circuit the dispatch-table dict.get for the common atomic
   types (str, int, NoneType, bool, float) by matching type(obj) with
   direct `is` checks before falling through to `self.dispatch.get(t)`.
   Placed after the memo and reducer_override checks so semantics for
   repeated strings, subclasses, and custom reducers are unchanged.

Pure-Python dump numbers (best-of-9 median per bench, vs `main` with
no pickle patches):

    list_of_ints_10k         dump -11.7%    load -3.9%
    list_of_strs_1k          dump  -8.9%    load -3.4%
    dict_str_int_5k          dump  -9.9%    load -4.4%
    deep_list                dump -24.6%    load -2.9%
    nested_list_of_dicts     dump -28.5%    load -2.8%

(Cumulative with the exact-container fast paths from 94b53eb.) The
load deltas are unrelated noise; the load path was not touched.

Correctness:
  test_pickle              1060/1060 pass
  test_pickletools          202/202 pass
  test_copy                  83/83  pass
  test_copyreg                6/6   pass
  test_importlib           1217/1217 pass
  dill 0.4.1               29/30 pass (pre-existing 3.15a8 incompat)
  cloudpickle 3.1.2        236/236 + 22 skipped + 2 xfailed (identical)

Investigations that did NOT ship:

  B) Hoisting persistent_id / reducer_override hook checks to __init__
     — precomputed bool + __dict__ probe is strictly more work than the
     original self.persistent_id(obj) call, which hits the type-attribute
     cache. Rejected twice on clean measurements.

  C) Atomic-tuple memoize skip — semantically safe but changes byte-
     exact pickle output, breaking test_pickle_to_2x's fixture assertion.
     Deferred; would require updating DATA_SET2 / DATA_XRANGE.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Lib/pickle.py | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/Lib/pickle.py b/Lib/pickle.py
index 89cc897e2bf890..6856a81d5e5343 100644
--- a/Lib/pickle.py
+++ b/Lib/pickle.py
@@ -560,7 +560,15 @@ def get(self, i):
         return GET + repr(i).encode("ascii") + b'\n'
 
     def save(self, obj, save_persistent_id=True):
-        self.framer.commit_frame()
+        # Inlined commit_frame() hot check. The frame is either None
+        # (proto < 4) or a BytesIO that only needs committing once it
+        # exceeds _FRAME_SIZE_TARGET. Skip the Python-level method
+        # dispatch for the no-op case (the overwhelming majority of
+        # saves on small/medium payloads).
+        framer = self.framer
+        cf = framer.current_frame
+        if cf is not None and cf.tell() >= _Framer._FRAME_SIZE_TARGET:
+            framer.commit_frame()
 
         # Check for persistent id (defined by a subclass)
         if save_persistent_id:
@@ -581,8 +589,27 @@ def save(self, obj, save_persistent_id=True):
             rv = reduce(obj)
 
         if rv is NotImplemented:
-            # Check the type dispatch table
+            # Fast-path common types before the general dispatch table
+            # lookup. Saves one dict.get per save() call on payloads
+            # dominated by these types. The memo check already ran, so
+            # repeated strings / bytes / tuples still dedup via that path.
             t = type(obj)
+            if t is str:
+                self.save_str(obj)
+                return
+            if t is int:
+                self.save_long(obj)
+                return
+            if obj is None:
+                self.write(NONE)
+                return
+            if t is bool:
+                self.save_bool(obj)
+                return
+            if t is float:
+                self.save_float(obj)
+                return
+            # Check the type dispatch table
             f = self.dispatch.get(t)
             if f is not None:
                 f(self, obj)  # Call unbound method with explicit self

From 69eed0405f8f6ec09dfcb51ee9833ae891803ea8 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael.bommarito@gmail.com>
Date: Fri, 17 Apr 2026 18:17:18 -0400
Subject: [PATCH 3/9] pickle: add Misc/pickle-perf-diary.md + raw bench data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Documents the pure-Python _Pickler fast-path investigation that
produced commits 94b53eb and bb9d721. Mirrors the structure of
Misc/marshal-perf-diary.md.

Covers five experiments (A-E) — what shipped (Exp 4 exact-container
fast paths; D inlined commit_frame; E atomic-type is-short-circuit),
what was rejected (B hook hoisting — Python's type-attribute cache
beats manual short-circuits), and what was deferred (C atomic-tuple
memoize skip — blocked by byte-exact pickle-output test fixture).

Includes the kernel-compile-contamination lesson and the methodology
that caught it on the clean rerun.

Adds Misc/pickle-perf-data/ with the six canonical JSON artifacts plus
the bench / profile scripts used throughout.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Misc/pickle-perf-data/README.md               |  65 +++
 Misc/pickle-perf-data/pickle-pure-BD.json     | 157 ++++++
 Misc/pickle-perf-data/pickle-pure-DE-v2.json  | 157 ++++++
 Misc/pickle-perf-data/pickle-pure-DE.json     | 157 ++++++
 .../pickle-pure-Donly-verify.json             | 157 ++++++
 .../pickle-pure-baseline.json                 | 157 ++++++
 Misc/pickle-perf-data/pickle-pure-exp4d.json  | 157 ++++++
 Misc/pickle-perf-data/pickle_pure_bench.py    |  68 +++
 Misc/pickle-perf-data/pickle_save_profile.py  |  32 ++
 Misc/pickle-perf-diary.md                     | 446 ++++++++++++++++++
 10 files changed, 1553 insertions(+)
 create mode 100644 Misc/pickle-perf-data/README.md
 create mode 100644 Misc/pickle-perf-data/pickle-pure-BD.json
 create mode 100644 Misc/pickle-perf-data/pickle-pure-DE-v2.json
 create mode 100644 Misc/pickle-perf-data/pickle-pure-DE.json
 create mode 100644 Misc/pickle-perf-data/pickle-pure-Donly-verify.json
 create mode 100644 Misc/pickle-perf-data/pickle-pure-baseline.json
 create mode 100644 Misc/pickle-perf-data/pickle-pure-exp4d.json
 create mode 100644 Misc/pickle-perf-data/pickle_pure_bench.py
 create mode 100644 Misc/pickle-perf-data/pickle_save_profile.py
 create mode 100644 Misc/pickle-perf-diary.md

diff --git a/Misc/pickle-perf-data/README.md b/Misc/pickle-perf-data/README.md
new file mode 100644
index 00000000000000..1cd1eb9c930f5e
--- /dev/null
+++ b/Misc/pickle-perf-data/README.md
@@ -0,0 +1,65 @@
+# Pickle Perf Raw Data
+
+Raw artifacts backing `Misc/pickle-perf-diary.md`. Regeneratable;
+checked in so reviewers can re-verify numbers without rerunning the
+methodology.
+
+## Harness
+
+`pickle_pure_bench.py` — the pure-Python `pickle._Pickler` /
+`_Unpickler` benchmark used throughout. Five workloads (list-of-ints,
+list-of-strs, flat str-keyed dict, deep list-of-lists, nested
+list-of-dicts). Each reports a best-of-9 median for dump and load at
+protocol 5.
+
+`pickle_save_profile.py` — `cProfile`-based breakdown used once to
+identify which internal calls dominate `save()` (informed the priority
+ordering of ideas D, E over B).
+
+Run each with `taskset -c 0 ./python <script>` on a quiet machine.
+
+## JSON files
+
+| File | Commit / state |
+| --- | --- |
+| `pickle-pure-baseline.json` | Clean `main` (2faceeec), no pickle patches |
+| `pickle-pure-exp4d.json` | Exp 4 only (exact-container fast paths, `94b53eb`) |
+| `pickle-pure-Donly-verify.json` | Exp 4 + D (inlined `commit_frame`) |
+| `pickle-pure-BD.json` | Exp 4 + D + B attempt — **used to confirm B regression** |
+| `pickle-pure-DE.json` | Exp 4 + D + E (int-only initial form) |
+| `pickle-pure-DE-v2.json` | Exp 4 + D + E (str added — the shipped form, `bb9d721`) |
+
+## Interpretation guide
+
+Each JSON has per-workload records with:
+
+    loads_number / dumps_number     # inner loop counts
+    loads_runs / dumps_runs         # 9 raw timings
+    loads_median / dumps_median     # primary statistic
+    loads_min / dumps_min           # outlier-robust secondary
+
+Compare two files with:
+
+    ./python -c "
+    import json
+    def load(p):
+        s = open(p).read()
+        return json.loads(s[s.find('{'):])
+    a = load('a.json'); b = load('b.json')
+    for k in sorted(a):
+        print(k, (b[k]['dump_median'] - a[k]['dump_median']) /
+                  a[k]['dump_median'] * 100, '% dump')"
+
+(The benchmark prints a summary line before the JSON, hence the
+`s.find('{')` trick.)
+
+## What's missing
+
+Earlier-iteration `pickle-pure-exp4.json` / `exp4b.json` / `exp4c.json`
+/ `exp4e.json` variants were not copied — they represent abandoned
+shapes of the Exp 4 implementation (index-based iteration with its
+mutation bug, dict-path using `iter()+next()`) and are superseded by
+`exp4d.json`. Still available in `/tmp/` on the authoring machine.
+
+`pickle-pure-D.json` / `pickle-pure-exp4.json` from the kernel-compile
+thermal contamination event are excluded — numbers unusable.
diff --git a/Misc/pickle-perf-data/pickle-pure-BD.json b/Misc/pickle-perf-data/pickle-pure-BD.json
new file mode 100644
index 00000000000000..9ba36875af5d81
--- /dev/null
+++ b/Misc/pickle-perf-data/pickle-pure-BD.json
@@ -0,0 +1,157 @@
+list_of_ints_10k          dump_med=0.91930s  load_med=0.43501s
+list_of_strs_1k           dump_med=0.40574s  load_med=0.20695s
+dict_str_int_5k           dump_med=1.23034s  load_med=0.71012s
+nested_list_of_dicts      dump_med=1.77985s  load_med=1.12879s
+deep_list                 dump_med=2.40298s  load_med=1.31276s
+{
+  "deep_list": {
+    "dump_median": 2.402980258979369,
+    "dump_min": 2.39466959098354,
+    "dump_number": 1000,
+    "dump_runs": [
+      2.4085421949857846,
+      2.4049394440080505,
+      2.402980258979369,
+      2.39466959098354,
+      2.4080548229976557,
+      2.4018223460006993,
+      2.415940430975752,
+      2.3984833220019937,
+      2.4018100450048223
+    ],
+    "load_median": 1.312759077991359,
+    "load_min": 1.3071376279985998,
+    "load_number": 1000,
+    "load_runs": [
+      1.3202803699823562,
+      1.3171429770009127,
+      1.3173831879976206,
+      1.312759077991359,
+      1.3071376279985998,
+      1.3110339369741268,
+      1.3103157649748027,
+      1.3147789300128352,
+      1.3117808050010353
+    ]
+  },
+  "dict_str_int_5k": {
+    "dump_median": 1.2303356520133093,
+    "dump_min": 1.2247383689973503,
+    "dump_number": 200,
+    "dump_runs": [
+      1.3666104830044787,
+      1.2447044269938488,
+      1.2278229230141733,
+      1.2303356520133093,
+      1.2393498940218706,
+      1.2351028789998963,
+      1.2262491540168412,
+      1.22654432701529,
+      1.2247383689973503
+    ],
+    "load_median": 0.7101229340187274,
+    "load_min": 0.706795966019854,
+    "load_number": 200,
+    "load_runs": [
+      0.7075427410018165,
+      0.706795966019854,
+      0.7101229340187274,
+      0.7290908470167778,
+      0.7085066660074517,
+      0.7123115089780185,
+      0.710755064006662,
+      0.7084631109901238,
+      0.7136946939863265
+    ]
+  },
+  "list_of_ints_10k": {
+    "dump_median": 0.9192987270071171,
+    "dump_min": 0.9133134169969708,
+    "dump_number": 200,
+    "dump_runs": [
+      0.9238862659840379,
+      0.9133134169969708,
+      0.9204169060103595,
+      0.9223326169885695,
+      0.921029527002247,
+      0.9183009609987494,
+      0.9156870770093519,
+      0.9176075769937597,
+      0.9192987270071171
+    ],
+    "load_median": 0.4350064450118225,
+    "load_min": 0.4311449060041923,
+    "load_number": 200,
+    "load_runs": [
+      0.4558928439801093,
+      0.4350064450118225,
+      0.4324780570168514,
+      0.43448822500067763,
+      0.43863931301166303,
+      0.43812351301312447,
+      0.4350890280038584,
+      0.4311449060041923,
+      0.4335023880121298
+    ]
+  },
+  "list_of_strs_1k": {
+    "dump_median": 0.4057403629994951,
+    "dump_min": 0.4033116950013209,
+    "dump_number": 500,
+    "dump_runs": [
+      0.4057403629994951,
+      0.4123751530132722,
+      0.4073213080118876,
+      0.40551396799855866,
+      0.4033116950013209,
+      0.4044021530135069,
+      0.4074590789969079,
+      0.4051300589926541,
+      0.40627873499761336
+    ],
+    "load_median": 0.2069543870165944,
+    "load_min": 0.20513966301223263,
+    "load_number": 500,
+    "load_runs": [
+      0.2075739669962786,
+      0.20645616602269,
+      0.20723486301722005,
+      0.20971321500837803,
+      0.20513966301223263,
+      0.2069543870165944,
+      0.20632790599483997,
+      0.20682099199621007,
+      0.20782412099651992
+    ]
+  },
+  "nested_list_of_dicts": {
+    "dump_median": 1.7798525010002777,
+    "dump_min": 1.7632110970153008,
+    "dump_number": 1000,
+    "dump_runs": [
+      1.7948254899820313,
+      1.783357928012265,
+      1.7804154109908268,
+      1.7823616079986095,
+      1.7772578129952308,
+      1.7798525010002777,
+      1.7730186230037361,
+      1.7632110970153008,
+      1.7634506369940937
+    ],
+    "load_median": 1.12878518801881,
+    "load_min": 1.1273440989898518,
+    "load_number": 1000,
+    "load_runs": [
+      1.1277884370065294,
+      1.129626431997167,
+      1.14512576899142,
+      1.1587807849864475,
+      1.12878518801881,
+      1.1308587690000422,
+      1.127368856978137,
+      1.1273440989898518,
+      1.1280879919941071
+    ]
+  }
+}
diff --git a/Misc/pickle-perf-data/pickle-pure-DE-v2.json b/Misc/pickle-perf-data/pickle-pure-DE-v2.json
new file mode 100644
index 00000000000000..459bb0548a73d3
--- /dev/null
+++ b/Misc/pickle-perf-data/pickle-pure-DE-v2.json
@@ -0,0 +1,157 @@
+list_of_ints_10k          dump_med=0.67561s  load_med=0.42621s
+list_of_strs_1k           dump_med=0.29343s  load_med=0.20290s
+dict_str_int_5k           dump_med=0.97931s  load_med=0.70600s
+nested_list_of_dicts      dump_med=1.43940s  load_med=1.12700s
+deep_list                 dump_med=2.03787s  load_med=1.29878s
+{
+  "deep_list": {
+    "dump_median": 2.0378651290084235,
+    "dump_min": 2.0309917329868767,
+    "dump_number": 1000,
+    "dump_runs": [
+      2.0351808429986704,
+      2.038278858992271,
+      2.050464561994886,
+      2.0309917329868767,
+      2.0561472100089304,
+      2.0328132120193914,
+      2.037633209984051,
+      2.0411366690241266,
+      2.0378651290084235
+    ],
+    "load_median": 1.2987810370104853,
+    "load_min": 1.2953619630134199,
+    "load_number": 1000,
+    "load_runs": [
+      1.2991201730037574,
+      1.297109200997511,
+      1.2979511379962787,
+      1.2953619630134199,
+      1.2960114750021603,
+      1.3029695930017624,
+      1.3053947059961502,
+      1.3139708059898112,
+      1.2987810370104853
+    ]
+  },
+  "dict_str_int_5k": {
+    "dump_median": 0.9793077069916762,
+    "dump_min": 0.9736417139938567,
+    "dump_number": 200,
+    "dump_runs": [
+      0.9736417139938567,
+      0.9793077069916762,
+      0.9807399339915719,
+      0.9787038849899545,
+      0.9795360869902652,
+      0.9744311689864844,
+      0.9745427169837058,
+      0.98043748998316,
+      0.9843152559769806
+    ],
+    "load_median": 0.7059950139955617,
+    "load_min": 0.7000152170076035,
+    "load_number": 200,
+    "load_runs": [
+      0.7059950139955617,
+      0.7081212670018431,
+      0.7079254949931055,
+      0.7000152170076035,
+      0.706046350998804,
+      0.7073957809770945,
+      0.7048586469900329,
+      0.7048707559879404,
+      0.701123542006826
+    ]
+  },
+  "list_of_ints_10k": {
+    "dump_median": 0.6756118129997049,
+    "dump_min": 0.6735546679992694,
+    "dump_number": 200,
+    "dump_runs": [
+      0.6839964200044051,
+      0.6793537760095205,
+      0.7024026130093262,
+      0.6745005330012646,
+      0.6756118129997049,
+      0.6735546679992694,
+      0.6747379360022023,
+      0.6752456210087985,
+      0.6777991220005788
+    ],
+    "load_median": 0.42620760400313884,
+    "load_min": 0.42387862800387666,
+    "load_number": 200,
+    "load_runs": [
+      0.4257192910008598,
+      0.4266782260092441,
+      0.4260585650044959,
+      0.424293169984594,
+      0.42666856601135805,
+      0.42387862800387666,
+      0.4275824159849435,
+      0.42620760400313884,
+      0.42792953198659234
+    ]
+  },
+  "list_of_strs_1k": {
+    "dump_median": 0.2934286270174198,
+    "dump_min": 0.29193341100472026,
+    "dump_number": 500,
+    "dump_runs": [
+      0.2967839890043251,
+      0.2940941939887125,
+      0.292652784002712,
+      0.2934286270174198,
+      0.2949846759729553,
+      0.29193341100472026,
+      0.2967690009973012,
+      0.293221332016401,
+      0.2924067370186094
+    ],
+    "load_median": 0.20290413597831503,
+    "load_min": 0.20165196299785748,
+    "load_number": 500,
+    "load_runs": [
+      0.20290413597831503,
+      0.20292848700773902,
+      0.20165196299785748,
+      0.20430645099258982,
+      0.2018760299833957,
+      0.20293336099712178,
+      0.20336143497843295,
+      0.20224970902199857,
+      0.20282926998334005
+    ]
+  },
+  "nested_list_of_dicts": {
+    "dump_median": 1.4393983459740411,
+    "dump_min": 1.4290942430088762,
+    "dump_number": 1000,
+    "dump_runs": [
+      1.4447900310042314,
+      1.4460319979989436,
+      1.441794184007449,
+      1.4333396299916785,
+      1.4290942430088762,
+      1.4358573770150542,
+      1.4393983459740411,
+      1.438702771003591,
+      1.4431046750105452
+    ],
+    "load_median": 1.1269958830089308,
+    "load_min": 1.121885464002844,
+    "load_number": 1000,
+    "load_runs": [
+      1.1304119899868965,
+      1.1301160019938834,
+      1.1238572309957817,
+      1.1302743080013897,
+      1.131272408994846,
+      1.1248455940221902,
+      1.1222883820009883,
+      1.121885464002844,
+      1.1269958830089308
+    ]
+  }
+}
diff --git a/Misc/pickle-perf-data/pickle-pure-DE.json b/Misc/pickle-perf-data/pickle-pure-DE.json
new file mode 100644
index 00000000000000..e1fdad65faf84c
--- /dev/null
+++ b/Misc/pickle-perf-data/pickle-pure-DE.json
@@ -0,0 +1,157 @@
+list_of_ints_10k          dump_med=0.67263s  load_med=0.43388s
+list_of_strs_1k           dump_med=0.31437s  load_med=0.20643s
+dict_str_int_5k           dump_med=1.02289s  load_med=0.71204s
+nested_list_of_dicts      dump_med=1.46524s  load_med=1.13679s
+deep_list                 dump_med=2.02512s  load_med=1.32524s
+{
+  "deep_list": {
+    "dump_median": 2.0251224520034157,
+    "dump_min": 2.0166203700064216,
+    "dump_number": 1000,
+    "dump_runs": [
+      2.018379980989266,
+      2.0272193479759153,
+      2.0166210470197257,
+      2.0251224520034157,
+      2.0166203700064216,
+      2.0248265830159653,
+      2.049967636994552,
+      2.042192810011329,
+      2.028051579982275
+    ],
+    "load_median": 1.3252407940162811,
+    "load_min": 1.3183191069983877,
+    "load_number": 1000,
+    "load_runs": [
+      1.3252407940162811,
+      1.3183191069983877,
+      1.3223642540106084,
+      1.3228508569882251,
+      1.3261027190019377,
+      1.329409708007006,
+      1.3297534310258925,
+      1.328939569008071,
+      1.323627606005175
+    ]
+  },
+  "dict_str_int_5k": {
+    "dump_median": 1.0228936600033194,
+    "dump_min": 1.0157012910058256,
+    "dump_number": 200,
+    "dump_runs": [
+      1.0202337020018604,
+      1.0157012910058256,
+      1.01861926401034,
+      1.0268449090071954,
+      1.022725334012648,
+      1.0228936600033194,
+      1.0334247990103904,
+      1.0239273630140815,
+      1.0328715479990933
+    ],
+    "load_median": 0.7120367029856425,
+    "load_min": 0.7088228659995366,
+    "load_number": 200,
+    "load_runs": [
+      0.7120367029856425,
+      0.7135604670038447,
+      0.7175248459971044,
+      0.7118345579947345,
+      0.7133721380087081,
+      0.7088228659995366,
+      0.7139595320040826,
+      0.7115455150196794,
+      0.7093554949969985
+    ]
+  },
+  "list_of_ints_10k": {
+    "dump_median": 0.6726286539924331,
+    "dump_min": 0.6653509080060758,
+    "dump_number": 200,
+    "dump_runs": [
+      0.6690162190061528,
+      0.6653509080060758,
+      0.6655782090092544,
+      0.675788761000149,
+      0.6817701089894399,
+      0.683992317004595,
+      0.6746671459986828,
+      0.671241372008808,
+      0.6726286539924331
+    ],
+    "load_median": 0.43388323599356227,
+    "load_min": 0.43071943800896406,
+    "load_number": 200,
+    "load_runs": [
+      0.4611522300110664,
+      0.43388323599356227,
+      0.4311800099967513,
+      0.4329775070073083,
+      0.43071943800896406,
+      0.43307386999367736,
+      0.44467111499398015,
+      0.4350934960239101,
+      0.43426087297848426
+    ]
+  },
+  "list_of_strs_1k": {
+    "dump_median": 0.3143748669826891,
+    "dump_min": 0.31276378702023067,
+    "dump_number": 500,
+    "dump_runs": [
+      0.3134282849787269,
+      0.3157211340148933,
+      0.3127726379898377,
+      0.3143748669826891,
+      0.3133637700229883,
+      0.3148782410135027,
+      0.3143862159922719,
+      0.31276378702023067,
+      0.3153946059755981
+    ],
+    "load_median": 0.20642803798546083,
+    "load_min": 0.204678206006065,
+    "load_number": 500,
+    "load_runs": [
+      0.2071670409932267,
+      0.20642803798546083,
+      0.20559154701186344,
+      0.20588873099768534,
+      0.2077353509957902,
+      0.20606560201849788,
+      0.2076577479892876,
+      0.20666238700505346,
+      0.204678206006065
+    ]
+  },
+  "nested_list_of_dicts": {
+    "dump_median": 1.4652372339915019,
+    "dump_min": 1.4612911320000421,
+    "dump_number": 1000,
+    "dump_runs": [
+      1.478063208021922,
+      1.4694051039987244,
+      1.4613489490002394,
+      1.4652372339915019,
+      1.4612911320000421,
+      1.4617419700080063,
+      1.47042615400278,
+      1.470824472984532,
+      1.4648167659761384
+    ],
+    "load_median": 1.1367882780032232,
+    "load_min": 1.1326015570084564,
+    "load_number": 1000,
+    "load_runs": [
+      1.1367882780032232,
+      1.1376148190174717,
+      1.140628134016879,
+      1.1413246700249147,
+      1.1375428199826274,
+      1.1338932159997057,
+      1.1326015570084564,
+      1.133799008995993,
+      1.1346324940095656
+    ]
+  }
+}
diff --git a/Misc/pickle-perf-data/pickle-pure-Donly-verify.json b/Misc/pickle-perf-data/pickle-pure-Donly-verify.json
new file mode 100644
index 00000000000000..c7a5f009510b17
--- /dev/null
+++ b/Misc/pickle-perf-data/pickle-pure-Donly-verify.json
@@ -0,0 +1,157 @@
+list_of_ints_10k          dump_med=0.68049s  load_med=0.43080s
+list_of_strs_1k           dump_med=0.30082s  load_med=0.20502s
+dict_str_int_5k           dump_med=1.01708s  load_med=0.70519s
+nested_list_of_dicts      dump_med=1.44021s  load_med=1.13652s
+deep_list                 dump_med=2.04570s  load_med=1.30408s
+{
+  "deep_list": {
+    "dump_median": 2.045696774002863,
+    "dump_min": 2.0398600000189617,
+    "dump_number": 1000,
+    "dump_runs": [
+      2.043070841988083,
+      2.0410373299964704,
+      2.0398600000189617,
+      2.0471822330146097,
+      2.044025568990037,
+      2.0559000989887863,
+      2.045696774002863,
+      2.048144770989893,
+      2.052835623995634
+    ],
+    "load_median": 1.304083589988295,
+    "load_min": 1.299203549977392,
+    "load_number": 1000,
+    "load_runs": [
+      1.299203549977392,
+      1.3032960160053335,
+      1.3028028690023348,
+      1.3115046969905961,
+      1.3075253709976096,
+      1.3115513019729406,
+      1.308240234007826,
+      1.300697508995654,
+      1.304083589988295
+    ]
+  },
+  "dict_str_int_5k": {
+    "dump_median": 1.0170796010061167,
+    "dump_min": 1.0143144269823097,
+    "dump_number": 200,
+    "dump_runs": [
+      1.0158396899932995,
+      1.0143144269823097,
+      1.017147575999843,
+      1.0317976030055434,
+      1.0209092369768769,
+      1.0212397419963963,
+      1.0149212249962147,
+      1.0170796010061167,
+      1.0149822739767842
+    ],
+    "load_median": 0.7051884279935621,
+    "load_min": 0.7027078429819085,
+    "load_number": 200,
+    "load_runs": [
+      0.7046778700023424,
+      0.7304002059972845,
+      0.7065964679932222,
+      0.7051884279935621,
+      0.704050714004552,
+      0.7069314509863034,
+      0.7035913949948736,
+      0.7027078429819085,
+      0.7157388079795055
+    ]
+  },
+  "list_of_ints_10k": {
+    "dump_median": 0.6804857359966263,
+    "dump_min": 0.6792166839877609,
+    "dump_number": 200,
+    "dump_runs": [
+      0.681038473994704,
+      0.6804857359966263,
+      0.6809856129984837,
+      0.6799394739791751,
+      0.6821398930042051,
+      0.6792166839877609,
+      0.6799996799963992,
+      0.67961636299151,
+      0.6830115659977309
+    ],
+    "load_median": 0.4308035080030095,
+    "load_min": 0.428706125007011,
+    "load_number": 200,
+    "load_runs": [
+      0.4329482780012768,
+      0.43037094597821124,
+      0.428706125007011,
+      0.4346050030144397,
+      0.43064064797363244,
+      0.4299465490039438,
+      0.4308035080030095,
+      0.4320430780062452,
+      0.43109303299570456
+    ]
+  },
+  "list_of_strs_1k": {
+    "dump_median": 0.30081587500171736,
+    "dump_min": 0.29719530101283453,
+    "dump_number": 500,
+    "dump_runs": [
+      0.29906012100400403,
+      0.2998991619970184,
+      0.305134405993158,
+      0.30377223299001344,
+      0.303472214989597,
+      0.30081587500171736,
+      0.29901081800926477,
+      0.3024973499996122,
+      0.29719530101283453
+    ],
+    "load_median": 0.20502282300731167,
+    "load_min": 0.20428677997551858,
+    "load_number": 500,
+    "load_runs": [
+      0.20511246100068092,
+      0.20521458899020217,
+      0.2047070210101083,
+      0.20428677997551858,
+      0.2057154799986165,
+      0.2057620559935458,
+      0.20502282300731167,
+      0.20471726098912768,
+      0.20444758699159138
+    ]
+  },
+  "nested_list_of_dicts": {
+    "dump_median": 1.440212988993153,
+    "dump_min": 1.435074684995925,
+    "dump_number": 1000,
+    "dump_runs": [
+      1.440212988993153,
+      1.435074684995925,
+      1.4385789580119308,
+      1.4409781679860316,
+      1.439543638000032,
+      1.4399277000047732,
+      1.445138097013114,
+      1.453299649991095,
+      1.445539294014452
+    ],
+    "load_median": 1.1365163169975858,
+    "load_min": 1.132899730000645,
+    "load_number": 1000,
+    "load_runs": [
+      1.1382749690092169,
+      1.1414668830111623,
+      1.1358641410188284,
+      1.1338105300092138,
+      1.132899730000645,
+      1.1365163169975858,
+      1.1413998159987386,
+      1.135908809985267,
+      1.1367385880148504
+    ]
+  }
+}
diff --git a/Misc/pickle-perf-data/pickle-pure-baseline.json b/Misc/pickle-perf-data/pickle-pure-baseline.json
new file mode 100644
index 00000000000000..21a2937c4e4b27
--- /dev/null
+++ b/Misc/pickle-perf-data/pickle-pure-baseline.json
@@ -0,0 +1,157 @@
+list_of_ints_10k          dump_med=0.76472s  load_med=0.44357s
+list_of_strs_1k           dump_med=0.32197s  load_med=0.21013s
+dict_str_int_5k           dump_med=1.08718s  load_med=0.73858s
+nested_list_of_dicts      dump_med=2.01369s  load_med=1.15922s
+deep_list                 dump_med=2.70338s  load_med=1.33732s
+{
+  "deep_list": {
+    "dump_median": 2.7033844699908514,
+    "dump_min": 2.6937272080103867,
+    "dump_number": 1000,
+    "dump_runs": [
+      2.72207166600856,
+      2.7144663099898025,
+      2.6937272080103867,
+      2.6979810649936553,
+      2.7033844699908514,
+      2.6974078729981557,
+      2.700465011992492,
+      2.7116770150023513,
+      2.711619901994709
+    ],
+    "load_median": 1.337322609993862,
+    "load_min": 1.3322539450018667,
+    "load_number": 1000,
+    "load_runs": [
+      1.3346240490209311,
+      1.3390158129914198,
+      1.342718394997064,
+      1.3407046339998487,
+      1.3355732029885985,
+      1.3322539450018667,
+      1.337322609993862,
+      1.3348274029849563,
+      1.34047101798933
+    ]
+  },
+  "dict_str_int_5k": {
+    "dump_median": 1.0871835730213206,
+    "dump_min": 1.083220430999063,
+    "dump_number": 200,
+    "dump_runs": [
+      1.0844236589909997,
+      1.2569781089841854,
+      1.091007155016996,
+      1.0851060879940633,
+      1.083220430999063,
+      1.0842179449973628,
+      1.093184547004057,
+      1.0881974690128118,
+      1.0871835730213206
+    ],
+    "load_median": 0.7385778310126625,
+    "load_min": 0.7343079830170609,
+    "load_number": 200,
+    "load_runs": [
+      0.7385778310126625,
+      0.7403619920078199,
+      0.7356278810184449,
+      0.7358206959906965,
+      0.7343079830170609,
+      0.740531234012451,
+      0.7377027559850831,
+      0.7547609519970138,
+      0.7397108209843282
+    ]
+  },
+  "list_of_ints_10k": {
+    "dump_median": 0.7647163710207678,
+    "dump_min": 0.7607937120192219,
+    "dump_number": 200,
+    "dump_runs": [
+      0.7695770429854747,
+      0.7657628729939461,
+      0.7647163710207678,
+      0.7652712230046745,
+      0.7645555280032568,
+      0.764146071014693,
+      0.7658817569899838,
+      0.764585063996492,
+      0.7607937120192219
+    ],
+    "load_median": 0.4435691879771184,
+    "load_min": 0.4405112490057945,
+    "load_number": 200,
+    "load_runs": [
+      0.4435691879771184,
+      0.4405112490057945,
+      0.45241835102206096,
+      0.4528762780246325,
+      0.4542838860070333,
+      0.4408376220089849,
+      0.4432933820062317,
+      0.4421970819821581,
+      0.4446860819880385
+    ]
+  },
+  "list_of_strs_1k": {
+    "dump_median": 0.32197269398602657,
+    "dump_min": 0.3197705880156718,
+    "dump_number": 500,
+    "dump_runs": [
+      0.3207820450188592,
+      0.3222554210224189,
+      0.32199474799563177,
+      0.32197269398602657,
+      0.3227026330132503,
+      0.3213407369912602,
+      0.32285048899939284,
+      0.3197705880156718,
+      0.32095999200828373
+    ],
+    "load_median": 0.21012721199076623,
+    "load_min": 0.20889620899106376,
+    "load_number": 500,
+    "load_runs": [
+      0.20889620899106376,
+      0.20924553298391402,
+      0.20928583198110573,
+      0.2102906020008959,
+      0.21022001499659382,
+      0.21143139101332054,
+      0.21012721199076623,
+      0.21125835299608298,
+      0.20996750600170344
+    ]
+  },
+  "nested_list_of_dicts": {
+    "dump_median": 2.013691997010028,
+    "dump_min": 2.0009320300014224,
+    "dump_number": 1000,
+    "dump_runs": [
+      2.0119435010128655,
+      2.0009320300014224,
+      2.013691997010028,
+      2.0090409010008443,
+      2.0276798070117366,
+      2.0143161460000556,
+      2.020271721004974,
+      2.0183416990039404,
+      2.0064945070189424
+    ],
+    "load_median": 1.1592168509960175,
+    "load_min": 1.1505210300092585,
+    "load_number": 1000,
+    "load_runs": [
+      1.1689628259919118,
+      1.1505210300092585,
+      1.1540992140071467,
+      1.1557140609947965,
+      1.1592168509960175,
+      1.1590063939802349,
+      1.1943739719863515,
+      1.1667225419951137,
+      1.1699198570277076
+    ]
+  }
+}
diff --git a/Misc/pickle-perf-data/pickle-pure-exp4d.json b/Misc/pickle-perf-data/pickle-pure-exp4d.json
new file mode 100644
index 00000000000000..31ac926d52c8de
--- /dev/null
+++ b/Misc/pickle-perf-data/pickle-pure-exp4d.json
@@ -0,0 +1,157 @@
+list_of_ints_10k          dump_med=0.73974s  load_med=0.43289s
+list_of_strs_1k           dump_med=0.32278s  load_med=0.20832s
+dict_str_int_5k           dump_med=1.08434s  load_med=0.71553s
+nested_list_of_dicts      dump_med=1.59428s  load_med=1.14758s
+deep_list                 dump_med=2.22139s  load_med=1.32901s
+{
+  "deep_list": {
+    "dump_median": 2.2213945649855305,
+    "dump_min": 2.2061512220243458,
+    "dump_number": 1000,
+    "dump_runs": [
+      2.222053486009827,
+      2.2245263219811022,
+      2.2257350599975325,
+      2.2213945649855305,
+      2.212077326985309,
+      2.2188354929967318,
+      2.224048058997141,
+      2.2101889599871356,
+      2.2061512220243458
+    ],
+    "load_median": 1.3290111710084602,
+    "load_min": 1.3204623750061728,
+    "load_number": 1000,
+    "load_runs": [
+      1.3290111710084602,
+      1.3333852750074584,
+      1.3314400750095956,
+      1.3266311199986376,
+      1.330824117991142,
+      1.3297433870029636,
+      1.3258400830090977,
+      1.3204623750061728,
+      1.3250854320067447
+    ]
+  },
+  "dict_str_int_5k": {
+    "dump_median": 1.084340234985575,
+    "dump_min": 1.079654622997623,
+    "dump_number": 200,
+    "dump_runs": [
+      1.0848692589788698,
+      1.0863522970175836,
+      1.084340234985575,
+      1.079654622997623,
+      1.082118020975031,
+      1.0873415439855307,
+      1.0828245889861137,
+      1.0808038400136866,
+      1.0933073569904082
+    ],
+    "load_median": 0.7155307490029372,
+    "load_min": 0.7126387599855661,
+    "load_number": 200,
+    "load_runs": [
+      0.7166855039831717,
+      0.7139862030162476,
+      0.7155307490029372,
+      0.7126387599855661,
+      0.7136932019784581,
+      0.7155475449981168,
+      0.7156909820041619,
+      0.7131319509935565,
+      0.7198174180230126
+    ]
+  },
+  "list_of_ints_10k": {
+    "dump_median": 0.7397426110110246,
+    "dump_min": 0.7359336900117341,
+    "dump_number": 200,
+    "dump_runs": [
+      0.753055346023757,
+      0.7409483039809857,
+      0.7397426110110246,
+      0.7382766689988784,
+      0.7383768020081334,
+      0.7359336900117341,
+      0.7385778029856738,
+      0.7421423770138063,
+      0.7405381449789274
+    ],
+    "load_median": 0.4328879040258471,
+    "load_min": 0.4305327030015178,
+    "load_number": 200,
+    "load_runs": [
+      0.4328009609889705,
+      0.4324144610145595,
+      0.4332847869955003,
+      0.4342651710030623,
+      0.4316625370120164,
+      0.4328879040258471,
+      0.4305327030015178,
+      0.4386978149996139,
+      0.4379630359762814
+    ]
+  },
+  "list_of_strs_1k": {
+    "dump_median": 0.3227790009987075,
+    "dump_min": 0.32081380498129874,
+    "dump_number": 500,
+    "dump_runs": [
+      0.32427984100650065,
+      0.3242041439807508,
+      0.32484092001686804,
+      0.3231437439972069,
+      0.3227790009987075,
+      0.32121593799092807,
+      0.32142846699571237,
+      0.32212649000575766,
+      0.32081380498129874
+    ],
+    "load_median": 0.20831663900753483,
+    "load_min": 0.20693807801580988,
+    "load_number": 500,
+    "load_runs": [
+      0.20887073400081135,
+      0.20929160199011676,
+      0.2073181919986382,
+      0.20933648000936955,
+      0.20693807801580988,
+      0.20776125599513762,
+      0.2086013069783803,
+      0.20831663900753483,
+      0.20781919700675644
+    ]
+  },
+  "nested_list_of_dicts": {
+    "dump_median": 1.5942782199999783,
+    "dump_min": 1.588619847985683,
+    "dump_number": 1000,
+    "dump_runs": [
+      1.5996585949906148,
+      1.5942782199999783,
+      1.5904694340133574,
+      1.5967345780227333,
+      1.5970040730026085,
+      1.588619847985683,
+      1.5953650159935933,
+      1.592835980001837,
+      1.5938750600034837
+    ],
+    "load_median": 1.1475819289917126,
+    "load_min": 1.1432219900016207,
+    "load_number": 1000,
+    "load_runs": [
+      1.1502229679899756,
+      1.182334457989782,
+      1.150208687002305,
+      1.1516497379925568,
+      1.145763167005498,
+      1.1455858400149737,
+      1.1432219900016207,
+      1.1475819289917126,
+      1.1454184760223143
+    ]
+  }
+}
diff --git a/Misc/pickle-perf-data/pickle_pure_bench.py b/Misc/pickle-perf-data/pickle_pure_bench.py
new file mode 100644
index 00000000000000..6bee1d9f4be694
--- /dev/null
+++ b/Misc/pickle-perf-data/pickle_pure_bench.py
@@ -0,0 +1,68 @@
+"""
+Benchmark the pure-Python pickle._Pickler and pickle._Unpickler paths.
+
+These are the fallbacks when the C accelerator isn't available (embedded
+builds, alternative implementations, stdlib regression testing). They also
+mirror the semantics that anyone subclassing pickle.Pickler in Python
+directly will hit.
+"""
+import io
+import json
+import pickle
+import statistics
+import timeit
+
+# Force pure-Python path by using _Pickler / _Unpickler directly
+PyPickler = pickle._Pickler
+PyUnpickler = pickle._Unpickler
+
+SAMPLES = {
+    "list_of_ints_10k":     list(range(10_000)),
+    "list_of_strs_1k":      [f"item_{i}" for i in range(1_000)],
+    "dict_str_int_5k":      {f"key_{i}": i for i in range(5_000)},
+    "nested_list_of_dicts": [{"id": i, "name": f"n{i}", "val": i*2} for i in range(500)],
+    "deep_list":            [[i] * 10 for i in range(500)],
+}
+
+def bench_dump(obj, number):
+    buf = io.BytesIO()
+    def run():
+        buf.seek(0); buf.truncate(0)
+        p = PyPickler(buf, protocol=5)
+        p.dump(obj)
+    return timeit.repeat(run, number=number, repeat=9)
+
+def bench_load(data, number):
+    def run():
+        u = PyUnpickler(io.BytesIO(data))
+        u.load()
+    return timeit.repeat(run, number=number, repeat=9)
+
+CONFIG = {
+    "list_of_ints_10k":     {"dump": 200, "load": 200},
+    "list_of_strs_1k":      {"dump": 500, "load": 500},
+    "dict_str_int_5k":      {"dump": 200, "load": 200},
+    "nested_list_of_dicts": {"dump": 1000, "load": 1000},
+    "deep_list":            {"dump": 1000, "load": 1000},
+}
+
+results = {}
+for name, obj in SAMPLES.items():
+    # Pre-serialize with the C pickler (it's fine, we only benchmark the pure-Python path)
+    data = pickle.dumps(obj, protocol=5)
+    n = CONFIG[name]
+    drun = bench_dump(obj,  n["dump"])
+    lrun = bench_load(data, n["load"])
+    results[name] = {
+        "dump_number": n["dump"],
+        "load_number": n["load"],
+        "dump_runs": drun,
+        "load_runs": lrun,
+        "dump_median": statistics.median(drun),
+        "load_median": statistics.median(lrun),
+        "dump_min": min(drun),
+        "load_min":  min(lrun),
+    }
+    print(f"{name:25s} dump_med={results[name]['dump_median']:.5f}s  load_med={results[name]['load_median']:.5f}s", flush=True)
+
+print(json.dumps(results, indent=2, sort_keys=True))
diff --git a/Misc/pickle-perf-data/pickle_save_profile.py b/Misc/pickle-perf-data/pickle_save_profile.py
new file mode 100644
index 00000000000000..b402f408cf43bc
--- /dev/null
+++ b/Misc/pickle-perf-data/pickle_save_profile.py
@@ -0,0 +1,32 @@
+"""
+Profile where pure-Python pickle._Pickler.save() spends its time,
+broken down by call count.
+"""
+import cProfile
+import io
+import pickle
+import pstats
+
+PyPickler = pickle._Pickler
+
+def dump_it(obj):
+    buf = io.BytesIO()
+    p = PyPickler(buf, protocol=5)
+    p.dump(obj)
+
+WORKLOADS = {
+    "list_of_dicts":   [{"id": i, "name": f"n{i}", "v": i*2} for i in range(500)],
+    "deep_list":       [[i]*10 for i in range(500)],
+    "list_of_ints":    list(range(10_000)),
+    "dict_str_int":    {f"k{i}": i for i in range(5_000)},
+}
+
+for name, obj in WORKLOADS.items():
+    print(f"\n===== {name} =====")
+    pr = cProfile.Profile()
+    pr.enable()
+    for _ in range(50):
+        dump_it(obj)
+    pr.disable()
+    st = pstats.Stats(pr).sort_stats("cumulative")
+    st.print_stats(12)
diff --git a/Misc/pickle-perf-diary.md b/Misc/pickle-perf-diary.md
new file mode 100644
index 00000000000000..e1266004618aef
--- /dev/null
+++ b/Misc/pickle-perf-diary.md
@@ -0,0 +1,446 @@
+# Pickle Pure-Python Perf — Experiment Diary
+
+Working notebook for the pure-Python `pickle._Pickler` fast-path
+investigation. Mirrors the structure of `Misc/marshal-perf-diary.md`
+from the marshal recovery PR.
+
+**Scope**: only the pure-Python pickler fallback
+(`Lib/pickle.py::_Pickler`). The C accelerator (`Modules/_pickle.c`)
+is already heavily optimized (custom `PyMemoTable`, `batch_list_exact`
+/ `batch_dict_exact` specializations, hybrid array+dict unpickler memo
+with recent DoS guard). The pure-Python path is what's exercised
+when the C module is unavailable (embedded builds, free-threading
+dev, test regression) and by subclasses that override pure-Python
+methods.
+
+Branch: `exp-pickle/4-pure-python-exact-containers` (name is historical;
+carries the whole pickle stack). Based on `main` at `2faceeec5c0`.
+
+## Ground rules
+
+- **Harness**: `Misc/pickle-perf-data/pickle_pure_bench.py`. Five
+  workloads exercising `_Pickler` / `_Unpickler`. Protocol 5. Best-of-9
+  median per operation, pinned with `taskset -c 0`.
+- **Primary statistic**: `dump_median`. Report `load_median` too, but
+  none of the experiments in this diary touch the load path — any
+  load delta is measurement noise.
+- **Baseline**: fresh `main` tip `2faceeec5c0`, no patches.
+- **Build**: `make -j24` after each edit (pure-Python file edits
+  don't strictly require rebuild, but we do it anyway to keep other
+  state consistent).
+- **Correctness gate**: `./python -m test test_pickle` must pass
+  before any measurement is taken seriously.
+
+### Failure mode we learned to watch for
+
+One round of benchmarks was taken while a Linux kernel compile was
+running on core 0 of the same CPU. Results showed uniform ~2×
+slowdowns (+120%) across unrelated ideas — obvious contamination, but
+a cheaper form of the same effect (+15-29%) was initially interpreted
+as a genuine regression. **Rerun any suspect reading on a confirmed-
+quiet machine before drawing conclusions.** The `pickle-pure-D-v2` and
+`pickle-pure-BD` files in the data folder are the clean re-runs used
+for the actual verdicts below.
+
+## Reference numbers (clean `main`)
+
+| Workload | `dump_median` (s) | `load_median` (s) |
+| --- | ---: | ---: |
+| `list_of_ints_10k` | 0.7647 | 0.4436 |
+| `list_of_strs_1k` | 0.3220 | 0.2101 |
+| `dict_str_int_5k` | 1.0872 | 0.7386 |
+| `deep_list` | 2.7034 | 1.3373 |
+| `nested_list_of_dicts` | 2.0137 | 1.1592 |
+
+Source: `pickle-perf-data/pickle-pure-baseline.json`.
+
+## Experiment ledger
+
+| # | Idea | Lens | Status | Cumulative `dump` Δ vs main |
+| --- | --- | --- | --- | ---: |
+| 4 | Exact-container fast paths (`_batch_appends_exact`, `_batch_setitems_exact`) | Compiler (user #4) | **Shipped** (`94b53eb`) | −0.2 to −22.1% |
+| A | Full per-type dispatch cache in `save()` | Compiler (user #2) | Subsumed by E | n/a |
+| B | Hoist `persistent_id` / `reducer_override` hook probes to `__init__` | Compiler (user #2) | **Rejected** | +17 to +36% (worse) |
+| C | Skip `memoize()` for atomic-content tuples | Graph (cycle impossible) | **Deferred** | +0 to −3% (test fixture blocker) |
+| D | Inline `framer.commit_frame()` hot check | Compiler | **Shipped** (`bb9d721`) | −5 to −10% additional |
+| E | Atomic-type `is` short-circuit in `save()` | Empirical / profile | **Shipped** (`bb9d721`) | −0 to −4% additional |
+
+Final shipped: **Exp 4 + D + E**. Final numbers vs `main`:
+
+| Workload | `dump` | `load` |
+| --- | ---: | ---: |
+| `list_of_ints_10k` | **−11.7%** | −3.9% |
+| `list_of_strs_1k` | **−8.9%** | −3.4% |
+| `dict_str_int_5k` | **−9.9%** | −4.4% |
+| `deep_list` | **−24.6%** | −2.9% |
+| `nested_list_of_dicts` | **−28.5%** | −2.8% |
+
+Load-path numbers above are within measurement noise (the load path
+was never touched); they move directionally because the same
+subprocess runs both benches and thermal state drifts slightly.
+
+## Experiment 4 — exact-container fast paths
+
+**Hypothesis.** C has `batch_list_exact` (`Modules/_pickle.c:3179`)
+and `batch_dict_exact` (`:3455`) that skip the generic batching
+machinery for `PyList_CheckExact(obj)` / `PyDict_CheckExact(obj)`. The
+pure-Python `_batch_appends` (Lib/pickle.py:1037) uses
+`itertools.batched(items, _BATCHSIZE)` which, even though `batched`
+itself is a C-implemented iterator, forces one tuple allocation and
+one `enumerate()` object per batch. For the exact-list / exact-dict
+case this is unnecessary overhead.
+
+### Implementation iterations
+
+Three shapes were tried before the final form:
+
+1. **Index-based loop** — `for i in range(total, total+batch): save(obj[i])`.
+   Broke `test_evil_pickler_mutating_collection` because `obj[i]`
+   raises `IndexError` if `persistent_id` clears the list mid-batch.
+   The generic `batched()` path materialised the batch as a tuple
+   first and thus tolerated mid-iteration mutation.
+
+2. **Slice snapshot per batch** — `snapshot = obj[idx:idx + batch]`
+   before the inner loop. One list-slice allocation amortised over
+   `batch_size` items, no `enumerate()` object. Matches the tolerance
+   of `batched()`. Chosen for the list side.
+
+3. **Dict: single direct iteration when `n ≤ BATCHSIZE`, materialised
+   list when larger.** `obj.items()` itself raises `RuntimeError` on
+   mutation, so no snapshot is needed for correctness. The small-dict
+   case skips all batching machinery entirely — a single `for k, v in
+   items:` loop + `MARK` + `SETITEMS`. The large-dict case does a
+   one-time `list(items)` then iterates batches via slicing.
+
+### Final diff summary
+
+- `_batch_appends_exact(obj)` activated when `type(obj) is list and
+  self.bin`.
+- `_batch_setitems_exact(obj)` activated when `type(obj) is dict and
+  self.bin`.
+- Other types and proto 0 use the generic paths unchanged.
+- About 80 lines added to `Lib/pickle.py`.
+
+### Results (vs `main`)
+
+| Workload | `dump` Δ |
+| --- | ---: |
+| `list_of_ints_10k` | −3.3% |
+| `list_of_strs_1k` | −1.7% |
+| `dict_str_int_5k` | −0.2% |
+| `deep_list` | −17.2% |
+| `nested_list_of_dicts` | −22.1% |
+
+Nested / deep workloads see big wins because every level of nesting
+hits the fast path. Flat workloads see small wins because only the
+outermost call goes through it and the per-item work dominates.
+
+### Notes
+
+- `test_evil_pickler_mutating_collection` forced the slice-snapshot
+  approach. A naive index loop in pure Python is not a drop-in
+  replacement for the C path because the C path relies on
+  `PyList_GET_SIZE` being re-read each iteration as a guard against
+  mid-save mutation, which is free at C level but expensive as a
+  per-iteration `len(obj)` in Python.
+- The `n <= BATCHSIZE` single-iteration fast path for dicts is the
+  real win on `dict_str_int_5k`. For `n > BATCHSIZE` (our 5000-item
+  dict) the benefit largely evaporates because `save()` on 5k items
+  dominates anyway.
+
+## Experiment A — per-type dispatch cache
+
+**Hypothesis.** Each `save()` call does 6-8 attribute / dict lookups
+before it actually dispatches: `persistent_id`, `memo.get`,
+`getattr(reducer_override, _NoValue)`, `type(obj)`, `dispatch.get(t)`,
+and so on. A per-type closure cache keyed on `type(obj)` would
+eliminate the repeated work for repeat types.
+
+**Status.** Subsumed by Experiment E. E is essentially a tight,
+hand-rolled version of A for the five dominant types (str, int,
+NoneType, bool, float). A general-purpose cache would add a layer of
+indirection that the Python-level interpreter's type-attribute cache
+already provides via its `tp_version_tag` mechanism for bound-method
+lookup. Would likely be a wash or slightly slower. Not pursued
+separately.
+
+## Experiment B — hoist `persistent_id` / `reducer_override` hooks
+
+**Hypothesis.** The profile showed `getattr` called 500,000 times
+at ~34 ms for `list_of_ints_10k`. Most of that is the
+`getattr(self, "reducer_override", _NoValue)` probe which finds
+nothing in the 99% case. Precompute a bool at `__init__` indicating
+whether either hook is actually overridden, skip the per-call probe.
+
+### First attempt
+
+Store bound methods / unbound class functions in `self._persistent_id_hook`
+/ `self._reducer_override_hook` at `__init__`. Failed because storing
+a bound method creates a `self → bound-method → self` reference cycle,
+breaking `test_pickler_reference_cycle`.
+
+### Second attempt
+
+Store unbound class functions (`cls.persistent_id` etc.) and call them
+with explicit `self`. Failed for classmethod-defined hooks because
+`cls.persistent_id` on a classmethod is already bound to `cls`;
+calling it with `(self, obj)` passes three args to a function that
+takes two.
+
+### Third attempt (the one that ran clean)
+
+Precompute only booleans:
+
+    self._has_persistent_id_override = cls.persistent_id is not _Pickler.persistent_id
+    self._has_reducer_override = hasattr(cls, "reducer_override")
+
+Guard the call site:
+
+    if save_persistent_id and (
+            self._has_persistent_id_override
+            or "persistent_id" in self.__dict__):
+        pid = self.persistent_id(obj)
+
+Passes all tests (the `__dict__` probe preserves the documented
+instance-attribute-override pattern).
+
+### Verdict: REJECTED
+
+Clean-machine measurement confirmed it makes things **worse** by 17%
+to 36% across every workload:
+
+| Workload | `vs D-only` |
+| --- | ---: |
+| `deep_list` | +16.9% |
+| `dict_str_int_5k` | +22.0% |
+| `list_of_ints_10k` | +31.1% |
+| `list_of_strs_1k` | +36.2% |
+| `nested_list_of_dicts` | +23.2% |
+
+The initial read during the kernel compile had shown +124% — obvious
+contamination; the clean re-run gave these honest numbers. Still a
+genuine regression.
+
+Why it fails: Python's type-attribute cache makes `self.persistent_id`
+a roughly-constant-time lookup (a type-version tag compare + a slot
+index). Beating it requires avoiding the attribute load entirely — but
+the `__dict__` probe we added to preserve instance-override semantics
+is itself a dict lookup on a non-empty dict, which is about the same
+cost as the thing we're trying to avoid. Net: more work per call.
+
+**Lesson.** Don't try to beat `PyType_Lookup`'s type-attribute cache
+with hand-written `__dict__` probes. The cache is already a fast path
+for exactly this pattern.
+
+## Experiment C — skip memoize for atomic-content tuples
+
+**Hypothesis.** A tuple whose elements are all atomic (int, str,
+float, None, bool, bytes) can never be part of a reference cycle.
+The memoize call writes a `MEMOIZE` byte + a `dict.__setitem__` entry
+per such tuple; dedup savings for one-shot atomic tuples is usually
+lost (a list comprehension produces fresh tuples, so `id()` differs
+per row even when values repeat).
+
+**Implementation**: in `save_tuple`, after saving the elements, check
+whether any element's type is outside `_ATOMIC_TYPES`; only call
+`self.memoize(obj)` if so.
+
+### Verdict: DEFERRED
+
+Semantically safe — all tests that check values / round-trip round-trip
+pass. But `test_pickle_to_2x` asserts byte-exact output against
+fixture constants `DATA_XRANGE` / `DATA_SET2`. Skipping the MEMOIZE /
+BINPUT byte shortens the stream, changing the bytes. The test comment
+already acknowledges brittleness ("this test is a bit too strong
+since we can produce different bytecode that 2.x will still
+understand") but the assertion is real.
+
+Potential win is modest (1-3% on tuple-heavy workloads), not worth
+the PR-level friction of updating fixture constants across the two
+affected tests. Deferred until someone cares enough to do the fixture
+update.
+
+## Experiment D — inline `framer.commit_frame()` hot check
+
+**Hypothesis.** Profile showed `commit_frame` called 500,000 times
+for `list_of_ints_10k` (cumtime 0.061 s, tottime 0.043 s). Body work
+in the common case is trivial — check `self.current_frame is not None`
+and `f.tell() < _FRAME_SIZE_TARGET`, both false → return. The method
+call overhead is most of the cost.
+
+### Implementation
+
+At the top of `save()`, replace:
+
+    self.framer.commit_frame()
+
+with:
+
+    framer = self.framer
+    cf = framer.current_frame
+    if cf is not None and cf.tell() >= _Framer._FRAME_SIZE_TARGET:
+        framer.commit_frame()
+
+Two attribute loads + a conditional, no function call in the hot
+no-op case. Actual `commit_frame()` call falls through when a frame
+is actually ready.
+
+### Results (vs Exp 4 baseline)
+
+| Workload | `dump` Δ | `load` Δ |
+| --- | ---: | ---: |
+| `list_of_ints_10k` | −8.0% | −0.9% |
+| `list_of_strs_1k` | −6.8% | −3.0% |
+| `dict_str_int_5k` | −6.2% | −2.3% |
+| `deep_list` | −7.9% | −2.1% |
+| `nested_list_of_dicts` | −9.7% | −2.1% |
+
+Consistent 5-10% win across every workload. Completely independent
+of data shape because `save()` is called per-value everywhere.
+
+## Experiment E — atomic-type `is` short-circuit
+
+**Hypothesis.** After the `persistent_id` / `memo` / `reducer_override`
+checks, the common-case dispatch is `self.dispatch.get(type(obj))` —
+a dict lookup. For the five dominant types in real payloads (`str`,
+`int`, `NoneType`, `bool`, `float`) we can skip the dict lookup by
+doing a sequence of `type(obj) is T` identity tests and calling the
+corresponding `save_*` method directly.
+
+### Implementation iterations
+
+First form checked `int`, `bool`, `None`, `float` only. Helped the
+int-heavy workload but measurably hurt `list_of_strs_1k` (+4.5%)
+because every string now paid four no-op identity tests before
+falling through to the dispatch lookup.
+
+Final form adds `str` first:
+
+    t = type(obj)
+    if t is str:
+        self.save_str(obj); return
+    if t is int:
+        self.save_long(obj); return
+    if obj is None:
+        self.write(NONE); return
+    if t is bool:
+        self.save_bool(obj); return
+    if t is float:
+        self.save_float(obj); return
+    f = self.dispatch.get(t)  # fallback
+
+Placed after the memo check so repeated strings still dedup via the
+memo path (critical for dicts with shared keys).
+
+### Results (vs Exp 4 + D baseline)
+
+| Workload | `dump` Δ | Note |
+| --- | ---: | --- |
+| `list_of_ints_10k` | −0.7% | Dominant int path, already hits D gain |
+| `list_of_strs_1k` | −2.5% | Now a net win after adding str |
+| `dict_str_int_5k` | −3.7% | String keys benefit most |
+| `deep_list` | −0.4% | Mostly ints, D dominates |
+| `nested_list_of_dicts` | −0.1% | Flat; other costs dominate |
+
+Small but universally positive with str included, which matters
+because str is the single most common dict-key type in real payloads.
+
+## Combined results and third-party validation
+
+### Cumulative picture (Exp 4 + D + E vs `main`)
+
+See the final-numbers table at the top of the ledger. Summary: **−9%
+to −28% dump**, with loads unchanged within noise.
+
+### Third-party regression suite
+
+Same venvs reused from the marshal recovery work; see
+`Misc/marshal-perf-diary.md` for setup.
+
+**dill 0.4.1 test suite**: 29/30 pass (unchanged). The one failure
+(`test_session`) is a pre-existing 3.15a8 incompatibility in dill's
+module-state serialization — reproduces identically on unpatched
+`main`, stays identical here. The traceback is entirely in `pickle.py`
++ `dill/_dill.py`, not in my changed code paths.
+
+**cloudpickle 3.1.2**: 236 passed, 22 skipped, 2 xfailed — byte-for-byte
+identical pass/skip/xfail breakdown as on unpatched `main`. Both
+xfails are upstream-known cross-process determinism issues, not
+related to any of our changes.
+
+### Stdlib regression suite
+
+| Test file | Count | Result |
+| --- | ---: | --- |
+| `test_pickle` | 1,060 | SUCCESS |
+| `test_pickletools` | 202 | SUCCESS |
+| `test_copy` | 83 | SUCCESS |
+| `test_copyreg` | 6 | SUCCESS |
+| `test_importlib` | 1,217 | SUCCESS |
+
+2,568 tests covering code paths that exercise `_Pickler`, all pass.
+
+## Final conclusions
+
+### What shipped
+
+Two commits on `exp-pickle/4-pure-python-exact-containers` (local,
+unpushed):
+
+1. `94b53eb` — Exp 4 exact-container fast paths.
+2. `bb9d721` — Exp D inlined `commit_frame` + Exp E atomic-type
+   short-circuit.
+
+Net diff vs `main`: +138 / -4 lines in `Lib/pickle.py`.
+
+### What we learned
+
+- **`save()` overhead dominates on small-object workloads.** 500,000
+  save calls × ~2 µs = 1 second. Optimizing the *frame* of each call
+  (D) matters more than optimizing the *body* (individual `save_*`
+  functions) for this shape of work.
+- **`PyType_Lookup`'s type-attribute cache is already good.** Don't
+  try to short-circuit `self.method()` with a bool + `__dict__` probe;
+  you'll add more work than you save. Exp B learned this the hard way
+  on two separate implementation attempts.
+- **Byte-exact pickle-output tests block some legitimate wins.** Exp C
+  is semantically correct and saves real work but would require
+  updating fixture constants in `test_pickle_to_2x`. Noting this for
+  anyone willing to push the broader change through review.
+- **Contamination is real.** A background kernel compile on the same
+  core can produce uniform ~2× slowdowns that look superficially like
+  a patch-induced regression. Every experiment in this diary had its
+  final verdict set by a clean-machine re-run.
+
+### What we didn't try
+
+- **Specialized `save_list` for homogeneous-content lists** (all ints,
+  all strs). Real arrays are often homogeneous. Could probably skip
+  per-item type dispatch entirely. Natural next experiment.
+- **Memoize-skip for small one-shot dict / list objects.** Same
+  cycle-impossibility argument as Exp C, same byte-output sensitivity.
+- **A full per-type closure cache in `save()`.** The Python-level
+  interpreter already provides this via type-version-tagged attribute
+  caches for the method-lookup path; Exp E exploits the cases where
+  `is`-check + direct call is even cheaper than a cache hit.
+- **Any C-level work on `Modules/_pickle.c`.** The C path was already
+  hand-optimized decades ago; this round stayed in pure Python.
+
+### Recommended next move
+
+The Exp 4 + D + E stack is a small (~150-line) self-contained patch
+with measurable, reproducible wins, no regressions, and third-party
+validation. Would ship comfortably as a standalone PR against
+`python/cpython`.
+
+If continuing: Exp C is low-hanging if someone is willing to update
+the two fixture constants. Homogeneous-list specialization is the
+next experiment worth running — similar methodology, should be a
+week-end job.
+
+## Provenance
+
+Generated 2026-04-17 during a follow-up exploration of stdlib perf
+opportunities after the marshal safe-cycle recovery PR. Raw data in
+`Misc/pickle-perf-data/`; commits `94b53eb` and `bb9d721` on
+`exp-pickle/4-pure-python-exact-containers`.

From 285fcae806098fae487209af60d41d4c937de714 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael.bommarito@gmail.com>
Date: Fri, 17 Apr 2026 19:09:35 -0400
Subject: [PATCH 4/9] pickle: fix large-dict mutation semantics; reorder save()
 fast paths (F1)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes:

1) _batch_setitems_exact large-dict branch (`n > _BATCHSIZE`) used to
   materialise `list(obj.items())` upfront, silently losing the
   `RuntimeError("dictionary changed size during iteration")` raised
   by dict_items when mutation happens during save() (e.g. from a
   persistent_id hook). This is the exact-dict semantics regression
   flagged in review. Fix: delegate to the generic `_batch_setitems`,
   which iterates through `batched()` over the live items view and
   preserves the detection. The small-dict `n <= _BATCHSIZE` fast path
   already used direct `for k, v in items:` iteration which triggers
   the same check.

2) Reorder save()'s fast paths to match the dispatch order in
   Modules/_pickle.c::save(): atomic types (str / int / None / bool /
   float) are tested *before* the memo.get()/reducer_override probe.
   The C reference skips both for these types — str because save_str
   handles memoisation inline, the others because they're never
   memoized. Pure Python now does the same:

     - str: inline memo.get check, then save_str on miss (preserves dedup)
     - int / None / bool / float: direct save_*, no memo.get, no
       reducer_override probe

Pure-Python dump numbers vs clean `main` (best-of-9 median):

    list_of_ints_10k         dump -38.6%    load -3.8%
    list_of_strs_1k          dump -17.2%    load -3.6%
    dict_str_int_5k          dump -25.9%    load -4.2%
    deep_list                dump -44.4%    load -3.1%
    nested_list_of_dicts     dump -34.9%    load -2.8%

Load deltas are unrelated noise. The semantic change (skip
reducer_override for atomic types) matches the C pickler's long-
established behaviour.

test_pickle 1060/1060, test_pickletools 202/202, test_copy 83/83,
cloudpickle 236/236 identical breakdown. Large-dict mutation detection
verified via a local reproducer (RuntimeError raised as expected).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Lib/pickle.py | 85 ++++++++++++++++++++++-----------------------------
 1 file changed, 37 insertions(+), 48 deletions(-)

diff --git a/Lib/pickle.py b/Lib/pickle.py
index 6856a81d5e5343..a76e445762871a 100644
--- a/Lib/pickle.py
+++ b/Lib/pickle.py
@@ -577,7 +577,34 @@ def save(self, obj, save_persistent_id=True):
                 self.save_pers(pid)
                 return
 
-        # Check the memo
+        # Fast paths matching the order of Modules/_pickle.c::save().
+        # Each of these returns without going through reducer_override,
+        # which the C reference implementation also skips for these
+        # types.
+        t = type(obj)
+        # str: memoized, so check memo inline before falling into save_str.
+        if t is str:
+            x = self.memo.get(id(obj))
+            if x is not None:
+                self.write(self.get(x[0]))
+                return
+            self.save_str(obj)
+            return
+        # int / None / bool / float: not memoized; skip memo.get entirely.
+        if t is int:
+            self.save_long(obj)
+            return
+        if obj is None:
+            self.write(NONE)
+            return
+        if t is bool:
+            self.save_bool(obj)
+            return
+        if t is float:
+            self.save_float(obj)
+            return
+
+        # Check the memo (non-atomic, non-str types)
         x = self.memo.get(id(obj))
         if x is not None:
             self.write(self.get(x[0]))
@@ -589,26 +616,6 @@ def save(self, obj, save_persistent_id=True):
             rv = reduce(obj)
 
         if rv is NotImplemented:
-            # Fast-path common types before the general dispatch table
-            # lookup. Saves one dict.get per save() call on payloads
-            # dominated by these types. The memo check already ran, so
-            # repeated strings / bytes / tuples still dedup via that path.
-            t = type(obj)
-            if t is str:
-                self.save_str(obj)
-                return
-            if t is int:
-                self.save_long(obj)
-                return
-            if obj is None:
-                self.write(NONE)
-                return
-            if t is bool:
-                self.save_bool(obj)
-                return
-            if t is float:
-                self.save_float(obj)
-                return
             # Check the type dispatch table
             f = self.dispatch.get(t)
             if f is not None:
@@ -1177,6 +1184,9 @@ def _batch_setitems_exact(self, obj):
             return
         if n <= batch_size:
             # Single batch: iterate items() directly, no batching machinery.
+            # dict_items iteration itself raises RuntimeError on size change,
+            # so mutation during save() (e.g. from persistent_id hooks) is
+            # detected.
             write(MARK)
             for k, v in items:
                 save(k)
@@ -1187,33 +1197,12 @@ def _batch_setitems_exact(self, obj):
                     raise
             write(SETITEMS)
             return
-        # Large dict: materialize items once, batch via slicing. The full
-        # items list is allocated only when n > batch_size.
-        all_items = list(items)
-        total = 0
-        while total < n:
-            remaining = n - total
-            if remaining == 1:
-                k, v = all_items[total]
-                save(k)
-                try:
-                    save(v)
-                except BaseException as exc:
-                    exc.add_note(f'when serializing {_T(obj)} item {k!r}')
-                    raise
-                write(SETITEM)
-                return
-            this_batch = remaining if remaining < batch_size else batch_size
-            write(MARK)
-            for k, v in all_items[total:total + this_batch]:
-                save(k)
-                try:
-                    save(v)
-                except BaseException as exc:
-                    exc.add_note(f'when serializing {_T(obj)} item {k!r}')
-                    raise
-            write(SETITEMS)
-            total += this_batch
+        # Large dict: delegate to the generic path, which uses batched()
+        # over the live items iterator and preserves dict mutation-during-
+        # save detection. The per-batch tuple allocation is amortised over
+        # BATCHSIZE items here, so the exact-dict fast-path advantage is
+        # concentrated on the n <= batch_size case above.
+        self._batch_setitems(items, obj)
 
     def _batch_setitems(self, items, obj):
         # Helper to batch up SETITEMS sequences; proto >= 1 only

From 7c6af84626238d257914ff90b65c8923f6c6e33b Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael.bommarito@gmail.com>
Date: Fri, 17 Apr 2026 19:14:46 -0400
Subject: [PATCH 5/9] pickle: cache BININT1 opcode bytes for n in 0..255 (F4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

save_long for small non-negative ints did `BININT1 + pack("<B", obj)`
per call. Precompute the 256 two-byte opcode sequences once at module
import (about 50 KB) and use a tuple index instead. Eliminates a
_struct.pack call on every such save.

Pure-Python dump numbers vs the previous commit (F1):

    list_of_ints_10k         dump  -0.0%   (values 0-9999, only 256 hit
                                            the cache)
    list_of_strs_1k          dump  -1.4%
    dict_str_int_5k          dump  -1.4%
    deep_list                dump  -9.9%   ([i]*10 for i in 0..499 —
                                            every value hits the cache)
    nested_list_of_dicts     dump  -1.5%

Cumulative vs clean `main`: -18% to -50% dump depending on workload.

test_pickle 1060/1060, cloudpickle 236/236.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Lib/pickle.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Lib/pickle.py b/Lib/pickle.py
index a76e445762871a..f5ed4d8172547d 100644
--- a/Lib/pickle.py
+++ b/Lib/pickle.py
@@ -162,6 +162,10 @@ def __init__(self, value):
 
 _tuplesize2code = [EMPTY_TUPLE, TUPLE1, TUPLE2, TUPLE3]
 
+# Precomputed BININT1 opcode + payload for n in 0..255. Avoids the
+# struct.pack("<B", n) on every small non-negative int save.
+_BININT1_BYTES = tuple(BININT1 + bytes([_i]) for _i in range(256))
+
 # Protocol 3 (Python 3.x)
 
 BINBYTES       = b'B'   # push bytes; counted binary string argument
@@ -861,7 +865,7 @@ def save_long(self, obj):
             # First one- and two-byte unsigned ints:
             if obj >= 0:
                 if obj <= 0xff:
-                    self.write(BININT1 + pack("<B", obj))
+                    self.write(_BININT1_BYTES[obj])
                     return
                 if obj <= 0xffff:
                     self.write(BININT2 + pack("<H", obj))

From 2f1d38b12c01cb2067b7662745afde33a686db11 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael.bommarito@gmail.com>
Date: Fri, 17 Apr 2026 19:16:54 -0400
Subject: [PATCH 6/9] pickle: inline the MEMOIZE proto-4+ path in memoize()
 (F2)

memoize() used to call self.put(idx) which returns an opcode bytestring
and then write it. For protocol >= 4 (the common case for any recent
user), put() just returns the single-byte MEMOIZE constant. Inline the
protocol dispatch directly into memoize() to skip the method call and
the temporary-bytes indirection.

Also caches `memo = self.memo` as a local to avoid two attribute loads
on the common path.

Pure-Python dump numbers vs the previous commit (F4):

    list_of_ints_10k         dump  +1.0%   (ints not memoized; noise)
    list_of_strs_1k          dump  -1.9%
    dict_str_int_5k          dump  -1.2%
    deep_list                dump  -1.3%
    nested_list_of_dicts     dump  -1.6%

Cumulative vs clean `main`: -20% to -51% dump.

put() method kept for subclass override compatibility.

test_pickle 1060/1060.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Lib/pickle.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/Lib/pickle.py b/Lib/pickle.py
index f5ed4d8172547d..9202851fa95377 100644
--- a/Lib/pickle.py
+++ b/Lib/pickle.py
@@ -536,12 +536,27 @@ def memoize(self, obj):
         # growable) array, indexed by memo key.
         if self.fast:
             return
-        assert id(obj) not in self.memo
-        idx = len(self.memo)
-        self.write(self.put(idx))
-        self.memo[id(obj)] = idx, obj
+        memo = self.memo
+        assert id(obj) not in memo
+        idx = len(memo)
+        # Inlined self.put(idx). For proto >= 4 (the common case for any
+        # recent user), MEMOIZE is a one-byte constant; avoid the method
+        # dispatch + the redundant self.write indirection.
+        proto = self.proto
+        if proto >= 4:
+            self.write(MEMOIZE)
+        elif self.bin:
+            if idx < 256:
+                self.write(BINPUT + pack("<B", idx))
+            else:
+                self.write(LONG_BINPUT + pack("<I", idx))
+        else:
+            self.write(PUT + repr(idx).encode("ascii") + b'\n')
+        memo[id(obj)] = idx, obj
 
     # Return a PUT (BINPUT, LONG_BINPUT) opcode string, with argument i.
+    # Retained for backward compatibility with subclasses that override
+    # this method; memoize() now inlines the common paths directly.
     def put(self, idx):
         if self.proto >= 4:
             return MEMOIZE

From e917108ace45f92c3ce80d0ad4907bbd22f56fe7 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael.bommarito@gmail.com>
Date: Fri, 17 Apr 2026 19:25:28 -0400
Subject: [PATCH 7/9] pickle: extend save() fast path to bytes (F6)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add `bytes` to the save() dispatch fast path, placed AFTER the four
non-memoized atomics (int / None / bool / float) so int-heavy workloads
don't pay an extra type-identity branch before hitting their direct
dispatch. bytes is memoized, so memo.get is inlined alongside the
save_bytes call the same way str is.

Dump deltas on a bytes-heavy workload:

    list_of_short_bytes_5k    dump -10.7%
    list_of_medium_bytes_1k   dump -11.7%
    dict_bytes_to_int_2k      dump  -9.1%
    list_of_bytearrays_1k     dump  +0.4%   (bytearray isn't caught;
                                             hits dispatch table)

Main-bench impact (F2 -> F6v2), within ±0.7% across all benches.
bytearray specialisation is a separate question; rare in real payloads.

test_pickle 1060/1060.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Lib/pickle.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/Lib/pickle.py b/Lib/pickle.py
index 9202851fa95377..a28e977317a268 100644
--- a/Lib/pickle.py
+++ b/Lib/pickle.py
@@ -610,6 +610,8 @@ def save(self, obj, save_persistent_id=True):
             self.save_str(obj)
             return
         # int / None / bool / float: not memoized; skip memo.get entirely.
+        # Placed before bytes so int-heavy workloads don't pay an extra
+        # branch miss before hitting their fast path.
         if t is int:
             self.save_long(obj)
             return
@@ -622,6 +624,14 @@ def save(self, obj, save_persistent_id=True):
         if t is float:
             self.save_float(obj)
             return
+        # bytes: memoized; same inline memo pattern as str.
+        if t is bytes:
+            x = self.memo.get(id(obj))
+            if x is not None:
+                self.write(self.get(x[0]))
+                return
+            self.save_bytes(obj)
+            return
 
         # Check the memo (non-atomic, non-str types)
         x = self.memo.get(id(obj))

From 024a57c44d702d2bd86081a988420f053feb3d41 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael.bommarito@gmail.com>
Date: Fri, 17 Apr 2026 19:58:24 -0400
Subject: [PATCH 8/9] pickle: update perf diary for round 2 (F1-F6) + raw data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expands Misc/pickle-perf-diary.md to cover the round-2 experiments:

  F1 SHIPPED — atomic fast paths before memo (matches C pickler order)
  F2 SHIPPED — inline MEMOIZE proto-4+ in memoize()
  F3 REJECTED — frame byte counter (Python bookkeeping loses to
                 BytesIO.tell()'s C method call)
  F4 SHIPPED — cache BININT1 opcode bytes for n in 0..255
  F5 REJECTED — ASCII save_str (Python utf-8 encoder already fast
                 for pure-ASCII)
  F6 SHIPPED — bytes in save() fast path
  F7 DEFERRED — exact-set batching (needs set workload)

Plus the large-dict mutation-detection regression in Exp 4 that
review flagged, fixed as part of F1.

Updated "What we learned" with four new lessons distilled from the
failures (F3, F5) and the order-matching insight from F1. Added a
"What we validated this session" section citing full CPython
regression suite passing (48,928 tests), dill 29/30 (pre-existing
3.15a8 incompat), cloudpickle 243/243 identical to baseline, joblib
focused subset identical to baseline.

Misc/pickle-perf-data/ adds nine round-2 JSON artifacts plus the
bytes-specific bench script, and README.md documents which file
represents which branch state.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Misc/pickle-perf-data/README.md               |  33 +-
 Misc/pickle-perf-data/pickle-F1v2.json        | 157 ++++++++
 Misc/pickle-perf-data/pickle-F2.json          | 157 ++++++++
 Misc/pickle-perf-data/pickle-F3.json          | 157 ++++++++
 Misc/pickle-perf-data/pickle-F4.json          | 157 ++++++++
 Misc/pickle-perf-data/pickle-F5.json          | 157 ++++++++
 Misc/pickle-perf-data/pickle-F6v2.json        | 157 ++++++++
 Misc/pickle-perf-data/pickle-bytes-F6v2.json  | 118 ++++++
 Misc/pickle-perf-data/pickle-bytes-pre.json   | 118 ++++++
 Misc/pickle-perf-data/pickle-post-fix.json    | 157 ++++++++
 .../pickle_pure_bench_bytes.py                |  53 +++
 Misc/pickle-perf-diary.md                     | 358 ++++++++++++++++--
 12 files changed, 1746 insertions(+), 33 deletions(-)
 create mode 100644 Misc/pickle-perf-data/pickle-F1v2.json
 create mode 100644 Misc/pickle-perf-data/pickle-F2.json
 create mode 100644 Misc/pickle-perf-data/pickle-F3.json
 create mode 100644 Misc/pickle-perf-data/pickle-F4.json
 create mode 100644 Misc/pickle-perf-data/pickle-F5.json
 create mode 100644 Misc/pickle-perf-data/pickle-F6v2.json
 create mode 100644 Misc/pickle-perf-data/pickle-bytes-F6v2.json
 create mode 100644 Misc/pickle-perf-data/pickle-bytes-pre.json
 create mode 100644 Misc/pickle-perf-data/pickle-post-fix.json
 create mode 100644 Misc/pickle-perf-data/pickle_pure_bench_bytes.py

diff --git a/Misc/pickle-perf-data/README.md b/Misc/pickle-perf-data/README.md
index 1cd1eb9c930f5e..5068a0d4caa7bf 100644
--- a/Misc/pickle-perf-data/README.md
+++ b/Misc/pickle-perf-data/README.md
@@ -12,14 +12,20 @@ list-of-strs, flat str-keyed dict, deep list-of-lists, nested
 list-of-dicts). Each reports a best-of-9 median for dump and load at
 protocol 5.
 
-`pickle_save_profile.py` — `cProfile`-based breakdown used once to
-identify which internal calls dominate `save()` (informed the priority
-ordering of ideas D, E over B).
+`pickle_pure_bench_bytes.py` — bytes-heavy workload (short bytes,
+medium bytes, bytearrays, bytes-keyed dict). Introduced in round 2 to
+evaluate F6 (bytes in the save() fast path).
+
+`pickle_save_profile.py` — `cProfile`-based breakdown used to identify
+which internal calls dominate `save()` (informed the priority ordering
+of ideas D, E over B; in round 2, drove the F1 / F2 / F4 ordering).
 
 Run each with `taskset -c 0 ./python <script>` on a quiet machine.
 
 ## JSON files
 
+### Round 1 (Exp 4 → E)
+
 | File | Commit / state |
 | --- | --- |
 | `pickle-pure-baseline.json` | Clean `main` (2faceeec), no pickle patches |
@@ -27,7 +33,26 @@ Run each with `taskset -c 0 ./python <script>` on a quiet machine.
 | `pickle-pure-Donly-verify.json` | Exp 4 + D (inlined `commit_frame`) |
 | `pickle-pure-BD.json` | Exp 4 + D + B attempt — **used to confirm B regression** |
 | `pickle-pure-DE.json` | Exp 4 + D + E (int-only initial form) |
-| `pickle-pure-DE-v2.json` | Exp 4 + D + E (str added — the shipped form, `bb9d721`) |
+| `pickle-pure-DE-v2.json` | Exp 4 + D + E (str added — `bb9d721`) |
+
+### Round 2 (F1 → F6)
+
+| File | Commit / state |
+| --- | --- |
+| `pickle-post-fix.json` | Large-dict mutation fix, before F1 reorder |
+| `pickle-F1v2.json` | F1 (save() reordered, atomic short-circuit before memo, `285fcae`) |
+| `pickle-F3.json` | F3 (frame byte counter) — **rejected**, reverted |
+| `pickle-F4.json` | F4 (BININT1 opcode cache, `7c6af84`) |
+| `pickle-F2.json` | F2 (inlined MEMOIZE in memoize(), `2f1d38b`) |
+| `pickle-F5.json` | F5 (ASCII save_str) — **rejected**, reverted |
+| `pickle-F6v2.json` | F6 (bytes in fast path, `e917108`) — current tip |
+
+### Bytes-specific bench (introduced in round 2)
+
+| File | Content |
+| --- | --- |
+| `pickle-bytes-pre.json` | Before F6; baseline for bytes workloads |
+| `pickle-bytes-F6v2.json` | After F6 |
 
 ## Interpretation guide
 
diff --git a/Misc/pickle-perf-data/pickle-F1v2.json b/Misc/pickle-perf-data/pickle-F1v2.json
new file mode 100644
index 00000000000000..21aa745b3bc090
--- /dev/null
+++ b/Misc/pickle-perf-data/pickle-F1v2.json
@@ -0,0 +1,157 @@
+list_of_ints_10k          dump_med=0.46946s  load_med=0.42662s
+list_of_strs_1k           dump_med=0.26663s  load_med=0.20294s
+dict_str_int_5k           dump_med=0.80553s  load_med=0.69985s
+nested_list_of_dicts      dump_med=1.31115s  load_med=1.12204s
+deep_list                 dump_med=1.50200s  load_med=1.29234s
+{
+  "deep_list": {
+    "dump_median": 1.5019968409906141,
+    "dump_min": 1.497245379985543,
+    "dump_number": 1000,
+    "dump_runs": [
+      1.5009774919890333,
+      1.4990510340139735,
+      1.5066834699828178,
+      1.497245379985543,
+      1.5041009330016095,
+      1.4982437639846466,
+      1.504303901019739,
+      1.5152082399872597,
+      1.5019968409906141
+    ],
+    "load_median": 1.2923393840028439,
+    "load_min": 1.2908724549924955,
+    "load_number": 1000,
+    "load_runs": [
+      1.29189643097925,
+      1.2921961240062956,
+      1.2923393840028439,
+      1.2932846340117976,
+      1.2918326990038622,
+      1.2931526899919845,
+      1.2939838360180147,
+      1.2908724549924955,
+      1.2953305760165676
+    ]
+  },
+  "dict_str_int_5k": {
+    "dump_median": 0.8055337310070172,
+    "dump_min": 0.8035851889871992,
+    "dump_number": 200,
+    "dump_runs": [
+      0.8050816709874198,
+      0.8199981430079788,
+      0.8047663400066085,
+      0.8055337310070172,
+      0.8044308759854175,
+      0.8068802680063527,
+      0.8091387820022646,
+      0.8035851889871992,
+      0.8064585370011628
+    ],
+    "load_median": 0.6998515519953799,
+    "load_min": 0.6964062039915007,
+    "load_number": 200,
+    "load_runs": [
+      0.7024963599978946,
+      0.6998515519953799,
+      0.6994447850156575,
+      0.6993452400201932,
+      0.6964062039915007,
+      0.7028375849768054,
+      0.6999831019784324,
+      0.7041278199758381,
+      0.6986452080018353
+    ]
+  },
+  "list_of_ints_10k": {
+    "dump_median": 0.46945962999598123,
+    "dump_min": 0.4669357760285493,
+    "dump_number": 200,
+    "dump_runs": [
+      0.4678908060013782,
+      0.4669357760285493,
+      0.4700292210036423,
+      0.4670876759919338,
+      0.46945962999598123,
+      0.46787143900291994,
+      0.4747264570032712,
+      0.47413765999954194,
+      0.4715561600169167
+    ],
+    "load_median": 0.42661580501589924,
+    "load_min": 0.42471483699046075,
+    "load_number": 200,
+    "load_runs": [
+      0.42648839598405175,
+      0.42471483699046075,
+      0.4252382779959589,
+      0.4281669949996285,
+      0.4288054100179579,
+      0.42661580501589924,
+      0.43016234098467976,
+      0.4266007059777621,
+      0.42826703100581653
+    ]
+  },
+  "list_of_strs_1k": {
+    "dump_median": 0.2666289230110124,
+    "dump_min": 0.264438459998928,
+    "dump_number": 500,
+    "dump_runs": [
+      0.2679859589843545,
+      0.264438459998928,
+      0.2651102160161827,
+      0.26737507301731966,
+      0.2653146140219178,
+      0.2668767370050773,
+      0.2700573760084808,
+      0.2666289230110124,
+      0.2654508549894672
+    ],
+    "load_median": 0.20294347300659865,
+    "load_min": 0.202313760004472,
+    "load_number": 500,
+    "load_runs": [
+      0.20461290999082848,
+      0.20535565199679695,
+      0.2052633530111052,
+      0.20280386199010536,
+      0.20283933900645934,
+      0.202313760004472,
+      0.20294347300659865,
+      0.20560321197262965,
+      0.20257305397535674
+    ]
+  },
+  "nested_list_of_dicts": {
+    "dump_median": 1.3111504959815647,
+    "dump_min": 1.303584012988722,
+    "dump_number": 1000,
+    "dump_runs": [
+      1.3113582879886962,
+      1.3147916969901416,
+      1.3040668389876373,
+      1.3076240690134,
+      1.3111504959815647,
+      1.303584012988722,
+      1.3114991229958832,
+      1.3090370139980223,
+      1.3243075040227268
+    ],
+    "load_median": 1.1220397170109209,
+    "load_min": 1.119343422993552,
+    "load_number": 1000,
+    "load_runs": [
+      1.1203632620163262,
+      1.1217989969882183,
+      1.1242971819883678,
+      1.124860201991396,
+      1.119343422993552,
+      1.1260101119987667,
+      1.1211580410017632,
+      1.1220397170109209,
+      1.1251991979952436
+    ]
+  }
+}
diff --git a/Misc/pickle-perf-data/pickle-F2.json b/Misc/pickle-perf-data/pickle-F2.json
new file mode 100644
index 00000000000000..dc3db838786563
--- /dev/null
+++ b/Misc/pickle-perf-data/pickle-F2.json
@@ -0,0 +1,157 @@
+list_of_ints_10k          dump_med=0.47417s  load_med=0.42891s
+list_of_strs_1k           dump_med=0.25792s  load_med=0.20272s
+dict_str_int_5k           dump_med=0.78459s  load_med=0.70477s
+nested_list_of_dicts      dump_med=1.27060s  load_med=1.13209s
+deep_list                 dump_med=1.33559s  load_med=1.30836s
+{
+  "deep_list": {
+    "dump_median": 1.3355907069926616,
+    "dump_min": 1.3311425409920048,
+    "dump_number": 1000,
+    "dump_runs": [
+      1.3373034579853993,
+      1.337114919006126,
+      1.3311425409920048,
+      1.3338035839842632,
+      1.3330696389894001,
+      1.3351775739865843,
+      1.336296836991096,
+      1.3355907069926616,
+      1.3360507040051743
+    ],
+    "load_median": 1.3083578489895444,
+    "load_min": 1.3026017179945484,
+    "load_number": 1000,
+    "load_runs": [
+      1.3124700170010328,
+      1.313380811974639,
+      1.3273777790018357,
+      1.3083578489895444,
+      1.3052292329957709,
+      1.309355906007113,
+      1.3026017179945484,
+      1.3080214970104862,
+      1.3044605539762415
+    ]
+  },
+  "dict_str_int_5k": {
+    "dump_median": 0.7845924060093239,
+    "dump_min": 0.7814601160062011,
+    "dump_number": 200,
+    "dump_runs": [
+      0.7814601160062011,
+      0.7845924060093239,
+      0.7827607769868337,
+      0.7864577260043006,
+      0.7818346369895153,
+      0.7861032810178585,
+      0.7844650570186786,
+      0.7915582059940789,
+      0.7890999960072804
+    ],
+    "load_median": 0.7047746739990544,
+    "load_min": 0.7005252969975118,
+    "load_number": 200,
+    "load_runs": [
+      0.7070487900055014,
+      0.7021644150081556,
+      0.7040379410027526,
+      0.7060447660041973,
+      0.7005252969975118,
+      0.7032691420172341,
+      0.7054792249982711,
+      0.7047746739990544,
+      0.7048475909978151
+    ]
+  },
+  "list_of_ints_10k": {
+    "dump_median": 0.4741691680101212,
+    "dump_min": 0.4707727270142641,
+    "dump_number": 200,
+    "dump_runs": [
+      0.47680948599008843,
+      0.47158606199081987,
+      0.4719101079972461,
+      0.47561555000720546,
+      0.4707727270142641,
+      0.47411919198930264,
+      0.4749926330114249,
+      0.47449902998050675,
+      0.4741691680101212
+    ],
+    "load_median": 0.42890589899616316,
+    "load_min": 0.4274280740064569,
+    "load_number": 200,
+    "load_runs": [
+      0.4274280740064569,
+      0.42890589899616316,
+      0.4276138319983147,
+      0.43172124499687925,
+      0.42934834299376234,
+      0.43089867098024115,
+      0.4281359760207124,
+      0.42867373800254427,
+      0.43137767899315804
+    ]
+  },
+  "list_of_strs_1k": {
+    "dump_median": 0.25791743700392544,
+    "dump_min": 0.2558917299902532,
+    "dump_number": 500,
+    "dump_runs": [
+      0.2607862740114797,
+      0.2574189910083078,
+      0.260898548993282,
+      0.25927707998198457,
+      0.2599420999758877,
+      0.2578697509889025,
+      0.25739270000485703,
+      0.25791743700392544,
+      0.2558917299902532
+    ],
+    "load_median": 0.20272396298241802,
+    "load_min": 0.2017509949801024,
+    "load_number": 500,
+    "load_runs": [
+      0.20272396298241802,
+      0.2036275510035921,
+      0.20276092601125129,
+      0.2028182730136905,
+      0.20397138502448797,
+      0.20199521101312712,
+      0.20211108098737895,
+      0.20205860800342634,
+      0.2017509949801024
+    ]
+  },
+  "nested_list_of_dicts": {
+    "dump_median": 1.2706028319953475,
+    "dump_min": 1.2637555600085761,
+    "dump_number": 1000,
+    "dump_runs": [
+      1.2762192410009447,
+      1.2708911430090666,
+      1.2713399810018018,
+      1.2637555600085761,
+      1.2706028319953475,
+      1.2692987970076501,
+      1.2696479339792859,
+      1.2722268810030073,
+      1.2650974990101531
+    ],
+    "load_median": 1.1320884489978198,
+    "load_min": 1.1259048770007212,
+    "load_number": 1000,
+    "load_runs": [
+      1.127192392013967,
+      1.1259048770007212,
+      1.1423722939798608,
+      1.1266003269993234,
+      1.1594511220173445,
+      1.136050406988943,
+      1.128911217005225,
+      1.1378110359946731,
+      1.1320884489978198
+    ]
+  }
+}
diff --git a/Misc/pickle-perf-data/pickle-F3.json b/Misc/pickle-perf-data/pickle-F3.json
new file mode 100644
index 00000000000000..551ad55ccacdad
--- /dev/null
+++ b/Misc/pickle-perf-data/pickle-F3.json
@@ -0,0 +1,157 @@
+list_of_ints_10k          dump_med=0.48454s  load_med=0.43002s
+list_of_strs_1k           dump_med=0.27676s  load_med=0.20374s
+dict_str_int_5k           dump_med=0.83183s  load_med=0.70130s
+nested_list_of_dicts      dump_med=1.38403s  load_med=1.13296s
+deep_list                 dump_med=1.57405s  load_med=1.30588s
+{
+  "deep_list": {
+    "dump_median": 1.5740469059965108,
+    "dump_min": 1.5695010180061217,
+    "dump_number": 1000,
+    "dump_runs": [
+      1.5731769000121858,
+      1.5740469059965108,
+      1.575004504993558,
+      1.5770580749958754,
+      1.5751581989752594,
+      1.5695010180061217,
+      1.5905408959952183,
+      1.571403342008125,
+      1.5719750830030534
+    ],
+    "load_median": 1.3058751260105055,
+    "load_min": 1.3027889490185771,
+    "load_number": 1000,
+    "load_runs": [
+      1.3027889490185771,
+      1.3039648149861023,
+      1.3065849469858222,
+      1.3047499060048722,
+      1.3058751260105055,
+      1.3029524029989261,
+      1.3139150049828459,
+      1.3090720100153703,
+      1.3118772209854797
+    ]
+  },
+  "dict_str_int_5k": {
+    "dump_median": 0.8318268859875388,
+    "dump_min": 0.827731141995173,
+    "dump_number": 200,
+    "dump_runs": [
+      0.8298991810006555,
+      0.8466462889919057,
+      0.8321409350028262,
+      0.8286351870046929,
+      0.8343178250070196,
+      0.827731141995173,
+      0.8298571140039712,
+      0.8318268859875388,
+      0.839522771013435
+    ],
+    "load_median": 0.7012957740225829,
+    "load_min": 0.6973415559914429,
+    "load_number": 200,
+    "load_runs": [
+      0.6996081699908245,
+      0.7026149840094149,
+      0.7025974559946917,
+      0.7005174400110263,
+      0.7012957740225829,
+      0.7036204450123478,
+      0.7000589400122408,
+      0.6973415559914429,
+      0.7024074439832475
+    ]
+  },
+  "list_of_ints_10k": {
+    "dump_median": 0.48454139099339955,
+    "dump_min": 0.4836708630027715,
+    "dump_number": 200,
+    "dump_runs": [
+      0.49386435898486525,
+      0.48454139099339955,
+      0.4838896240107715,
+      0.48391324499971233,
+      0.48369535701931454,
+      0.4869437330053188,
+      0.48518806099309586,
+      0.4836708630027715,
+      0.485327475995291
+    ],
+    "load_median": 0.4300207979977131,
+    "load_min": 0.4288215959968511,
+    "load_number": 200,
+    "load_runs": [
+      0.4300207979977131,
+      0.42903182000736706,
+      0.4289355869987048,
+      0.43090289799147286,
+      0.42987440698198043,
+      0.43064188899006695,
+      0.4288215959968511,
+      0.4304612769919913,
+      0.4308939920156263
+    ]
+  },
+  "list_of_strs_1k": {
+    "dump_median": 0.27675632198224775,
+    "dump_min": 0.27570775197818875,
+    "dump_number": 500,
+    "dump_runs": [
+      0.276153910002904,
+      0.2763833990029525,
+      0.27863978798268363,
+      0.2765317859884817,
+      0.27731807800591923,
+      0.27675632198224775,
+      0.2768336390145123,
+      0.27570775197818875,
+      0.27700776499114
+    ],
+    "load_median": 0.20374052500119433,
+    "load_min": 0.2032015599834267,
+    "load_number": 500,
+    "load_runs": [
+      0.20414188998984173,
+      0.2035129370051436,
+      0.2032015599834267,
+      0.20400967801106162,
+      0.2033179830177687,
+      0.20402492498396896,
+      0.20324421499390155,
+      0.20374052500119433,
+      0.20418840501224622
+    ]
+  },
+  "nested_list_of_dicts": {
+    "dump_median": 1.3840276080009062,
+    "dump_min": 1.379275277024135,
+    "dump_number": 1000,
+    "dump_runs": [
+      1.379275277024135,
+      1.3845016679842956,
+      1.3831179730186705,
+      1.3840276080009062,
+      1.3843097799981479,
+      1.3847410949820187,
+      1.3827015029964969,
+      1.4200368320161942,
+      1.3834034580213483
+    ],
+    "load_median": 1.1329641210031696,
+    "load_min": 1.1309265790041536,
+    "load_number": 1000,
+    "load_runs": [
+      1.133157428004779,
+      1.1318503729999065,
+      1.1341376439959276,
+      1.1347301310161129,
+      1.1355230569897685,
+      1.131226172001334,
+      1.1309265790041536,
+      1.1329641210031696,
+      1.1326816150103696
+    ]
+  }
+}
diff --git a/Misc/pickle-perf-data/pickle-F4.json b/Misc/pickle-perf-data/pickle-F4.json
new file mode 100644
index 00000000000000..41996afc76bef5
--- /dev/null
+++ b/Misc/pickle-perf-data/pickle-F4.json
@@ -0,0 +1,157 @@
+list_of_ints_10k          dump_med=0.46929s  load_med=0.42215s
+list_of_strs_1k           dump_med=0.26292s  load_med=0.20019s
+dict_str_int_5k           dump_med=0.79429s  load_med=0.69395s
+nested_list_of_dicts      dump_med=1.29158s  load_med=1.11652s
+deep_list                 dump_med=1.35277s  load_med=1.29035s
+{
+  "deep_list": {
+    "dump_median": 1.3527695439988747,
+    "dump_min": 1.3417465070087928,
+    "dump_number": 1000,
+    "dump_runs": [
+      1.350360415992327,
+      1.3656550089945085,
+      1.3417465070087928,
+      1.3595347500231583,
+      1.3487215919885784,
+      1.3527695439988747,
+      1.354326327011222,
+      1.352106223028386,
+      1.3535518040007446
+    ],
+    "load_median": 1.2903541830019094,
+    "load_min": 1.2853805829945486,
+    "load_number": 1000,
+    "load_runs": [
+      1.2877932479896117,
+      1.2895966530195437,
+      1.2939038100012112,
+      1.2913010689953808,
+      1.2937738320033532,
+      1.2903541830019094,
+      1.2861204729997553,
+      1.2992534609802533,
+      1.2853805829945486
+    ]
+  },
+  "dict_str_int_5k": {
+    "dump_median": 0.7942866069788579,
+    "dump_min": 0.7920242490072269,
+    "dump_number": 200,
+    "dump_runs": [
+      0.7978121290216222,
+      0.7945221220143139,
+      0.7920242490072269,
+      0.794580568006495,
+      0.7934384369873442,
+      0.7935939879971556,
+      0.7942866069788579,
+      0.796369723015232,
+      0.7928306610265281
+    ],
+    "load_median": 0.6939524389745202,
+    "load_min": 0.6932927519956138,
+    "load_number": 200,
+    "load_runs": [
+      0.6939524389745202,
+      0.6935475220088847,
+      0.6939277570054401,
+      0.6944774389849044,
+      0.6959260500152595,
+      0.695406537008239,
+      0.6944298410089687,
+      0.6932927519956138,
+      0.6936380100087263
+    ]
+  },
+  "list_of_ints_10k": {
+    "dump_median": 0.4692947150033433,
+    "dump_min": 0.46653992700157687,
+    "dump_number": 200,
+    "dump_runs": [
+      0.47567813302157447,
+      0.46653992700157687,
+      0.46851683701970614,
+      0.47128816501935944,
+      0.46925433800788596,
+      0.4715841499855742,
+      0.46715522298472933,
+      0.47052456298843026,
+      0.4692947150033433
+    ],
+    "load_median": 0.4221480939886533,
+    "load_min": 0.42036854999605566,
+    "load_number": 200,
+    "load_runs": [
+      0.42527854099171236,
+      0.425013525004033,
+      0.4221480939886533,
+      0.4373894559976179,
+      0.42192134601646103,
+      0.42191921500489116,
+      0.42036854999605566,
+      0.42183153799851425,
+      0.42632938202586956
+    ]
+  },
+  "list_of_strs_1k": {
+    "dump_median": 0.2629196490161121,
+    "dump_min": 0.26163923999411054,
+    "dump_number": 500,
+    "dump_runs": [
+      0.26241732499329373,
+      0.2631832910119556,
+      0.2629196490161121,
+      0.2618296940054279,
+      0.26163923999411054,
+      0.26218619200517423,
+      0.2631655119766947,
+      0.26509728899691254,
+      0.26552043500123546
+    ],
+    "load_median": 0.2001899020106066,
+    "load_min": 0.19943000900093466,
+    "load_number": 500,
+    "load_runs": [
+      0.2017149579769466,
+      0.20306460099527612,
+      0.2003233830037061,
+      0.19943000900093466,
+      0.2004918140009977,
+      0.19959253299748525,
+      0.2001899020106066,
+      0.19946499800425954,
+      0.20006964402273297
+    ]
+  },
+  "nested_list_of_dicts": {
+    "dump_median": 1.2915813619911205,
+    "dump_min": 1.2885841199895367,
+    "dump_number": 1000,
+    "dump_runs": [
+      1.292526153003564,
+      1.2905749880010262,
+      1.306313084991416,
+      1.291562555998098,
+      1.2976472349837422,
+      1.290032062010141,
+      1.2921899229986593,
+      1.2915813619911205,
+      1.2885841199895367
+    ],
+    "load_median": 1.1165189029998146,
+    "load_min": 1.11280835501384,
+    "load_number": 1000,
+    "load_runs": [
+      1.113989342993591,
+      1.1238757180108223,
+      1.1165189029998146,
+      1.1149843519960996,
+      1.11280835501384,
+      1.1179163749911822,
+      1.1165960890066344,
+      1.1162643670104444,
+      1.1169322659843601
+    ]
+  }
+}
diff --git a/Misc/pickle-perf-data/pickle-F5.json b/Misc/pickle-perf-data/pickle-F5.json
new file mode 100644
index 00000000000000..90c55aff2a41d2
--- /dev/null
+++ b/Misc/pickle-perf-data/pickle-F5.json
@@ -0,0 +1,157 @@
+list_of_ints_10k          dump_med=0.46966s  load_med=0.42497s
+list_of_strs_1k           dump_med=0.25782s  load_med=0.20289s
+dict_str_int_5k           dump_med=0.78015s  load_med=0.70024s
+nested_list_of_dicts      dump_med=1.27026s  load_med=1.12223s
+deep_list                 dump_med=1.33360s  load_med=1.29760s
+{
+  "deep_list": {
+    "dump_median": 1.3335967210005037,
+    "dump_min": 1.3285019180038944,
+    "dump_number": 1000,
+    "dump_runs": [
+      1.3361497690202668,
+      1.331279038975481,
+      1.3369462120172102,
+      1.3322508150013164,
+      1.3367602299840655,
+      1.3285019180038944,
+      1.3335967210005037,
+      1.3321534309943672,
+      1.3369764079980087
+    ],
+    "load_median": 1.2976044509850908,
+    "load_min": 1.291819054982625,
+    "load_number": 1000,
+    "load_runs": [
+      1.2966857370047364,
+      1.2976476619951427,
+      1.2982357789878733,
+      1.3036545110226143,
+      1.2976044509850908,
+      1.2940969280025456,
+      1.2938310799945612,
+      1.3095037300081458,
+      1.291819054982625
+    ]
+  },
+  "dict_str_int_5k": {
+    "dump_median": 0.780147395998938,
+    "dump_min": 0.7774643120064866,
+    "dump_number": 200,
+    "dump_runs": [
+      0.7844805019849446,
+      0.777518760995008,
+      0.7774643120064866,
+      0.7798743690073024,
+      0.7807924000080675,
+      0.780028476001462,
+      0.782573426986346,
+      0.780147395998938,
+      0.7822241180110723
+    ],
+    "load_median": 0.7002370380214415,
+    "load_min": 0.6972021220135503,
+    "load_number": 200,
+    "load_runs": [
+      0.7002370380214415,
+      0.7022165300149936,
+      0.7001779619895387,
+      0.6992839259910397,
+      0.7195519619854167,
+      0.7068804759765044,
+      0.702939039998455,
+      0.6972021220135503,
+      0.7000846860173624
+    ]
+  },
+  "list_of_ints_10k": {
+    "dump_median": 0.4696578680013772,
+    "dump_min": 0.46754416698240675,
+    "dump_number": 200,
+    "dump_runs": [
+      0.47411208000266925,
+      0.46927239999058656,
+      0.46839072500006296,
+      0.4688742560101673,
+      0.4715996519953478,
+      0.47221954897395335,
+      0.4696578680013772,
+      0.46754416698240675,
+      0.471379277005326
+    ],
+    "load_median": 0.4249699049978517,
+    "load_min": 0.4215563000179827,
+    "load_number": 200,
+    "load_runs": [
+      0.4256505250232294,
+      0.4270968330092728,
+      0.4249699049978517,
+      0.4218275480088778,
+      0.423782588011818,
+      0.4215563000179827,
+      0.4249822609999683,
+      0.4224553740059491,
+      0.4257895860064309
+    ]
+  },
+  "list_of_strs_1k": {
+    "dump_median": 0.2578166199964471,
+    "dump_min": 0.25662504899082705,
+    "dump_number": 500,
+    "dump_runs": [
+      0.26047377099166624,
+      0.25662504899082705,
+      0.25751735499943607,
+      0.2589073199778795,
+      0.25788120899233036,
+      0.2578166199964471,
+      0.25957061600638554,
+      0.25748810000368394,
+      0.2569785390223842
+    ],
+    "load_median": 0.20288846699986607,
+    "load_min": 0.2013806460017804,
+    "load_number": 500,
+    "load_runs": [
+      0.20288846699986607,
+      0.20201357899350114,
+      0.2037200210033916,
+      0.2032585279957857,
+      0.2013806460017804,
+      0.20148403500206769,
+      0.20210941499681212,
+      0.20307578600477427,
+      0.2040593319979962
+    ]
+  },
+  "nested_list_of_dicts": {
+    "dump_median": 1.2702566210064106,
+    "dump_min": 1.2662554410053417,
+    "dump_number": 1000,
+    "dump_runs": [
+      1.272771202988224,
+      1.2662554410053417,
+      1.2714290439907927,
+      1.270595673006028,
+      1.2702566210064106,
+      1.2941171480051707,
+      1.2690772479982115,
+      1.2687538439931814,
+      1.2670250440132804
+    ],
+    "load_median": 1.1222342750115786,
+    "load_min": 1.1169042770052329,
+    "load_number": 1000,
+    "load_runs": [
+      1.116922822984634,
+      1.1169042770052329,
+      1.1222342750115786,
+      1.1250868110219017,
+      1.1258343259978574,
+      1.1221062389959116,
+      1.1241724919818807,
+      1.1384804600093048,
+      1.1212274669960607
+    ]
+  }
+}
diff --git a/Misc/pickle-perf-data/pickle-F6v2.json b/Misc/pickle-perf-data/pickle-F6v2.json
new file mode 100644
index 00000000000000..a14de61674a935
--- /dev/null
+++ b/Misc/pickle-perf-data/pickle-F6v2.json
@@ -0,0 +1,157 @@
+list_of_ints_10k          dump_med=0.47214s  load_med=0.42831s
+list_of_strs_1k           dump_med=0.25739s  load_med=0.20295s
+dict_str_int_5k           dump_med=0.78007s  load_med=0.70409s
+nested_list_of_dicts      dump_med=1.27241s  load_med=1.13203s
+deep_list                 dump_med=1.34461s  load_med=1.30408s
+{
+  "deep_list": {
+    "dump_median": 1.3446054009837098,
+    "dump_min": 1.3394905389868654,
+    "dump_number": 1000,
+    "dump_runs": [
+      1.3394905389868654,
+      1.347063215012895,
+      1.3519734730070923,
+      1.3453039169835392,
+      1.342954871011898,
+      1.3451609860057943,
+      1.3446054009837098,
+      1.3434591199911665,
+      1.3419895030092448
+    ],
+    "load_median": 1.3040779840084724,
+    "load_min": 1.299048428016249,
+    "load_number": 1000,
+    "load_runs": [
+      1.299048428016249,
+      1.30305297599989,
+      1.3011720060021617,
+      1.3042571050173137,
+      1.3338880280207377,
+      1.3038154410023708,
+      1.3213161390158348,
+      1.3040779840084724,
+      1.3102746209769975
+    ]
+  },
+  "dict_str_int_5k": {
+    "dump_median": 0.7800723889959045,
+    "dump_min": 0.7783223139995243,
+    "dump_number": 200,
+    "dump_runs": [
+      0.7800723889959045,
+      0.7799861649982631,
+      0.7824538210115861,
+      0.7783223139995243,
+      0.7792819830065127,
+      0.7830243440112099,
+      0.7784027269808576,
+      0.7804551280278247,
+      0.7802822579978965
+    ],
+    "load_median": 0.7040858109830879,
+    "load_min": 0.7010791360225994,
+    "load_number": 200,
+    "load_runs": [
+      0.7043061080039479,
+      0.7023759619914927,
+      0.7030515069782268,
+      0.7081933679874055,
+      0.7021840680099558,
+      0.7010791360225994,
+      0.7040858109830879,
+      0.7069041469949298,
+      0.7058628860104363
+    ]
+  },
+  "list_of_ints_10k": {
+    "dump_median": 0.47214188598445617,
+    "dump_min": 0.4705758850031998,
+    "dump_number": 200,
+    "dump_runs": [
+      0.4742345330014359,
+      0.4722038920153864,
+      0.4705758850031998,
+      0.47290972698829137,
+      0.4717923470016103,
+      0.47110052502830513,
+      0.4723907239967957,
+      0.47214188598445617,
+      0.47072569900774397
+    ],
+    "load_median": 0.4283100029861089,
+    "load_min": 0.42700953999883495,
+    "load_number": 200,
+    "load_runs": [
+      0.43335279999882914,
+      0.4548073170008138,
+      0.4283100029861089,
+      0.42750212299870327,
+      0.42942432398558594,
+      0.42766048898920417,
+      0.42700953999883495,
+      0.42848984300508164,
+      0.42780931101879105
+    ]
+  },
+  "list_of_strs_1k": {
+    "dump_median": 0.2573925339966081,
+    "dump_min": 0.2566280080063734,
+    "dump_number": 500,
+    "dump_runs": [
+      0.2598363939905539,
+      0.2576719030039385,
+      0.2566280080063734,
+      0.2570396980154328,
+      0.2573925339966081,
+      0.2614145730040036,
+      0.2568521559878718,
+      0.25725612899987027,
+      0.25870545700308867
+    ],
+    "load_median": 0.2029514189925976,
+    "load_min": 0.20211675600148737,
+    "load_number": 500,
+    "load_runs": [
+      0.20235929498448968,
+      0.20211675600148737,
+      0.20335459199850447,
+      0.2029514189925976,
+      0.2038201020041015,
+      0.20399305000319146,
+      0.20216496000648476,
+      0.20260031000361778,
+      0.2035672290076036
+    ]
+  },
+  "nested_list_of_dicts": {
+    "dump_median": 1.2724108810070902,
+    "dump_min": 1.2684306089940947,
+    "dump_number": 1000,
+    "dump_runs": [
+      1.2828123570070602,
+      1.2699548069795128,
+      1.2717457129911054,
+      1.2713525709987152,
+      1.2724108810070902,
+      1.2684306089940947,
+      1.2745656919723842,
+      1.2734907999983989,
+      1.2730105069931597
+    ],
+    "load_median": 1.1320317159988917,
+    "load_min": 1.1275092939904425,
+    "load_number": 1000,
+    "load_runs": [
+      1.129589811025653,
+      1.1359602059819736,
+      1.1320317159988917,
+      1.1327350729843602,
+      1.1298678299936,
+      1.1317517429997679,
+      1.1481941560050473,
+      1.1275092939904425,
+      1.1337442880030721
+    ]
+  }
+}
diff --git a/Misc/pickle-perf-data/pickle-bytes-F6v2.json b/Misc/pickle-perf-data/pickle-bytes-F6v2.json
new file mode 100644
index 00000000000000..96085fc36dc70c
--- /dev/null
+++ b/Misc/pickle-perf-data/pickle-bytes-F6v2.json
@@ -0,0 +1,118 @@
+list_of_short_bytes_5k         dump_med=1.36313s  load_med=0.87508s
+list_of_medium_bytes_1k        dump_med=0.25974s  load_med=0.17219s
+list_of_bytearrays_1k          dump_med=0.31071s  load_med=0.26507s
+dict_bytes_to_int_2k           dump_med=0.76580s  load_med=0.62017s
+{
+  "dict_bytes_to_int_2k": {
+    "dump_median": 0.7657969659776427,
+    "dump_min": 0.7633960820094217,
+    "dump_runs": [
+      0.7674669309926685,
+      0.7633960820094217,
+      0.7657969659776427,
+      0.7645750269875862,
+      0.765020419989014,
+      0.7658959950204007,
+      0.7652317300089635,
+      0.7690891080128495,
+      0.7678431569947861
+    ],
+    "load_median": 0.6201701980025973,
+    "load_min": 0.6187940069939941,
+    "load_runs": [
+      0.6199791239923798,
+      0.6203661299950909,
+      0.6187940069939941,
+      0.6270566619932652,
+      0.6212266740039922,
+      0.6201343499997165,
+      0.6189115779998247,
+      0.6205530060105957,
+      0.6201701980025973
+    ]
+  },
+  "list_of_bytearrays_1k": {
+    "dump_median": 0.31071193498792127,
+    "dump_min": 0.3100230999989435,
+    "dump_runs": [
+      0.31063630399876274,
+      0.31071193498792127,
+      0.31097569499979727,
+      0.31069087199284695,
+      0.3106039409758523,
+      0.3100230999989435,
+      0.31196171999908984,
+      0.31121612299466506,
+      0.3109887960017659
+    ],
+    "load_median": 0.2650736850046087,
+    "load_min": 0.26334526200662367,
+    "load_runs": [
+      0.2650736850046087,
+      0.26510078701539896,
+      0.2645646229793783,
+      0.2665539210138377,
+      0.265008496993687,
+      0.2678447400103323,
+      0.26334526200662367,
+      0.264281761017628,
+      0.26552151099895127
+    ]
+  },
+  "list_of_medium_bytes_1k": {
+    "dump_median": 0.25973886699648574,
+    "dump_min": 0.25829321797937155,
+    "dump_runs": [
+      0.2603469759924337,
+      0.25973886699648574,
+      0.25925488301436417,
+      0.26240749697899446,
+      0.26060454500839114,
+      0.25895258301170543,
+      0.25829321797937155,
+      0.26007791500887834,
+      0.258924539986765
+    ],
+    "load_median": 0.17219194798963144,
+    "load_min": 0.17070724800578319,
+    "load_runs": [
+      0.17210355601855554,
+      0.17186588500044309,
+      0.17392397299408913,
+      0.1727906810119748,
+      0.17274361802265048,
+      0.17180611100047827,
+      0.17219194798963144,
+      0.17070724800578319,
+      0.1725264280103147
+    ]
+  },
+  "list_of_short_bytes_5k": {
+    "dump_median": 1.3631322139990516,
+    "dump_min": 1.358806170988828,
+    "dump_runs": [
+      1.3660804590035696,
+      1.3633175469876733,
+      1.3630006269959267,
+      1.3654051140183583,
+      1.3605952649959363,
+      1.3662348479847424,
+      1.3631322139990516,
+      1.359001361997798,
+      1.358806170988828
+    ],
+    "load_median": 0.8750847970077302,
+    "load_min": 0.8732768380141351,
+    "load_runs": [
+      0.8750847970077302,
+      0.8734721139771864,
+      0.8732768380141351,
+      0.8752922279818449,
+      0.8741870099911466,
+      0.8735435540147591,
+      0.8924715099856257,
+      0.8751560029922985,
+      0.8853642049944028
+    ]
+  }
+}
diff --git a/Misc/pickle-perf-data/pickle-bytes-pre.json b/Misc/pickle-perf-data/pickle-bytes-pre.json
new file mode 100644
index 00000000000000..a564e8d7589455
--- /dev/null
+++ b/Misc/pickle-perf-data/pickle-bytes-pre.json
@@ -0,0 +1,118 @@
+list_of_short_bytes_5k         dump_med=1.52572s  load_med=0.89308s
+list_of_medium_bytes_1k        dump_med=0.29432s  load_med=0.17468s
+list_of_bytearrays_1k          dump_med=0.30938s  load_med=0.26697s
+dict_bytes_to_int_2k           dump_med=0.84226s  load_med=0.63278s
+{
+  "dict_bytes_to_int_2k": {
+    "dump_median": 0.8422569980029948,
+    "dump_min": 0.8362462919903919,
+    "dump_runs": [
+      0.8430894699995406,
+      0.8417722480080556,
+      0.8362462919903919,
+      0.8438501350174192,
+      0.8411837189923972,
+      0.8422569980029948,
+      0.8425710090086795,
+      0.8379477910057176,
+      0.8439036430208944
+    ],
+    "load_median": 0.6327810660004616,
+    "load_min": 0.6290387620101683,
+    "load_runs": [
+      0.6362683860061225,
+      0.6348283840052318,
+      0.6290387620101683,
+      0.6327810660004616,
+      0.6618689050083049,
+      0.6293159729975741,
+      0.6322668910142966,
+      0.6334396309976,
+      0.6325078759982716
+    ]
+  },
+  "list_of_bytearrays_1k": {
+    "dump_median": 0.3093765630037524,
+    "dump_min": 0.306808776978869,
+    "dump_runs": [
+      0.3093765630037524,
+      0.3083771590027027,
+      0.30986057402333245,
+      0.306808776978869,
+      0.3100668080151081,
+      0.3090932019986212,
+      0.31041129599907435,
+      0.3092500809871126,
+      0.31077829698915593
+    ],
+    "load_median": 0.26697024900931865,
+    "load_min": 0.26491978400736116,
+    "load_runs": [
+      0.26697024900931865,
+      0.2664162100118119,
+      0.266443717002403,
+      0.2679101309913676,
+      0.2680081869766582,
+      0.2679736709978897,
+      0.26491978400736116,
+      0.26543061700067483,
+      0.2677989369840361
+    ]
+  },
+  "list_of_medium_bytes_1k": {
+    "dump_median": 0.2943212440004572,
+    "dump_min": 0.29297671900712885,
+    "dump_runs": [
+      0.29425962900859304,
+      0.29518934700172395,
+      0.29662220302270725,
+      0.2943869439768605,
+      0.2943044769926928,
+      0.2943212440004572,
+      0.29297671900712885,
+      0.29419700798462145,
+      0.29697781198774464
+    ],
+    "load_median": 0.17467987502459437,
+    "load_min": 0.17374842800199986,
+    "load_runs": [
+      0.17374842800199986,
+      0.1745834049943369,
+      0.1744183299888391,
+      0.1761268700065557,
+      0.17697653200593777,
+      0.1764016640081536,
+      0.17393729300238192,
+      0.19178297198959626,
+      0.17467987502459437
+    ]
+  },
+  "list_of_short_bytes_5k": {
+    "dump_median": 1.5257160300097894,
+    "dump_min": 1.52191814899561,
+    "dump_runs": [
+      1.5362194830086082,
+      1.5459396399965044,
+      1.5257160300097894,
+      1.5319720820116345,
+      1.5226232759887353,
+      1.52191814899561,
+      1.5250951769994572,
+      1.5275468620238826,
+      1.525083160988288
+    ],
+    "load_median": 0.8930817610234953,
+    "load_min": 0.8909498819848523,
+    "load_runs": [
+      0.8951333180011716,
+      0.8930817610234953,
+      0.8953445160004776,
+      0.8947322810126934,
+      0.8956492930010427,
+      0.8920939379895572,
+      0.892244232003577,
+      0.8909498819848523,
+      0.8921436429955065
+    ]
+  }
+}
diff --git a/Misc/pickle-perf-data/pickle-post-fix.json b/Misc/pickle-perf-data/pickle-post-fix.json
new file mode 100644
index 00000000000000..28da26c7b75692
--- /dev/null
+++ b/Misc/pickle-perf-data/pickle-post-fix.json
@@ -0,0 +1,157 @@
+list_of_ints_10k          dump_med=0.67460s  load_med=0.42994s
+list_of_strs_1k           dump_med=0.28906s  load_med=0.20440s
+dict_str_int_5k           dump_med=0.96269s  load_med=0.71093s
+nested_list_of_dicts      dump_med=1.43517s  load_med=1.14338s
+deep_list                 dump_med=2.05395s  load_med=1.31575s
+{
+  "deep_list": {
+    "dump_median": 2.0539472999807913,
+    "dump_min": 2.0466757010144647,
+    "dump_number": 1000,
+    "dump_runs": [
+      2.0525701260194182,
+      2.0694163709995337,
+      2.0466757010144647,
+      2.0539472999807913,
+      2.0863979189889506,
+      2.0536597600148525,
+      2.0495226650091354,
+      2.0601514369773213,
+      2.0557748269930016
+    ],
+    "load_median": 1.3157547959999647,
+    "load_min": 1.311796754016541,
+    "load_number": 1000,
+    "load_runs": [
+      1.3137639659980778,
+      1.3139963610155974,
+      1.311796754016541,
+      1.3370133179996628,
+      1.3157547959999647,
+      1.317198010016,
+      1.3121011200128123,
+      1.3203509089944419,
+      1.318253788020229
+    ]
+  },
+  "dict_str_int_5k": {
+    "dump_median": 0.9626886440091766,
+    "dump_min": 0.9605170200229622,
+    "dump_number": 200,
+    "dump_runs": [
+      0.9626886440091766,
+      0.9611088200181257,
+      0.9638137099973392,
+      0.9637224999896716,
+      0.9605170200229622,
+      0.9609493039897643,
+      0.961388880998129,
+      0.9681612249987666,
+      0.9650618539890274
+    ],
+    "load_median": 0.7109302699973341,
+    "load_min": 0.7081749869976193,
+    "load_number": 200,
+    "load_runs": [
+      0.7104972380038816,
+      0.7109302699973341,
+      0.7081749869976193,
+      0.7086126150097698,
+      0.7092837099917233,
+      0.712322662002407,
+      0.7122487350134179,
+      0.7202230299881194,
+      0.7135504409961868
+    ]
+  },
+  "list_of_ints_10k": {
+    "dump_median": 0.6746024050225969,
+    "dump_min": 0.6719562280050013,
+    "dump_number": 200,
+    "dump_runs": [
+      0.6801464110030793,
+      0.6721921590215061,
+      0.6723615459923167,
+      0.673272668995196,
+      0.6754793899890501,
+      0.6756901020125952,
+      0.6753933620057069,
+      0.6746024050225969,
+      0.6719562280050013
+    ],
+    "load_median": 0.4299366380146239,
+    "load_min": 0.4277481680037454,
+    "load_number": 200,
+    "load_runs": [
+      0.43162021099124104,
+      0.4345451260160189,
+      0.4277481680037454,
+      0.4277922750043217,
+      0.4293675130174961,
+      0.4279562339943368,
+      0.43217520701000467,
+      0.4302983859961387,
+      0.4299366380146239
+    ]
+  },
+  "list_of_strs_1k": {
+    "dump_median": 0.2890551130112726,
+    "dump_min": 0.28859103901777416,
+    "dump_number": 500,
+    "dump_runs": [
+      0.28859103901777416,
+      0.2890551130112726,
+      0.2902803129982203,
+      0.2898574910068419,
+      0.29002336799749173,
+      0.2886178079934325,
+      0.2899810500093736,
+      0.2889821300050244,
+      0.2890126239799429
+    ],
+    "load_median": 0.20439893400180154,
+    "load_min": 0.20382020599208772,
+    "load_number": 500,
+    "load_runs": [
+      0.20399944900418632,
+      0.204369979997864,
+      0.20382020599208772,
+      0.20631398001569323,
+      0.20439893400180154,
+      0.2049277960031759,
+      0.20495106297312304,
+      0.20577315898844972,
+      0.204317081021145
+    ]
+  },
+  "nested_list_of_dicts": {
+    "dump_median": 1.435173161997227,
+    "dump_min": 1.4256869830132928,
+    "dump_number": 1000,
+    "dump_runs": [
+      1.437092014006339,
+      1.435173161997227,
+      1.4392601690196898,
+      1.436017094005365,
+      1.4256869830132928,
+      1.4345273090002593,
+      1.4281554600165691,
+      1.4283745099965017,
+      1.4402513290115166
+    ],
+    "load_median": 1.1433806889981497,
+    "load_min": 1.137105599977076,
+    "load_number": 1000,
+    "load_runs": [
+      1.1385058440209832,
+      1.1437003150058445,
+      1.1417482849792577,
+      1.137105599977076,
+      1.1389529810112435,
+      1.1433806889981497,
+      1.1462141220108606,
+      1.1454468579904642,
+      1.1555560099950526
+    ]
+  }
+}
diff --git a/Misc/pickle-perf-data/pickle_pure_bench_bytes.py b/Misc/pickle-perf-data/pickle_pure_bench_bytes.py
new file mode 100644
index 00000000000000..308054895f26a9
--- /dev/null
+++ b/Misc/pickle-perf-data/pickle_pure_bench_bytes.py
@@ -0,0 +1,53 @@
+"""
+Bytes/bytearray-heavy workload to evaluate F6 (extend save() fast path
+to bytes).
+"""
+import io
+import json
+import pickle
+import statistics
+import timeit
+
+PyPickler = pickle._Pickler
+PyUnpickler = pickle._Unpickler
+
+SAMPLES = {
+    "list_of_short_bytes_5k":    [bytes([i & 0xff]) * 4 for i in range(5000)],
+    "list_of_medium_bytes_1k":   [bytes(range(32)) for _ in range(1000)],
+    "list_of_bytearrays_1k":     [bytearray(range(16)) for _ in range(1000)],
+    "dict_bytes_to_int_2k":      {bytes([i & 0xff, (i>>8) & 0xff]): i for i in range(2000)},
+}
+
+CONFIG = {k: {"dump": 500, "load": 500} for k in SAMPLES}
+
+def bench_dump(obj, number):
+    buf = io.BytesIO()
+    def run():
+        buf.seek(0); buf.truncate(0)
+        p = PyPickler(buf, protocol=5)
+        p.dump(obj)
+    return timeit.repeat(run, number=number, repeat=9)
+
+def bench_load(data, number):
+    def run():
+        u = PyUnpickler(io.BytesIO(data))
+        u.load()
+    return timeit.repeat(run, number=number, repeat=9)
+
+results = {}
+for name, obj in SAMPLES.items():
+    data = pickle.dumps(obj, protocol=5)
+    n = CONFIG[name]
+    drun = bench_dump(obj, n["dump"])
+    lrun = bench_load(data, n["load"])
+    results[name] = {
+        "dump_runs": drun,
+        "load_runs": lrun,
+        "dump_median": statistics.median(drun),
+        "load_median": statistics.median(lrun),
+        "dump_min": min(drun),
+        "load_min":  min(lrun),
+    }
+    print(f"{name:30s} dump_med={results[name]['dump_median']:.5f}s  load_med={results[name]['load_median']:.5f}s", flush=True)
+
+print(json.dumps(results, indent=2, sort_keys=True))
diff --git a/Misc/pickle-perf-diary.md b/Misc/pickle-perf-diary.md
index e1266004618aef..8d6be4f27d15eb 100644
--- a/Misc/pickle-perf-diary.md
+++ b/Misc/pickle-perf-diary.md
@@ -56,6 +56,8 @@ Source: `pickle-perf-data/pickle-pure-baseline.json`.
 
 ## Experiment ledger
 
+### Round 1 (Exp 4 → E)
+
 | # | Idea | Lens | Status | Cumulative `dump` Δ vs main |
 | --- | --- | --- | --- | ---: |
 | 4 | Exact-container fast paths (`_batch_appends_exact`, `_batch_setitems_exact`) | Compiler (user #4) | **Shipped** (`94b53eb`) | −0.2 to −22.1% |
@@ -65,15 +67,39 @@ Source: `pickle-perf-data/pickle-pure-baseline.json`.
 | D | Inline `framer.commit_frame()` hot check | Compiler | **Shipped** (`bb9d721`) | −5 to −10% additional |
 | E | Atomic-type `is` short-circuit in `save()` | Empirical / profile | **Shipped** (`bb9d721`) | −0 to −4% additional |
 
-Final shipped: **Exp 4 + D + E**. Final numbers vs `main`:
+### Round 2 (F1 → F6)
+
+| # | Idea | Lens | Status | `dump` Δ vs round-1 tip |
+| --- | --- | --- | --- | ---: |
+| (fix) | Preserve large-dict mutation detection (regression from Exp 4) | Correctness | Shipped (part of F1) | Neutral |
+| F1 | Move atomic fast paths *before* memo lookup, match C pickler's dispatch order | Compiler (user round 2 #1) | **Shipped** (`285fcae`) | −7 to −31% additional |
+| F2 | Inline `put()` MEMOIZE proto-4+ path into `memoize()` | Compiler (user #2) | **Shipped** (`2f1d38b`) | −0 to −2% additional |
+| F3 | Explicit frame byte counter to eliminate `BytesIO.tell()` | Compiler (user #3) | **Rejected** | +3 to +5% (worse) |
+| F4 | Precompute BININT1 opcode bytes for n in 0..255 | Empirical (user #4) | **Shipped** (`7c6af84`) | −0 to −10% additional |
+| F5 | ASCII fast path in `save_str` | Compiler (user #6) | **Rejected** | ~0% (flat) |
+| F6 | Extend `save()` fast path to `bytes` | Empirical (user #5) | **Shipped** (`e917108`) | −9 to −12% on bytes-heavy, neutral elsewhere |
+| F7 | Exact-set batching fast path | Compiler (user #8) | Deferred | Not measured |
+
+Final shipped stack: **Exp 4 + D + E + F1 + F2 + F4 + F6**. Cumulative
+numbers vs clean `main`:
 
 | Workload | `dump` | `load` |
 | --- | ---: | ---: |
-| `list_of_ints_10k` | **−11.7%** | −3.9% |
-| `list_of_strs_1k` | **−8.9%** | −3.4% |
-| `dict_str_int_5k` | **−9.9%** | −4.4% |
-| `deep_list` | **−24.6%** | −2.9% |
-| `nested_list_of_dicts` | **−28.5%** | −2.8% |
+| `list_of_ints_10k` | **−38.3%** | −3.8% |
+| `list_of_strs_1k` | **−20.1%** | −3.6% |
+| `dict_str_int_5k` | **−27.7%** | −4.2% |
+| `deep_list` | **−49.0%** | −3.1% |
+| `nested_list_of_dicts` | **−36.8%** | −2.8% |
+
+Bytes-heavy bench (introduced in round 2; no prior reference number
+on `main`, so deltas are vs the round-2 tip *without* F6):
+
+| Workload | `dump` Δ vs pre-F6 |
+| --- | ---: |
+| `list_of_short_bytes_5k` | **−10.7%** |
+| `list_of_medium_bytes_1k` | **−11.7%** |
+| `dict_bytes_to_int_2k` | **−9.1%** |
+| `list_of_bytearrays_1k` | +0.4% (bytearray not in fast path) |
 
 Load-path numbers above are within measurement noise (the load path
 was never touched); they move directionally because the same
@@ -380,67 +406,341 @@ related to any of our changes.
 
 2,568 tests covering code paths that exercise `_Pickler`, all pass.
 
+## Round 2 — F1 through F6
+
+After shipping Round 1 and writing the diary, a fresh profile of the
+current branch flagged which call sites still dominated: `save()` body,
+`memoize()`, `save_long`'s `_struct.pack` calls, and `framer.current_frame.tell()`
+invocations. The user prepared a second idea list (F1–F10) targeting
+those lines directly. Round 2 ran the first six of those to verdict.
+
+### Correctness fix: large-dict mutation detection
+
+**Flagged on review.** My Exp 4 `_batch_setitems_exact` large-dict
+branch (`n > _BATCHSIZE`) materialised `list(obj.items())` upfront,
+losing the `RuntimeError("dictionary changed size during iteration")`
+that `batched()` on the live items-view would raise. Fix: delegate the
+large-dict path to the generic `_batch_setitems`, which iterates
+through `batched()` over the live view. The small-dict fast path
+already preserved detection through direct `for k, v in items:`.
+
+Verified via a targeted reproducer (2000-entry dict with a
+`persistent_id` hook that clears the dict): **RuntimeError now raised
+as expected**, both before and after the rest of Round 2.
+
+No stdlib test exercised this path, which is why the regression slipped
+through in Round 1. Worth considering a test-suite addition.
+
+### Experiment F1 — atomic fast paths *before* memo lookup
+
+**Hypothesis.** The C pickler's `save()` (`Modules/_pickle.c:4514`)
+dispatches atomic types (`None`, `bool`, `long`, `float`) *before* the
+memo check, and without invoking `reducer_override`. The Python version
+did the memo check and reducer_override probe first, costing an `id()`
++ `memo.get()` and a `getattr` per call for the 80%-by-count case of
+atomic values.
+
+**Implementation iteration 1** placed atomics in the obvious order —
+`int`, `None`, `bool`, `float`, then memo, then reducer_override,
+then `str`. Regressed `list_of_strs_1k` by +3% because every string
+paid four no-op `type(obj) is X` identity tests before reaching its
+dispatch.
+
+**Implementation iteration 2** (shipped) reorders so `str` is first
+(with inline memo check preserving dedup), then the four non-memoized
+atomics, then the memo check for other types. Matches C pickler's
+dispatch order.
+
+**Semantic change**: reducer_override is no longer called for
+`str`/`int`/`bool`/`None`/`float`. This matches the C pickler's long-
+standing behaviour; the `reducer_override` protocol was designed for
+custom types, not the built-in atomic ones.
+
+**Results** (vs F1 predecessor, Exp 4 + D + E):
+
+| Workload | `dump` Δ |
+| --- | ---: |
+| `list_of_ints_10k` | **−30.4%** |
+| `list_of_strs_1k` | **−7.8%** |
+| `dict_str_int_5k` | **−16.3%** |
+| `deep_list` | **−26.9%** |
+| `nested_list_of_dicts` | **−8.6%** |
+
+The largest pure-Python pickle speedup this entire project has
+produced.
+
+### Experiment F2 — inline MEMOIZE in `memoize()`
+
+**Hypothesis.** For protocol ≥ 4 (the common case), `self.put(idx)`
+simply returns the single-byte `MEMOIZE` constant. `memoize()` was
+calling `self.write(self.put(idx))` — one method dispatch and one
+temporary bytestring, every time. Profile showed 35–280 ms per bench
+in `memoize`.
+
+**Implementation.** Inline the protocol dispatch directly into
+`memoize()`; for proto ≥ 4, write `MEMOIZE` directly. Cache
+`memo = self.memo` as a local to avoid a second attribute load. The
+out-of-line `put()` method is preserved for subclass override
+compatibility.
+
+**Results** (vs F4 predecessor):
+
+| Workload | `dump` Δ |
+| --- | ---: |
+| `list_of_ints_10k` | +1.0% (not memoized; noise) |
+| `list_of_strs_1k` | −1.9% |
+| `dict_str_int_5k` | −1.2% |
+| `deep_list` | −1.3% |
+| `nested_list_of_dicts` | −1.6% |
+
+Modest but consistent win on memoizing workloads.
+
+### Experiment F3 — explicit frame byte counter
+
+**Hypothesis.** After D, `BytesIO.tell()` was still visible in the
+profile. Track bytes written in `_Framer` as an explicit integer
+counter, so the frame-boundary check in `save()` becomes a plain int
+compare.
+
+**Implementation.** Add `current_frame_size` to `_Framer`, increment
+in `_Framer.write`, reset in `commit_frame` and `start_framing`.
+
+**Verdict: REJECTED.** Regressed **all** workloads by +3–5%. The
+counter-maintenance cost per write (`self.current_frame_size += len(data)`,
+a Python-level attribute load + `len()` + store) exceeds
+`BytesIO.tell()`'s cost — `BytesIO` is a C type and `tell()` is a
+direct C method call with no Python-level frame overhead. One Python
+statement per write beats one C method call per ~100 writes.
+
+**Lesson.** Don't replace a C-method-call-on-C-object with a
+Python-level counter unless you're confident the amortised write
+frequency is less than the amortised check frequency. In this code
+path it's roughly 1:1, so the counter always loses.
+
+### Experiment F4 — BININT1 opcode-bytes cache
+
+**Hypothesis.** `save_long` for small non-negative ints was doing
+`BININT1 + pack("<B", obj)` per call. Precompute the 256 two-byte
+opcode sequences at module import.
+
+**Implementation.** Module-level `_BININT1_BYTES = tuple(BININT1 +
+bytes([i]) for i in range(256))` (~50 KB one-time cost). `save_long`
+uses `self.write(_BININT1_BYTES[obj])` in the `0 ≤ obj ≤ 0xff` branch.
+
+**Results** (vs F1 predecessor):
+
+| Workload | `dump` Δ | Note |
+| --- | ---: | --- |
+| `list_of_ints_10k` | −0.0% | 0–9999 range; only 256 values in cache |
+| `list_of_strs_1k` | −1.4% | Incidental — string save also writes byte headers via pack |
+| `dict_str_int_5k` | −1.4% | Values 0–4999 mostly hit BININT2, not cached |
+| `deep_list` | **−9.9%** | `[i]*10 for i in 0..499` — every int in cache |
+| `nested_list_of_dicts` | −1.5% | |
+
+The huge deep_list win demonstrates F4's headline: when all ints fit
+in the cache, this is a significant speedup. In mixed workloads the
+effect is smaller but positive.
+
+### Experiment F5 — ASCII fast path in `save_str`
+
+**Hypothesis.** `obj.encode('utf-8', 'surrogatepass')` is used
+unconditionally. For ASCII strings (the common case — identifiers,
+JSON-like dict keys, log records), `.encode('ascii')` skips the utf-8
+codec's surrogate-pass logic.
+
+**Implementation.** Guard with `obj.isascii()`, then call
+`.encode('ascii')` on the fast path, else original path.
+
+**Verdict: REJECTED (flat).** Within ±1% across all benches — pure
+noise. Python's utf-8 encoder already has a fast path for pure-ASCII
+strings (all bytes < 0x80 require no multi-byte encoding, essentially
+a memcpy). The cost of `isascii()` (a C method that walks the string
+once) cancels any saving from switching codecs.
+
+**Lesson.** Before optimising a Python-level codec call, check whether
+the underlying C codec already has a fast path for your input shape.
+Python's str/bytes codecs are aggressively specialised.
+
+### Experiment F6 — `bytes` in the `save()` fast path
+
+**Hypothesis.** F1's fast path covers `str`, `int`, `None`, `bool`,
+`float`. `bytes` values — common in binary-protocol and content-
+addressable workloads — still fall through to the dispatch table. Add
+`bytes` to the fast path with inline memo check (analogous to `str`).
+
+**Workload.** No existing bench used bytes, so added
+`pickle_pure_bench_bytes.py` with four shapes: small fixed-size bytes
+in a list, medium bytes (32-byte buffers), bytearrays (control — not
+covered by F6), and bytes-keyed dict.
+
+**Implementation iteration 1** placed `bytes` right after `str` in
+the fast path. Regressed `list_of_ints_10k` by +2.6% and `deep_list`
+by +3.2% because every int save now paid an extra `type(obj) is bytes`
+branch before hitting the int path.
+
+**Implementation iteration 2** (shipped) places `bytes` *after* the
+non-memoized atomics (int/None/bool/float). Int-heavy workloads pay
+zero extra cost; bytes workloads pay four branches before dispatch,
+which is cheap relative to `save_bytes`'s body.
+
+**Results.** Main-bench workloads neutral (±0.7%). Bytes bench:
+
+| Workload | `dump` Δ |
+| --- | ---: |
+| `list_of_short_bytes_5k` | **−10.7%** |
+| `list_of_medium_bytes_1k` | **−11.7%** |
+| `dict_bytes_to_int_2k` | **−9.1%** |
+| `list_of_bytearrays_1k` | +0.4% (`bytearray` not covered) |
+
+### Experiment F7 — exact-set batching
+
+**Deferred.** C has `batch_set_exact` that iterates a set directly
+with a size-change check. Pure Python currently uses `batched()` over
+`sorted(obj)` (for determinism) — adds sort cost plus generator
+overhead. Expected win on set-heavy workloads (~1000+ items), similar
+in shape to Exp 4's dict/list work. Needs a set-heavy workload; not
+measured this round.
+
 ## Final conclusions
 
 ### What shipped
 
-Two commits on `exp-pickle/4-pure-python-exact-containers` (local,
+Seven commits on `exp-pickle/4-pure-python-exact-containers` (local,
 unpushed):
 
+**Round 1:**
 1. `94b53eb` — Exp 4 exact-container fast paths.
 2. `bb9d721` — Exp D inlined `commit_frame` + Exp E atomic-type
    short-circuit.
+3. `69eed04` — this diary (round-1 version).
+
+**Round 2:**
+4. `285fcae` — F1 reorder save() fast paths + fix large-dict
+   mutation regression introduced by Exp 4.
+5. `7c6af84` — F4 BININT1 opcode cache.
+6. `2f1d38b` — F2 inline MEMOIZE proto-4+ in memoize().
+7. `e917108` — F6 bytes in save() fast path.
 
-Net diff vs `main`: +138 / -4 lines in `Lib/pickle.py`.
+Net diff vs `main`: ~200 added / ~30 removed lines in `Lib/pickle.py`.
 
 ### What we learned
 
 - **`save()` overhead dominates on small-object workloads.** 500,000
   save calls × ~2 µs = 1 second. Optimizing the *frame* of each call
-  (D) matters more than optimizing the *body* (individual `save_*`
+  (D, F1) matters more than optimizing the *body* (individual `save_*`
   functions) for this shape of work.
+- **Matching the C pickler's dispatch order is almost always right.**
+  F1's biggest wins (−30% on int-heavy workloads) came from simply
+  matching `Modules/_pickle.c::save()`'s atomic-types-before-memo
+  order. The pure-Python implementation had been written to look
+  readable, not to mirror the reference. When in doubt, align the
+  two.
 - **`PyType_Lookup`'s type-attribute cache is already good.** Don't
   try to short-circuit `self.method()` with a bool + `__dict__` probe;
   you'll add more work than you save. Exp B learned this the hard way
   on two separate implementation attempts.
+- **Don't replace a C-method call with a Python-level counter.** F3
+  tried to replace `BytesIO.tell()` (C-method on C-type) with a
+  Python attribute. The per-write Python-level bookkeeping costs more
+  than the single C call it saved — +3-5% regression across the
+  board. A single C method call on a C type is hard to beat from
+  Python.
+- **Don't re-specialize a codec Python already specializes.** F5
+  tried an ASCII fast path in `save_str`. Python's utf-8 encoder
+  already has a fast path for pure-ASCII strings (memcpy); `isascii()`
+  walks the string, cancelling any saving. Before manually specializing
+  a codec, check whether the underlying C implementation already has
+  a fast path for your input shape.
 - **Byte-exact pickle-output tests block some legitimate wins.** Exp C
   is semantically correct and saves real work but would require
   updating fixture constants in `test_pickle_to_2x`. Noting this for
   anyone willing to push the broader change through review.
+- **Precomputed opcode-byte tables are cheap and payoff-bounded.**
+  F4 cached 256 two-byte strings (~50 KB one-time) and won up to
+  −10% on workloads where every integer fit in the cache. Bounded
+  payoff: only small ints benefit. But free to implement, free to
+  keep, and Python's small-int cache guarantees consistent `is`
+  behaviour for identifier-style numbers.
 - **Contamination is real.** A background kernel compile on the same
   core can produce uniform ~2× slowdowns that look superficially like
   a patch-induced regression. Every experiment in this diary had its
-  final verdict set by a clean-machine re-run.
+  final verdict set by a clean-machine re-run; one round-2 experiment
+  (B) was initially read as +124% under thermal contamination but
+  reconfirmed as a real +17-36% regression on the clean machine, which
+  was the honest verdict.
+- **The diary's own value was paid back in round 2.** F1 was
+  suggested by a user review of the round-1 tip + diary that flagged
+  a specific regression (large-dict mutation detection) I'd missed,
+  plus six more ideas targeting specific hotspots. Writing the diary
+  up front made a second round of focused work possible.
+
+### What we validated this session
+
+- **Full CPython regression suite** (`./python -m test -j24`) on the
+  F6 tip: 468 test files, 48,928 tests run, zero failures.
+- **dill 0.4.1 test suite**: 29/30 pass; the single failure
+  (`test_session`) is a pre-existing Python 3.15 alpha incompatibility
+  in dill's module-state serialization — reproduces identically on
+  unmodified `main`. Entire traceback is in `pickle.py` + `dill/_dill.py`;
+  no changed code path in our branch is involved.
+- **cloudpickle 3.1.2 upstream tests** (cloned directly from
+  `cloudpipe/cloudpickle`): 243 pass + 29 skip + 2 xfail — byte-for-byte
+  identical breakdown vs unmodified `main`. The two xfails are upstream-
+  documented cross-process determinism issues unrelated to our changes.
+- **joblib 1.5.3 self-tests** (focused subset — 5 test files, excluding
+  numpy/memmap dependencies that don't install on 3.15a8): 95 pass,
+  2 fail, 7 err — **identical outcome on `main` baseline**.
+- **attrs 26.1.0**: smoke-test of `@attrs.define`-class round-trip
+  through `pickle._Pickler` — passes.
 
 ### What we didn't try
 
-- **Specialized `save_list` for homogeneous-content lists** (all ints,
-  all strs). Real arrays are often homogeneous. Could probably skip
-  per-item type dispatch entirely. Natural next experiment.
+- **Specialized `save_list` / `save_set` / `save_tuple` for
+  homogeneous-content containers** (all ints, all strs). Real arrays
+  are often homogeneous. Could probably skip per-item type dispatch
+  entirely. Natural next experiment.
+- **F7 exact-set batching** — queued but no set-heavy workload was
+  added this round. Mirror of Exp 4 for sets; expected similar shape
+  and magnitude.
 - **Memoize-skip for small one-shot dict / list objects.** Same
   cycle-impossibility argument as Exp C, same byte-output sensitivity.
-- **A full per-type closure cache in `save()`.** The Python-level
-  interpreter already provides this via type-version-tagged attribute
-  caches for the method-lookup path; Exp E exploits the cases where
-  `is`-check + direct call is even cheaper than a cache hit.
 - **Any C-level work on `Modules/_pickle.c`.** The C path was already
   hand-optimized decades ago; this round stayed in pure Python.
+- **Real-world project test suites (Django, Sphinx, IPython)**. These
+  use the C accelerator by default; my changes are to the pure-Python
+  fallback, so not running their suites is defensible but not
+  comprehensive.
 
 ### Recommended next move
 
-The Exp 4 + D + E stack is a small (~150-line) self-contained patch
-with measurable, reproducible wins, no regressions, and third-party
-validation. Would ship comfortably as a standalone PR against
-`python/cpython`.
+The current stack (Exp 4 + D + E + F1 + F2 + F4 + F6) is a
+self-contained ~200-line patch with:
+
+- measurable, reproducible wins: **−20 to −49% on pure-Python
+  `pickle._Pickler` dump**
+- no regressions: full CPython suite passes; dill / cloudpickle /
+  joblib / attrs all match baseline
+- third-party validation on two directly-relevant libraries
+  (dill uses `marshal`+`pickle`, cloudpickle extends pickle)
+- a diary that records every rejected idea with its reason, so
+  future reviewers don't re-try the rejected paths
+
+Ships comfortably as a PR against `python/cpython` separate from the
+marshal PR.
+
+If continuing after a merge:
 
-If continuing: Exp C is low-hanging if someone is willing to update
-the two fixture constants. Homogeneous-list specialization is the
-next experiment worth running — similar methodology, should be a
-week-end job.
+1. **F7 exact-set batching** — small, symmetric to Exp 4, wants a
+   set workload added first.
+2. **Homogeneous-list specialization** — skip per-element dispatch
+   when `save_list` detects all elements share a type.
+3. **Exp C revisited with fixture updates** — semantically correct,
+   1–3% gain on tuple-heavy workloads, would need a test-suite
+   coordinated change.
 
 ## Provenance
 
-Generated 2026-04-17 during a follow-up exploration of stdlib perf
-opportunities after the marshal safe-cycle recovery PR. Raw data in
-`Misc/pickle-perf-data/`; commits `94b53eb` and `bb9d721` on
-`exp-pickle/4-pure-python-exact-containers`.
+Generated 2026-04-17, updated over two sessions. Round 1 (Exp 4, D, E)
+and round 2 (F1, F2, F3, F4, F5, F6) are distinct; see commit history
+on `exp-pickle/4-pure-python-exact-containers`. Raw bench data in
+`Misc/pickle-perf-data/`.

From 4f0b43c2b6b47317cba7cf86f492791727031b72 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael.bommarito@gmail.com>
Date: Fri, 17 Apr 2026 20:29:09 -0400
Subject: [PATCH 9/9] gh-148706: add NEWS entry for pickle._Pickler
 optimization

Announces the pure-Python _Pickler speedup stack (Exp 4 + D + E +
F1 + F2 + F4 + F6, seven commits preceding this one). Highlights the
dill impact and flags the single semantic change (atomic types skip
reducer_override to match the C reference implementation).
---
 ...026-04-18-00-28-38.gh-issue-148706.mvlqm5.rst | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Library/2026-04-18-00-28-38.gh-issue-148706.mvlqm5.rst

diff --git a/Misc/NEWS.d/next/Library/2026-04-18-00-28-38.gh-issue-148706.mvlqm5.rst b/Misc/NEWS.d/next/Library/2026-04-18-00-28-38.gh-issue-148706.mvlqm5.rst
new file mode 100644
index 00000000000000..1290074ee307b8
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-04-18-00-28-38.gh-issue-148706.mvlqm5.rst
@@ -0,0 +1,16 @@
+Speed up :class:`pickle._Pickler` (the pure-Python pickler fallback,
+also the base class of :class:`dill.Pickler`) on the ``dump`` path by
+inlining the frame-boundary check, reordering
+:meth:`~pickle._Pickler.save` to dispatch atomic types (``str``,
+``bytes``, ``int``, ``None``, ``bool``, ``float``) ahead of the memo
+lookup to match the C reference implementation in
+:mod:`!Modules/_pickle.c`, adding exact-container fast paths for
+:class:`list` and :class:`dict` under binary protocols, inlining
+``MEMOIZE`` for protocol 4+, and precomputing the ``BININT1`` opcode
+byte sequences for integers in ``0..255``.  Pure-Python
+:meth:`pickle._Pickler.dump` is now 20–49% faster on representative
+workloads; :func:`dill.dumps` (which inherits from
+:class:`pickle._Pickler`) is 19–37% faster on the same shapes.  One
+user-visible semantic change: atomic types no longer invoke
+:meth:`~pickle._Pickler.reducer_override`, aligning pure-Python
+behaviour with the long-standing C dispatch order.