Skip to content
Open
172 changes: 162 additions & 10 deletions Lib/pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,10 @@ def __init__(self, value):

_tuplesize2code = [EMPTY_TUPLE, TUPLE1, TUPLE2, TUPLE3]

# Precomputed BININT1 opcode + payload for n in 0..255. Avoids the
# struct.pack("<B", n) on every small non-negative int save.
_BININT1_BYTES = tuple(BININT1 + bytes([_i]) for _i in range(256))

# Protocol 3 (Python 3.x)

BINBYTES = b'B' # push bytes; counted binary string argument
Expand Down Expand Up @@ -532,12 +536,27 @@ def memoize(self, obj):
# growable) array, indexed by memo key.
if self.fast:
return
assert id(obj) not in self.memo
idx = len(self.memo)
self.write(self.put(idx))
self.memo[id(obj)] = idx, obj
memo = self.memo
assert id(obj) not in memo
idx = len(memo)
# Inlined self.put(idx). For proto >= 4 (the common case for any
# recent user), MEMOIZE is a one-byte constant; avoid the method
# dispatch + the redundant self.write indirection.
proto = self.proto
if proto >= 4:
self.write(MEMOIZE)
elif self.bin:
if idx < 256:
self.write(BINPUT + pack("<B", idx))
else:
self.write(LONG_BINPUT + pack("<I", idx))
else:
self.write(PUT + repr(idx).encode("ascii") + b'\n')
memo[id(obj)] = idx, obj

# Return a PUT (BINPUT, LONG_BINPUT) opcode string, with argument i.
# Retained for backward compatibility with subclasses that override
# this method; memoize() now inlines the common paths directly.
def put(self, idx):
if self.proto >= 4:
return MEMOIZE
Expand All @@ -560,7 +579,15 @@ def get(self, i):
return GET + repr(i).encode("ascii") + b'\n'

def save(self, obj, save_persistent_id=True):
self.framer.commit_frame()
# Inlined commit_frame() hot check. The frame is either None
# (proto < 4) or a BytesIO that only needs committing once it
# exceeds _FRAME_SIZE_TARGET. Skip the Python-level method
# dispatch for the no-op case (the overwhelming majority of
# saves on small/medium payloads).
framer = self.framer
cf = framer.current_frame
if cf is not None and cf.tell() >= _Framer._FRAME_SIZE_TARGET:
framer.commit_frame()

# Check for persistent id (defined by a subclass)
if save_persistent_id:
Expand All @@ -569,7 +596,44 @@ def save(self, obj, save_persistent_id=True):
self.save_pers(pid)
return

# Check the memo
# Fast paths matching the order of Modules/_pickle.c::save().
# Each of these returns without going through reducer_override,
# which the C reference implementation also skips for these
# types.
t = type(obj)
# str: memoized, so check memo inline before falling into save_str.
if t is str:
x = self.memo.get(id(obj))
if x is not None:
self.write(self.get(x[0]))
return
self.save_str(obj)
return
# int / None / bool / float: not memoized; skip memo.get entirely.
# Placed before bytes so int-heavy workloads don't pay an extra
# branch miss before hitting their fast path.
if t is int:
self.save_long(obj)
return
if obj is None:
self.write(NONE)
return
if t is bool:
self.save_bool(obj)
return
if t is float:
self.save_float(obj)
return
# bytes: memoized; same inline memo pattern as str.
if t is bytes:
x = self.memo.get(id(obj))
if x is not None:
self.write(self.get(x[0]))
return
self.save_bytes(obj)
return

# Check the memo (non-atomic, non-str types)
x = self.memo.get(id(obj))
if x is not None:
self.write(self.get(x[0]))
Expand All @@ -582,7 +646,6 @@ def save(self, obj, save_persistent_id=True):

if rv is NotImplemented:
# Check the type dispatch table
t = type(obj)
f = self.dispatch.get(t)
if f is not None:
f(self, obj) # Call unbound method with explicit self
Expand Down Expand Up @@ -827,7 +890,7 @@ def save_long(self, obj):
# First one- and two-byte unsigned ints:
if obj >= 0:
if obj <= 0xff:
self.write(BININT1 + pack("<B", obj))
self.write(_BININT1_BYTES[obj])
return
if obj <= 0xffff:
self.write(BININT2 + pack("<H", obj))
Expand Down Expand Up @@ -1028,12 +1091,55 @@ def save_list(self, obj):
self.write(MARK + LIST)

self.memoize(obj)
self._batch_appends(obj, obj)
if self.bin and type(obj) is list:
# Fast path for exact lists under binary protocols; mirrors the
# C accelerator's batch_list_exact (Modules/_pickle.c). Avoids
# the per-batch tuple allocation from batched() and the
# enumerate() overhead used by the generic _batch_appends path.
self._batch_appends_exact(obj)
else:
self._batch_appends(obj, obj)

dispatch[list] = save_list

_BATCHSIZE = 1000

def _batch_appends_exact(self, obj):
# Fast path for type(obj) is list, binary protocols. Snapshots a
# slice per batch so concurrent mutation (e.g. via persistent_id)
# does not break indexing; matches the tolerance of the generic
# _batch_appends path that goes through batched().
save = self.save
write = self.write
batch_size = self._BATCHSIZE
idx = 0
while True:
n = len(obj)
if idx >= n:
return
remaining = n - idx
if remaining == 1:
try:
save(obj[idx])
except BaseException as exc:
exc.add_note(f'when serializing {_T(obj)} item {idx}')
raise
write(APPEND)
return
batch = remaining if remaining < batch_size else batch_size
snapshot = obj[idx:idx + batch]
write(MARK)
i = idx
for x in snapshot:
try:
save(x)
except BaseException as exc:
exc.add_note(f'when serializing {_T(obj)} item {i}')
raise
i += 1
write(APPENDS)
idx = i

def _batch_appends(self, items, obj):
# Helper to batch up APPENDS sequences
save = self.save
Expand Down Expand Up @@ -1077,10 +1183,56 @@ def save_dict(self, obj):
self.write(MARK + DICT)

self.memoize(obj)
self._batch_setitems(obj.items(), obj)
if self.bin and type(obj) is dict:
self._batch_setitems_exact(obj)
else:
self._batch_setitems(obj.items(), obj)

dispatch[dict] = save_dict

def _batch_setitems_exact(self, obj):
# Fast path for type(obj) is dict, binary protocols. dict's own
# iterator raises RuntimeError on size change, so no snapshotting
# is needed.
save = self.save
write = self.write
batch_size = self._BATCHSIZE
items = obj.items()
n = len(items)
if n == 0:
return
if n == 1:
for k, v in items:
save(k)
try:
save(v)
except BaseException as exc:
exc.add_note(f'when serializing {_T(obj)} item {k!r}')
raise
write(SETITEM)
return
if n <= batch_size:
# Single batch: iterate items() directly, no batching machinery.
# dict_items iteration itself raises RuntimeError on size change,
# so mutation during save() (e.g. from persistent_id hooks) is
# detected.
write(MARK)
for k, v in items:
save(k)
try:
save(v)
except BaseException as exc:
exc.add_note(f'when serializing {_T(obj)} item {k!r}')
raise
write(SETITEMS)
return
# Large dict: delegate to the generic path, which uses batched()
# over the live items iterator and preserves dict mutation-during-
# save detection. The per-batch tuple allocation is amortised over
# BATCHSIZE items here, so the exact-dict fast-path advantage is
# concentrated on the n <= batch_size case above.
self._batch_setitems(items, obj)

def _batch_setitems(self, items, obj):
# Helper to batch up SETITEMS sequences; proto >= 1 only
save = self.save
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Speed up :class:`pickle._Pickler` (the pure-Python pickler fallback,
also the base class of :class:`dill.Pickler`) on the ``dump`` path by
inlining the frame-boundary check, reordering
:meth:`~pickle._Pickler.save` to dispatch atomic types (``str``,
``bytes``, ``int``, ``None``, ``bool``, ``float``) ahead of the memo
lookup to match the C reference implementation in
:mod:`!Modules/_pickle.c`, adding exact-container fast paths for
:class:`list` and :class:`dict` under binary protocols, inlining
``MEMOIZE`` for protocol 4+, and precomputing the ``BININT1`` opcode
byte sequences for integers in ``0..255``. Pure-Python
:meth:`pickle._Pickler.dump` is now 20–49% faster on representative
workloads; :func:`dill.dumps` (which inherits from
:class:`pickle._Pickler`) is 19–37% faster on the same shapes. One
user-visible semantic change: atomic types no longer invoke
:meth:`~pickle._Pickler.reducer_override`, aligning pure-Python
behaviour with the long-standing C dispatch order.
90 changes: 90 additions & 0 deletions Misc/pickle-perf-data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# Pickle Perf Raw Data

Raw artifacts backing `Misc/pickle-perf-diary.md`. Regeneratable;
checked in so reviewers can re-verify numbers without rerunning the
methodology.

## Harness

`pickle_pure_bench.py` — the pure-Python `pickle._Pickler` /
`_Unpickler` benchmark used throughout. Five workloads (list-of-ints,
list-of-strs, flat str-keyed dict, deep list-of-lists, nested
list-of-dicts). Each reports a best-of-9 median for dump and load at
protocol 5.

`pickle_pure_bench_bytes.py` — bytes-heavy workload (short bytes,
medium bytes, bytearrays, bytes-keyed dict). Introduced in round 2 to
evaluate F6 (bytes in the save() fast path).

`pickle_save_profile.py``cProfile`-based breakdown used to identify
which internal calls dominate `save()` (informed the priority ordering
of ideas D, E over B; in round 2, drove the F1 / F2 / F4 ordering).

Run each with `taskset -c 0 ./python <script>` on a quiet machine.

## JSON files

### Round 1 (Exp 4 → E)

| File | Commit / state |
| --- | --- |
| `pickle-pure-baseline.json` | Clean `main` (2faceeec), no pickle patches |
| `pickle-pure-exp4d.json` | Exp 4 only (exact-container fast paths, `94b53eb`) |
| `pickle-pure-Donly-verify.json` | Exp 4 + D (inlined `commit_frame`) |
| `pickle-pure-BD.json` | Exp 4 + D + B attempt — **used to confirm B regression** |
| `pickle-pure-DE.json` | Exp 4 + D + E (int-only initial form) |
| `pickle-pure-DE-v2.json` | Exp 4 + D + E (str added — `bb9d721`) |

### Round 2 (F1 → F6)

| File | Commit / state |
| --- | --- |
| `pickle-post-fix.json` | Large-dict mutation fix, before F1 reorder |
| `pickle-F1v2.json` | F1 (save() reordered, atomic short-circuit before memo, `285fcae`) |
| `pickle-F3.json` | F3 (frame byte counter) — **rejected**, reverted |
| `pickle-F4.json` | F4 (BININT1 opcode cache, `7c6af84`) |
| `pickle-F2.json` | F2 (inlined MEMOIZE in memoize(), `2f1d38b`) |
| `pickle-F5.json` | F5 (ASCII save_str) — **rejected**, reverted |
| `pickle-F6v2.json` | F6 (bytes in fast path, `e917108`) — current tip |

### Bytes-specific bench (introduced in round 2)

| File | Content |
| --- | --- |
| `pickle-bytes-pre.json` | Before F6; baseline for bytes workloads |
| `pickle-bytes-F6v2.json` | After F6 |

## Interpretation guide

Each JSON has per-workload records with:

loads_number / dumps_number # inner loop counts
loads_runs / dumps_runs # 9 raw timings
loads_median / dumps_median # primary statistic
loads_min / dumps_min # outlier-robust secondary

Compare two files with:

./python -c "
import json
def load(p):
s = open(p).read()
return json.loads(s[s.find('{'):])
a = load('a.json'); b = load('b.json')
for k in sorted(a):
print(k, (b[k]['dump_median'] - a[k]['dump_median']) /
a[k]['dump_median'] * 100, '% dump')"

(The benchmark prints a summary line before the JSON, hence the
`s.find('{')` trick.)

## What's missing

Earlier-iteration `pickle-pure-exp4.json` / `exp4b.json` / `exp4c.json`
/ `exp4e.json` variants were not copied — they represent abandoned
shapes of the Exp 4 implementation (index-based iteration with its
mutation bug, dict-path using `iter()+next()`) and are superseded by
`exp4d.json`. Still available in `/tmp/` on the authoring machine.

`pickle-pure-D.json` / `pickle-pure-exp4.json` from the kernel-compile
thermal contamination event are excluded — numbers unusable.
Loading
Loading