Skip to content

Commit

Permalink
- A new pickle protocol (protocol 3) is added with explicit support
Browse files Browse the repository at this point in the history
  for bytes.  This is the default protocol.  It intentionally cannot
  be unpickled by Python 2.x.

- When a pickle	written	by Python 2.x contains an (8-bit) str
  instance, this is now decoded to a (Unicode) str instance.  The
  encoding used to do this defaults to ASCII, but can be overridden
  via two new keyword arguments to the Unpickler class.  Previously
  this would create bytes instances, which is usually wrong: str
  instances are often used to pickle attribute names etc., and text is
  more common than binary data anyway.
  • Loading branch information
gvanrossum committed Mar 17, 2008
1 parent 953e4e5 commit f416981
Show file tree
Hide file tree
Showing 6 changed files with 165 additions and 55 deletions.
64 changes: 45 additions & 19 deletions Lib/pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,22 @@
bytes_types = (bytes, bytearray, memoryview)

# These are purely informational; no code uses these.
format_version = "2.0" # File format version we write
format_version = "3.0" # File format version we write
compatible_formats = ["1.0", # Original protocol 0
"1.1", # Protocol 0 with INST added
"1.2", # Original protocol 1
"1.3", # Protocol 1 with BINFLOAT added
"2.0", # Protocol 2
"3.0", # Protocol 3
] # Old format versions we can read

# This is the highest protocol number we know how to read.
HIGHEST_PROTOCOL = 2
HIGHEST_PROTOCOL = 3

# The protocol we write by default. May be less than HIGHEST_PROTOCOL.
DEFAULT_PROTOCOL = 2
# We intentionally write a protocol that Python 2.x cannot read;
# there are too many issues with that.
DEFAULT_PROTOCOL = 3

# Why use struct.pack() for pickling but marshal.loads() for
# unpickling? struct.pack() is 40% faster than marshal.dumps(), but
Expand Down Expand Up @@ -161,6 +164,10 @@ def __init__(self, value):

_tuplesize2code = [EMPTY_TUPLE, TUPLE1, TUPLE2, TUPLE3]

# Protocol 3 (Python 3.x)

BINBYTES = b'B' # push bytes; counted binary string argument
SHORT_BINBYTES = b'C' # " " ; " " " " < 256 bytes

__all__.extend([x for x in dir() if re.match("[A-Z][A-Z0-9_]+$",x)])

Expand Down Expand Up @@ -494,20 +501,19 @@ def save_float(self, obj, pack=struct.pack):
self.write(FLOAT + repr(obj).encode("ascii") + b'\n')
dispatch[float] = save_float

def save_string(self, obj, pack=struct.pack):
if self.bin:
n = len(obj)
if n < 256:
self.write(SHORT_BINSTRING + bytes([n]) + bytes(obj))
else:
self.write(BINSTRING + pack("<i", n) + bytes(obj))
def save_bytes(self, obj, pack=struct.pack):
if self.proto < 3:
self.save_reduce(bytes, (list(obj),))
return
n = len(obj)
if n < 256:
self.write(SHORT_BINBYTES + bytes([n]) + bytes(obj))
else:
# Strip leading 'b' due to repr() of bytes() returning b'...'
self.write(STRING + repr(obj).lstrip("b").encode("ascii") + b'\n')
self.write(BINBYTES + pack("<i", n) + bytes(obj))
self.memoize(obj)
dispatch[bytes] = save_string
dispatch[bytes] = save_bytes

def save_unicode(self, obj, pack=struct.pack):
def save_str(self, obj, pack=struct.pack):
if self.bin:
encoded = obj.encode('utf-8')
n = len(encoded)
Expand All @@ -518,7 +524,7 @@ def save_unicode(self, obj, pack=struct.pack):
self.write(UNICODE + bytes(obj.encode('raw-unicode-escape')) +
b'\n')
self.memoize(obj)
dispatch[str] = save_unicode
dispatch[str] = save_str

def save_tuple(self, obj):
write = self.write
Expand Down Expand Up @@ -775,7 +781,7 @@ def whichmodule(func, funcname):

class Unpickler:

def __init__(self, file):
def __init__(self, file, *, encoding="ASCII", errors="strict"):
"""This takes a binary file for reading a pickle data stream.
The protocol version of the pickle is detected automatically, so no
Expand All @@ -787,10 +793,16 @@ def __init__(self, file):
Thus file-like object can be a binary file object opened for
reading, a BytesIO object, or any other custom object that
meets this interface.
Optional keyword arguments are encoding and errors, which are
used to decode 8-bit string instances pickled by Python 2.x.
These default to 'ASCII' and 'strict', respectively.
"""
self.readline = file.readline
self.read = file.read
self.memo = {}
self.encoding = encoding
self.errors = errors

def load(self):
"""Read a pickled object representation from the open file.
Expand Down Expand Up @@ -831,7 +843,7 @@ def marker(self):

def load_proto(self):
proto = ord(self.read(1))
if not 0 <= proto <= 2:
if not 0 <= proto <= HIGHEST_PROTOCOL:
raise ValueError("unsupported pickle protocol: %d" % proto)
dispatch[PROTO[0]] = load_proto

Expand Down Expand Up @@ -924,9 +936,16 @@ def load_string(self):

def load_binstring(self):
len = mloads(b'i' + self.read(4))
self.append(self.read(len))
data = self.read(len)
value = str(data, self.encoding, self.errors)
self.append(value)
dispatch[BINSTRING[0]] = load_binstring

def load_binbytes(self):
len = mloads(b'i' + self.read(4))
self.append(self.read(len))
dispatch[BINBYTES[0]] = load_binbytes

def load_unicode(self):
self.append(str(self.readline()[:-1], 'raw-unicode-escape'))
dispatch[UNICODE[0]] = load_unicode
Expand All @@ -938,9 +957,16 @@ def load_binunicode(self):

def load_short_binstring(self):
len = ord(self.read(1))
self.append(bytes(self.read(len)))
data = bytes(self.read(len))
value = str(data, self.encoding, self.errors)
self.append(value)
dispatch[SHORT_BINSTRING[0]] = load_short_binstring

def load_short_binbytes(self):
len = ord(self.read(1))
self.append(bytes(self.read(len)))
dispatch[SHORT_BINBYTES[0]] = load_short_binbytes

def load_tuple(self):
k = self.marker()
self.stack[k:] = [tuple(self.stack[k+1:])]
Expand Down
109 changes: 86 additions & 23 deletions Lib/pickletools.py
Original file line number Diff line number Diff line change
Expand Up @@ -746,14 +746,19 @@ def __repr__(self):
doc="A Python float object.")

pystring = StackObject(
name='string',
obtype=bytes,
doc="A Python (8-bit) string object.")

pybytes = StackObject(
name='bytes',
obtype=bytes,
doc="A Python bytes object.")

pyunicode = StackObject(
name='str',
obtype=str,
doc="A Python string object.")
doc="A Python (Unicode) string object.")

pynone = StackObject(
name="None",
Expand Down Expand Up @@ -868,7 +873,7 @@ def __init__(self, name, code, arg,
assert isinstance(x, StackObject)
self.stack_after = stack_after

assert isinstance(proto, int) and 0 <= proto <= 2
assert isinstance(proto, int) and 0 <= proto <= 3
self.proto = proto

assert isinstance(doc, str)
Expand Down Expand Up @@ -995,7 +1000,9 @@ def __init__(self, name, code, arg,
The argument is a repr-style string, with bracketing quote characters,
and perhaps embedded escapes. The argument extends until the next
newline character.
newline character. (Actually, they are decoded into a str instance
using the encoding given to the Unpickler constructor. or the default,
'ASCII'.)
"""),

I(name='BINSTRING',
Expand All @@ -1008,7 +1015,9 @@ def __init__(self, name, code, arg,
There are two arguments: the first is a 4-byte little-endian signed int
giving the number of bytes in the string, and the second is that many
bytes, which are taken literally as the string content.
bytes, which are taken literally as the string content. (Actually,
they are decoded into a str instance using the encoding given to the
Unpickler constructor. or the default, 'ASCII'.)
"""),

I(name='SHORT_BINSTRING',
Expand All @@ -1019,6 +1028,36 @@ def __init__(self, name, code, arg,
proto=1,
doc="""Push a Python string object.
There are two arguments: the first is a 1-byte unsigned int giving
the number of bytes in the string, and the second is that many bytes,
which are taken literally as the string content. (Actually, they
are decoded into a str instance using the encoding given to the
Unpickler constructor. or the default, 'ASCII'.)
"""),

# Bytes (protocol 3 only; older protocols don't support bytes at all)

I(name='BINBYTES',
code='B',
arg=string4,
stack_before=[],
stack_after=[pybytes],
proto=3,
doc="""Push a Python bytes object.
There are two arguments: the first is a 4-byte little-endian signed int
giving the number of bytes in the string, and the second is that many
bytes, which are taken literally as the bytes content.
"""),

I(name='SHORT_BINBYTES',
code='C',
arg=string1,
stack_before=[],
stack_after=[pybytes],
proto=1,
doc="""Push a Python string object.
There are two arguments: the first is a 1-byte unsigned int giving
the number of bytes in the string, and the second is that many bytes,
which are taken literally as the string content.
Expand Down Expand Up @@ -2006,9 +2045,9 @@ def __init__(self, value):

_dis_test = r"""
>>> import pickle
>>> x = [1, 2, (3, 4), {bytes(b'abc'): "def"}]
>>> pkl = pickle.dumps(x, 0)
>>> dis(pkl)
>>> x = [1, 2, (3, 4), {b'abc': "def"}]
>>> pkl0 = pickle.dumps(x, 0)
>>> dis(pkl0)
0: ( MARK
1: l LIST (MARK at 0)
2: p PUT 0
Expand All @@ -2025,19 +2064,32 @@ def __init__(self, value):
25: ( MARK
26: d DICT (MARK at 25)
27: p PUT 2
30: S STRING 'abc'
37: p PUT 3
40: V UNICODE 'def'
45: p PUT 4
48: s SETITEM
49: a APPEND
50: . STOP
30: c GLOBAL 'builtins bytes'
46: p PUT 3
49: ( MARK
50: ( MARK
51: l LIST (MARK at 50)
52: p PUT 4
55: L LONG 97
59: a APPEND
60: L LONG 98
64: a APPEND
65: L LONG 99
69: a APPEND
70: t TUPLE (MARK at 49)
71: p PUT 5
74: R REDUCE
75: V UNICODE 'def'
80: p PUT 6
83: s SETITEM
84: a APPEND
85: . STOP
highest protocol among opcodes = 0
Try again with a "binary" pickle.
>>> pkl = pickle.dumps(x, 1)
>>> dis(pkl)
>>> pkl1 = pickle.dumps(x, 1)
>>> dis(pkl1)
0: ] EMPTY_LIST
1: q BINPUT 0
3: ( MARK
Expand All @@ -2050,13 +2102,24 @@ def __init__(self, value):
14: q BINPUT 1
16: } EMPTY_DICT
17: q BINPUT 2
19: U SHORT_BINSTRING 'abc'
24: q BINPUT 3
26: X BINUNICODE 'def'
34: q BINPUT 4
36: s SETITEM
37: e APPENDS (MARK at 3)
38: . STOP
19: c GLOBAL 'builtins bytes'
35: q BINPUT 3
37: ( MARK
38: ] EMPTY_LIST
39: q BINPUT 4
41: ( MARK
42: K BININT1 97
44: K BININT1 98
46: K BININT1 99
48: e APPENDS (MARK at 41)
49: t TUPLE (MARK at 37)
50: q BINPUT 5
52: R REDUCE
53: X BINUNICODE 'def'
61: q BINPUT 6
63: s SETITEM
64: e APPENDS (MARK at 3)
65: . STOP
highest protocol among opcodes = 1
Exercise the INST/OBJ/BUILD family.
Expand Down
21 changes: 18 additions & 3 deletions Lib/test/pickletester.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,12 @@ def test_unicode(self):
u2 = self.loads(p)
self.assertEqual(u2, u)

def test_bytes(self):
for proto in protocols:
for u in b'', b'xyz', b'xyz'*100:
p = self.dumps(u)
self.assertEqual(self.loads(p), u)

def test_ints(self):
import sys
for proto in protocols:
Expand Down Expand Up @@ -532,8 +538,8 @@ def test_long(self):

@run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
def test_float_format(self):
# make sure that floats are formatted locale independent
self.assertEqual(self.dumps(1.2)[0:3], b'F1.')
# make sure that floats are formatted locale independent with proto 0
self.assertEqual(self.dumps(1.2, 0)[0:3], b'F1.')

def test_reduce(self):
pass
Expand Down Expand Up @@ -624,6 +630,12 @@ def test_short_tuples(self):
(2, 2): pickle.TUPLE2,
(2, 3): pickle.TUPLE3,
(2, 4): pickle.TUPLE,

(3, 0): pickle.EMPTY_TUPLE,
(3, 1): pickle.TUPLE1,
(3, 2): pickle.TUPLE2,
(3, 3): pickle.TUPLE3,
(3, 4): pickle.TUPLE,
}
a = ()
b = (1,)
Expand All @@ -643,14 +655,17 @@ def test_singletons(self):
expected_opcode = {(0, None): pickle.NONE,
(1, None): pickle.NONE,
(2, None): pickle.NONE,
(3, None): pickle.NONE,

(0, True): pickle.INT,
(1, True): pickle.INT,
(2, True): pickle.NEWTRUE,
(3, True): pickle.NEWTRUE,

(0, False): pickle.INT,
(1, False): pickle.INT,
(2, False): pickle.NEWFALSE,
(3, False): pickle.NEWFALSE,
}
for proto in protocols:
for x in None, False, True:
Expand Down Expand Up @@ -955,7 +970,7 @@ def test_load_closed_file(self):

def test_highest_protocol(self):
# Of course this needs to be changed when HIGHEST_PROTOCOL changes.
self.assertEqual(self.module.HIGHEST_PROTOCOL, 2)
self.assertEqual(self.module.HIGHEST_PROTOCOL, 3)

def test_callapi(self):
from io import BytesIO
Expand Down
Loading

0 comments on commit f416981

Please sign in to comment.