Skip to content

Commit

Permalink
[3.11] gh-94606: Fix error when message with Unicode surrogate not su…
Browse files Browse the repository at this point in the history
…rrogateescaped string (GH-94641) (GH-112972)

(cherry picked from commit 27a5fd8)

Co-authored-by: Sidney Markowitz <sidney@sidney.com>
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
  • Loading branch information
3 people committed Dec 11, 2023
1 parent a37e147 commit 5aec2d2
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 16 deletions.
29 changes: 15 additions & 14 deletions Lib/email/message.py
Expand Up @@ -289,25 +289,26 @@ def get_payload(self, i=None, decode=False):
# cte might be a Header, so for now stringify it.
cte = str(self.get('content-transfer-encoding', '')).lower()
# payload may be bytes here.
if isinstance(payload, str):
if utils._has_surrogates(payload):
bpayload = payload.encode('ascii', 'surrogateescape')
if not decode:
if not decode:
if isinstance(payload, str) and utils._has_surrogates(payload):
try:
bpayload = payload.encode('ascii', 'surrogateescape')
try:
payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace')
except LookupError:
payload = bpayload.decode('ascii', 'replace')
elif decode:
try:
bpayload = payload.encode('ascii')
except UnicodeError:
# This won't happen for RFC compliant messages (messages
# containing only ASCII code points in the unicode input).
# If it does happen, turn the string into bytes in a way
# guaranteed not to fail.
bpayload = payload.encode('raw-unicode-escape')
if not decode:
except UnicodeEncodeError:
pass
return payload
if isinstance(payload, str):
try:
bpayload = payload.encode('ascii', 'surrogateescape')
except UnicodeEncodeError:
# This won't happen for RFC compliant messages (messages
# containing only ASCII code points in the unicode input).
# If it does happen, turn the string into bytes in a way
# guaranteed not to fail.
bpayload = payload.encode('raw-unicode-escape')
if cte == 'quoted-printable':
return quopri.decodestring(bpayload)
elif cte == 'base64':
Expand Down
4 changes: 2 additions & 2 deletions Lib/email/utils.py
Expand Up @@ -49,10 +49,10 @@
escapesre = re.compile(r'[\\"]')

def _has_surrogates(s):
"""Return True if s contains surrogate-escaped binary data."""
"""Return True if s may contain surrogate-escaped binary data."""
# This check is based on the fact that unless there are surrogates, utf8
# (Python's default encoding) can encode any string. This is the fastest
# way to check for surrogates, see issue 11454 for timings.
# way to check for surrogates, see bpo-11454 (moved to gh-55663) for timings.
try:
s.encode()
return False
Expand Down
29 changes: 29 additions & 0 deletions Lib/test/test_email/test_message.py
Expand Up @@ -748,6 +748,35 @@ def test_iter_attachments_mutation(self):
self.assertEqual(len(list(m.iter_attachments())), 2)
self.assertEqual(m.get_payload(), orig)

get_payload_surrogate_params = {

'good_surrogateescape': (
"String that can be encod\udcc3\udcabd with surrogateescape",
b'String that can be encod\xc3\xabd with surrogateescape'
),

'string_with_utf8': (
"String with utf-8 charactër",
b'String with utf-8 charact\xebr'
),

'surrogate_and_utf8': (
"String that cannot be ëncod\udcc3\udcabd with surrogateescape",
b'String that cannot be \xebncod\\udcc3\\udcabd with surrogateescape'
),

'out_of_range_surrogate': (
"String with \udfff cannot be encoded with surrogateescape",
b'String with \\udfff cannot be encoded with surrogateescape'
),
}

def get_payload_surrogate_as_gh_94606(self, msg, expected):
"""test for GH issue 94606"""
m = self._str_msg(msg)
payload = m.get_payload(decode=True)
self.assertEqual(expected, payload)


class TestEmailMessage(TestEmailMessageBase, TestEmailBase):
message = EmailMessage
Expand Down
@@ -0,0 +1,3 @@
Fix UnicodeEncodeError when :func:`email.message.get_payload` reads a message
with a Unicode surrogate character and the message content is not well-formed for
surrogateescape encoding. Patch by Sidney Markowitz.

0 comments on commit 5aec2d2

Please sign in to comment.