Skip to content

Commit

Permalink
bpo-43323: Fix UnicodeEncodeError in the email module (GH-32137)
Browse files Browse the repository at this point in the history
It was raised if the charset itself contains characters not encodable
in UTF-8 (in particular \udcxx characters representing non-decodable
bytes in the source).
(cherry picked from commit e91dee8)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
  • Loading branch information
miss-islington and serhiy-storchaka committed Apr 30, 2022
1 parent 869a894 commit 3d0a5f7
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 6 deletions.
10 changes: 5 additions & 5 deletions Lib/email/_encoded_words.py
Expand Up @@ -179,15 +179,15 @@ def decode(ew):
# Turn the CTE decoded bytes into unicode.
try:
string = bstring.decode(charset)
except UnicodeError:
except UnicodeDecodeError:
defects.append(errors.UndecodableBytesDefect("Encoded word "
"contains bytes not decodable using {} charset".format(charset)))
f"contains bytes not decodable using {charset!r} charset"))
string = bstring.decode(charset, 'surrogateescape')
except LookupError:
except (LookupError, UnicodeEncodeError):
string = bstring.decode('ascii', 'surrogateescape')
if charset.lower() != 'unknown-8bit':
defects.append(errors.CharsetError("Unknown charset {} "
"in encoded word; decoded as unknown bytes".format(charset)))
defects.append(errors.CharsetError(f"Unknown charset {charset!r} "
f"in encoded word; decoded as unknown bytes"))
return string, charset, lang, defects


Expand Down
2 changes: 1 addition & 1 deletion Lib/email/_header_value_parser.py
Expand Up @@ -781,7 +781,7 @@ def params(self):
else:
try:
value = value.decode(charset, 'surrogateescape')
except LookupError:
except (LookupError, UnicodeEncodeError):
# XXX: there should really be a custom defect for
# unknown character set to make it easy to find,
# because otherwise unknown charset is a silent
Expand Down
7 changes: 7 additions & 0 deletions Lib/test/test_email/test__encoded_words.py
Expand Up @@ -130,6 +130,13 @@ def test_unknown_charset(self):
# XXX Should this be a new Defect instead?
defects = [errors.CharsetError])

def test_invalid_character_in_charset(self):
self._test('=?utf-8\udce2\udc80\udc9d?q?foo=ACbar?=',
b'foo\xacbar'.decode('ascii', 'surrogateescape'),
charset = 'utf-8\udce2\udc80\udc9d',
# XXX Should this be a new Defect instead?
defects = [errors.CharsetError])

def test_q_nonascii(self):
self._test('=?utf-8?q?=C3=89ric?=',
'Éric',
Expand Down
9 changes: 9 additions & 0 deletions Lib/test/test_email/test_email.py
Expand Up @@ -5323,6 +5323,15 @@ def test_rfc2231_unknown_encoding(self):
Content-Transfer-Encoding: 8bit
Content-Disposition: inline; filename*=X-UNKNOWN''myfile.txt
"""
msg = email.message_from_string(m)
self.assertEqual(msg.get_filename(), 'myfile.txt')

def test_rfc2231_bad_character_in_encoding(self):
m = """\
Content-Transfer-Encoding: 8bit
Content-Disposition: inline; filename*=utf-8\udce2\udc80\udc9d''myfile.txt
"""
msg = email.message_from_string(m)
self.assertEqual(msg.get_filename(), 'myfile.txt')
Expand Down
12 changes: 12 additions & 0 deletions Lib/test/test_email/test_headerregistry.py
Expand Up @@ -698,6 +698,18 @@ def content_type_as_value(self,
" charset*=unknown-8bit''utf-8%E2%80%9D\n",
),

'rfc2231_nonascii_in_charset_of_charset_parameter_value': (
"text/plain; charset*=utf-8”''utf-8%E2%80%9D",
'text/plain',
'text',
'plain',
{'charset': 'utf-8”'},
[],
'text/plain; charset="utf-8”"',
"Content-Type: text/plain;"
" charset*=utf-8''utf-8%E2%80%9D\n",
),

'rfc2231_encoded_then_unencoded_segments': (
('application/x-foo;'
'\tname*0*="us-ascii\'en-us\'My";'
Expand Down
@@ -0,0 +1,2 @@
Fix errors in the :mod:`email` module if the charset itself contains
undecodable/unencodable characters.

0 comments on commit 3d0a5f7

Please sign in to comment.