From 3d0a5f73f5436eac1c6c0b4b6210e3b3518dcc83 Mon Sep 17 00:00:00 2001 From: "Miss Islington (bot)" <31488909+miss-islington@users.noreply.github.com> Date: Sat, 30 Apr 2022 05:31:37 -0700 Subject: [PATCH] bpo-43323: Fix UnicodeEncodeError in the email module (GH-32137) It was raised if the charset itself contains characters not encodable in UTF-8 (in particular \udcxx characters representing non-decodable bytes in the source). (cherry picked from commit e91dee87edcf6dee5dd78053004d76e5f05456d4) Co-authored-by: Serhiy Storchaka --- Lib/email/_encoded_words.py | 10 +++++----- Lib/email/_header_value_parser.py | 2 +- Lib/test/test_email/test__encoded_words.py | 7 +++++++ Lib/test/test_email/test_email.py | 9 +++++++++ Lib/test/test_email/test_headerregistry.py | 12 ++++++++++++ .../Library/2022-03-27-12-40-16.bpo-43323.9mFPuI.rst | 2 ++ 6 files changed, 36 insertions(+), 6 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-03-27-12-40-16.bpo-43323.9mFPuI.rst diff --git a/Lib/email/_encoded_words.py b/Lib/email/_encoded_words.py index 295ae7eb21237c..6795a606de037e 100644 --- a/Lib/email/_encoded_words.py +++ b/Lib/email/_encoded_words.py @@ -179,15 +179,15 @@ def decode(ew): # Turn the CTE decoded bytes into unicode. try: string = bstring.decode(charset) - except UnicodeError: + except UnicodeDecodeError: defects.append(errors.UndecodableBytesDefect("Encoded word " - "contains bytes not decodable using {} charset".format(charset))) + f"contains bytes not decodable using {charset!r} charset")) string = bstring.decode(charset, 'surrogateescape') - except LookupError: + except (LookupError, UnicodeEncodeError): string = bstring.decode('ascii', 'surrogateescape') if charset.lower() != 'unknown-8bit': - defects.append(errors.CharsetError("Unknown charset {} " - "in encoded word; decoded as unknown bytes".format(charset))) + defects.append(errors.CharsetError(f"Unknown charset {charset!r} " + f"in encoded word; decoded as unknown bytes")) return string, charset, lang, defects diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 51d355fbb0abc5..8a8fb8bc42a954 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -781,7 +781,7 @@ def params(self): else: try: value = value.decode(charset, 'surrogateescape') - except LookupError: + except (LookupError, UnicodeEncodeError): # XXX: there should really be a custom defect for # unknown character set to make it easy to find, # because otherwise unknown charset is a silent diff --git a/Lib/test/test_email/test__encoded_words.py b/Lib/test/test_email/test__encoded_words.py index 0b8b1de3359aa6..1713962f94caef 100644 --- a/Lib/test/test_email/test__encoded_words.py +++ b/Lib/test/test_email/test__encoded_words.py @@ -130,6 +130,13 @@ def test_unknown_charset(self): # XXX Should this be a new Defect instead? defects = [errors.CharsetError]) + def test_invalid_character_in_charset(self): + self._test('=?utf-8\udce2\udc80\udc9d?q?foo=ACbar?=', + b'foo\xacbar'.decode('ascii', 'surrogateescape'), + charset = 'utf-8\udce2\udc80\udc9d', + # XXX Should this be a new Defect instead? + defects = [errors.CharsetError]) + def test_q_nonascii(self): self._test('=?utf-8?q?=C3=89ric?=', 'Éric', diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index 489cd05be4dd58..761ea90b78f153 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -5323,6 +5323,15 @@ def test_rfc2231_unknown_encoding(self): Content-Transfer-Encoding: 8bit Content-Disposition: inline; filename*=X-UNKNOWN''myfile.txt +""" + msg = email.message_from_string(m) + self.assertEqual(msg.get_filename(), 'myfile.txt') + + def test_rfc2231_bad_character_in_encoding(self): + m = """\ +Content-Transfer-Encoding: 8bit +Content-Disposition: inline; filename*=utf-8\udce2\udc80\udc9d''myfile.txt + """ msg = email.message_from_string(m) self.assertEqual(msg.get_filename(), 'myfile.txt') diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index 68bbc9561c4aff..9a512fdb9d7785 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -698,6 +698,18 @@ def content_type_as_value(self, " charset*=unknown-8bit''utf-8%E2%80%9D\n", ), + 'rfc2231_nonascii_in_charset_of_charset_parameter_value': ( + "text/plain; charset*=utf-8”''utf-8%E2%80%9D", + 'text/plain', + 'text', + 'plain', + {'charset': 'utf-8”'}, + [], + 'text/plain; charset="utf-8”"', + "Content-Type: text/plain;" + " charset*=utf-8''utf-8%E2%80%9D\n", + ), + 'rfc2231_encoded_then_unencoded_segments': ( ('application/x-foo;' '\tname*0*="us-ascii\'en-us\'My";' diff --git a/Misc/NEWS.d/next/Library/2022-03-27-12-40-16.bpo-43323.9mFPuI.rst b/Misc/NEWS.d/next/Library/2022-03-27-12-40-16.bpo-43323.9mFPuI.rst new file mode 100644 index 00000000000000..98d73101d3ee57 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-03-27-12-40-16.bpo-43323.9mFPuI.rst @@ -0,0 +1,2 @@ +Fix errors in the :mod:`email` module if the charset itself contains +undecodable/unencodable characters.