diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index d1b4c7df4f445f..6148801460cfb2 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2784,11 +2784,15 @@ def _refold_parse_tree(parse_tree, *, policy): # max_line_length 0/None means no limit, ie: infinitely long. maxlen = policy.max_line_length or sys.maxsize encoding = 'utf-8' if policy.utf8 else 'us-ascii' - lines = [''] - last_ew = None + lines = [''] # Folded lines to be output + leading_whitespace = '' # When we have whitespace between two encoded + # words, we may need to encode the whitespace + # at the beginning of the second word. + last_ew = None # Points to the last encoded character if there's an ew on + # the line last_charset = None wrap_as_ew_blocked = 0 - want_encoding = False + want_encoding = False # This is set to True if we need to encode this part end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked') parts = list(parse_tree) while parts: @@ -2812,10 +2816,12 @@ def _refold_parse_tree(parse_tree, *, policy): # 'charset' property on the policy. charset = 'utf-8' want_encoding = True + if part.token_type == 'mime-parameters': # Mime parameter folding (using RFC2231) is extra special. _fold_mime_parameters(part, lines, maxlen, encoding) continue + if want_encoding and not wrap_as_ew_blocked: if not part.as_ew_allowed: want_encoding = False @@ -2847,21 +2853,38 @@ def _refold_parse_tree(parse_tree, *, policy): last_charset == 'utf-8' and charset != 'us-ascii')): last_ew = None last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew, - part.ew_combine_allowed, charset) + part.ew_combine_allowed, charset, leading_whitespace) + # This whitespace has been added to the lines in _fold_as_ew() + # so clear it now. + leading_whitespace = '' last_charset = charset want_encoding = False continue + if len(tstr) <= maxlen - len(lines[-1]): lines[-1] += tstr continue + # This part is too long to fit. The RFC wants us to break at # "major syntactic breaks", so unless we don't consider this # to be one, check if it will fit on the next line by itself. + leading_whitespace = '' if (part.syntactic_break and len(tstr) + 1 <= maxlen): newline = _steal_trailing_WSP_if_exists(lines) if newline or part.startswith_fws(): + # We're going to fold the data onto a new line here. Due to + # the way encoded strings handle continuation lines, we need to + # be prepared to encode any whitespace if the next line turns + # out to start with an encoded word. lines.append(newline + tstr) + + whitespace_accumulator = [] + for char in lines[-1]: + if char not in WSP: + break + whitespace_accumulator.append(char) + leading_whitespace = ''.join(whitespace_accumulator) last_ew = None continue if not hasattr(part, 'encode'): @@ -2885,9 +2908,10 @@ def _refold_parse_tree(parse_tree, *, policy): else: # We can't fold it onto the next line either... lines[-1] += tstr + return policy.linesep.join(lines) + policy.linesep -def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset): +def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, leading_whitespace): """Fold string to_encode into lines as encoded word, combining if allowed. Return the new value for last_ew, or None if ew_combine_allowed is False. @@ -2902,7 +2926,7 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset): to_encode = str( get_unstructured(lines[-1][last_ew:] + to_encode)) lines[-1] = lines[-1][:last_ew] - if to_encode[0] in WSP: + elif to_encode[0] in WSP: # We're joining this to non-encoded text, so don't encode # the leading blank. leading_wsp = to_encode[0] @@ -2910,6 +2934,7 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset): if (len(lines[-1]) == maxlen): lines.append(_steal_trailing_WSP_if_exists(lines)) lines[-1] += leading_wsp + trailing_wsp = '' if to_encode[-1] in WSP: # Likewise for the trailing space. @@ -2929,11 +2954,20 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset): while to_encode: remaining_space = maxlen - len(lines[-1]) - text_space = remaining_space - chrome_len + text_space = remaining_space - chrome_len - len(leading_whitespace) if text_space <= 0: lines.append(' ') continue + # If we are at the start of a continuation line, prepend whitespace + # (we only want to do this when the line starts with an encoded word + # but if we're folding in this helper function, then we know that we + # are going to be writing out an encoded word.) + if len(lines) > 1 and len(lines[-1]) == 1 and leading_whitespace: + encoded_word = _ew.encode(leading_whitespace, charset=encode_as) + lines[-1] += encoded_word + leading_whitespace = '' + to_encode_word = to_encode[:text_space] encoded_word = _ew.encode(to_encode_word, charset=encode_as) excess = len(encoded_word) - remaining_space diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index 3ebcb684d006d0..bfff1051262079 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -281,6 +281,41 @@ class TestBytesGenerator(TestGeneratorBase, TestEmailBase): ioclass = io.BytesIO typ = lambda self, x: x.encode('ascii') + def test_defaults_handle_spaces_between_encoded_words_when_folded(self): + source = ("Уведомление о принятии в работу обращения для" + " подключения услуги") + expected = ('Subject: =?utf-8?b?0KPQstC10LTQvtC80LvQtdC90LjQtSDQviDQv9GA0LjQvdGP0YLQuNC4?=\n' + ' =?utf-8?b?INCyINGA0LDQsdC+0YLRgyDQvtCx0YDQsNGJ0LXQvdC40Y8g0LTQu9GPINC/0L4=?=\n' + ' =?utf-8?b?0LTQutC70Y7Rh9C10L3QuNGPINGD0YHQu9GD0LPQuA==?=\n\n').encode('ascii') + msg = EmailMessage() + msg['Subject'] = source + s = io.BytesIO() + g = BytesGenerator(s) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + + def test_defaults_handle_spaces_at_start_of_subject(self): + source = " Уведомление" + expected = b"Subject: =?utf-8?b?0KPQstC10LTQvtC80LvQtdC90LjQtQ==?=\n\n" + msg = EmailMessage() + msg['Subject'] = source + s = io.BytesIO() + g = BytesGenerator(s) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + + def test_defaults_handle_spaces_at_start_of_continuation_line(self): + source = " ф ффффффффффффффффффф ф ф" + expected = (b"Subject: " + b"=?utf-8?b?0YQg0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YQ=?=\n" + b" =?utf-8?b?INGEINGE?=\n\n") + msg = EmailMessage() + msg['Subject'] = source + s = io.BytesIO() + g = BytesGenerator(s) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + def test_cte_type_7bit_handles_unknown_8bit(self): source = ("Subject: Maintenant je vous présente mon " "collègue\n\n").encode('utf-8') diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index bb7ca8dfd8c52c..5a608a033c7e54 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -7,6 +7,7 @@ from test.test_email import TestEmailBase, parameterize from email import headerregistry from email.headerregistry import Address, Group +from email.header import decode_header from test.support import ALWAYS_EQ @@ -1648,7 +1649,7 @@ def test_address_display_names(self): 'Lôrem ipsum dôlôr sit amet, cônsectetuer adipiscing. ' 'Suspendisse pôtenti. Aliquam nibh. Suspendisse pôtenti.', '=?utf-8?q?L=C3=B4rem_ipsum_d=C3=B4l=C3=B4r_sit_amet=2C_c' - '=C3=B4nsectetuer?=\n =?utf-8?q?adipiscing=2E_Suspendisse' + '=C3=B4nsectetuer?=\n =?utf-8?q?_adipiscing=2E_Suspendisse' '_p=C3=B4tenti=2E_Aliquam_nibh=2E?=\n Suspendisse =?utf-8' '?q?p=C3=B4tenti=2E?=', ), diff --git a/Misc/NEWS.d/next/Library/2023-04-26-22-24-17.gh-issue-92081.V8xMot.rst b/Misc/NEWS.d/next/Library/2023-04-26-22-24-17.gh-issue-92081.V8xMot.rst new file mode 100644 index 00000000000000..0302e957b884cf --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-04-26-22-24-17.gh-issue-92081.V8xMot.rst @@ -0,0 +1 @@ +Fix missing spaces in email headers when the spaces are mixed with encoded 8-bit characters.