diff --git a/Doc/library/mailbox.rst b/Doc/library/mailbox.rst index e8a96f29ea185e..39884b035cd79d 100644 --- a/Doc/library/mailbox.rst +++ b/Doc/library/mailbox.rst @@ -562,7 +562,7 @@ Supported mailbox formats are Maildir, mbox, MH, Babyl, and MMDF. ^^^^^^^^^^^^^^^^^^^^^^ -.. class:: mbox(path, factory=None, create=True) +.. class:: mbox(path, factory=None, create=True, from_matcher=None) A subclass of :class:`Mailbox` for mailboxes in mbox format. Parameter *factory* is a callable object that accepts a file-like message representation (which @@ -575,6 +575,22 @@ Supported mailbox formats are Maildir, mbox, MH, Babyl, and MMDF. messages in an mbox mailbox are stored in a single file with the beginning of each message indicated by a line whose first five characters are "From ". + The parameter *from_matcher* can be used to override this default, by providing + a boolean function that takes the line as its sole parameter. + The default matcher is ``lambda line: line.startswith(b'From ')``. + A stricter matcher might be: + ``lambda line: re.match(b'From .+ \\d\\d\\d\\d\\r?\\n', line)``. + + One alternate matcher is included: + - ``'full'``: this matches the syntax ``From [ info]`` + The ``asctime`` field must match the standard syntax, i.e. the fixed length (24 char) string: + ``(Mon|...|Sun) (Jan|...|Dec) [ |d]d hh:mm:ss yyyy``. + The date field can have a leading space instead of a leading ``0``. + [The month and day-of-week fields are always in English] + A boolean function might be useful in some cases where the body text contains + un-quoted "From " lines. In such cases, it might help to check that the year (and month) + are the expected values for the mbox. Any other "From " lines are likely to be un-quoted body text. + Several variations of the mbox format exist to address perceived shortcomings in the original. In the interest of compatibility, :class:`!mbox` implements the original format, which is sometimes referred to as :dfn:`mboxo`. This means that diff --git a/Lib/graphlib.py b/Lib/graphlib.py index 7961c9c5cac2d6..82f33fb5cf312c 100644 --- a/Lib/graphlib.py +++ b/Lib/graphlib.py @@ -90,17 +90,13 @@ def prepare(self): still be used to obtain as many nodes as possible until cycles block more progress. After a call to this function, the graph cannot be modified and therefore no more nodes can be added using "add". - - Raise ValueError if nodes have already been passed out of the sorter. - """ - if self._npassedout > 0: - raise ValueError("cannot prepare() after starting sort") + if self._ready_nodes is not None: + raise ValueError("cannot prepare() more than once") - if self._ready_nodes is None: - self._ready_nodes = [ - i.node for i in self._node2info.values() if i.npredecessors == 0 - ] + self._ready_nodes = [ + i.node for i in self._node2info.values() if i.npredecessors == 0 + ] # ready_nodes is set before we look for cycles on purpose: # if the user wants to catch the CycleError, that's fine, # they can continue using the instance to grab as many diff --git a/Lib/mailbox.py b/Lib/mailbox.py index b00d9e8634c785..8825b6decafb84 100644 --- a/Lib/mailbox.py +++ b/Lib/mailbox.py @@ -895,15 +895,31 @@ def _install_message(self, message): class mbox(_mboxMMDF): """A classic mbox mailbox.""" + # This is the full syntax, i.e. From sender asctime[ moreinfo] + DAY_RE = b' (?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)' + MON_RE = b' (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)' + DTY_RE = b' [ 123]\\d \\d\\d:\\d\\d:\\d\\d \\d{4}' # day, time, year + FULL_RE = b'From \\S+' + DAY_RE + MON_RE + DTY_RE + b'( .+)?' + linesep + b'\\Z' + # we capture the optional moreinfo group so we can check for lines that end in the date + _mangle_from_ = True # All messages must end in a newline character, and # _post_message_hooks outputs an empty line between messages. _append_newline = True - def __init__(self, path, factory=None, create=True): + def __init__(self, path, factory=None, create=True, from_matcher=None): """Initialize an mbox mailbox.""" self._message_factory = mboxMessage + if from_matcher is None: + # default to original matcher + self._from_matcher = lambda line: line.startswith(b'From ') + elif from_matcher == 'full': # From sender date[ moreinfo] + import re + regex = re.compile(self.FULL_RE) # compile once + self._from_matcher = lambda line: re.match(regex, line) + else: # assume it is a boolean function with one parameter + self._from_matcher = from_matcher _mboxMMDF.__init__(self, path, factory, create) def _post_message_hook(self, f): @@ -918,7 +934,7 @@ def _generate_toc(self): while True: line_pos = self._file.tell() line = self._file.readline() - if line.startswith(b'From '): + if self._from_matcher(line): if len(stops) < len(starts): if last_was_empty: stops.append(line_pos - len(linesep)) diff --git a/Lib/test/test_email/data/mailbox_01.mbox b/Lib/test/test_email/data/mailbox_01.mbox new file mode 100644 index 00000000000000..a137e39e67ec60 --- /dev/null +++ b/Lib/test/test_email/data/mailbox_01.mbox @@ -0,0 +1,7 @@ +From MAILER-DAEMON Sun Aug 7 11:40:37 2022 extra info +From: foo +Subject: unquoted From in body; extra info on From line + +Hello + +From time to time diff --git a/Lib/test/test_email/data/mailbox_02.mbox b/Lib/test/test_email/data/mailbox_02.mbox new file mode 100644 index 00000000000000..eeda4cbed042b0 --- /dev/null +++ b/Lib/test/test_email/data/mailbox_02.mbox @@ -0,0 +1,7 @@ +From MAILER-DAEMON Sun Aug 7 11:40:37 20220 extra info +From: foo +Subject: unquoted From in body; invalid extra info on From line + +Hello + +From time to time diff --git a/Lib/test/test_graphlib.py b/Lib/test/test_graphlib.py index 66722e0b0498a6..5f38af4024c5b0 100644 --- a/Lib/test/test_graphlib.py +++ b/Lib/test/test_graphlib.py @@ -140,21 +140,9 @@ def test_calls_before_prepare(self): def test_prepare_multiple_times(self): ts = graphlib.TopologicalSorter() ts.prepare() - ts.prepare() - - def test_prepare_after_pass_out(self): - ts = graphlib.TopologicalSorter({'a': 'bc'}) - ts.prepare() - self.assertEqual(set(ts.get_ready()), {'b', 'c'}) - with self.assertRaisesRegex(ValueError, r"cannot prepare\(\) after starting sort"): + with self.assertRaisesRegex(ValueError, r"cannot prepare\(\) more than once"): ts.prepare() - def test_prepare_cycleerror_each_time(self): - ts = graphlib.TopologicalSorter({'a': 'b', 'b': 'a'}) - for attempt in range(1, 4): - with self.assertRaises(graphlib.CycleError, msg=f"{attempt=}"): - ts.prepare() - def test_invalid_nodes_in_done(self): ts = graphlib.TopologicalSorter() ts.add(1, 2, 3, 4) diff --git a/Lib/test/test_mailbox.py b/Lib/test/test_mailbox.py index 0169948e453438..12324ee0c7ae37 100644 --- a/Lib/test/test_mailbox.py +++ b/Lib/test/test_mailbox.py @@ -1310,6 +1310,73 @@ def test_message_separator(self): data = f.read() self.assertEndsWith(data, '0\n\n') + # Test reading an mbox file with un-prefixed From in body text + # currently generates 2 messages + def _test_read_mbox(self, matcher=0, count=2): + # create a basic mbox file + self._box.add('From: foo\n\nHello\n') + # Add an un-prefixed From to create a second entry + self._box._file.write(b'From time to time\n') + self._box.close() + # re-read it using the provided matcher + if matcher == 0: # not provided, so omit + self._box = mailbox.mbox(self._path, create=False) + else: + self._box = mailbox.mbox(self._path, create=False, from_matcher=matcher) + # How many messages were found? + self.assertEqual(len(self._box.keys()), count) + + def test_read_mbox_omitted(self): + self._test_read_mbox() + + def test_read_mbox_none(self): + self._test_read_mbox(None) + + def test_read_mbox_default(self): + self._test_read_mbox(lambda line: re.match(b'From ', line)) + + def test_read_mbox_full1(self): + self._test_read_mbox('full', count=1) + + def test_read_mbox_regex1(self): + import re + # stricter matching should only find one message + self._test_read_mbox(lambda line: re.match(b'From .+ \\d\\d\\d\\d\\r?\\n', line), count=1) + + def test_read_mbox_regex2(self): + import re + # invalid, so don't find any messages + self._test_read_mbox(lambda line: re.match(b'From .+ \\d\\d\\d\\r?\\n', line), count=0) + +class TestMboxFromFile(unittest.TestCase): + # test class without default setUp/tearDown which we don't want + + def setUp(self): + self._box = None + self._path = None + + def tearDown(self): + if self._box is not None: + self._box.close() + # Don't delete it! + + def checkmbox(self, name, matcher, count): + self._path = os.path.join(os.path.dirname(__file__), 'test_email', 'data', name) + self._box = mailbox.mbox(self._path, create=False, from_matcher=matcher) + self.assertEqual(len(self._box.keys()), count) + + # default matcher finds two messages as there are 2 From lines + def test_read_mbox_None_01(self): + self.checkmbox('mailbox_01.mbox', None, 2) + + def test_read_mbox_None_02(self): + self.checkmbox('mailbox_02.mbox', None, 2) + + def test_read_mbox_full_01(self): + self.checkmbox('mailbox_01.mbox', 'full', 1) + + def test_read_mbox_full_02(self): + self.checkmbox('mailbox_02.mbox', 'full', 0) # From line has extra non-space chars after YYYY class TestMMDF(_TestMboxMMDF, unittest.TestCase): diff --git a/Misc/ACKS b/Misc/ACKS index 42068ec6aefbd2..102fe7f3eab38e 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -1484,7 +1484,6 @@ Michael Pomraning Martin Pool Iustin Pop Claudiu Popa -Daniel Pope Nick Pope John Popplewell Matheus Vieira Portela diff --git a/Misc/NEWS.d/next/Library/2022-08-07-18-49-49.gh-issue-93376.G7XqQo.rst b/Misc/NEWS.d/next/Library/2022-08-07-18-49-49.gh-issue-93376.G7XqQo.rst new file mode 100644 index 00000000000000..3425d4fe1e1253 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-08-07-18-49-49.gh-issue-93376.G7XqQo.rst @@ -0,0 +1,3 @@ +Added *from_matcher* parameter to mailbox.mbox parser. +This allows the user to override the default matcher (which looks for "From " only) with a +more specific matcher that is less likely to match against un-quoted "From " lines in body text.