Merge pull request #2582 from jeanas/roundtrip-test

tests: Check that golden tokens add up to the input
pygments · Nov 14, 2023 · f57a2dd · f57a2dd
2 parents 7536da3 + 2403507
commit f57a2dd
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 21 deletions.
diff --git a/pygments/lexer.py b/pygments/lexer.py
@@ -199,20 +199,9 @@ def analyse_text(text):
         it's the same as if the return values was ``0.0``.
         """
 
-    def get_tokens(self, text, unfiltered=False):
-        """
-        This method is the basic interface of a lexer. It is called by
-        the `highlight()` function. It must process the text and return an
-        iterable of ``(tokentype, value)`` pairs from `text`.
+    def _preprocess_lexer_input(self, text):
+        """Apply preprocessing such as decoding the input, removing BOM and normalizing newlines."""
 
-        Normally, you don't need to override this method. The default
-        implementation processes the options recognized by all lexers
-        (`stripnl`, `stripall` and so on), and then yields all tokens
-        from `get_tokens_unprocessed()`, with the ``index`` dropped.
-
-        If `unfiltered` is set to `True`, the filtering mechanism is
-        bypassed even if filters are defined.
-        """
         if not isinstance(text, str):
             if self.encoding == 'guess':
                 text, _ = guess_decode(text)
@@ -255,6 +244,24 @@ def get_tokens(self, text, unfiltered=False):
         if self.ensurenl and not text.endswith('\n'):
             text += '\n'
 
+        return text
+
+    def get_tokens(self, text, unfiltered=False):
+        """
+        This method is the basic interface of a lexer. It is called by
+        the `highlight()` function. It must process the text and return an
+        iterable of ``(tokentype, value)`` pairs from `text`.
+
+        Normally, you don't need to override this method. The default
+        implementation processes the options recognized by all lexers
+        (`stripnl`, `stripall` and so on), and then yields all tokens
+        from `get_tokens_unprocessed()`, with the ``index`` dropped.
+
+        If `unfiltered` is set to `True`, the filtering mechanism is
+        bypassed even if filters are defined.
+        """
+        text = self._preprocess_lexer_input(text)
+
         def streamer():
             for _, t, v in self.get_tokens_unprocessed(text):
                 yield t, v

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -46,9 +46,15 @@ def _prettyprint_tokens(cls, tokens):
 
     def runtest(self):
         lexer = pygments.lexers.get_lexer_by_name(self.lexer)
-        tokens = lexer.get_tokens(self.input)
+        tokens = list(lexer.get_tokens(self.input))
         self.actual = '\n'.join(self._prettyprint_tokens(tokens)).rstrip('\n') + '\n'
-        if not self.config.getoption('--update-goldens'):
+        if self.config.getoption('--update-goldens'):
+            # Make sure the new golden output corresponds to the input.
+            output = ''.join(val for (tok, val) in tokens)
+            preproc_input = lexer._preprocess_lexer_input(self.input) # remove BOMs etc.
+            assert output == preproc_input
+        else:
+            # Make sure the output is the expected golden output
             assert self.actual == self.expected
 
     def _test_file_rel_path(self):
@@ -59,12 +65,18 @@ def _prunetraceback(self, excinfo):
 
     def repr_failure(self, excinfo):
         if isinstance(excinfo.value, AssertionError):
-            rel_path = self._test_file_rel_path()
-            message = (
-                'The tokens produced by the "{}" lexer differ from the '
-                'expected ones in the file "{}".\n'
-                'Run `pytest {} --update-goldens` to update it.'
-            ).format(self.lexer, rel_path, Path(*rel_path.parts[:2]))
+            if self.config.getoption('--update-goldens'):
+                message = (
+                    f'The tokens produced by the "{self.lexer}" lexer '
+                    'do not add up to the input.'
+                )
+            else:
+                rel_path = self._test_file_rel_path()
+                message = (
+                    'The tokens produced by the "{}" lexer differ from the '
+                    'expected ones in the file "{}".\n'
+                    'Run `tox -- {} --update-goldens` to update it.'
+                ).format(self.lexer, rel_path, Path(*rel_path.parts[:2]))
             diff = str(excinfo.value).split('\n', 1)[-1]
             return message + '\n\n' + diff
         else: