Address Georg's review

pygments · Nov 14, 2023 · 2403507 · 2403507
1 parent 3b1878e
commit 2403507
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 18 deletions.
diff --git a/pygments/lexer.py b/pygments/lexer.py
@@ -199,20 +199,9 @@ def analyse_text(text):
         it's the same as if the return values was ``0.0``.
         """
 
-    def get_tokens(self, text, unfiltered=False):
-        """
-        This method is the basic interface of a lexer. It is called by
-        the `highlight()` function. It must process the text and return an
-        iterable of ``(tokentype, value)`` pairs from `text`.
-
-        Normally, you don't need to override this method. The default
-        implementation processes the options recognized by all lexers
-        (`stripnl`, `stripall` and so on), and then yields all tokens
-        from `get_tokens_unprocessed()`, with the ``index`` dropped.
+    def _preprocess_lexer_input(self, text):
+        """Apply preprocessing such as decoding the input, removing BOM and normalizing newlines."""
 
-        If `unfiltered` is set to `True`, the filtering mechanism is
-        bypassed even if filters are defined.
-        """
         if not isinstance(text, str):
             if self.encoding == 'guess':
                 text, _ = guess_decode(text)
@@ -255,7 +244,23 @@ def get_tokens(self, text, unfiltered=False):
         if self.ensurenl and not text.endswith('\n'):
             text += '\n'
 
-        self._input_for_tests = text
+        return text
+
+    def get_tokens(self, text, unfiltered=False):
+        """
+        This method is the basic interface of a lexer. It is called by
+        the `highlight()` function. It must process the text and return an
+        iterable of ``(tokentype, value)`` pairs from `text`.
+
+        Normally, you don't need to override this method. The default
+        implementation processes the options recognized by all lexers
+        (`stripnl`, `stripall` and so on), and then yields all tokens
+        from `get_tokens_unprocessed()`, with the ``index`` dropped.
+
+        If `unfiltered` is set to `True`, the filtering mechanism is
+        bypassed even if filters are defined.
+        """
+        text = self._preprocess_lexer_input(text)
 
         def streamer():
             for _, t, v in self.get_tokens_unprocessed(text):

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -50,10 +50,9 @@ def runtest(self):
         self.actual = '\n'.join(self._prettyprint_tokens(tokens)).rstrip('\n') + '\n'
         if self.config.getoption('--update-goldens'):
             # Make sure the new golden output corresponds to the input.
-            # lexer._input_for_tests is self.input but with a newline possibly
-            # added due to the ensurenl option, BOM possibly removed, ...
             output = ''.join(val for (tok, val) in tokens)
-            assert output == lexer._input_for_tests
+            preproc_input = lexer._preprocess_lexer_input(self.input) # remove BOMs etc.
+            assert output == preproc_input
         else:
             # Make sure the output is the expected golden output
             assert self.actual == self.expected
@@ -76,7 +75,7 @@ def repr_failure(self, excinfo):
                 message = (
                     'The tokens produced by the "{}" lexer differ from the '
                     'expected ones in the file "{}".\n'
-                    'Run `pytest {} --update-goldens` to update it.'
+                    'Run `tox -- {} --update-goldens` to update it.'
                 ).format(self.lexer, rel_path, Path(*rel_path.parts[:2]))
             diff = str(excinfo.value).split('\n', 1)[-1]
             return message + '\n\n' + diff