diff --git a/pygments/lexer.py b/pygments/lexer.py index 93d90bfbe6..eb5403e798 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -199,20 +199,9 @@ def analyse_text(text): it's the same as if the return values was ``0.0``. """ - def get_tokens(self, text, unfiltered=False): - """ - This method is the basic interface of a lexer. It is called by - the `highlight()` function. It must process the text and return an - iterable of ``(tokentype, value)`` pairs from `text`. + def _preprocess_lexer_input(self, text): + """Apply preprocessing such as decoding the input, removing BOM and normalizing newlines.""" - Normally, you don't need to override this method. The default - implementation processes the options recognized by all lexers - (`stripnl`, `stripall` and so on), and then yields all tokens - from `get_tokens_unprocessed()`, with the ``index`` dropped. - - If `unfiltered` is set to `True`, the filtering mechanism is - bypassed even if filters are defined. - """ if not isinstance(text, str): if self.encoding == 'guess': text, _ = guess_decode(text) @@ -255,6 +244,24 @@ def get_tokens(self, text, unfiltered=False): if self.ensurenl and not text.endswith('\n'): text += '\n' + return text + + def get_tokens(self, text, unfiltered=False): + """ + This method is the basic interface of a lexer. It is called by + the `highlight()` function. It must process the text and return an + iterable of ``(tokentype, value)`` pairs from `text`. + + Normally, you don't need to override this method. The default + implementation processes the options recognized by all lexers + (`stripnl`, `stripall` and so on), and then yields all tokens + from `get_tokens_unprocessed()`, with the ``index`` dropped. + + If `unfiltered` is set to `True`, the filtering mechanism is + bypassed even if filters are defined. + """ + text = self._preprocess_lexer_input(text) + def streamer(): for _, t, v in self.get_tokens_unprocessed(text): yield t, v diff --git a/tests/conftest.py b/tests/conftest.py index f41d9d49c6..6cd32c7f0f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -46,9 +46,15 @@ def _prettyprint_tokens(cls, tokens): def runtest(self): lexer = pygments.lexers.get_lexer_by_name(self.lexer) - tokens = lexer.get_tokens(self.input) + tokens = list(lexer.get_tokens(self.input)) self.actual = '\n'.join(self._prettyprint_tokens(tokens)).rstrip('\n') + '\n' - if not self.config.getoption('--update-goldens'): + if self.config.getoption('--update-goldens'): + # Make sure the new golden output corresponds to the input. + output = ''.join(val for (tok, val) in tokens) + preproc_input = lexer._preprocess_lexer_input(self.input) # remove BOMs etc. + assert output == preproc_input + else: + # Make sure the output is the expected golden output assert self.actual == self.expected def _test_file_rel_path(self): @@ -59,12 +65,18 @@ def _prunetraceback(self, excinfo): def repr_failure(self, excinfo): if isinstance(excinfo.value, AssertionError): - rel_path = self._test_file_rel_path() - message = ( - 'The tokens produced by the "{}" lexer differ from the ' - 'expected ones in the file "{}".\n' - 'Run `pytest {} --update-goldens` to update it.' - ).format(self.lexer, rel_path, Path(*rel_path.parts[:2])) + if self.config.getoption('--update-goldens'): + message = ( + f'The tokens produced by the "{self.lexer}" lexer ' + 'do not add up to the input.' + ) + else: + rel_path = self._test_file_rel_path() + message = ( + 'The tokens produced by the "{}" lexer differ from the ' + 'expected ones in the file "{}".\n' + 'Run `tox -- {} --update-goldens` to update it.' + ).format(self.lexer, rel_path, Path(*rel_path.parts[:2])) diff = str(excinfo.value).split('\n', 1)[-1] return message + '\n\n' + diff else: