Re-work markdown fenced codeblock lexing

This primarily makes lexing of nested fenced codeblocks work but also added support for tilde-delimited blocks and corrected the end offset for blocks.
simplistix · Mar 18, 2024 · a38e407 · a38e407
1 parent 533c108
commit a38e407
Show file tree

Hide file tree

Showing 8 changed files with 382 additions and 50 deletions.
diff --git a/sybil/parsers/markdown/lexers.py b/sybil/parsers/markdown/lexers.py
@@ -1,13 +1,83 @@
 import re
-from typing import Optional, Dict
+import textwrap
+from typing import Optional, Dict, Pattern, Iterable, Match, List
 
+from sybil import Document, Region, Lexeme
 from sybil.parsers.abstract.lexers import BlockLexer
 
-CODEBLOCK_START_TEMPLATE = r"^(?P<prefix>[ \t]*)```(?P<language>{language})$\n"
-CODEBLOCK_END_TEMPLATE = r"(?<=\n){prefix}```(:?\n|\Z)"
+FENCE = re.compile(r"^(?P<prefix>[ \t]*)(?P<fence>`{3,}|~{3,})", re.MULTILINE)
 
 
-class FencedCodeBlockLexer(BlockLexer):
+class RawFencedCodeBlockLexer:
+
+    def __init__(
+            self,
+            info_pattern: Pattern[str] = re.compile(r'$\n', re.MULTILINE),
+            mapping: Optional[Dict[str, str]] = None,
+    ) -> None:
+        self.info_pattern = info_pattern
+        self.mapping = mapping
+
+    @staticmethod
+    def match_closes_existing(current: Match[str], existing: Match[str]) -> bool:
+        current_fence = current.group('fence')
+        existing_fence = existing.group('fence')
+        same_type = current_fence[0] == existing_fence[0]
+        okay_length = len(current_fence) >= len(existing_fence)
+        same_prefix = len(current.group('prefix')) == len(existing.group('prefix'))
+        return same_type and okay_length and same_prefix
+
+    def make_region(
+            self, opening: Match[str], document: Document, closing: Optional[Match[str]]
+    ) -> Optional[Region]:
+        if closing is None:
+            content_end = region_end = len(document.text)
+        else:
+            content_end = closing.start()
+            region_end = closing.end()
+        content = document.text[opening.end(): content_end]
+        info = self.info_pattern.match(content)
+        if info is None:
+            return None
+        lexemes = info.groupdict()
+        lines = content[info.end():].splitlines(keepends=True)
+        stripped = ''.join(line[len(opening.group('prefix')):] for line in lines)
+        lexemes['source'] = Lexeme(
+            textwrap.dedent(stripped),
+            offset=len(opening.group(0))+info.end(),
+            line_offset=0,
+        )
+        if self.mapping:
+            lexemes = {dest: lexemes[source] for source, dest in self.mapping.items()}
+        return Region(opening.start(), region_end, lexemes=lexemes)
+
+    def __call__(self, document: Document) -> Iterable[Region]:
+        open_blocks: List[Match[str]] = []
+        index = 0
+        while True:
+            match = FENCE.search(document.text, index)
+            if match is None:
+                break
+            else:
+                index = match.end()
+            # does this fence close any open block?
+            for i in range(len(open_blocks)):
+                existing = open_blocks[i]
+                if self.match_closes_existing(match, existing):
+                    maybe_region = self.make_region(existing, document, match)
+                    if maybe_region is not None:
+                        yield maybe_region
+                    open_blocks = open_blocks[:i]
+                    break
+            else:
+                open_blocks.append(match)
+        if open_blocks:
+            maybe_region = self.make_region(open_blocks[0], document, closing=None)
+            if maybe_region is not None:
+                yield maybe_region
+
+
+class FencedCodeBlockLexer(RawFencedCodeBlockLexer):
     """
     A :class:`~sybil.parsers.abstract.lexers.BlockLexer` for Markdown fenced code blocks.
 
@@ -28,14 +98,9 @@ class FencedCodeBlockLexer(BlockLexer):
 
     def __init__(self, language: str, mapping: Optional[Dict[str, str]] = None) -> None:
         super().__init__(
-            start_pattern=re.compile(CODEBLOCK_START_TEMPLATE.format(language=language)),
-            end_pattern_template=CODEBLOCK_END_TEMPLATE,
+            info_pattern=re.compile(f'(?P<language>{language})$\n', re.MULTILINE),
             mapping=mapping,
         )
-        self.start_pattern = re.compile(
-            CODEBLOCK_START_TEMPLATE.format(language=language),
-            re.MULTILINE
-        )
 
 
 DIRECTIVE_IN_HTML_COMMENT_START = (

diff --git a/sybil/parsers/myst/lexers.py b/sybil/parsers/myst/lexers.py
@@ -3,13 +3,13 @@
 
 from sybil import Document, Region
 from sybil.parsers.abstract.lexers import BlockLexer
-from sybil.parsers.markdown.lexers import CODEBLOCK_END_TEMPLATE
+from sybil.parsers.markdown.lexers import RawFencedCodeBlockLexer
 from sybil.parsers.rest.lexers import parse_options_and_source
 
-DIRECTIVE_START_TEMPLATE = (
-    r"^(?P<prefix>[ \t]*)```\{{(?P<directive>{directive})}} ?(?P<arguments>{arguments})$\n"
-    r'(?P<options>(?:\1[ \t]*:[\w-]*:[^\n]*\n)+)?'
-    r"(\1---\n(?P<yaml_options>(?:.+\n)*)\1---\n)?"
+INFO_PATTERN = (
+    r'\{{(?P<directive>{directive})}} ?(?P<arguments>{arguments})$\n'
+    r'(?P<options>(?:[ \t]*:[\w-]*:[^\n]*\n)+)?'
+    r"([ \t]*---\n(?P<yaml_options>(?:.+\n)*)[ \t]*---\n)?"
 )
 
 
@@ -23,7 +23,7 @@ def parse_yaml_options(lexed: Region) -> None:
         lexemes['options'].update(options)
 
 
-class DirectiveLexer(BlockLexer):
+class DirectiveLexer(RawFencedCodeBlockLexer):
     """
     A :class:`~sybil.parsers.abstract.lexers.BlockLexer` for MyST directives such as:
 
@@ -60,11 +60,10 @@ def __init__(
             self, directive: str, arguments: str = '.*', mapping: Optional[Dict[str, str]] = None
     ) -> None:
         super().__init__(
-            start_pattern=re.compile(
-                DIRECTIVE_START_TEMPLATE.format(directive=directive, arguments=arguments),
-                re.MULTILINE
+            info_pattern=re.compile(
+                INFO_PATTERN.format(directive=directive, arguments=arguments),
+                re.MULTILINE,
             ),
-            end_pattern_template=CODEBLOCK_END_TEMPLATE,
             mapping=mapping,
         )
 

diff --git a/tests/samples/markdown-fenced-code-block.md b/tests/samples/markdown-fenced-code-block.md
@@ -0,0 +1,55 @@
+backticks:
+
+```
+<
+ >
+```
+
+tildes:
+~~~
+<
+ >
+~~~
+
+Fewer than three backticks is not enough:
+``
+foo
+``
+
+
+The closing code fence must use the same character as the opening fence:
+
+
+```
+aaa
+~~~
+```
+
+
+The closing code fence must be at least as long as the opening fence:
+
+````
+aaa
+```
+``````
+
+Nested:
+
+~~~~
+~~~
+aaa
+~~~
+~~~~
+
+
+Can't mix chars:
+
+~`~
+foo
+~`~
+
+
+This one gets closed by the end of document:
+```
+some stuff here
+~~~
diff --git a/tests/samples/myst-complicated-nesting.md b/tests/samples/myst-complicated-nesting.md
@@ -0,0 +1,53 @@
+# {py:mod}`bytewax.connectors.demo`
+
+```{py:module} bytewax.connectors.demo
+```
+
+```{autodoc2-docstring} bytewax.connectors.demo
+:parser: myst
+:allowtitles:
+```
+
+## Data
+
+````{py:data} X
+:canonical: bytewax.connectors.demo.X
+:type: typing.TypeVar
+
+```{autodoc2-docstring} bytewax.connectors.demo.X
+:parser: myst
+```
+
+````
+
+
+## Classes
+
+`````{py:class} RandomMetricSource(metric_name: str, interval: datetime.timedelta = timedelta(seconds=0.7), count: int = sys.maxsize, next_random: typing.Callable[[], float] = lambda: random.randrange(0, 10))
+:canonical: bytewax.connectors.demo.RandomMetricSource
+
+:Bases:
+    - {py:obj}`~bytewax.inputs.FixedPartitionedSource``[`{py:obj}`~typing.Tuple``[`{py:obj}`~str``, `{py:obj}`~float``], `{py:obj}`~bytewax.connectors.demo._RandomMetricState``]`
+
+```{autodoc2-docstring} bytewax.connectors.demo.RandomMetricSource
+:parser: myst
+```
+
+```{rubric} Initialization
+```
+
+```{autodoc2-docstring} bytewax.connectors.demo.RandomMetricSource.__init__
+:parser: myst
+```
+
+````{py:method} list_parts() -> typing.List[str]
+:canonical: bytewax.connectors.demo.RandomMetricSource.list_parts
+
+````
+
+````{py:method} build_part(now: datetime.datetime, for_part: str, resume_state: typing.Optional[bytewax.connectors.demo._RandomMetricState])
+:canonical: bytewax.connectors.demo.RandomMetricSource.build_part
+
+````
+
+`````
diff --git a/tests/samples/myst-directive-nested.md b/tests/samples/myst-directive-nested.md
@@ -0,0 +1,13 @@
+````{note}
+The warning block will be properly-parsed
+
+   ```{warning}
+   Here's my warning
+   ```
+
+But the next block will be parsed as raw text
+
+    ```{warning}
+    Here's my raw text warning that isn't parsed...
+    ```
+````
diff --git a/tests/test_markdown_lexers.py b/tests/test_markdown_lexers.py
@@ -0,0 +1,18 @@
+from testfixtures import compare
+
+from sybil.parsers.markdown.lexers import RawFencedCodeBlockLexer
+from sybil.region import Region
+from .helpers import region_details, check_lexed_regions
+
+
+def test_fenced_code_block():
+    lexer = RawFencedCodeBlockLexer()
+    check_lexed_regions('markdown-fenced-code-block.md', lexer, expected = [
+        Region(12, 24, lexemes={'source': '<\n >\n'}),
+        Region(34, 46, lexemes={'source': '<\n >\n'}),
+        Region(177, 192, lexemes={'source': 'aaa\n~~~\n'}),
+        Region(266, 285, lexemes={'source': 'aaa\n```\n'}),
+        Region(301, 312, lexemes={'source': 'aaa\n'}),
+        Region(296, 317, lexemes={'source': '~~~\naaa\n~~~\n'}),
+        Region(397, 421, lexemes={'source': 'some stuff here\n~~~\n'}),
+    ])
diff --git a/tests/test_myst_codeblock.py b/tests/test_myst_codeblock.py
@@ -3,8 +3,7 @@
 import pytest
 from testfixtures import compare
 
-from sybil import Example, Region
-from sybil.evaluators.python import PythonEvaluator
+from sybil import Example
 from sybil.parsers.myst import PythonCodeBlockParser, CodeBlockParser
 from .helpers import check_excinfo, parse
 
@@ -35,6 +34,11 @@ def test_basic():
     assert '__builtins__' not in namespace
 
 
+def test_complicated_nesting():
+    # This has no code blocks, but should still parse fine:
+    parse('myst-complicated-nesting.md', PythonCodeBlockParser(), expected=0)
+
+
 def test_doctest_at_end_of_fenced_codeblock():
     examples, namespace = parse('myst-codeblock-doctests-end-of-fenced-codeblocks.md',
                                 PythonCodeBlockParser(), expected=2)