simplistix · cjw296 · Mar 18, 2024 · Mar 18, 2024 · Mar 18, 2024 · Mar 18, 2024
diff --git a/docs/api.rst b/docs/api.rst
@@ -90,6 +90,8 @@ ReST Parsing and Lexing
 Markdown Parsing and Lexing
 ---------------------------
 
+.. autoclass:: sybil.parsers.markdown.lexers.RawFencedCodeBlockLexer
+
 .. autoclass:: sybil.parsers.markdown.lexers.FencedCodeBlockLexer
 
 .. autoclass:: sybil.parsers.markdown.lexers.DirectiveInHTMLCommentLexer

diff --git a/sybil/parsers/markdown/lexers.py b/sybil/parsers/markdown/lexers.py
@@ -1,15 +1,106 @@
 import re
-from typing import Optional, Dict
+import textwrap
+from typing import Optional, Dict, Pattern, Iterable, Match, List
 
+from sybil import Document, Region, Lexeme
 from sybil.parsers.abstract.lexers import BlockLexer
 
-CODEBLOCK_START_TEMPLATE = r"^(?P<prefix>[ \t]*)```(?P<language>{language})$\n"
-CODEBLOCK_END_TEMPLATE = r"(?<=\n){prefix}```(:?\n|\Z)"
+FENCE = re.compile(r"^(?P<prefix>[ \t]*)(?P<fence>`{3,}|~{3,})", re.MULTILINE)
 
 
-class FencedCodeBlockLexer(BlockLexer):
+class RawFencedCodeBlockLexer:
     """
-    A :class:`~sybil.parsers.abstract.lexers.BlockLexer` for Markdown fenced code blocks.
+    A :class:`~sybil.typing.Lexer` for Markdown fenced code blocks allowing flexible lexing
+    of the whole `info` line along with more complicated prefixes.
+
+    The following lexemes are extracted:
+
+    - ``source`` as a :class:`~sybil.Lexeme`.
+    - any other named groups specified in ``info_pattern`` as :class:`strings <str>`.
+
+
+    :param info_pattern:
+        a :class:`re.Pattern` to match the `info` line and any required prefix that follows it.
+
+    :param mapping:
+        If provided, this is used to rename lexemes from the keys in the mapping to their
+        values. Only mapped lexemes will be returned in any :class:`~sybil.Region` objects.
+
+    """
+
+
+    def __init__(
+            self,
+            info_pattern: Pattern[str] = re.compile(r'$\n', re.MULTILINE),
+            mapping: Optional[Dict[str, str]] = None,
+    ) -> None:
+        self.info_pattern = info_pattern
+        self.mapping = mapping
+
+    @staticmethod
+    def match_closes_existing(current: Match[str], existing: Match[str]) -> bool:
+        current_fence = current.group('fence')
+        existing_fence = existing.group('fence')
+        same_type = current_fence[0] == existing_fence[0]
+        okay_length = len(current_fence) >= len(existing_fence)
+        same_prefix = len(current.group('prefix')) == len(existing.group('prefix'))
+        return same_type and okay_length and same_prefix
+
+    def make_region(
+            self, opening: Match[str], document: Document, closing: Optional[Match[str]]
+    ) -> Optional[Region]:
+        if closing is None:
+            content_end = region_end = len(document.text)
+        else:
+            content_end = closing.start()
+            region_end = closing.end()
+        content = document.text[opening.end(): content_end]
+        info = self.info_pattern.match(content)
+        if info is None:
+            return None
+        lexemes = info.groupdict()
+        lines = content[info.end():].splitlines(keepends=True)
+        stripped = ''.join(line[len(opening.group('prefix')):] for line in lines)
+        lexemes['source'] = Lexeme(
+            textwrap.dedent(stripped),
+            offset=len(opening.group(0))+info.end(),
+            line_offset=0,
+        )
+        if self.mapping:
+            lexemes = {dest: lexemes[source] for source, dest in self.mapping.items()}
+        return Region(opening.start(), region_end, lexemes=lexemes)
+
+    def __call__(self, document: Document) -> Iterable[Region]:
+        open_blocks: List[Match[str]] = []
+        index = 0
+        while True:
+            match = FENCE.search(document.text, index)
+            if match is None:
+                break
+            else:
+                index = match.end()
+            # does this fence close any open block?
+            for i in range(len(open_blocks)):
+                existing = open_blocks[i]
+                if self.match_closes_existing(match, existing):
+                    maybe_region = self.make_region(existing, document, match)
+                    if maybe_region is not None:
+                        yield maybe_region
+                    open_blocks = open_blocks[:i]
+                    break
+            else:
+                open_blocks.append(match)
+        if open_blocks:
+            maybe_region = self.make_region(open_blocks[0], document, closing=None)
+            if maybe_region is not None:
+                yield maybe_region
+
+
+class FencedCodeBlockLexer(RawFencedCodeBlockLexer):
+    """
+    A :class:`~sybil.typing.Lexer` for Markdown fenced code blocks where a language is specified.
+    :class:`RawFencedCodeBlockLexer` can be used if the whole `info` line, or a more complicated
+    prefix, is required.
 
     The following lexemes are extracted:
 
@@ -28,14 +119,9 @@ class FencedCodeBlockLexer(BlockLexer):
 
     def __init__(self, language: str, mapping: Optional[Dict[str, str]] = None) -> None:
         super().__init__(
-            start_pattern=re.compile(CODEBLOCK_START_TEMPLATE.format(language=language)),
-            end_pattern_template=CODEBLOCK_END_TEMPLATE,
+            info_pattern=re.compile(f'(?P<language>{language})$\n', re.MULTILINE),
             mapping=mapping,
         )
-        self.start_pattern = re.compile(
-            CODEBLOCK_START_TEMPLATE.format(language=language),
-            re.MULTILINE
-        )
 
 
 DIRECTIVE_IN_HTML_COMMENT_START = (

diff --git a/sybil/parsers/myst/lexers.py b/sybil/parsers/myst/lexers.py
@@ -3,13 +3,13 @@
 
 from sybil import Document, Region
 from sybil.parsers.abstract.lexers import BlockLexer
-from sybil.parsers.markdown.lexers import CODEBLOCK_END_TEMPLATE
+from sybil.parsers.markdown.lexers import RawFencedCodeBlockLexer
 from sybil.parsers.rest.lexers import parse_options_and_source
 
-DIRECTIVE_START_TEMPLATE = (
-    r"^(?P<prefix>[ \t]*)```\{{(?P<directive>{directive})}} ?(?P<arguments>{arguments})$\n"
-    r'(?P<options>(?:\1[ \t]*:[\w-]*:[^\n]*\n)+)?'
-    r"(\1---\n(?P<yaml_options>(?:.+\n)*)\1---\n)?"
+INFO_PATTERN = (
+    r'\{{(?P<directive>{directive})}} ?(?P<arguments>{arguments})$\n'
+    r'(?P<options>(?:[ \t]*:[\w-]*:[^\n]*\n)+)?'
+    r"([ \t]*---\n(?P<yaml_options>(?:.+\n)*)[ \t]*---\n)?"
 )
 
 
@@ -23,9 +23,9 @@ def parse_yaml_options(lexed: Region) -> None:
         lexemes['options'].update(options)
 
 
-class DirectiveLexer(BlockLexer):
+class DirectiveLexer(RawFencedCodeBlockLexer):
     """
-    A :class:`~sybil.parsers.abstract.lexers.BlockLexer` for MyST directives such as:
+    A :class:`~sybil.typing.Lexer` for MyST directives such as:
 
     .. code-block:: markdown
 
@@ -60,11 +60,10 @@ def __init__(
             self, directive: str, arguments: str = '.*', mapping: Optional[Dict[str, str]] = None
     ) -> None:
         super().__init__(
-            start_pattern=re.compile(
-                DIRECTIVE_START_TEMPLATE.format(directive=directive, arguments=arguments),
-                re.MULTILINE
+            info_pattern=re.compile(
+                INFO_PATTERN.format(directive=directive, arguments=arguments),
+                re.MULTILINE,
             ),
-            end_pattern_template=CODEBLOCK_END_TEMPLATE,
             mapping=mapping,
         )
 

diff --git a/sybil/region.py b/sybil/region.py
@@ -95,7 +95,7 @@ def __lt__(self, other: 'Region') -> bool:
         assert self.start == other.start  # This is where this may happen, if not something weird
         return True
 
-    def adjust(self, lexed: Union['Region', 'Region'], lexeme: Lexeme) -> None:
+    def adjust(self, lexed: 'Region', lexeme: Lexeme) -> None:
         """
         Adjust the start and end of this region based on the provided :class:`Lexeme`
         and ::class:`Region` that lexeme came from.

diff --git a/tests/helpers.py b/tests/helpers.py
@@ -7,7 +7,7 @@
 from tempfile import NamedTemporaryFile
 from textwrap import dedent
 from traceback import TracebackException
-from typing import Optional, Tuple, List, Sequence
+from typing import Optional, Tuple, List, Sequence, Union, Iterable
 from unittest import TextTestRunner, main as unittest_main, SkipTest
 
 import pytest
@@ -28,21 +28,54 @@
 DOCS = HERE.parent / 'docs'
 SAMPLE_PATH = HERE / 'samples'
 
+
 def sample_path(name) -> str:
     return str(SAMPLE_PATH / name)
 
 
-def lex(name: str, lexer: Lexer) -> List[Region]:
+def regions_and_document(name: str, lexer: Lexer) -> Tuple[Document, List[Region]]:
     path = sample_path(name)
     document = Document(Path(path).read_text(), path)
-    return list(lexer(document))
+    return document, list(lexer(document))
+
+
+def lex(name: str, lexer: Lexer) -> List[Region]:
+    return regions_and_document(name, lexer)[1]
+
+
+def region_details(
+        document: Document, regions: Iterable[Region]
+) -> List[Tuple[Tuple[str, Union[Region, str]], ...]]:
+    # return a list of tuple of tuples to make failures easier to work through:
+    return [(
+        ('start', document.line_column(region.start)),
+        ('end', document.line_column(region.end)),
+        ('region', region)
+    ) for region in regions]
+
+
+def check_lexed_regions(name: str, lexer: Lexer, *, expected: List[Region]) -> None:
+    document, actual = regions_and_document(name, lexer)
+    compare(
+        expected=region_details(document, expected),
+        actual=region_details(document, actual),
+    )
 
 
 def lex_text(text: str, lexer: Lexer) -> List[Region]:
     document = Document(text, 'sample.txt')
     return list(lexer(document))
 
 
+def check_lexed_text_regions(text: str, lexer: Lexer, *, expected: List[Region]) -> None:
+    document = Document(text, 'sample.txt')
+    actual =list(lexer(document))
+    compare(
+        expected=region_details(document, expected),
+        actual=region_details(document, actual),
+    )
+
+
 def parse(name: str, *parsers: Parser, expected: int) -> Tuple[List[Example], dict]:
     document = Document.parse(sample_path(name), *parsers)
     examples = list(document)

diff --git a/tests/samples/markdown-fenced-code-block.md b/tests/samples/markdown-fenced-code-block.md
@@ -0,0 +1,55 @@
+backticks:
+
+```
+<
+ >
+```
+
+tildes:
+~~~
+<
+ >
+~~~
+
+Fewer than three backticks is not enough:
+``
+foo
+``
+
+
+The closing code fence must use the same character as the opening fence:
+
+
+```
+aaa
+~~~
+```
+
+
+The closing code fence must be at least as long as the opening fence:
+
+````
+aaa
+```
+``````
+
+Nested:
+
+~~~~
+~~~
+aaa
+~~~
+~~~~
+
+
+Can't mix chars:
+
+~`~
+foo
+~`~
+
+
+This one gets closed by the end of document:
+```
+some stuff here
+~~~
diff --git a/tests/samples/myst-complicated-nesting.md b/tests/samples/myst-complicated-nesting.md
@@ -0,0 +1,53 @@
+# {py:mod}`bytewax.connectors.demo`
+
+```{py:module} bytewax.connectors.demo
+```
+
+```{autodoc2-docstring} bytewax.connectors.demo
+:parser: myst
+:allowtitles:
+```
+
+## Data
+
+````{py:data} X
+:canonical: bytewax.connectors.demo.X
+:type: typing.TypeVar
+
+```{autodoc2-docstring} bytewax.connectors.demo.X
+:parser: myst
+```
+
+````
+
+
+## Classes
+
+`````{py:class} RandomMetricSource(metric_name: str, interval: datetime.timedelta = timedelta(seconds=0.7), count: int = sys.maxsize, next_random: typing.Callable[[], float] = lambda: random.randrange(0, 10))
+:canonical: bytewax.connectors.demo.RandomMetricSource
+
+:Bases:
+    - {py:obj}`~bytewax.inputs.FixedPartitionedSource``[`{py:obj}`~typing.Tuple``[`{py:obj}`~str``, `{py:obj}`~float``], `{py:obj}`~bytewax.connectors.demo._RandomMetricState``]`
+
+```{autodoc2-docstring} bytewax.connectors.demo.RandomMetricSource
+:parser: myst
+```
+
+```{rubric} Initialization
+```
+
+```{autodoc2-docstring} bytewax.connectors.demo.RandomMetricSource.__init__
+:parser: myst
+```
+
+````{py:method} list_parts() -> typing.List[str]
+:canonical: bytewax.connectors.demo.RandomMetricSource.list_parts
+
+````
+
+````{py:method} build_part(now: datetime.datetime, for_part: str, resume_state: typing.Optional[bytewax.connectors.demo._RandomMetricState])
+:canonical: bytewax.connectors.demo.RandomMetricSource.build_part
+
+````
+
+`````
diff --git a/tests/samples/myst-directive-nested.md b/tests/samples/myst-directive-nested.md
@@ -0,0 +1,13 @@
+````{note}
+The warning block will be properly-parsed
+
+   ```{warning}
+   Here's my warning
+   ```
+
+But the next block will be parsed as raw text
+
+    ```{warning}
+    Here's my raw text warning that isn't parsed...
+    ```
+````