Skip to content

Commit

Permalink
Re-work markdown fenced codeblock lexing
Browse files Browse the repository at this point in the history
This primarily makes lexing of nested fenced codeblocks work but also added support for tilde-delimited blocks and corrected the end offset for blocks.
  • Loading branch information
cjw296 committed Mar 18, 2024
1 parent 533c108 commit a38e407
Show file tree
Hide file tree
Showing 8 changed files with 382 additions and 50 deletions.
85 changes: 75 additions & 10 deletions sybil/parsers/markdown/lexers.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,83 @@
import re
from typing import Optional, Dict
import textwrap
from typing import Optional, Dict, Pattern, Iterable, Match, List

from sybil import Document, Region, Lexeme
from sybil.parsers.abstract.lexers import BlockLexer

CODEBLOCK_START_TEMPLATE = r"^(?P<prefix>[ \t]*)```(?P<language>{language})$\n"
CODEBLOCK_END_TEMPLATE = r"(?<=\n){prefix}```(:?\n|\Z)"
FENCE = re.compile(r"^(?P<prefix>[ \t]*)(?P<fence>`{3,}|~{3,})", re.MULTILINE)


class FencedCodeBlockLexer(BlockLexer):
class RawFencedCodeBlockLexer:

def __init__(
self,
info_pattern: Pattern[str] = re.compile(r'$\n', re.MULTILINE),
mapping: Optional[Dict[str, str]] = None,
) -> None:
self.info_pattern = info_pattern
self.mapping = mapping

@staticmethod
def match_closes_existing(current: Match[str], existing: Match[str]) -> bool:
current_fence = current.group('fence')
existing_fence = existing.group('fence')
same_type = current_fence[0] == existing_fence[0]
okay_length = len(current_fence) >= len(existing_fence)
same_prefix = len(current.group('prefix')) == len(existing.group('prefix'))
return same_type and okay_length and same_prefix

def make_region(
self, opening: Match[str], document: Document, closing: Optional[Match[str]]
) -> Optional[Region]:
if closing is None:
content_end = region_end = len(document.text)
else:
content_end = closing.start()
region_end = closing.end()
content = document.text[opening.end(): content_end]
info = self.info_pattern.match(content)
if info is None:
return None
lexemes = info.groupdict()
lines = content[info.end():].splitlines(keepends=True)
stripped = ''.join(line[len(opening.group('prefix')):] for line in lines)
lexemes['source'] = Lexeme(
textwrap.dedent(stripped),
offset=len(opening.group(0))+info.end(),
line_offset=0,
)
if self.mapping:
lexemes = {dest: lexemes[source] for source, dest in self.mapping.items()}
return Region(opening.start(), region_end, lexemes=lexemes)

def __call__(self, document: Document) -> Iterable[Region]:
open_blocks: List[Match[str]] = []
index = 0
while True:
match = FENCE.search(document.text, index)
if match is None:
break
else:
index = match.end()
# does this fence close any open block?
for i in range(len(open_blocks)):
existing = open_blocks[i]
if self.match_closes_existing(match, existing):
maybe_region = self.make_region(existing, document, match)
if maybe_region is not None:
yield maybe_region
open_blocks = open_blocks[:i]
break
else:
open_blocks.append(match)
if open_blocks:
maybe_region = self.make_region(open_blocks[0], document, closing=None)
if maybe_region is not None:
yield maybe_region


class FencedCodeBlockLexer(RawFencedCodeBlockLexer):
"""
A :class:`~sybil.parsers.abstract.lexers.BlockLexer` for Markdown fenced code blocks.
Expand All @@ -28,14 +98,9 @@ class FencedCodeBlockLexer(BlockLexer):

def __init__(self, language: str, mapping: Optional[Dict[str, str]] = None) -> None:
super().__init__(
start_pattern=re.compile(CODEBLOCK_START_TEMPLATE.format(language=language)),
end_pattern_template=CODEBLOCK_END_TEMPLATE,
info_pattern=re.compile(f'(?P<language>{language})$\n', re.MULTILINE),
mapping=mapping,
)
self.start_pattern = re.compile(
CODEBLOCK_START_TEMPLATE.format(language=language),
re.MULTILINE
)


DIRECTIVE_IN_HTML_COMMENT_START = (
Expand Down
19 changes: 9 additions & 10 deletions sybil/parsers/myst/lexers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@

from sybil import Document, Region
from sybil.parsers.abstract.lexers import BlockLexer
from sybil.parsers.markdown.lexers import CODEBLOCK_END_TEMPLATE
from sybil.parsers.markdown.lexers import RawFencedCodeBlockLexer
from sybil.parsers.rest.lexers import parse_options_and_source

DIRECTIVE_START_TEMPLATE = (
r"^(?P<prefix>[ \t]*)```\{{(?P<directive>{directive})}} ?(?P<arguments>{arguments})$\n"
r'(?P<options>(?:\1[ \t]*:[\w-]*:[^\n]*\n)+)?'
r"(\1---\n(?P<yaml_options>(?:.+\n)*)\1---\n)?"
INFO_PATTERN = (
r'\{{(?P<directive>{directive})}} ?(?P<arguments>{arguments})$\n'
r'(?P<options>(?:[ \t]*:[\w-]*:[^\n]*\n)+)?'
r"([ \t]*---\n(?P<yaml_options>(?:.+\n)*)[ \t]*---\n)?"
)


Expand All @@ -23,7 +23,7 @@ def parse_yaml_options(lexed: Region) -> None:
lexemes['options'].update(options)


class DirectiveLexer(BlockLexer):
class DirectiveLexer(RawFencedCodeBlockLexer):
"""
A :class:`~sybil.parsers.abstract.lexers.BlockLexer` for MyST directives such as:
Expand Down Expand Up @@ -60,11 +60,10 @@ def __init__(
self, directive: str, arguments: str = '.*', mapping: Optional[Dict[str, str]] = None
) -> None:
super().__init__(
start_pattern=re.compile(
DIRECTIVE_START_TEMPLATE.format(directive=directive, arguments=arguments),
re.MULTILINE
info_pattern=re.compile(
INFO_PATTERN.format(directive=directive, arguments=arguments),
re.MULTILINE,
),
end_pattern_template=CODEBLOCK_END_TEMPLATE,
mapping=mapping,
)

Expand Down
55 changes: 55 additions & 0 deletions tests/samples/markdown-fenced-code-block.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
backticks:

```
<
>
```

tildes:
~~~
<
>
~~~

Fewer than three backticks is not enough:
``
foo
``


The closing code fence must use the same character as the opening fence:


```
aaa
~~~
```


The closing code fence must be at least as long as the opening fence:

````
aaa
```
``````

Nested:

~~~~
~~~
aaa
~~~
~~~~


Can't mix chars:

~`~
foo
~`~


This one gets closed by the end of document:
```
some stuff here
~~~
53 changes: 53 additions & 0 deletions tests/samples/myst-complicated-nesting.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# {py:mod}`bytewax.connectors.demo`

```{py:module} bytewax.connectors.demo
```

```{autodoc2-docstring} bytewax.connectors.demo
:parser: myst
:allowtitles:
```

## Data

````{py:data} X
:canonical: bytewax.connectors.demo.X
:type: typing.TypeVar
```{autodoc2-docstring} bytewax.connectors.demo.X
:parser: myst
```
````


## Classes

`````{py:class} RandomMetricSource(metric_name: str, interval: datetime.timedelta = timedelta(seconds=0.7), count: int = sys.maxsize, next_random: typing.Callable[[], float] = lambda: random.randrange(0, 10))
:canonical: bytewax.connectors.demo.RandomMetricSource
:Bases:
- {py:obj}`~bytewax.inputs.FixedPartitionedSource``[`{py:obj}`~typing.Tuple``[`{py:obj}`~str``, `{py:obj}`~float``], `{py:obj}`~bytewax.connectors.demo._RandomMetricState``]`
```{autodoc2-docstring} bytewax.connectors.demo.RandomMetricSource
:parser: myst
```
```{rubric} Initialization
```
```{autodoc2-docstring} bytewax.connectors.demo.RandomMetricSource.__init__
:parser: myst
```
````{py:method} list_parts() -> typing.List[str]
:canonical: bytewax.connectors.demo.RandomMetricSource.list_parts
````
````{py:method} build_part(now: datetime.datetime, for_part: str, resume_state: typing.Optional[bytewax.connectors.demo._RandomMetricState])
:canonical: bytewax.connectors.demo.RandomMetricSource.build_part
````
`````
13 changes: 13 additions & 0 deletions tests/samples/myst-directive-nested.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
````{note}
The warning block will be properly-parsed
```{warning}
Here's my warning
```
But the next block will be parsed as raw text
```{warning}
Here's my raw text warning that isn't parsed...
```
````
18 changes: 18 additions & 0 deletions tests/test_markdown_lexers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from testfixtures import compare

from sybil.parsers.markdown.lexers import RawFencedCodeBlockLexer
from sybil.region import Region
from .helpers import region_details, check_lexed_regions


def test_fenced_code_block():
lexer = RawFencedCodeBlockLexer()
check_lexed_regions('markdown-fenced-code-block.md', lexer, expected = [
Region(12, 24, lexemes={'source': '<\n >\n'}),
Region(34, 46, lexemes={'source': '<\n >\n'}),
Region(177, 192, lexemes={'source': 'aaa\n~~~\n'}),
Region(266, 285, lexemes={'source': 'aaa\n```\n'}),
Region(301, 312, lexemes={'source': 'aaa\n'}),
Region(296, 317, lexemes={'source': '~~~\naaa\n~~~\n'}),
Region(397, 421, lexemes={'source': 'some stuff here\n~~~\n'}),
])
8 changes: 6 additions & 2 deletions tests/test_myst_codeblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import pytest
from testfixtures import compare

from sybil import Example, Region
from sybil.evaluators.python import PythonEvaluator
from sybil import Example
from sybil.parsers.myst import PythonCodeBlockParser, CodeBlockParser
from .helpers import check_excinfo, parse

Expand Down Expand Up @@ -35,6 +34,11 @@ def test_basic():
assert '__builtins__' not in namespace


def test_complicated_nesting():
# This has no code blocks, but should still parse fine:
parse('myst-complicated-nesting.md', PythonCodeBlockParser(), expected=0)


def test_doctest_at_end_of_fenced_codeblock():
examples, namespace = parse('myst-codeblock-doctests-end-of-fenced-codeblocks.md',
PythonCodeBlockParser(), expected=2)
Expand Down

0 comments on commit a38e407

Please sign in to comment.