diff --git a/sybil/parsers/markdown/lexers.py b/sybil/parsers/markdown/lexers.py index e92bcbe..68a5d7c 100644 --- a/sybil/parsers/markdown/lexers.py +++ b/sybil/parsers/markdown/lexers.py @@ -1,13 +1,83 @@ import re -from typing import Optional, Dict +import textwrap +from typing import Optional, Dict, Pattern, Iterable, Match, List +from sybil import Document, Region, Lexeme from sybil.parsers.abstract.lexers import BlockLexer -CODEBLOCK_START_TEMPLATE = r"^(?P[ \t]*)```(?P{language})$\n" -CODEBLOCK_END_TEMPLATE = r"(?<=\n){prefix}```(:?\n|\Z)" +FENCE = re.compile(r"^(?P[ \t]*)(?P`{3,}|~{3,})", re.MULTILINE) -class FencedCodeBlockLexer(BlockLexer): +class RawFencedCodeBlockLexer: + + def __init__( + self, + info_pattern: Pattern[str] = re.compile(r'$\n', re.MULTILINE), + mapping: Optional[Dict[str, str]] = None, + ) -> None: + self.info_pattern = info_pattern + self.mapping = mapping + + @staticmethod + def match_closes_existing(current: Match[str], existing: Match[str]) -> bool: + current_fence = current.group('fence') + existing_fence = existing.group('fence') + same_type = current_fence[0] == existing_fence[0] + okay_length = len(current_fence) >= len(existing_fence) + same_prefix = len(current.group('prefix')) == len(existing.group('prefix')) + return same_type and okay_length and same_prefix + + def make_region( + self, opening: Match[str], document: Document, closing: Optional[Match[str]] + ) -> Optional[Region]: + if closing is None: + content_end = region_end = len(document.text) + else: + content_end = closing.start() + region_end = closing.end() + content = document.text[opening.end(): content_end] + info = self.info_pattern.match(content) + if info is None: + return None + lexemes = info.groupdict() + lines = content[info.end():].splitlines(keepends=True) + stripped = ''.join(line[len(opening.group('prefix')):] for line in lines) + lexemes['source'] = Lexeme( + textwrap.dedent(stripped), + offset=len(opening.group(0))+info.end(), + line_offset=0, + ) + if self.mapping: + lexemes = {dest: lexemes[source] for source, dest in self.mapping.items()} + return Region(opening.start(), region_end, lexemes=lexemes) + + def __call__(self, document: Document) -> Iterable[Region]: + open_blocks: List[Match[str]] = [] + index = 0 + while True: + match = FENCE.search(document.text, index) + if match is None: + break + else: + index = match.end() + # does this fence close any open block? + for i in range(len(open_blocks)): + existing = open_blocks[i] + if self.match_closes_existing(match, existing): + maybe_region = self.make_region(existing, document, match) + if maybe_region is not None: + yield maybe_region + open_blocks = open_blocks[:i] + break + else: + open_blocks.append(match) + if open_blocks: + maybe_region = self.make_region(open_blocks[0], document, closing=None) + if maybe_region is not None: + yield maybe_region + + +class FencedCodeBlockLexer(RawFencedCodeBlockLexer): """ A :class:`~sybil.parsers.abstract.lexers.BlockLexer` for Markdown fenced code blocks. @@ -28,14 +98,9 @@ class FencedCodeBlockLexer(BlockLexer): def __init__(self, language: str, mapping: Optional[Dict[str, str]] = None) -> None: super().__init__( - start_pattern=re.compile(CODEBLOCK_START_TEMPLATE.format(language=language)), - end_pattern_template=CODEBLOCK_END_TEMPLATE, + info_pattern=re.compile(f'(?P{language})$\n', re.MULTILINE), mapping=mapping, ) - self.start_pattern = re.compile( - CODEBLOCK_START_TEMPLATE.format(language=language), - re.MULTILINE - ) DIRECTIVE_IN_HTML_COMMENT_START = ( diff --git a/sybil/parsers/myst/lexers.py b/sybil/parsers/myst/lexers.py index 5d41e83..17d2944 100644 --- a/sybil/parsers/myst/lexers.py +++ b/sybil/parsers/myst/lexers.py @@ -3,13 +3,13 @@ from sybil import Document, Region from sybil.parsers.abstract.lexers import BlockLexer -from sybil.parsers.markdown.lexers import CODEBLOCK_END_TEMPLATE +from sybil.parsers.markdown.lexers import RawFencedCodeBlockLexer from sybil.parsers.rest.lexers import parse_options_and_source -DIRECTIVE_START_TEMPLATE = ( - r"^(?P[ \t]*)```\{{(?P{directive})}} ?(?P{arguments})$\n" - r'(?P(?:\1[ \t]*:[\w-]*:[^\n]*\n)+)?' - r"(\1---\n(?P(?:.+\n)*)\1---\n)?" +INFO_PATTERN = ( + r'\{{(?P{directive})}} ?(?P{arguments})$\n' + r'(?P(?:[ \t]*:[\w-]*:[^\n]*\n)+)?' + r"([ \t]*---\n(?P(?:.+\n)*)[ \t]*---\n)?" ) @@ -23,7 +23,7 @@ def parse_yaml_options(lexed: Region) -> None: lexemes['options'].update(options) -class DirectiveLexer(BlockLexer): +class DirectiveLexer(RawFencedCodeBlockLexer): """ A :class:`~sybil.parsers.abstract.lexers.BlockLexer` for MyST directives such as: @@ -60,11 +60,10 @@ def __init__( self, directive: str, arguments: str = '.*', mapping: Optional[Dict[str, str]] = None ) -> None: super().__init__( - start_pattern=re.compile( - DIRECTIVE_START_TEMPLATE.format(directive=directive, arguments=arguments), - re.MULTILINE + info_pattern=re.compile( + INFO_PATTERN.format(directive=directive, arguments=arguments), + re.MULTILINE, ), - end_pattern_template=CODEBLOCK_END_TEMPLATE, mapping=mapping, ) diff --git a/tests/samples/markdown-fenced-code-block.md b/tests/samples/markdown-fenced-code-block.md new file mode 100644 index 0000000..c031e6a --- /dev/null +++ b/tests/samples/markdown-fenced-code-block.md @@ -0,0 +1,55 @@ +backticks: + +``` +< + > +``` + +tildes: +~~~ +< + > +~~~ + +Fewer than three backticks is not enough: +`` +foo +`` + + +The closing code fence must use the same character as the opening fence: + + +``` +aaa +~~~ +``` + + +The closing code fence must be at least as long as the opening fence: + +```` +aaa +``` +`````` + +Nested: + +~~~~ +~~~ +aaa +~~~ +~~~~ + + +Can't mix chars: + +~`~ +foo +~`~ + + +This one gets closed by the end of document: +``` +some stuff here +~~~ diff --git a/tests/samples/myst-complicated-nesting.md b/tests/samples/myst-complicated-nesting.md new file mode 100644 index 0000000..38af080 --- /dev/null +++ b/tests/samples/myst-complicated-nesting.md @@ -0,0 +1,53 @@ +# {py:mod}`bytewax.connectors.demo` + +```{py:module} bytewax.connectors.demo +``` + +```{autodoc2-docstring} bytewax.connectors.demo +:parser: myst +:allowtitles: +``` + +## Data + +````{py:data} X +:canonical: bytewax.connectors.demo.X +:type: typing.TypeVar + +```{autodoc2-docstring} bytewax.connectors.demo.X +:parser: myst +``` + +```` + + +## Classes + +`````{py:class} RandomMetricSource(metric_name: str, interval: datetime.timedelta = timedelta(seconds=0.7), count: int = sys.maxsize, next_random: typing.Callable[[], float] = lambda: random.randrange(0, 10)) +:canonical: bytewax.connectors.demo.RandomMetricSource + +:Bases: + - {py:obj}`~bytewax.inputs.FixedPartitionedSource``[`{py:obj}`~typing.Tuple``[`{py:obj}`~str``, `{py:obj}`~float``], `{py:obj}`~bytewax.connectors.demo._RandomMetricState``]` + +```{autodoc2-docstring} bytewax.connectors.demo.RandomMetricSource +:parser: myst +``` + +```{rubric} Initialization +``` + +```{autodoc2-docstring} bytewax.connectors.demo.RandomMetricSource.__init__ +:parser: myst +``` + +````{py:method} list_parts() -> typing.List[str] +:canonical: bytewax.connectors.demo.RandomMetricSource.list_parts + +```` + +````{py:method} build_part(now: datetime.datetime, for_part: str, resume_state: typing.Optional[bytewax.connectors.demo._RandomMetricState]) +:canonical: bytewax.connectors.demo.RandomMetricSource.build_part + +```` + +````` diff --git a/tests/samples/myst-directive-nested.md b/tests/samples/myst-directive-nested.md new file mode 100644 index 0000000..d2b0a73 --- /dev/null +++ b/tests/samples/myst-directive-nested.md @@ -0,0 +1,13 @@ +````{note} +The warning block will be properly-parsed + + ```{warning} + Here's my warning + ``` + +But the next block will be parsed as raw text + + ```{warning} + Here's my raw text warning that isn't parsed... + ``` +```` diff --git a/tests/test_markdown_lexers.py b/tests/test_markdown_lexers.py new file mode 100644 index 0000000..9367ec8 --- /dev/null +++ b/tests/test_markdown_lexers.py @@ -0,0 +1,18 @@ +from testfixtures import compare + +from sybil.parsers.markdown.lexers import RawFencedCodeBlockLexer +from sybil.region import Region +from .helpers import region_details, check_lexed_regions + + +def test_fenced_code_block(): + lexer = RawFencedCodeBlockLexer() + check_lexed_regions('markdown-fenced-code-block.md', lexer, expected = [ + Region(12, 24, lexemes={'source': '<\n >\n'}), + Region(34, 46, lexemes={'source': '<\n >\n'}), + Region(177, 192, lexemes={'source': 'aaa\n~~~\n'}), + Region(266, 285, lexemes={'source': 'aaa\n```\n'}), + Region(301, 312, lexemes={'source': 'aaa\n'}), + Region(296, 317, lexemes={'source': '~~~\naaa\n~~~\n'}), + Region(397, 421, lexemes={'source': 'some stuff here\n~~~\n'}), + ]) diff --git a/tests/test_myst_codeblock.py b/tests/test_myst_codeblock.py index 98c411f..2d27fe1 100644 --- a/tests/test_myst_codeblock.py +++ b/tests/test_myst_codeblock.py @@ -3,8 +3,7 @@ import pytest from testfixtures import compare -from sybil import Example, Region -from sybil.evaluators.python import PythonEvaluator +from sybil import Example from sybil.parsers.myst import PythonCodeBlockParser, CodeBlockParser from .helpers import check_excinfo, parse @@ -35,6 +34,11 @@ def test_basic(): assert '__builtins__' not in namespace +def test_complicated_nesting(): + # This has no code blocks, but should still parse fine: + parse('myst-complicated-nesting.md', PythonCodeBlockParser(), expected=0) + + def test_doctest_at_end_of_fenced_codeblock(): examples, namespace = parse('myst-codeblock-doctests-end-of-fenced-codeblocks.md', PythonCodeBlockParser(), expected=2) diff --git a/tests/test_myst_lexers.py b/tests/test_myst_lexers.py index 6cf4ab4..c02a7b0 100644 --- a/tests/test_myst_lexers.py +++ b/tests/test_myst_lexers.py @@ -2,52 +2,52 @@ from testfixtures import compare +from sybil.parsers.markdown.lexers import FencedCodeBlockLexer, DirectiveInHTMLCommentLexer from sybil.parsers.myst.lexers import ( DirectiveLexer, DirectiveInPercentCommentLexer ) -from sybil.parsers.markdown.lexers import FencedCodeBlockLexer, DirectiveInHTMLCommentLexer -from sybil.region import Region, Region -from .helpers import lex, sample_path, lex_text +from sybil.region import Region +from .helpers import lex, sample_path, lex_text, check_lexed_regions, check_lexed_text_regions def test_fenced_code_block(): lexer = FencedCodeBlockLexer('py?thon') - compare(lex('myst-lexers.md', lexer), expected=[ - Region(36, 56, lexemes={'language': 'python', 'source': '>>> 1+1\n2\n'}), - Region(1137, 1168, lexemes={'language': 'pthon', 'source': 'assert 1 + 1 == 2\n'}), + check_lexed_regions('myst-lexers.md', lexer, expected=[ + Region(36, 59, lexemes={'language': 'python', 'source': '>>> 1+1\n2\n'}), + Region(1137, 1173, lexemes={'language': 'pthon', 'source': 'assert 1 + 1 == 2\n'}), ]) def test_fenced_code_block_with_mapping(): lexer = FencedCodeBlockLexer('python', mapping={'source': 'body'}) - compare(lex('myst-lexers.md', lexer), expected=[ - Region(36, 56, lexemes={'body': '>>> 1+1\n2\n'}) + check_lexed_regions('myst-lexers.md', lexer, expected=[ + Region(36, 59, lexemes={'body': '>>> 1+1\n2\n'}) ]) def test_myst_directives(): lexer = DirectiveLexer(directive='[^}]+') - compare(lex('myst-lexers.md', lexer), expected=[ - Region(110, 145, lexemes={ + check_lexed_regions('myst-lexers.md', lexer, expected=[ + Region(110, 148, lexemes={ 'directive': 'code-block', 'arguments': 'python', 'options': {}, 'source': '>>> 1 + 1\n3\n', }), - Region(188, 273, lexemes={ + Region(188, 276, lexemes={ 'directive': 'directivename', 'arguments': 'arguments', 'options': {'key1': 'val1', 'key2': 'val2'}, 'source': 'This is\ndirective content\n', }), - Region(330, 378, lexemes={ + Region(330, 381, lexemes={ 'directive': 'eval-rst', 'arguments': '', 'options': {}, 'source': '.. doctest::\n\n >>> 1 + 1\n 4\n', }), - Region(1398, 1474, lexemes={ + Region(1398, 1479, lexemes={ 'directive': 'foo', 'arguments': 'bar', 'options': {'key1': 'val1'}, @@ -59,13 +59,13 @@ def test_myst_directives(): def test_examples_from_parsing_tests(): lexer = DirectiveLexer(directive='code-block', arguments='python') compare(lex('myst-codeblock.md', lexer), expected=[ - Region(99, 151, lexemes={ + Region(99, 154, lexemes={ 'directive': 'code-block', 'arguments': 'python', 'options': {}, 'source': "raise Exception('boom!')\n", }), - Region(701, 748, lexemes={ + Region(701, 753, lexemes={ 'directive': 'code-block', 'arguments': 'python', 'options': {}, @@ -77,7 +77,7 @@ def test_examples_from_parsing_tests(): def test_myst_directives_with_mapping(): lexer = DirectiveLexer(directive='directivename', arguments='.*', mapping={'arguments': 'foo'}) compare(lex('myst-lexers.md', lexer), expected=[ - Region(188, 273, lexemes={'foo': 'arguments', 'options': {}}), + Region(188, 276, lexemes={'foo': 'arguments', 'options': {}}), ]) @@ -188,7 +188,7 @@ def test_myst_html_comment_invisible_clear_directive(): def test_lexing_directives(): lexer = DirectiveLexer('[^}]+') compare(lex('myst-lexing-directives.md', lexer), expected=[ - Region(55, 233, lexemes={ + Region(55, 236, lexemes={ 'directive': 'note', 'arguments': 'This is a note admonition.', 'options': {}, @@ -198,25 +198,25 @@ def test_lexing_directives(): ' following.\n' '- It includes this bullet list.\n'), }), - Region(238, 317, lexemes={ + Region(238, 320, lexemes={ 'directive': 'admonition', 'arguments': 'And, by the way...', 'options': {}, 'source': 'You can make up your own admonition too.\n', }), - Region(322, 383, lexemes={ + Region(322, 386, lexemes={ 'directive': 'sample', 'arguments': '', 'options': {}, 'source': 'This directive has no arguments, just a body.\n', }), - Region(455, 478, lexemes={ + Region(455, 481, lexemes={ 'directive': 'image', 'arguments': 'picture.png', 'options': {}, 'source': '', }), - Region(483, 592, lexemes={ + Region(483, 595, lexemes={ 'directive': 'image', 'arguments': 'picture.jpeg', 'options': { @@ -228,7 +228,7 @@ def test_lexing_directives(): }, 'source': '', }), - Region(597, 1311, lexemes={ + Region(597, 1314, lexemes={ 'directive': 'figure', 'arguments': 'picture.png', 'options': { @@ -252,7 +252,7 @@ def test_lexing_directives(): '+-----------------------+-----------------------+\n' '\n') }), - Region(1317, 1449, lexemes={ + Region(1317, 1452, lexemes={ 'directive': 'topic', 'arguments': 'Topic Title', 'options': {}, @@ -260,13 +260,13 @@ def test_lexing_directives(): 'the body of the topic, and are\n' 'interpreted as body elements.\n') }), - Region(1506, 1589, lexemes={ + Region(1506, 1592, lexemes={ 'directive': 'topic', 'arguments': 'example.cfg', 'options': {'class': 'read-file'}, 'source': '::\n\n [A Section]\n dir = frob\n' }), - Region(1612, 1801, lexemes={ + Region(1612, 1804, lexemes={ 'directive': 'sidebar', 'arguments': 'Optional Sidebar Title', 'options': {'subtitle': 'Optional Sidebar Subtitle'}, @@ -274,14 +274,14 @@ def test_lexing_directives(): 'the body of the sidebar, and are\n' 'interpreted as body elements.\n') }), - Region(1807, 2003, lexemes={ + Region(1807, 2006, lexemes={ 'directive': 'code-block', 'arguments': 'python', 'options': {'lineno-start': 10, 'emphasize-lines': '1, 3', 'caption': 'This is my\nmulti-line caption. It is *pretty nifty* ;-)\n'}, 'source': "a = 2\nprint('my 1st line')\nprint(f'my {a}nd line')\n", }), - Region(2008, 2210, lexemes={ + Region(2008, 2213, lexemes={ 'directive': 'eval-rst', 'arguments': '', 'options': {}, @@ -304,10 +304,135 @@ def test_directive_no_trailing_newline(): lexer = DirectiveLexer(directive='toctree') text = Path(sample_path('myst-directive-no-trailing-newline.md')).read_text().rstrip('\n') compare(lex_text(text, lexer), expected=[ - Region(16, 64, lexemes={ + Region(16, 67, lexemes={ 'directive': 'toctree', 'arguments': '', 'options': {'maxdepth': '1'}, 'source': 'flask\npyramid\ncustom\n', }), ]) + + +def test_directive_nested(): + lexer = DirectiveLexer(directive='.+') + text = Path(sample_path('myst-directive-nested.md')).read_text().rstrip('\n') + check_lexed_text_regions(text, lexer, expected=[ + Region(54, 97, lexemes={ + 'directive': 'warning', + 'arguments': '', + 'options': {}, + 'source': "Here's my warning\n", + }), + Region(146, 222, lexemes={ + 'directive': 'warning', + 'arguments': '', + 'options': {}, + 'source': "Here's my raw text warning that isn't parsed...\n", + }), + Region(0, 227, lexemes={ + 'directive': 'note', + 'arguments': '', + 'options': {}, + 'source': ('The warning block will be properly-parsed\n' + '\n' + ' ```{warning}\n' + " Here's my warning\n" + ' ```\n' + '\n' + 'But the next block will be parsed as raw text\n' + '\n' + ' ```{warning}\n' + " Here's my raw text warning that isn't parsed...\n" + ' ```\n'), + }), + ]) + + +def test_complicated_nesting(): + lexer = DirectiveLexer(directive='.+') + check_lexed_regions('myst-complicated-nesting.md', lexer, expected=[ + Region(37, 79, lexemes={ + 'directive': 'py:module', + 'arguments': 'bytewax.connectors.demo', + 'options': {}, + 'source': "", + }), + Region(81, 160, lexemes={ + 'directive': 'autodoc2-docstring', + 'arguments': 'bytewax.connectors.demo', + 'options': {'parser': 'myst', 'allowtitles': ''}, + 'source': "", + }), + Region(248, 315, lexemes={ + 'directive': 'autodoc2-docstring', + 'arguments': 'bytewax.connectors.demo.X', + 'options': {'parser': 'myst'}, + 'source': "", + }), + Region(171, 321, lexemes={ + 'directive': 'py:data', + 'arguments': 'X', + 'options': {'canonical': 'bytewax.connectors.demo.X', 'type': 'typing.TypeVar'}, + 'source': "```{autodoc2-docstring} bytewax.connectors.demo.X\n" + ":parser: myst\n" + "```\n\n", + }), + Region(789, 873, lexemes={ + 'directive': 'autodoc2-docstring', + 'arguments': 'bytewax.connectors.demo.RandomMetricSource', + 'options': {'parser': 'myst'}, + 'source': "", + }), + Region(875, 905, lexemes={ + 'directive': 'rubric', + 'arguments': 'Initialization', + 'options': {}, + 'source': "", + }), + Region(907, 1000, lexemes={ + 'directive': 'autodoc2-docstring', + 'arguments': 'bytewax.connectors.demo.RandomMetricSource.__init__', + 'options': {'parser': 'myst'}, + 'source': "", + }), + Region(1002, 1122, lexemes={ + 'directive': 'py:method', + 'arguments': 'list_parts() -> typing.List[str]', + 'options': {'canonical': 'bytewax.connectors.demo.RandomMetricSource.list_parts'}, + 'source': "", + }), + Region(1124, 1336, lexemes={ + 'directive': 'py:method', + 'arguments': 'build_part(now: datetime.datetime, for_part: str, resume_state: ' + 'typing.Optional[bytewax.connectors.demo._RandomMetricState])', + 'options': {'canonical': 'bytewax.connectors.demo.RandomMetricSource.build_part'}, + 'source': "", + }), + Region(336, 1343, lexemes={ + 'directive': 'py:class', + 'arguments': 'RandomMetricSource(metric_name: str, interval: datetime.timedelta = ' + 'timedelta(seconds=0.7), count: int = sys.maxsize, next_random: ' + 'typing.Callable[[], float] = lambda: random.randrange(0, 10))', + 'options': {'canonical': 'bytewax.connectors.demo.RandomMetricSource'}, + 'source': ":Bases:\n" + " - {py:obj}`~bytewax.inputs.FixedPartitionedSource``[`{py:obj}`" + "~typing.Tuple``[`{py:obj}`~str``, `{py:obj}`~float``]," + " `{py:obj}`~bytewax.connectors.demo._RandomMetricState``]`\n\n" + "```{autodoc2-docstring} bytewax.connectors.demo.RandomMetricSource\n" + ":parser: myst\n" + "```\n\n" + "```{rubric} Initialization\n" + "```\n\n" + "```{autodoc2-docstring} " + "bytewax.connectors.demo.RandomMetricSource.__init__\n" + ":parser: myst\n" + "```\n\n" + "````{py:method} list_parts() -> typing.List[str]\n" + ":canonical: bytewax.connectors.demo.RandomMetricSource.list_parts\n\n" + "````\n\n" + "````{py:method} build_part(now: datetime.datetime, for_part: str, " + "resume_state: typing.Optional[bytewax.connectors.demo._RandomMetricState])\n" + ":canonical: bytewax.connectors.demo.RandomMetricSource.build_part\n\n" + "````\n\n", + }), + ])