diff --git a/strictdoc/backend/sdoc_source_code/comment_parser/marker_lexer.py b/strictdoc/backend/sdoc_source_code/comment_parser/marker_lexer.py index 0a093945b..31bbaff7e 100644 --- a/strictdoc/backend/sdoc_source_code/comment_parser/marker_lexer.py +++ b/strictdoc/backend/sdoc_source_code/comment_parser/marker_lexer.py @@ -24,7 +24,7 @@ class GrammarTemplate(Template): node_name: /##CUSTOM_TAGS/ node_multiline_value: (_WS_INLINE? | (_WS_INLINE NODE_STRING_VALUE)) NEWLINE (NODE_STRING_VALUE NEWLINE)* -NODE_STRING_VALUE.2: /(?![ ]*##RELATION_MARKER_START)(?!\\s*(##CUSTOM_TAGS):\\s)(?!\\s*##NODE_FIELD_END_MARKER)[^\n\r]+/x +NODE_STRING_VALUE.2: /(?![ ]*##RELATION_MARKER_START)(?!\\s*(##CUSTOM_TAGS):\\s)(?!\\s*##NODE_FIELD_END_MARKER)[^\\n\\r]+/ _NORMAL_STRING_NO_MARKER_NO_NODE: /(?!\\s*##RELATION_MARKER_START)((?!\\s*(##CUSTOM_TAGS):\\s)|(##RESERVED_KEYWORDS)).+/ """) diff --git a/tests/unit/strictdoc/backend/sdoc_source_code/test_marker_lexer.py b/tests/unit/strictdoc/backend/sdoc_source_code/test_marker_lexer.py index cdef05776..e9ff05bf9 100644 --- a/tests/unit/strictdoc/backend/sdoc_source_code/test_marker_lexer.py +++ b/tests/unit/strictdoc/backend/sdoc_source_code/test_marker_lexer.py @@ -4,7 +4,7 @@ from typing import Any, List, Optional -from lark import Tree +from lark import Token, Tree from strictdoc.backend.sdoc_source_code.comment_parser.marker_lexer import ( MarkerLexer, @@ -489,6 +489,46 @@ def test_34_node_text_starting_below() -> None: ) +def test_35a_node_value_newline_lf() -> None: + """Verify that LF goes into a separate NEWLINE token.""" + input_string = "FIELD: value1\nvalue2\n" + tree = MarkerLexer.parse(input_string, custom_tags={"FIELD"}) + + node_fields = list(tree.find_data("node_field")) + + assert_node_field( + node_fields[0], + "FIELD", + [ + Token("NODE_STRING_VALUE", "value1"), + Token("NEWLINE", "\n"), + Token("NODE_STRING_VALUE", "value2"), + Token("NEWLINE", "\n"), + ], + ) + + +def test_35b_node_value_newline_crlf() -> None: + """Verify that CR LF goes into a separate NEWLINE token.""" + input_string = "FIELD: value1\r\nvalue2\r\n" + tree = MarkerLexer.parse(input_string, custom_tags={"FIELD"}) + + node_fields = list(tree.find_data("node_field")) + + assert_node_field( + node_fields[0], + "FIELD", + [ + Token("NODE_STRING_VALUE", "value1"), + Token("NEWLINE", "\r\n"), + Token("NODE_STRING_VALUE", "value2"), + # The implicit \r\n => \n conversion at EOF is not nice, but doesn't hurt (yet). + # We need to improve EOF handling in lark grammar to get rid of it. + Token("NEWLINE", "\n"), + ], + ) + + def test_60_exclude_reserved_keywords() -> None: input_string = """ FIXME: This can likely replace _weak below with no problem.