diff --git a/ruff.toml b/ruff.toml index a506ceb38..1af4df1b7 100644 --- a/ruff.toml +++ b/ruff.toml @@ -140,7 +140,6 @@ ignore = [ "PLR5501", "UP035", # [*] Import from `collections.abc` instead: `Iterator` - "UP038", # [*] Use `X | Y` in `isinstance` call instead of `(X, Y)` (conflict with Pylint) ] # Avoid trying to fix flake8-bugbear (`B`) violations. diff --git a/strictdoc/backend/sdoc/grammar_reader.py b/strictdoc/backend/sdoc/grammar_reader.py index 1e26bfc97..5c8312f98 100644 --- a/strictdoc/backend/sdoc/grammar_reader.py +++ b/strictdoc/backend/sdoc/grammar_reader.py @@ -63,7 +63,7 @@ def read_from_file( if unpickled_content is not None: return unpickled_content - with open(file_path, encoding="utf8") as file: + with open(file_path, encoding="utf-8-sig") as file: grammar_content = file.read() try: diff --git a/strictdoc/backend/sdoc/reader.py b/strictdoc/backend/sdoc/reader.py index da43cc254..d12f11982 100644 --- a/strictdoc/backend/sdoc/reader.py +++ b/strictdoc/backend/sdoc/reader.py @@ -20,6 +20,7 @@ from strictdoc.core.project_config import ProjectConfig from strictdoc.helpers.cast import assert_cast from strictdoc.helpers.exception import StrictDocException +from strictdoc.helpers.string import strip_bom from strictdoc.helpers.textx import drop_textx_meta @@ -36,6 +37,8 @@ def _read( file_path: Optional[str] = None, migrate_sections: bool = False, ) -> Tuple[SDocDocument, ParseContext]: + input_string = strip_bom(input_string) + parse_context = ParseContext( path_to_sdoc_file=file_path, migrate_sections=migrate_sections ) @@ -100,7 +103,9 @@ def read_from_file( if unpickled_content: return assert_cast(unpickled_content, SDocDocument) - with open(file_path, encoding="utf8") as file: + # utf-8-sig is important here because it strips the UTF BOM markers + # from the beginning of source files created on Windows. + with open(file_path, encoding="utf-8-sig") as file: sdoc_content = file.read() sdoc, parse_context = self.read_with_parse_context( diff --git a/strictdoc/backend/sdoc_source_code/reader.py b/strictdoc/backend/sdoc_source_code/reader.py index 96baf3d81..fd2342333 100644 --- a/strictdoc/backend/sdoc_source_code/reader.py +++ b/strictdoc/backend/sdoc_source_code/reader.py @@ -292,7 +292,7 @@ def read( return source_file_traceability_info def read_from_file(self, file_path: str) -> SourceFileTraceabilityInfo: - with open(file_path, encoding="utf-8") as file: + with open(file_path, encoding="utf-8-sig") as file: sdoc_content = file.read() sdoc = self.read(sdoc_content, file_path=file_path) return sdoc diff --git a/strictdoc/helpers/string.py b/strictdoc/helpers/string.py index e1e5bc514..5b2187813 100644 --- a/strictdoc/helpers/string.py +++ b/strictdoc/helpers/string.py @@ -4,6 +4,8 @@ REGEX_TRAILING_WHITESPACE_SINGLELINE = re.compile(r"\s{2,}") REGEX_TRAILING_WHITESPACE_MULTILINE = re.compile(r" +\n") +UTF8_BOM = "\ufeff" + # WIP: Check if this is used. def escape(string: str) -> str: @@ -95,3 +97,8 @@ def tokenize(text: str) -> List[str]: pattern = r"[a-z0-9]+(?:[-_./][a-z0-9]+)*" tokens = re.findall(pattern, text) return tokens + + +def strip_bom(s: str) -> str: + # U+FEFF is the BOM character when in str form + return s.lstrip(UTF8_BOM) diff --git a/tests/unit/strictdoc/backend/sdoc/test_dsl_passthrough.py b/tests/unit/strictdoc/backend/sdoc/test_dsl_passthrough.py index faf05fa6a..7ffa1b490 100644 --- a/tests/unit/strictdoc/backend/sdoc/test_dsl_passthrough.py +++ b/tests/unit/strictdoc/backend/sdoc/test_dsl_passthrough.py @@ -866,6 +866,21 @@ def test_089_document_config_use_mid(default_project_config): assert input_sdoc == output +def test_090_byte_order_mark_symbol_does_not_cause_parsing_errors(): + input_sdoc = ( + "\ufeff" + """\ +[DOCUMENT] +TITLE: Test Doc +""" + ) + + reader = SDReader() + + document = reader.read(input_sdoc) + assert isinstance(document, SDocDocument) + + def test__validation__30__composite_node_start_end_tags_do_not_match(): input_sdoc = """\ [DOCUMENT]