strictdoc-project · stanislaw · Sep 25, 2025 · Sep 25, 2025
diff --git a/ruff.toml b/ruff.toml
@@ -140,7 +140,6 @@ ignore = [
     "PLR5501",
 
     "UP035", # [*] Import from `collections.abc` instead: `Iterator`
-    "UP038", # [*] Use `X | Y` in `isinstance` call instead of `(X, Y)` (conflict with Pylint)
 ]
 
 # Avoid trying to fix flake8-bugbear (`B`) violations.

diff --git a/strictdoc/backend/sdoc/grammar_reader.py b/strictdoc/backend/sdoc/grammar_reader.py
@@ -63,7 +63,7 @@ def read_from_file(
         if unpickled_content is not None:
             return unpickled_content
 
-        with open(file_path, encoding="utf8") as file:
+        with open(file_path, encoding="utf-8-sig") as file:
             grammar_content = file.read()
 
         try:

diff --git a/strictdoc/backend/sdoc/reader.py b/strictdoc/backend/sdoc/reader.py
@@ -20,6 +20,7 @@
 from strictdoc.core.project_config import ProjectConfig
 from strictdoc.helpers.cast import assert_cast
 from strictdoc.helpers.exception import StrictDocException
+from strictdoc.helpers.string import strip_bom
 from strictdoc.helpers.textx import drop_textx_meta
 
 
@@ -36,6 +37,8 @@ def _read(
         file_path: Optional[str] = None,
         migrate_sections: bool = False,
     ) -> Tuple[SDocDocument, ParseContext]:
+        input_string = strip_bom(input_string)
+
         parse_context = ParseContext(
             path_to_sdoc_file=file_path, migrate_sections=migrate_sections
         )
@@ -100,7 +103,9 @@ def read_from_file(
         if unpickled_content:
             return assert_cast(unpickled_content, SDocDocument)
 
-        with open(file_path, encoding="utf8") as file:
+        # utf-8-sig is important here because it strips the UTF BOM markers
+        # from the beginning of source files created on Windows.
+        with open(file_path, encoding="utf-8-sig") as file:
             sdoc_content = file.read()
 
         sdoc, parse_context = self.read_with_parse_context(

diff --git a/strictdoc/backend/sdoc_source_code/reader.py b/strictdoc/backend/sdoc_source_code/reader.py
@@ -292,7 +292,7 @@ def read(
         return source_file_traceability_info
 
     def read_from_file(self, file_path: str) -> SourceFileTraceabilityInfo:
-        with open(file_path, encoding="utf-8") as file:
+        with open(file_path, encoding="utf-8-sig") as file:
             sdoc_content = file.read()
             sdoc = self.read(sdoc_content, file_path=file_path)
             return sdoc
diff --git a/strictdoc/helpers/string.py b/strictdoc/helpers/string.py
@@ -4,6 +4,8 @@
 REGEX_TRAILING_WHITESPACE_SINGLELINE = re.compile(r"\s{2,}")
 REGEX_TRAILING_WHITESPACE_MULTILINE = re.compile(r" +\n")
 
+UTF8_BOM = "\ufeff"
+
 
 # WIP: Check if this is used.
 def escape(string: str) -> str:
@@ -95,3 +97,8 @@ def tokenize(text: str) -> List[str]:
     pattern = r"[a-z0-9]+(?:[-_./][a-z0-9]+)*"
     tokens = re.findall(pattern, text)
     return tokens
+
+
+def strip_bom(s: str) -> str:
+    # U+FEFF is the BOM character when in str form
+    return s.lstrip(UTF8_BOM)
diff --git a/tests/unit/strictdoc/backend/sdoc/test_dsl_passthrough.py b/tests/unit/strictdoc/backend/sdoc/test_dsl_passthrough.py
@@ -866,6 +866,21 @@ def test_089_document_config_use_mid(default_project_config):
     assert input_sdoc == output
 
 
+def test_090_byte_order_mark_symbol_does_not_cause_parsing_errors():
+    input_sdoc = (
+        "\ufeff"
+        """\
+[DOCUMENT]
+TITLE: Test Doc
+"""
+    )
+
+    reader = SDReader()
+
+    document = reader.read(input_sdoc)
+    assert isinstance(document, SDocDocument)
+
+
 def test__validation__30__composite_node_start_end_tags_do_not_match():
     input_sdoc = """\
 [DOCUMENT]