diff --git a/docs/strictdoc_11_developer_guide.sdoc b/docs/strictdoc_11_developer_guide.sdoc index 21b72f40b..70bc65345 100644 --- a/docs/strictdoc_11_developer_guide.sdoc +++ b/docs/strictdoc_11_developer_guide.sdoc @@ -227,13 +227,15 @@ STATEMENT: >>> .. code-block:: python - for a_, b_ in foo: - # use a_, b_ within the loop. + for foo_, bar_ in baz: + # use foo_, bar_ within the loop. - The function arguments with the default values shall be avoided. This convention improves the visibility of the function interfaces at the coast of increased verbosity which is the price that StrictDoc development is willing to pay, maintaining the software long-term. The all-explicit function parameters indication is especially useful when the large code refactorings are made. - StrictDoc has been making a gradual shift towards a stronger type system. Although type annotations haven't been added everywhere in the codebase, it is preferred to include them for all new code that is written. +- For opening files, use the helpers ``file_open_read_utf8`` and ``file_open_read_bytes``. These helpers perform normal file opening but also strip the UTF-8 BOM character, which is added by some Windows tools. + - If a contribution includes changes in StrictDoc's code, at least the integration-level tests should be added to the ``tests/integration``. If the contributed code needs a fine-grained control over the added behavior, adding diff --git a/strictdoc/backend/sdoc/grammar_reader.py b/strictdoc/backend/sdoc/grammar_reader.py index 5c8312f98..210bb07e8 100644 --- a/strictdoc/backend/sdoc/grammar_reader.py +++ b/strictdoc/backend/sdoc/grammar_reader.py @@ -14,6 +14,7 @@ from strictdoc.backend.sdoc.pickle_cache import PickleCache from strictdoc.core.project_config import ProjectConfig from strictdoc.helpers.cast import assert_optional_cast +from strictdoc.helpers.file_system import file_open_read_utf8 from strictdoc.helpers.textx import ( drop_textx_meta, preserve_source_location_data, @@ -63,7 +64,7 @@ def read_from_file( if unpickled_content is not None: return unpickled_content - with open(file_path, encoding="utf-8-sig") as file: + with file_open_read_utf8(file_path) as file: grammar_content = file.read() try: diff --git a/strictdoc/backend/sdoc/reader.py b/strictdoc/backend/sdoc/reader.py index d12f11982..c4e28bd5b 100644 --- a/strictdoc/backend/sdoc/reader.py +++ b/strictdoc/backend/sdoc/reader.py @@ -20,6 +20,7 @@ from strictdoc.core.project_config import ProjectConfig from strictdoc.helpers.cast import assert_cast from strictdoc.helpers.exception import StrictDocException +from strictdoc.helpers.file_system import file_open_read_utf8 from strictdoc.helpers.string import strip_bom from strictdoc.helpers.textx import drop_textx_meta @@ -103,9 +104,7 @@ def read_from_file( if unpickled_content: return assert_cast(unpickled_content, SDocDocument) - # utf-8-sig is important here because it strips the UTF BOM markers - # from the beginning of source files created on Windows. - with open(file_path, encoding="utf-8-sig") as file: + with file_open_read_utf8(file_path) as file: sdoc_content = file.read() sdoc, parse_context = self.read_with_parse_context( diff --git a/strictdoc/backend/sdoc_source_code/coverage_reports/gcov.py b/strictdoc/backend/sdoc_source_code/coverage_reports/gcov.py index 9748ab7a6..3d340cfb2 100644 --- a/strictdoc/backend/sdoc_source_code/coverage_reports/gcov.py +++ b/strictdoc/backend/sdoc_source_code/coverage_reports/gcov.py @@ -20,6 +20,7 @@ from strictdoc.backend.sdoc.models.section import SDocSection from strictdoc.core.file_tree import File from strictdoc.core.project_config import ProjectConfig +from strictdoc.helpers.file_system import file_open_read_utf8 @dataclass @@ -41,7 +42,7 @@ def read_from_file( doc_file: File, project_config: ProjectConfig, ) -> SDocDocument: - with open(doc_file.full_path, encoding="UTF-8") as file: + with file_open_read_utf8(doc_file.full_path) as file: content = file.read() return cls.read_from_string(content, doc_file, project_config) diff --git a/strictdoc/backend/sdoc_source_code/reader.py b/strictdoc/backend/sdoc_source_code/reader.py index fd2342333..0cbb89540 100644 --- a/strictdoc/backend/sdoc_source_code/reader.py +++ b/strictdoc/backend/sdoc_source_code/reader.py @@ -26,6 +26,7 @@ validate_marker_uids, ) from strictdoc.helpers.file_stats import SourceFileStats +from strictdoc.helpers.file_system import file_open_read_utf8 from strictdoc.helpers.textx import drop_textx_meta @@ -292,7 +293,7 @@ def read( return source_file_traceability_info def read_from_file(self, file_path: str) -> SourceFileTraceabilityInfo: - with open(file_path, encoding="utf-8-sig") as file: + with file_open_read_utf8(file_path) as file: sdoc_content = file.read() sdoc = self.read(sdoc_content, file_path=file_path) return sdoc diff --git a/strictdoc/backend/sdoc_source_code/reader_c.py b/strictdoc/backend/sdoc_source_code/reader_c.py index 8fab8deab..a46c32a1a 100644 --- a/strictdoc/backend/sdoc_source_code/reader_c.py +++ b/strictdoc/backend/sdoc_source_code/reader_c.py @@ -35,6 +35,7 @@ ) from strictdoc.helpers.cast import assert_cast from strictdoc.helpers.file_stats import SourceFileStats +from strictdoc.helpers.file_system import file_open_read_bytes class SourceFileTraceabilityReader_C: @@ -374,7 +375,7 @@ def read( return traceability_info def read_from_file(self, file_path: str) -> SourceFileTraceabilityInfo: - with open(file_path, "rb") as file: + with file_open_read_bytes(file_path) as file: sdoc_content = file.read() sdoc = self.read(sdoc_content, file_path=file_path) return sdoc diff --git a/strictdoc/backend/sdoc_source_code/reader_python.py b/strictdoc/backend/sdoc_source_code/reader_python.py index 2f8683f87..6063c48db 100644 --- a/strictdoc/backend/sdoc_source_code/reader_python.py +++ b/strictdoc/backend/sdoc_source_code/reader_python.py @@ -30,6 +30,7 @@ ) from strictdoc.backend.sdoc_source_code.tree_sitter_helpers import traverse_tree from strictdoc.helpers.file_stats import SourceFileStats +from strictdoc.helpers.file_system import file_open_read_bytes class SourceFileTraceabilityReader_Python: @@ -252,7 +253,7 @@ def read( return traceability_info def read_from_file(self, file_path: str) -> SourceFileTraceabilityInfo: - with open(file_path, "rb") as file: + with file_open_read_bytes(file_path) as file: sdoc_content = file.read() sdoc = self.read(sdoc_content, file_path=file_path) return sdoc diff --git a/strictdoc/backend/sdoc_source_code/reader_robot.py b/strictdoc/backend/sdoc_source_code/reader_robot.py index c3fee4e63..bee548272 100644 --- a/strictdoc/backend/sdoc_source_code/reader_robot.py +++ b/strictdoc/backend/sdoc_source_code/reader_robot.py @@ -37,6 +37,7 @@ source_file_traceability_info_processor, ) from strictdoc.helpers.file_stats import SourceFileStats +from strictdoc.helpers.file_system import file_open_read_utf8 class SdocRelationVisitor(ModelVisitor): # type: ignore[misc] @@ -184,7 +185,7 @@ def read( return traceability_info def read_from_file(self, file_path: str) -> SourceFileTraceabilityInfo: - with open(file_path) as file: + with file_open_read_utf8(file_path) as file: sdoc_content = file.read() sdoc = self.read(sdoc_content, file_path=file_path) return sdoc diff --git a/strictdoc/backend/sdoc_source_code/test_reports/junit_xml_reader.py b/strictdoc/backend/sdoc_source_code/test_reports/junit_xml_reader.py index ad380e557..b1741b9b0 100644 --- a/strictdoc/backend/sdoc_source_code/test_reports/junit_xml_reader.py +++ b/strictdoc/backend/sdoc_source_code/test_reports/junit_xml_reader.py @@ -22,6 +22,7 @@ from strictdoc.core.file_tree import File from strictdoc.core.project_config import ProjectConfig from strictdoc.helpers.cast import assert_cast, assert_optional_cast +from strictdoc.helpers.file_system import file_open_read_utf8 from strictdoc.helpers.paths import path_to_posix_path @@ -51,7 +52,7 @@ def read_from_file( doc_file: File, project_config: ProjectConfig, ) -> SDocDocument: - with open(doc_file.full_path, encoding="UTF-8") as file: + with file_open_read_utf8(doc_file.full_path) as file: content = file.read() return cls.read_from_string(content, doc_file, project_config) diff --git a/strictdoc/export/html/generators/source_file_view_generator.py b/strictdoc/export/html/generators/source_file_view_generator.py index e9c9852ef..2c94ad945 100644 --- a/strictdoc/export/html/generators/source_file_view_generator.py +++ b/strictdoc/export/html/generators/source_file_view_generator.py @@ -44,6 +44,7 @@ from strictdoc.export.html.renderers.link_renderer import LinkRenderer from strictdoc.export.html.renderers.markup_renderer import MarkupRenderer from strictdoc.helpers.cast import assert_cast +from strictdoc.helpers.file_system import file_open_read_utf8 from strictdoc.helpers.timing import measure_performance @@ -89,7 +90,7 @@ def export( traceability_index: TraceabilityIndex, html_templates: HTMLTemplates, ) -> Markup: - with open(source_file.full_path, encoding="utf-8") as opened_file: + with file_open_read_utf8(source_file.full_path) as opened_file: source_file_lines = opened_file.readlines() pygmented_source_file_lines: List[SourceLineEntry] = [] diff --git a/strictdoc/export/html/generators/view_objects/document_screen_view_object.py b/strictdoc/export/html/generators/view_objects/document_screen_view_object.py index ea857b9a0..beeed658f 100644 --- a/strictdoc/export/html/generators/view_objects/document_screen_view_object.py +++ b/strictdoc/export/html/generators/view_objects/document_screen_view_object.py @@ -28,6 +28,7 @@ from strictdoc.export.html.renderers.link_renderer import LinkRenderer from strictdoc.export.html.renderers.markup_renderer import MarkupRenderer from strictdoc.helpers.cast import assert_cast +from strictdoc.helpers.file_system import file_open_read_utf8 from strictdoc.helpers.git_client import GitClient from strictdoc.helpers.string import interpolate_at_pattern_lazy from strictdoc.server.helpers.turbo import render_turbo_stream @@ -72,7 +73,7 @@ def __init__( self.custom_html2pdf_template: Optional[Template] = None if project_config.html2pdf_template is not None: - with open(project_config.html2pdf_template) as f_: + with file_open_read_utf8(project_config.html2pdf_template) as f_: self.custom_html2pdf_template = Template(f_.read()) def has_included_document(self) -> bool: diff --git a/strictdoc/export/rst/rst_to_html_fragment_writer.py b/strictdoc/export/rst/rst_to_html_fragment_writer.py index c3d1ae845..dcb2eaeba 100644 --- a/strictdoc/export/rst/rst_to_html_fragment_writer.py +++ b/strictdoc/export/rst/rst_to_html_fragment_writer.py @@ -24,6 +24,7 @@ from strictdoc.export.rst.directives.wildcard_enhanced_image import ( WildcardEnhancedImage, ) +from strictdoc.helpers.file_system import file_open_read_bytes class RstToHtmlFragmentWriter: @@ -97,8 +98,8 @@ def write(self, rst_fragment: str, use_cache: bool = True) -> Markup: ) if use_cache and os.path.isdir(path_to_rst_fragment_bucket_dir): if os.path.isfile(path_to_cached_fragment): - with open( - path_to_cached_fragment, "rb" + with file_open_read_bytes( + path_to_cached_fragment ) as cached_fragment_file_: return Markup(cached_fragment_file_.read().decode("UTF-8")) else: diff --git a/strictdoc/export/spdx/spdx_generator.py b/strictdoc/export/spdx/spdx_generator.py index 722723288..94596cf49 100644 --- a/strictdoc/export/spdx/spdx_generator.py +++ b/strictdoc/export/spdx/spdx_generator.py @@ -59,6 +59,7 @@ from strictdoc.export.spdx.spdx_sdoc_container import SPDXSDocContainer from strictdoc.export.spdx.spdx_to_sdoc_converter import SPDXToSDocConverter from strictdoc.helpers.cast import assert_cast +from strictdoc.helpers.file_system import file_open_read_bytes from strictdoc.helpers.sha256 import get_sha256 RELATION_ID_HOW_TO = "SPDXRef-Relationship-How-to-form-ID?" @@ -254,7 +255,9 @@ def export_tree( for document_ in traceability_index.document_tree.document_list: assert document_.meta is not None - with open(document_.meta.input_doc_full_path, "rb") as input_file_: + with file_open_read_bytes( + document_.meta.input_doc_full_path + ) as input_file_: document_bytes = input_file_.read() # @@ -340,7 +343,9 @@ def export_tree( if node_link_path_ in lookup_file_name_to_spdx_file: continue - with open(node_link_path_, "rb") as node_link_file_: + with file_open_read_bytes( + node_link_path_ + ) as node_link_file_: file_bytes = node_link_file_.read() source_spdx_file = ( diff --git a/strictdoc/helpers/file_system.py b/strictdoc/helpers/file_system.py index 68662399a..c31135c99 100644 --- a/strictdoc/helpers/file_system.py +++ b/strictdoc/helpers/file_system.py @@ -1,9 +1,14 @@ +import codecs import os import platform import shutil import tempfile +from contextlib import contextmanager +from io import BufferedReader, TextIOWrapper from pathlib import Path -from typing import Optional +from typing import Iterator, Optional + +UTF8_BOM_BYTES = codecs.BOM_UTF8 # b'\xef\xbb\xbf' def sync_dir(src_dir: str, dst_dir: str, message: Optional[str]) -> None: @@ -73,3 +78,21 @@ def get_portable_temp_dir() -> Path: return Path( "/tmp" if platform.system() == "Darwin" else tempfile.gettempdir() ) + + +@contextmanager +def file_open_read_utf8(file_path: str) -> Iterator[TextIOWrapper]: + # utf-8-sig is important here because it strips the UTF BOM markers + # from the beginning of source files created by some Windows tools. + with open(file_path, encoding="utf-8-sig") as file_: + yield file_ + + +@contextmanager +def file_open_read_bytes(file_path: str) -> Iterator[BufferedReader]: + with open(file_path, "rb") as raw_file: + start = raw_file.read(len(UTF8_BOM_BYTES)) + if start != UTF8_BOM_BYTES: + # No BOM -> rewind to beginning. + raw_file.seek(0) + yield raw_file diff --git a/strictdoc/helpers/md5.py b/strictdoc/helpers/md5.py index 6736412c5..455d4ea40 100644 --- a/strictdoc/helpers/md5.py +++ b/strictdoc/helpers/md5.py @@ -1,5 +1,7 @@ import hashlib +from strictdoc.helpers.file_system import file_open_read_bytes + def get_md5(obj: str) -> str: return hashlib.md5(obj.encode("utf-8")).hexdigest() @@ -7,7 +9,7 @@ def get_md5(obj: str) -> str: def get_file_md5(path: str, buf_size: int = 65536) -> str: m = hashlib.md5() - with open(path, "rb") as f: + with file_open_read_bytes(path) as f: b = f.read(buf_size) while len(b) > 0: m.update(b) diff --git a/tests/unit/strictdoc/helpers/test_file_open_bytes.py b/tests/unit/strictdoc/helpers/test_file_open_bytes.py new file mode 100644 index 000000000..fe879af50 --- /dev/null +++ b/tests/unit/strictdoc/helpers/test_file_open_bytes.py @@ -0,0 +1,33 @@ +import os +import tempfile +from io import BufferedReader + +import pytest + +from strictdoc.helpers.file_system import UTF8_BOM_BYTES, file_open_read_bytes + + +@pytest.mark.parametrize( + "initial_bytes,expected_bytes", + [ + (b"", b""), + (b"A", b"A"), + (b"AB", b"AB"), + (UTF8_BOM_BYTES + b"Hello", b"Hello"), + (b"Hello", b"Hello"), + ], +) +def test_file_open_read_bytes(initial_bytes, expected_bytes): + tmp_file = tempfile.NamedTemporaryFile(mode="wb", delete=False) + tmp_path = tmp_file.name + + try: + tmp_file.write(initial_bytes) + tmp_file.close() + + with file_open_read_bytes(tmp_path) as f: + assert isinstance(f, BufferedReader) + content = f.read() + assert content == expected_bytes + finally: + os.remove(tmp_path) diff --git a/tests/unit/strictdoc/helpers/test_file_open_utf8.py b/tests/unit/strictdoc/helpers/test_file_open_utf8.py new file mode 100644 index 000000000..649e72784 --- /dev/null +++ b/tests/unit/strictdoc/helpers/test_file_open_utf8.py @@ -0,0 +1,36 @@ +import os +import tempfile +from io import TextIOWrapper + +import pytest + +from strictdoc.helpers.file_system import UTF8_BOM_BYTES, file_open_read_utf8 + +UTF8_BOM_STR = UTF8_BOM_BYTES.decode("utf-8") + + +@pytest.mark.parametrize( + "initial_text,expected_text", + [ + ("", ""), + ("A", "A"), + (UTF8_BOM_STR + "Hello", "Hello"), + ("Hello", "Hello"), + ], +) +def test_file_open_read_utf8(initial_text, expected_text): + tmp_file = tempfile.NamedTemporaryFile( + mode="w", encoding="utf-8", delete=False + ) + tmp_path = tmp_file.name + + try: + tmp_file.write(initial_text) + tmp_file.close() + + with file_open_read_utf8(tmp_path) as f: + assert isinstance(f, TextIOWrapper) + content = f.read() + assert content == expected_text + finally: + os.remove(tmp_path)