From cb7a75827405ae940fa817352492778b350b3311 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 17 Nov 2025 14:15:21 -0500 Subject: [PATCH 1/2] Port https://github.com/sillsdev/machine/pull/353 Fix unused imports Fix import sorting Address reviewr comments Add parameter types Use isort --- machine/corpora/__init__.py | 26 +- .../file_paratext_project_file_handler.py | 27 ++ .../file_paratext_project_settings_parser.py | 23 +- .../file_paratext_project_text_updater.py | 5 +- ...xt_project_versification_error_detector.py | 11 + .../corpora/paratext_project_file_handler.py | 18 + .../paratext_project_settings_parser_base.py | 36 +- .../paratext_project_terms_parser_base.py | 26 +- .../paratext_project_text_updater_base.py | 23 +- ...xt_project_versification_error_detector.py | 42 +++ .../usfm_versification_error_detector.py | 172 ++++++++++ ...y => zip_paratext_project_file_handler.py} | 35 +- .../zip_paratext_project_settings_parser.py | 23 +- .../zip_paratext_project_terms_parser.py | 19 +- .../zip_paratext_project_text_updater.py | 16 +- ...paratext_project_versification_detector.py | 10 + ...atext_project_quote_convention_detector.py | 5 +- ...atext_project_quote_convention_detector.py | 24 +- ...atext_project_quote_convention_detector.py | 20 +- machine/scripture/verse_ref.py | 11 + .../test_paratext_project_terms_parser.py | 47 +-- tests/corpora/test_usfm_manual.py | 94 +----- .../test_usfm_verisifcation_error_detector.py | 319 ++++++++++++++++++ ...atext_project_quote_convention_detector.py | 40 +-- .../memory_paratext_project_file_handler.py | 56 +++ ...atext_project_quote_convention_detector.py | 15 +- .../memory_paratext_project_terms_parser.py | 17 +- ...xt_project_versification_error_detector.py | 10 + 28 files changed, 828 insertions(+), 342 deletions(-) create mode 100644 machine/corpora/file_paratext_project_file_handler.py create mode 100644 machine/corpora/file_paratext_project_versification_error_detector.py create mode 100644 machine/corpora/paratext_project_file_handler.py create mode 100644 machine/corpora/paratext_project_versification_error_detector.py create mode 100644 machine/corpora/usfm_versification_error_detector.py rename machine/corpora/{zip_paratext_project_settings_parser_base.py => zip_paratext_project_file_handler.py} (59%) create mode 100644 machine/corpora/zip_paratext_project_versification_detector.py create mode 100644 tests/corpora/test_usfm_verisifcation_error_detector.py create mode 100644 tests/testutils/memory_paratext_project_file_handler.py create mode 100644 tests/testutils/memory_paratext_project_versification_error_detector.py diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index 45eb628b..fba7c405 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -7,8 +7,10 @@ from .dbl_bundle_text_corpus import DblBundleTextCorpus from .dictionary_alignment_corpus import DictionaryAlignmentCorpus from .dictionary_text_corpus import DictionaryTextCorpus +from .file_paratext_project_file_handler import FileParatextProjectFileHandler from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser from .file_paratext_project_text_updater import FileParatextProjectTextUpdater +from .file_paratext_project_versification_error_detector import FileParatextProjectVersificationErrorDetector from .flatten import flatten from .memory_alignment_collection import MemoryAlignmentCollection from .memory_stream_container import MemoryStreamContainer @@ -18,10 +20,12 @@ from .parallel_text_row import ParallelTextRow from .paratext_backup_terms_corpus import ParatextBackupTermsCorpus from .paratext_backup_text_corpus import ParatextBackupTextCorpus +from .paratext_project_file_handler import ParatextProjectFileHandler from .paratext_project_settings import ParatextProjectSettings from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase +from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector from .paratext_text_corpus import ParatextTextCorpus from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler from .scripture_element import ScriptureElement @@ -70,16 +74,22 @@ from .usfm_update_block import UsfmUpdateBlock from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType from .usfm_update_block_handler import UsfmUpdateBlockHandler +from .usfm_versification_error_detector import ( + UsfmVersificationError, + UsfmVersificationErrorDetector, + UsfmVersificationErrorType, +) from .usx_file_alignment_collection import UsxFileAlignmentCollection from .usx_file_alignment_corpus import UsxFileAlignmentCorpus from .usx_file_text import UsxFileText from .usx_file_text_corpus import UsxFileTextCorpus from .usx_memory_text import UsxMemoryText from .usx_zip_text import UsxZipText +from .zip_paratext_project_file_handler import ZipParatextProjectFileHandler from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser -from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase from .zip_paratext_project_terms_parser import ZipParatextProjectTermsParser from .zip_paratext_project_text_updater import ZipParatextProjectTextUpdater +from .zip_paratext_project_versification_detector import ZipParatextProjectVersificationErrorDetector __all__ = [ "AlignedWordPair", @@ -95,8 +105,10 @@ "EMPTY_SCRIPTURE_REF", "escape_spaces", "extract_scripture_corpus", + "FileParatextProjectFileHandler", "FileParatextProjectSettingsParser", "FileParatextProjectTextUpdater", + "FileParatextProjectVersificationErrorDetector", "flatten", "is_scripture", "lowercase", @@ -113,14 +125,16 @@ "ParallelTextRow", "ParatextBackupTermsCorpus", "ParatextBackupTextCorpus", + "ParatextProjectFileHandler", "ParatextProjectSettings", "ParatextProjectSettingsParserBase", "ParatextProjectTermsParserBase", "ParatextProjectTextUpdaterBase", + "ParatextProjectVersificationErrorDetector", "ParatextTextCorpus", + "parse_usfm", "PlaceMarkersAlignmentInfo", "PlaceMarkersUsfmUpdateBlockHandler", - "parse_usfm", "RtlReferenceOrder", "ScriptureElement", "ScriptureRef", @@ -139,8 +153,8 @@ "unescape_spaces", "UpdateUsfmMarkerBehavior", "UpdateUsfmParserHandler", - "UpdateUsfmTextBehavior", "UpdateUsfmRow", + "UpdateUsfmTextBehavior", "UsfmAttribute", "UsfmElementType", "UsfmFileText", @@ -164,14 +178,18 @@ "UsfmUpdateBlockElement", "UsfmUpdateBlockElementType", "UsfmUpdateBlockHandler", + "UsfmVersificationError", + "UsfmVersificationErrorDetector", + "UsfmVersificationErrorType", "UsxFileAlignmentCollection", "UsxFileAlignmentCorpus", "UsxFileText", "UsxFileTextCorpus", "UsxMemoryText", "UsxZipText", + "ZipParatextProjectFileHandler", "ZipParatextProjectSettingsParser", - "ZipParatextProjectSettingsParserBase", "ZipParatextProjectTermsParser", "ZipParatextProjectTextUpdater", + "ZipParatextProjectVersificationErrorDetector", ] diff --git a/machine/corpora/file_paratext_project_file_handler.py b/machine/corpora/file_paratext_project_file_handler.py new file mode 100644 index 00000000..8cdc3dd2 --- /dev/null +++ b/machine/corpora/file_paratext_project_file_handler.py @@ -0,0 +1,27 @@ +from pathlib import Path +from typing import BinaryIO, Optional + +from ..utils.typeshed import StrPath +from .paratext_project_file_handler import ParatextProjectFileHandler +from .usfm_stylesheet import UsfmStylesheet + + +class FileParatextProjectFileHandler(ParatextProjectFileHandler): + def __init__(self, project_dir: StrPath) -> None: + self._project_dir = Path(project_dir) + + def exists(self, file_name: str) -> bool: + return (self._project_dir / file_name).is_file() + + def open(self, file_name: str) -> BinaryIO: + return open(self._project_dir / file_name, "rb") + + def find(self, extension: str) -> Optional[Path]: + return next(self._project_dir.glob(f"*{extension}"), None) + + def create_stylesheet(self, file_name: str) -> UsfmStylesheet: + custom_stylesheet_filename = self._project_dir / "custom.sty" + return UsfmStylesheet( + file_name, + custom_stylesheet_filename if custom_stylesheet_filename.is_file() else None, + ) diff --git a/machine/corpora/file_paratext_project_settings_parser.py b/machine/corpora/file_paratext_project_settings_parser.py index ddfec490..13e62f28 100644 --- a/machine/corpora/file_paratext_project_settings_parser.py +++ b/machine/corpora/file_paratext_project_settings_parser.py @@ -1,27 +1,8 @@ -from pathlib import Path -from typing import BinaryIO, Optional - from ..utils.typeshed import StrPath +from .file_paratext_project_file_handler import FileParatextProjectFileHandler from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase -from .usfm_stylesheet import UsfmStylesheet class FileParatextProjectSettingsParser(ParatextProjectSettingsParserBase): def __init__(self, project_dir: StrPath) -> None: - self._project_dir = Path(project_dir) - - def _create_stylesheet(self, file_name: StrPath) -> UsfmStylesheet: - custom_stylesheet_filename = self._project_dir / "custom.sty" - return UsfmStylesheet( - file_name, - custom_stylesheet_filename if custom_stylesheet_filename.is_file() else None, - ) - - def _exists(self, file_name: StrPath) -> bool: - return (self._project_dir / file_name).is_file() - - def _find(self, extension: str) -> Optional[Path]: - return next(self._project_dir.glob(f"*{extension}"), None) - - def _open(self, file_name: StrPath) -> BinaryIO: - return open(self._project_dir / file_name, "rb") + super().__init__(FileParatextProjectFileHandler(project_dir)) diff --git a/machine/corpora/file_paratext_project_text_updater.py b/machine/corpora/file_paratext_project_text_updater.py index 757bf0ca..e09896aa 100644 --- a/machine/corpora/file_paratext_project_text_updater.py +++ b/machine/corpora/file_paratext_project_text_updater.py @@ -2,13 +2,16 @@ from typing import BinaryIO from ..utils.typeshed import StrPath +from .file_paratext_project_file_handler import FileParatextProjectFileHandler from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase class FileParatextProjectTextUpdater(ParatextProjectTextUpdaterBase): def __init__(self, project_dir: StrPath) -> None: - super().__init__(FileParatextProjectSettingsParser(project_dir)) + super().__init__( + FileParatextProjectFileHandler(project_dir), FileParatextProjectSettingsParser(project_dir).parse() + ) self._project_dir = project_dir diff --git a/machine/corpora/file_paratext_project_versification_error_detector.py b/machine/corpora/file_paratext_project_versification_error_detector.py new file mode 100644 index 00000000..4e2cdac3 --- /dev/null +++ b/machine/corpora/file_paratext_project_versification_error_detector.py @@ -0,0 +1,11 @@ +from ..utils.typeshed import StrPath +from .file_paratext_project_file_handler import FileParatextProjectFileHandler +from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser +from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector + + +class FileParatextProjectVersificationErrorDetector(ParatextProjectVersificationErrorDetector): + def __init__(self, project_dir: StrPath) -> None: + super().__init__( + FileParatextProjectFileHandler(project_dir), FileParatextProjectSettingsParser(project_dir).parse() + ) diff --git a/machine/corpora/paratext_project_file_handler.py b/machine/corpora/paratext_project_file_handler.py new file mode 100644 index 00000000..b0e27a04 --- /dev/null +++ b/machine/corpora/paratext_project_file_handler.py @@ -0,0 +1,18 @@ +from abc import ABC, abstractmethod +from typing import BinaryIO + +from .usfm_stylesheet import UsfmStylesheet + + +class ParatextProjectFileHandler(ABC): + @abstractmethod + def exists(self, file_name: str) -> bool: ... + + @abstractmethod + def open(self, file_name: str) -> BinaryIO: ... + + @abstractmethod + def find(self, extension: str) -> str: ... + + @abstractmethod + def create_stylesheet(self, file_name: str) -> UsfmStylesheet: ... diff --git a/machine/corpora/paratext_project_settings_parser_base.py b/machine/corpora/paratext_project_settings_parser_base.py index 28598748..61f2e66b 100644 --- a/machine/corpora/paratext_project_settings_parser_base.py +++ b/machine/corpora/paratext_project_settings_parser_base.py @@ -1,35 +1,24 @@ -from abc import ABC, abstractmethod -from typing import BinaryIO +from abc import ABC from xml.etree import ElementTree from ..scripture.verse_ref import Versification from ..utils.string_utils import parse_integer from .corpora_utils import get_encoding +from .paratext_project_file_handler import ParatextProjectFileHandler from .paratext_project_settings import ParatextProjectSettings -from .usfm_stylesheet import UsfmStylesheet class ParatextProjectSettingsParserBase(ABC): - - @abstractmethod - def _exists(self, file_name: str) -> bool: ... - - @abstractmethod - def _find(self, extension: str) -> str: ... - - @abstractmethod - def _open(self, file_name: str) -> BinaryIO: ... - - @abstractmethod - def _create_stylesheet(self, file_name: str) -> UsfmStylesheet: ... + def __init__(self, paratext_project_file_handler: ParatextProjectFileHandler): + self._paratext_project_file_handler = paratext_project_file_handler def parse(self) -> ParatextProjectSettings: settings_file_name = "Settings.xml" - if not self._exists(settings_file_name): - settings_file_name = self._find(".ssf") + if not self._paratext_project_file_handler.exists(settings_file_name): + settings_file_name = self._paratext_project_file_handler.find(".ssf") if not settings_file_name: raise ValueError("The project does not contain a settings file.") - with self._open(settings_file_name) as stream: + with self._paratext_project_file_handler.open(settings_file_name) as stream: settings_tree = ElementTree.parse(stream) name = settings_tree.getroot().findtext("Name", "") @@ -46,18 +35,21 @@ def parse(self) -> ParatextProjectSettings: versification_type = int(settings_tree.getroot().findtext("Versification", "4")) versification = Versification.get_builtin(versification_type) - if self._exists("custom.vrs"): + if self._paratext_project_file_handler.exists("custom.vrs"): guid = settings_tree.getroot().findtext("Guid", "") versification_name = f"{versification.name}-{guid}" versification = Versification.load( - self._open("custom.vrs"), + self._paratext_project_file_handler.open("custom.vrs"), versification, versification_name, ) stylesheet_file_name = settings_tree.getroot().findtext("StyleSheet", "usfm.sty") - if not self._exists(stylesheet_file_name) and stylesheet_file_name != "usfm_sb.sty": + if ( + not self._paratext_project_file_handler.exists(stylesheet_file_name) + and stylesheet_file_name != "usfm_sb.sty" + ): stylesheet_file_name = "usfm.sty" - stylesheet = self._create_stylesheet(stylesheet_file_name) + stylesheet = self._paratext_project_file_handler.create_stylesheet(stylesheet_file_name) prefix = "" form = "41MAT" diff --git a/machine/corpora/paratext_project_terms_parser_base.py b/machine/corpora/paratext_project_terms_parser_base.py index 678b6a9e..ff09d78b 100644 --- a/machine/corpora/paratext_project_terms_parser_base.py +++ b/machine/corpora/paratext_project_terms_parser_base.py @@ -1,12 +1,13 @@ from __future__ import annotations import re -from abc import ABC, abstractmethod +from abc import ABC from collections import defaultdict from importlib.resources import open_binary -from typing import BinaryIO, Dict, List, Optional, Sequence, Tuple, Union +from typing import Dict, List, Optional, Sequence, Tuple, Union from xml.etree import ElementTree +from .paratext_project_file_handler import ParatextProjectFileHandler from .paratext_project_settings import ParatextProjectSettings from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase @@ -24,7 +25,12 @@ class ParatextProjectTermsParserBase(ABC): - def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase]) -> None: + def __init__( + self, + paratext_project_file_handler: ParatextProjectFileHandler, + settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase], + ) -> None: + self._paratext_project_file_handler = paratext_project_file_handler self._settings: ParatextProjectSettings if isinstance(settings, ParatextProjectSettingsParserBase): self._settings = settings.parse() @@ -34,8 +40,8 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -> List[Tuple[str, List[str]]]: biblical_terms_doc = None if self._settings.biblical_terms_list_type == "Project": - if self._exists(self._settings.biblical_terms_file_name): - with self._open(self._settings.biblical_terms_file_name) as stream: + if self._paratext_project_file_handler.exists(self._settings.biblical_terms_file_name): + with self._paratext_project_file_handler.open(self._settings.biblical_terms_file_name) as stream: biblical_terms_doc = ElementTree.parse(stream) term_id_to_category_dict = _get_category_per_id(biblical_terms_doc) elif self._settings.biblical_terms_list_type in _PREDEFINED_TERMS_LIST_TYPES: @@ -60,8 +66,8 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) - terms_glosses_doc = ElementTree.parse(stream) term_renderings_doc: Optional[ElementTree.ElementTree[ElementTree.Element]] = None - if self._exists("TermRenderings.xml"): - with self._open("TermRenderings.xml") as stream: + if self._paratext_project_file_handler.exists("TermRenderings.xml"): + with self._paratext_project_file_handler.open("TermRenderings.xml") as stream: term_renderings_doc = ElementTree.parse(stream) terms_renderings: Dict[str, List[str]] = defaultdict(list) @@ -94,12 +100,6 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) - return [] - @abstractmethod - def _exists(self, file_name: str) -> bool: ... - - @abstractmethod - def _open(self, file_name: str) -> BinaryIO: ... - def _is_in_category(id: str, term_categories: Sequence[str], term_id_to_category_dict: Dict[str, str]) -> bool: category = term_id_to_category_dict.get(id) diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index 14c57305..0e7bfdfd 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -1,7 +1,7 @@ -from abc import ABC, abstractmethod -from typing import BinaryIO, Callable, Iterable, Optional, Sequence, Union +from abc import ABC +from typing import Callable, Iterable, Optional, Sequence, Union -from ..utils.typeshed import StrPath +from .paratext_project_file_handler import ParatextProjectFileHandler from .paratext_project_settings import ParatextProjectSettings from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase from .update_usfm_parser_handler import ( @@ -15,7 +15,12 @@ class ParatextProjectTextUpdaterBase(ABC): - def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase]) -> None: + def __init__( + self, + paratext_project_file_handler: ParatextProjectFileHandler, + settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase], + ) -> None: + self._paratext_project_file_handler = paratext_project_file_handler if isinstance(settings, ParatextProjectSettingsParserBase): self._settings = settings.parse() else: @@ -37,9 +42,9 @@ def update_usfm( compare_segments: bool = False, ) -> Optional[str]: file_name: str = self._settings.get_book_file_name(book_id) - if not self._exists(file_name): + if not self._paratext_project_file_handler.exists(file_name): return None - with self._open(file_name) as sfm_file: + with self._paratext_project_file_handler.open(file_name) as sfm_file: usfm: str = sfm_file.read().decode(self._settings.encoding) handler = UpdateUsfmParserHandler( rows, @@ -64,9 +69,3 @@ def update_usfm( f". Error: '{e}'" ) raise RuntimeError(error_message) from e - - @abstractmethod - def _exists(self, file_name: StrPath) -> bool: ... - - @abstractmethod - def _open(self, file_name: StrPath) -> BinaryIO: ... diff --git a/machine/corpora/paratext_project_versification_error_detector.py b/machine/corpora/paratext_project_versification_error_detector.py new file mode 100644 index 00000000..56b32365 --- /dev/null +++ b/machine/corpora/paratext_project_versification_error_detector.py @@ -0,0 +1,42 @@ +from typing import List, Optional, Union + +from .paratext_project_file_handler import ParatextProjectFileHandler +from .paratext_project_settings import ParatextProjectSettings +from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase +from .usfm_parser import parse_usfm +from .usfm_versification_error_detector import UsfmVersificationError, UsfmVersificationErrorDetector + + +class ParatextProjectVersificationErrorDetector: + def __init__( + self, + paratext_project_file_handler: ParatextProjectFileHandler, + settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase], + ) -> None: + self._paratext_project_file_handler = paratext_project_file_handler + if isinstance(settings, ParatextProjectSettingsParserBase): + self._settings = settings.parse() + else: + self._settings = settings + + def get_usfm_versification_errors( + self, + handler: Optional[UsfmVersificationErrorDetector] = None, + ) -> List[UsfmVersificationError]: + handler = handler or UsfmVersificationErrorDetector(self._settings.versification) + for file_name in self._settings.get_all_scripture_book_file_names(): + if not self._paratext_project_file_handler.exists(file_name): + continue + + with self._paratext_project_file_handler.open(file_name) as sfm_file: + usfm: str = sfm_file.read().decode(self._settings.encoding) + try: + parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification) + except Exception as e: + error_message = ( + f"An error occurred while parsing the usfm for '{file_name}'" + f"{f' in project {self._settings.name}' if self._settings.name else ''}" + f". Error: '{e}'" + ) + raise RuntimeError(error_message) from e + return handler.errors diff --git a/machine/corpora/usfm_versification_error_detector.py b/machine/corpora/usfm_versification_error_detector.py new file mode 100644 index 00000000..fc17bf5c --- /dev/null +++ b/machine/corpora/usfm_versification_error_detector.py @@ -0,0 +1,172 @@ +from enum import Enum, auto +from typing import List, Optional + +from machine.scripture import canon + +from ..scripture.verse_ref import ValidStatus, VerseRef, Versification +from .usfm_parser_handler import UsfmParserHandler +from .usfm_parser_state import UsfmParserState + + +class UsfmVersificationErrorType(Enum): + MISSING_CHAPTER = auto() + MISSING_VERSE = auto() + EXTRA_VERSE = auto() + INVALID_VERSE_RANGE = auto() + MISSING_VERSE_SEGMENT = auto() + EXTRA_VERSE_SEGMENT = auto() + + +class UsfmVersificationError: + def __init__( + self, + book_num: int, + expected_chapter: int, + expected_verse: int, + actual_chapter: int, + actual_verse: int, + verse_ref: Optional[VerseRef] = None, + ): + self._book_num = book_num + self._expected_chapter = expected_chapter + self._expected_verse = expected_verse + self._actual_chapter = actual_chapter + self._actual_verse = actual_verse + self._verse_ref = verse_ref + self._type: UsfmVersificationErrorType + + @property + def type(self) -> UsfmVersificationErrorType: + return self._type + + def check_error(self) -> bool: + """Returns true if there is an error""" + if self._expected_chapter > self._actual_chapter and self._expected_verse != 0: + self._type = UsfmVersificationErrorType.MISSING_CHAPTER + return True + if self._expected_verse > self._actual_verse and self._expected_chapter == self._actual_chapter: + self._type = UsfmVersificationErrorType.MISSING_VERSE + return True + if self._verse_ref is not None: + if not self._verse_ref.segment() and self._verse_ref.has_segments_defined: + self._type = UsfmVersificationErrorType.MISSING_VERSE_SEGMENT + return True + if self._verse_ref.segment() and not self._verse_ref.has_segments_defined: + self._type = UsfmVersificationErrorType.EXTRA_VERSE_SEGMENT + return True + if not self._verse_ref.is_valid: + self._type = UsfmVersificationError.map(self._verse_ref.valid_status) + return True + return False + + @staticmethod + def map(valid_status: ValidStatus) -> UsfmVersificationErrorType: + if valid_status == ValidStatus.OUT_OF_RANGE: + return UsfmVersificationErrorType.EXTRA_VERSE + if valid_status == ValidStatus.VERSE_REPEATED or valid_status == ValidStatus.VERSE_OUT_OF_ORDER: + return UsfmVersificationErrorType.INVALID_VERSE_RANGE + raise ValueError( + f"{ValidStatus.__name__} {valid_status} does not map to any {UsfmVersificationErrorType.__name__}" + ) + + @property + def expected_verse_ref(self) -> str: + if ( + default_verse_ref := VerseRef.try_from_string( + f"{self._book_num} {self._expected_chapter}:{self._expected_verse}" + ) + is None + ): + return "" + if self._type == UsfmVersificationErrorType.EXTRA_VERSE: + return "" + if self._type == UsfmVersificationErrorType.MISSING_VERSE_SEGMENT: + if ( + verse_ref_with_segment := VerseRef.try_from_string( + f"{self._book_num} {self._expected_chapter}:{self._expected_verse}a" + ) + is not None + ): + return str(verse_ref_with_segment) + if self._type == UsfmVersificationErrorType.INVALID_VERSE_RANGE and self._verse_ref is not None: + sorted_all_unique_verses = sorted(set(self._verse_ref.all_verses())) + first_verse = sorted_all_unique_verses[0] + last_verse = sorted_all_unique_verses[-1] + if first_verse == last_verse: + return str(first_verse) + elif ( + corrected_verse_range_ref := VerseRef.try_from_string( + f"{self._book_num} {self._expected_chapter}:{first_verse}-{last_verse}" + ) + is not None + ): + return str(corrected_verse_range_ref) + return str(default_verse_ref) + + @property + def actual_verse_ref(self) -> str: + return ( + str(self._verse_ref) + if self._verse_ref is not None + else str(VerseRef(self._book_num, self._actual_chapter, self._actual_verse)) + ) + + +class UsfmVersificationErrorDetector(UsfmParserHandler): + def __init__(self, versification: Versification): + self._versification = versification + self._current_book = 0 + self._current_chapter = 0 + self._current_verse = VerseRef() + self._errors: List[UsfmVersificationError] = [] + + @property + def errors(self) -> List[UsfmVersificationError]: + return self._errors.copy() + + def end_usfm(self, state: UsfmParserState) -> None: + if self._current_book > 0 and canon.is_canonical(self._current_book): + versification_error = UsfmVersificationError( + self._current_book, + self._versification.get_last_chapter(self._current_book), + self._versification.get_last_verse( + self._current_book, self._versification.get_last_chapter(self._current_book) + ), + self._current_chapter, + list(self._current_verse.all_verses())[-1].verse_num, + ) + if versification_error.check_error(): + self._errors.append(versification_error) + + def start_book(self, state: UsfmParserState, marker: str, code: str) -> None: + self._current_book = state.verse_ref.book_num + self._current_chapter = 0 + self._current_verse = VerseRef() + + def chapter( + self, state: UsfmParserState, number: str, marker: str, alt_number: Optional[str], pub_number: Optional[str] + ) -> None: + if self._current_book > 0 and canon.is_canonical(self._current_book) and self._current_chapter > 0: + versification_error = UsfmVersificationError( + self._current_book, + self._current_chapter, + self._versification.get_last_verse(self._current_book, self._current_chapter), + self._current_chapter, + list(self._current_verse.all_verses())[-1].verse_num, + ) + if versification_error.check_error(): + self._errors.append(versification_error) + + def verse( + self, state: UsfmParserState, number: str, marker: str, alt_number: Optional[str], pub_number: Optional[str] + ) -> None: + if self._current_book > 0 and canon.is_canonical(self._current_book) and self._current_chapter > 0: + versification_error = UsfmVersificationError( + self._current_book, + self._current_chapter, + list(self._current_verse.all_verses())[-1].verse_num, + self._current_chapter, + list(self._current_verse.all_verses())[-1].verse_num, + ) + if versification_error.check_error(): + self._errors.append(versification_error) diff --git a/machine/corpora/zip_paratext_project_settings_parser_base.py b/machine/corpora/zip_paratext_project_file_handler.py similarity index 59% rename from machine/corpora/zip_paratext_project_settings_parser_base.py rename to machine/corpora/zip_paratext_project_file_handler.py index 58ef8733..f97c46d5 100644 --- a/machine/corpora/zip_paratext_project_settings_parser_base.py +++ b/machine/corpora/zip_paratext_project_file_handler.py @@ -1,32 +1,51 @@ import os +from io import BytesIO from tempfile import mkstemp -from typing import Optional +from typing import BinaryIO, Optional, cast +from zipfile import ZipFile -from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase +from .paratext_project_file_handler import ParatextProjectFileHandler from .usfm_stylesheet import UsfmStylesheet -class ZipParatextProjectSettingsParserBase(ParatextProjectSettingsParserBase): - def _create_stylesheet(self, file_name: str) -> UsfmStylesheet: +class ZipParatextProjectFileHandler(ParatextProjectFileHandler): + def __init__(self, archive: ZipFile) -> None: + self._archive = archive + + def exists(self, file_name: str) -> bool: + return file_name in self._archive.namelist() + + def find(self, extension: str) -> Optional[str]: + for entry in self._archive.namelist(): + if entry.endswith(extension): + return entry + return None + + def open(self, file_name: str) -> Optional[BinaryIO]: + if file_name in self._archive.namelist(): + return BytesIO(self._archive.read(file_name)) + return None + + def create_stylesheet(self, file_name: str) -> UsfmStylesheet: stylesheet_temp_path: Optional[str] = None stylesheet_temp_fd: Optional[int] = None custom_stylesheet_temp_path: Optional[str] = None custom_stylesheet_temp_fd: Optional[int] = None try: stylesheet_path: str = file_name - if self._exists(file_name): + if self.exists(file_name): stylesheet_temp_fd, stylesheet_temp_path = mkstemp() with ( - self._open(file_name) as source, + cast(BinaryIO, self.open(file_name)) as source, open(stylesheet_temp_fd, "wb", closefd=False) as stylesheet_temp_file, ): stylesheet_temp_file.write(source.read()) stylesheet_path = stylesheet_temp_path custom_stylesheet_path: Optional[str] = None - if self._exists("custom.sty"): + if self.exists("custom.sty"): custom_stylesheet_temp_fd, custom_stylesheet_temp_path = mkstemp() with ( - self._open("custom.sty") as source, + cast(BinaryIO, self.open("custom.sty")) as source, open(custom_stylesheet_temp_fd, "wb", closefd=False) as custom_stylesheet_temp_file, ): custom_stylesheet_temp_file.write(source.read()) diff --git a/machine/corpora/zip_paratext_project_settings_parser.py b/machine/corpora/zip_paratext_project_settings_parser.py index 4f237472..e9fb3080 100644 --- a/machine/corpora/zip_paratext_project_settings_parser.py +++ b/machine/corpora/zip_paratext_project_settings_parser.py @@ -1,24 +1,9 @@ -from io import BytesIO -from typing import BinaryIO, Optional from zipfile import ZipFile -from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase +from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase +from .zip_paratext_project_file_handler import ZipParatextProjectFileHandler -class ZipParatextProjectSettingsParser(ZipParatextProjectSettingsParserBase): +class ZipParatextProjectSettingsParser(ParatextProjectSettingsParserBase): def __init__(self, archive: ZipFile) -> None: - self._archive = archive - - def _exists(self, file_name: str) -> bool: - return file_name in self._archive.namelist() - - def _find(self, extension: str) -> Optional[str]: - for entry in self._archive.namelist(): - if entry.endswith(extension): - return entry - return None - - def _open(self, file_name: str) -> Optional[BinaryIO]: - if file_name in self._archive.namelist(): - return BytesIO(self._archive.read(file_name)) - return None + super().__init__(ZipParatextProjectFileHandler(archive)) diff --git a/machine/corpora/zip_paratext_project_terms_parser.py b/machine/corpora/zip_paratext_project_terms_parser.py index ebc208a0..06e54986 100644 --- a/machine/corpora/zip_paratext_project_terms_parser.py +++ b/machine/corpora/zip_paratext_project_terms_parser.py @@ -1,23 +1,14 @@ -from io import BytesIO -from typing import BinaryIO, Optional +from typing import Optional from zipfile import ZipFile -from ..utils.typeshed import StrPath from .paratext_project_settings import ParatextProjectSettings from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase +from .zip_paratext_project_file_handler import ZipParatextProjectFileHandler from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser class ZipParatextProjectTermsParser(ParatextProjectTermsParserBase): def __init__(self, archive: ZipFile, settings: Optional[ParatextProjectSettings] = None) -> None: - super().__init__(settings or ZipParatextProjectSettingsParser(archive).parse()) - - self._archive = archive - - def _exists(self, file_name: StrPath) -> bool: - return file_name in self._archive.namelist() - - def _open(self, file_name: StrPath) -> Optional[BinaryIO]: - if file_name in self._archive.namelist(): - return BytesIO(self._archive.read(str(file_name))) - return None + super().__init__( + ZipParatextProjectFileHandler(archive), settings or ZipParatextProjectSettingsParser(archive).parse() + ) diff --git a/machine/corpora/zip_paratext_project_text_updater.py b/machine/corpora/zip_paratext_project_text_updater.py index b4dbd8bd..ffbdf1de 100644 --- a/machine/corpora/zip_paratext_project_text_updater.py +++ b/machine/corpora/zip_paratext_project_text_updater.py @@ -1,22 +1,10 @@ -from io import BytesIO -from typing import BinaryIO, Optional from zipfile import ZipFile -from ..utils.typeshed import StrPath from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase +from .zip_paratext_project_file_handler import ZipParatextProjectFileHandler from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser class ZipParatextProjectTextUpdater(ParatextProjectTextUpdaterBase): def __init__(self, archive: ZipFile) -> None: - super().__init__(ZipParatextProjectSettingsParser(archive)) - - self._archive = archive - - def _exists(self, file_name: StrPath) -> bool: - return file_name in self._archive.namelist() - - def _open(self, file_name: StrPath) -> Optional[BinaryIO]: - if file_name in self._archive.namelist(): - return BytesIO(self._archive.read(str(file_name))) - return None + super().__init__(ZipParatextProjectFileHandler(archive), ZipParatextProjectSettingsParser(archive).parse()) diff --git a/machine/corpora/zip_paratext_project_versification_detector.py b/machine/corpora/zip_paratext_project_versification_detector.py new file mode 100644 index 00000000..cf4bf66e --- /dev/null +++ b/machine/corpora/zip_paratext_project_versification_detector.py @@ -0,0 +1,10 @@ +from zipfile import ZipFile + +from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector +from .zip_paratext_project_file_handler import ZipParatextProjectFileHandler +from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser + + +class ZipParatextProjectVersificationErrorDetector(ParatextProjectVersificationErrorDetector): + def __init__(self, archive: ZipFile): + super().__init__(ZipParatextProjectFileHandler(archive), ZipParatextProjectSettingsParser(archive).parse()) diff --git a/machine/punctuation_analysis/file_paratext_project_quote_convention_detector.py b/machine/punctuation_analysis/file_paratext_project_quote_convention_detector.py index 3b743c1c..e3437521 100644 --- a/machine/punctuation_analysis/file_paratext_project_quote_convention_detector.py +++ b/machine/punctuation_analysis/file_paratext_project_quote_convention_detector.py @@ -1,6 +1,7 @@ from pathlib import Path from typing import BinaryIO +from ..corpora.file_paratext_project_file_handler import FileParatextProjectFileHandler from ..corpora.file_paratext_project_settings_parser import FileParatextProjectSettingsParser from ..utils.typeshed import StrPath from .paratext_project_quote_convention_detector import ParatextProjectQuoteConventionDetector @@ -8,7 +9,9 @@ class FileParatextProjectQuoteConventionDetector(ParatextProjectQuoteConventionDetector): def __init__(self, project_dir: StrPath) -> None: - super().__init__(FileParatextProjectSettingsParser(project_dir)) + super().__init__( + FileParatextProjectFileHandler(project_dir), FileParatextProjectSettingsParser(project_dir).parse() + ) self._project_dir = project_dir diff --git a/machine/punctuation_analysis/paratext_project_quote_convention_detector.py b/machine/punctuation_analysis/paratext_project_quote_convention_detector.py index 4b46d136..3e55752c 100644 --- a/machine/punctuation_analysis/paratext_project_quote_convention_detector.py +++ b/machine/punctuation_analysis/paratext_project_quote_convention_detector.py @@ -1,17 +1,21 @@ -from abc import ABC, abstractmethod -from typing import BinaryIO, Dict, List, Optional, Union +from typing import Dict, List, Optional, Union +from ..corpora.paratext_project_file_handler import ParatextProjectFileHandler from ..corpora.paratext_project_settings import ParatextProjectSettings from ..corpora.paratext_project_settings_parser_base import ParatextProjectSettingsParserBase from ..corpora.usfm_parser import parse_usfm from ..scripture.canon import book_id_to_number, get_scripture_books -from ..utils.typeshed import StrPath from .quote_convention_analysis import QuoteConventionAnalysis from .quote_convention_detector import QuoteConventionDetector -class ParatextProjectQuoteConventionDetector(ABC): - def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase]) -> None: +class ParatextProjectQuoteConventionDetector: + def __init__( + self, + paratext_project_file_handler: ParatextProjectFileHandler, + settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase], + ) -> None: + self._paratext_project_file_handler = paratext_project_file_handler if isinstance(settings, ParatextProjectSettingsParserBase): self._settings = settings.parse() else: @@ -27,12 +31,12 @@ def get_quote_convention_analysis( if include_chapters is not None and book_id_to_number(book_id) not in include_chapters: continue file_name: str = self._settings.get_book_file_name(book_id) - if not self._exists(file_name): + if not self._paratext_project_file_handler.exists(file_name): continue handler = QuoteConventionDetector() - with self._open(file_name) as sfm_file: + with self._paratext_project_file_handler.open(file_name) as sfm_file: usfm: str = sfm_file.read().decode(self._settings.encoding) try: parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification) @@ -48,9 +52,3 @@ def get_quote_convention_analysis( book_quote_convention_analyses.append(quote_convention_analysis) return QuoteConventionAnalysis.combine_with_weighted_average(book_quote_convention_analyses) - - @abstractmethod - def _exists(self, file_name: StrPath) -> bool: ... - - @abstractmethod - def _open(self, file_name: StrPath) -> BinaryIO: ... diff --git a/machine/punctuation_analysis/zip_paratext_project_quote_convention_detector.py b/machine/punctuation_analysis/zip_paratext_project_quote_convention_detector.py index e51b2754..470862c1 100644 --- a/machine/punctuation_analysis/zip_paratext_project_quote_convention_detector.py +++ b/machine/punctuation_analysis/zip_paratext_project_quote_convention_detector.py @@ -1,26 +1,10 @@ -from io import BytesIO -from typing import BinaryIO, Optional from zipfile import ZipFile +from ..corpora.zip_paratext_project_file_handler import ZipParatextProjectFileHandler from ..corpora.zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser from .paratext_project_quote_convention_detector import ParatextProjectQuoteConventionDetector class ZipParatextProjectQuoteConventionDetector(ParatextProjectQuoteConventionDetector): def __init__(self, archive: ZipFile) -> None: - self._archive = archive - super().__init__(ZipParatextProjectSettingsParser(archive)) - - def _exists(self, file_name: str) -> bool: - return file_name in self._archive.namelist() - - def _find(self, extension: str) -> Optional[str]: - for entry in self._archive.namelist(): - if entry.endswith(extension): - return entry - return None - - def _open(self, file_name: str) -> Optional[BinaryIO]: - if file_name in self._archive.namelist(): - return BytesIO(self._archive.read(file_name)) - return None + super().__init__(ZipParatextProjectFileHandler(archive), ZipParatextProjectSettingsParser(archive).parse()) diff --git a/machine/scripture/verse_ref.py b/machine/scripture/verse_ref.py index 2422163d..f35ea0fe 100644 --- a/machine/scripture/verse_ref.py +++ b/machine/scripture/verse_ref.py @@ -94,6 +94,13 @@ def from_string(cls, verse_str: str, versification: Optional[Versification] = No raise ValueError("The verse reference is invalid.") return VerseRef(b_cv[0], c_v[0], c_v[1], versification) + @classmethod + def try_from_string(cls, verse_str: str) -> Optional[VerseRef]: + try: + return cls.from_string(verse_str) + except ValueError: + return None + @classmethod def from_range(cls, start: VerseRef, end: VerseRef) -> VerseRef: if start.book_num != end.book_num or start.chapter_num != end.chapter_num: @@ -229,6 +236,10 @@ def is_default(self) -> bool: def is_excluded(self) -> bool: return self.versification.is_excluded(self.bbbcccvvv) + @property + def has_segments_defined(self): + return self.versification is not None and self.versification.verse_segments[self.bbbcccvvv] is not None + def get_segments(self, default_segments: Optional[Set[str]] = None) -> Optional[Set[str]]: if self.versification is None: return default_segments diff --git a/tests/corpora/test_paratext_project_terms_parser.py b/tests/corpora/test_paratext_project_terms_parser.py index 51be188e..2aaa0036 100644 --- a/tests/corpora/test_paratext_project_terms_parser.py +++ b/tests/corpora/test_paratext_project_terms_parser.py @@ -1,10 +1,10 @@ from typing import Dict, List, Optional, Tuple +from testutils.memory_paratext_project_file_handler import DefaultParatextProjectSettings from testutils.memory_paratext_project_terms_parser import MemoryParatextProjectTermsParser -from machine.corpora import ParatextProjectSettings, ParatextProjectTermsParserBase, UsfmStylesheet +from machine.corpora import ParatextProjectSettings, ParatextProjectTermsParserBase from machine.corpora.paratext_project_terms_parser_base import _get_glosses, _get_renderings, _strip_parens -from machine.scripture import ORIGINAL_VERSIFICATION, Versification def test_get_key_terms_from_terms_renderings() -> None: @@ -40,7 +40,7 @@ def test_get_key_terms_from_terms_renderings() -> None: def test_get_key_terms_from_terms_localizations_no_term_renderings() -> None: env = _TestEnvironment( - _DefaultParatextProjectSettings(biblical_terms_list_type="Major", biblical_terms_file_name="BiblicalTerms.xml"), + DefaultParatextProjectSettings(biblical_terms_list_type="Major", biblical_terms_file_name="BiblicalTerms.xml"), use_term_glosses=True, ) terms: List[Tuple[str, List[str]]] = env.get_glosses() @@ -52,7 +52,7 @@ def test_get_key_terms_from_terms_localizations_no_term_renderings() -> None: def test_get_key_terms_from_terms_localizations_no_term_renderings_do_not_use_term_glosses() -> None: env = _TestEnvironment( - _DefaultParatextProjectSettings(biblical_terms_list_type="Major", biblical_terms_file_name="BiblicalTerms.xml"), + DefaultParatextProjectSettings(biblical_terms_list_type="Major", biblical_terms_file_name="BiblicalTerms.xml"), use_term_glosses=False, ) terms: List[Tuple[str, List[str]]] = env.get_glosses() @@ -61,7 +61,7 @@ def test_get_key_terms_from_terms_localizations_no_term_renderings_do_not_use_te def test_get_key_terms_from_terms_localizations() -> None: env = _TestEnvironment( - _DefaultParatextProjectSettings( + DefaultParatextProjectSettings( biblical_terms_list_type="Major", biblical_terms_file_name="BiblicalTerms.xml", language_code="fr" ), use_term_glosses=True, @@ -75,7 +75,7 @@ def test_get_key_terms_from_terms_localizations() -> None: def test_get_key_terms_from_terms_localizations_term_renderings_exists_prefer_localization() -> None: env = _TestEnvironment( - _DefaultParatextProjectSettings(biblical_terms_list_type="Major", biblical_terms_file_name="BiblicalTerms.xml"), + DefaultParatextProjectSettings(biblical_terms_list_type="Major", biblical_terms_file_name="BiblicalTerms.xml"), files={ "TermRenderings.xml": r""" @@ -131,7 +131,7 @@ def __init__( ) -> None: self._use_term_glosses: bool = use_term_glosses self._parser: ParatextProjectTermsParserBase = MemoryParatextProjectTermsParser( - settings or _DefaultParatextProjectSettings(), files or {} + files or {}, settings or DefaultParatextProjectSettings() ) @property @@ -140,36 +140,3 @@ def parser(self) -> ParatextProjectTermsParserBase: def get_glosses(self) -> List[Tuple[str, List[str]]]: return self.parser.parse(["PN"], self._use_term_glosses) - - -class _DefaultParatextProjectSettings(ParatextProjectSettings): - def __init__( - self, - name: str = "Test", - full_name: str = "TestProject", - encoding: Optional[str] = None, - versification: Optional[Versification] = None, - stylesheet: Optional[UsfmStylesheet] = None, - file_name_prefix: str = "", - file_name_form: str = "41MAT", - file_name_suffix: str = "Test.SFM", - biblical_terms_list_type: str = "Project", - biblical_terms_project_name: str = "Test", - biblical_terms_file_name: str = "ProjectBiblicalTerms.xml", - language_code: str = "en", - ): - - super().__init__( - name, - full_name, - encoding if encoding is not None else "utf-8", - versification if versification is not None else ORIGINAL_VERSIFICATION, - stylesheet if stylesheet is not None else UsfmStylesheet("usfm.sty"), - file_name_prefix, - file_name_form, - file_name_suffix, - biblical_terms_list_type, - biblical_terms_project_name, - biblical_terms_file_name, - language_code, - ) diff --git a/tests/corpora/test_usfm_manual.py b/tests/corpora/test_usfm_manual.py index 241ba195..6e7eaaba 100644 --- a/tests/corpora/test_usfm_manual.py +++ b/tests/corpora/test_usfm_manual.py @@ -1,12 +1,9 @@ -import json import zipfile -from dataclasses import dataclass from pathlib import Path from typing import List, Optional import pytest from testutils.corpora_test_helpers import ( - TEST_DATA_PATH, USFM_SOURCE_PROJECT_PATH, USFM_SOURCE_PROJECT_ZIP_PATH, USFM_TARGET_PROJECT_PATH, @@ -21,10 +18,11 @@ StandardParallelTextCorpus, UpdateUsfmRow, UpdateUsfmTextBehavior, - ZipParatextProjectSettingsParser, - ZipParatextProjectTextUpdater, + ZipParatextProjectVersificationErrorDetector, ) from machine.punctuation_analysis import ZipParatextProjectQuoteConventionDetector +from machine.corpora.zip_paratext_project_versification_detector import ZipParatextProjectVersificationErrorDetector +from machine.punctuation_analysis import ZipParatextProjectQuoteConventionDetector @pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.") @@ -55,84 +53,6 @@ def test_parse_parallel_corpus(): assert new_usfm is not None -@dataclass -class PretranslationDto: - text_id: str - refs: List[str] - translation: str - - def __post_init__(self): - if self.text_id is None: - raise ValueError("text_id is a required field") - if self.refs is None: - raise ValueError("refs is a required field") - if self.translation is None: - raise ValueError("translation is a required field") - - -PRETRANSLATION_PATH = TEST_DATA_PATH / "pretranslations.json" -PARATEXT_PROJECT_PATH = TEST_DATA_PATH / "project" - - -@pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.") -# In order to run this test on specific projects, place the Paratext projects or Paratext project zips in the -# tests/testutils/data/project/ folder. -def test_create_usfm_file(): - def get_usfm(project_path: Path): - project_archive = None - try: - project_archive = zipfile.ZipFile(project_path, "r") - parser = ZipParatextProjectSettingsParser(project_archive) - except IsADirectoryError: - parser = FileParatextProjectSettingsParser(project_path) - - settings = parser.parse() - - # Read text from pretranslations file - with open(PRETRANSLATION_PATH, "r") as pretranslation_stream: - pretranslations = [ - ( - UpdateUsfmRow( - refs=[ScriptureRef.parse(r, settings.versification).to_relaxed() for r in p["refs"] or []], - text=p.get("translation", ""), - ) - ) - for p in json.load(pretranslation_stream) - ] - - book_ids: List[str] = [] - if project_archive is None: - for sfm_file in Path(project_path).glob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"): - book_id = settings.get_book_id(sfm_file.name) - if book_id: - book_ids.append(book_id) - updater = FileParatextProjectTextUpdater(project_path) - else: - for entry in project_archive.infolist(): - if entry.filename.startswith(settings.file_name_prefix) and entry.filename.endswith( - settings.file_name_suffix - ): - book_id = settings.get_book_id(entry.filename) - if book_id: - book_ids.append(book_id) - updater = ZipParatextProjectTextUpdater(project_archive) - - for book_id in book_ids: - new_usfm = updater.update_usfm( - book_id, pretranslations, text_behavior=UpdateUsfmTextBehavior.STRIP_EXISTING - ) - assert new_usfm is not None - - if not Path(PARATEXT_PROJECT_PATH / "Settings.xml").exists(): - for subdir in PARATEXT_PROJECT_PATH.iterdir(): - try: - get_usfm(subdir) - except Exception as e: - assert False, f"Failed to process {subdir}: {e}" - else: - get_usfm(PARATEXT_PROJECT_PATH) - - @pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.") def test_analyze_corpora_quote_conventions(): source_archive = zipfile.ZipFile(USFM_SOURCE_PROJECT_ZIP_PATH, "r") @@ -145,3 +65,11 @@ def test_analyze_corpora_quote_conventions(): assert source_analysis.best_quote_convention is not None assert target_analysis.best_quote_convention is not None + + +@pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.") +def test_validate_usfm_versification(): + archive = zipfile.ZipFile(USFM_SOURCE_PROJECT_ZIP_PATH, "r") + versification_error_detector = ZipParatextProjectVersificationErrorDetector(archive) + errors = versification_error_detector.get_usfm_versification_errors() + assert len(errors) == 0 diff --git a/tests/corpora/test_usfm_verisifcation_error_detector.py b/tests/corpora/test_usfm_verisifcation_error_detector.py new file mode 100644 index 00000000..87e7af4d --- /dev/null +++ b/tests/corpora/test_usfm_verisifcation_error_detector.py @@ -0,0 +1,319 @@ +from io import StringIO +from typing import Dict, List, Optional + +from testutils.memory_paratext_project_file_handler import DefaultParatextProjectSettings +from testutils.memory_paratext_project_versification_error_detector import ( + MemoryParatextProjectVersificationErrorDetector, +) + +from machine.corpora import ParatextProjectSettings, UsfmVersificationError, UsfmVersificationErrorType +from machine.scripture import ORIGINAL_VERSIFICATION, Versification + + +def get_usfm_versification_errors_no_errors(): + env = _TestEnvironment( + files={ + "653JNTest.SFM": r"""\id 3JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 12 + \v 13 + \v 14 + \v 15 + """ + } + ) + assert len(env.get_usfm_versification_errors()) == 0 + + +def get_usfm_versification_errors_missing_verses(): + env = _TestEnvironment( + files={ + "653JNTest.SFM": r"""\id 3JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 12 + \v 13 + \v 14 + """ + } + ) + errors = env.get_usfm_versification_errors() + assert len(errors) == 1 + assert errors[0].type == UsfmVersificationErrorType.MISSING_VERSE + + +def get_usfm_versification_missing_chapter(): + env = _TestEnvironment( + files={ + "653JNTest.SFM": r"""\id 3JN + """ + } + ) + errors = env.get_usfm_versification_errors() + assert len(errors) == 1 + assert errors[0].type == UsfmVersificationErrorType.MISSING_CHAPTER + + +def get_usfm_versification_errors_extra_verse(): + env = _TestEnvironment( + files={ + "653JNTest.SFM": r"""\id 3JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 12 + \v 13 + \v 14 + \v 15 + \v 16 + """ + } + ) + errors = env.get_usfm_versification_errors() + assert len(errors) == 1 + assert errors[0].type == UsfmVersificationErrorType.EXTRA_VERSE + + +def get_usfm_versification_errors_invalid_verse(): + env = _TestEnvironment( + files={ + "653JNTest.SFM": r"""\id 3JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 13-12 + \v 14 + \v 15 + """ + } + ) + errors = env.get_usfm_versification_errors() + assert len(errors) == 1 + assert errors[0].type == UsfmVersificationErrorType.INVALID_VERSE_RANGE + + +def get_usfm_versification_errors_extra_verse_segment(): + env = _TestEnvironment( + files={ + "653JNTest.SFM": r"""\id 3JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 12 + \v 13 + \v 14a + \v 14b + \v 15 + """ + } + ) + errors = env.get_usfm_versification_errors() + assert len(errors) == 1 + assert errors[0].type == UsfmVersificationErrorType.EXTRA_VERSE_SEGMENT + + +def get_usfm_versification_errors_missing_verse_segments(): + env = _TestEnvironment( + files={ + "653JNTest.SFM": r"""\id 3JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 12 + \v 13 + \v 14 + \v 15 + """ + } + ) + errors = env.get_usfm_versification_errors() + assert len(errors) == 1 + assert errors[0].type == UsfmVersificationErrorType.MISSING_VERSE_SEGMENT + + +def get_usfm_versification_errors_ignore_noncanonicals(): + env = _TestEnvironment( + files={ + "98XXETest.SFM": r"""\id XXE + \c 1 + \v 3-2 + """ + } + ) + assert len(env.get_usfm_versification_errors()) == 0 + + +def get_usfm_versification_errors_excluded_in_custom_vrs(): + env = _TestEnvironment( + files={ + "653JNTest.SFM": r"""\id 3JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 12 + \v 13 + \v 14 + \v 15 + """, + }, + settings=DefaultParatextProjectSettings(versification=get_custom_versification(r"-3JN 1:13")), + ) + errors = env.get_usfm_versification_errors() + assert len(errors) == 1 + assert errors[0].type == UsfmVersificationErrorType.EXTRA_VERSE + + +def get_usfm_versification_errors_multiple_books(): + env = _TestEnvironment( + files={ + "642JNTest.SFM": r"""\id 2JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 12 + """, + "653JNTest.SFM": r"""\id 3JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 12 + \v 13 + \v 14 + \v 15 + """, + } + ) + errors = env.get_usfm_versification_errors() + assert len(errors) == 1 + assert errors[0].type == UsfmVersificationErrorType.MISSING_VERSE + + +def get_usfm_versification_errors_multiple_chapters(): + env = _TestEnvironment( + files={ + "642JNTest.SFM": r"""\id 2JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 12 + \c 2 + \v 1 + """ + } + ) + errors = env.get_usfm_versification_errors() + assert len(errors) == 2 + assert errors[0].type == UsfmVersificationErrorType.MISSING_VERSE + assert errors[0].type == UsfmVersificationErrorType.EXTRA_VERSE + + +class _TestEnvironment: + def __init__(self, settings: Optional[ParatextProjectSettings] = None, files: Optional[Dict[str, str]] = None): + self._settings = settings + self._files = files or {} + self.detector = MemoryParatextProjectVersificationErrorDetector(settings, self._files) + + def get_usfm_versification_errors(self) -> List[UsfmVersificationError]: + return self.detector.get_usfm_versification_errors() + + +def get_custom_versification( + custom_vrs_contents: str, base_versification: Optional[Versification] = None +) -> Versification: + stream = StringIO(custom_vrs_contents) + versification = base_versification or Versification("custom", "vers.txt", ORIGINAL_VERSIFICATION) + versification = Versification.parse(stream, "vers.txt", versification, "custom") + return versification diff --git a/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py b/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py index 7977b6b7..aee1ff47 100644 --- a/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py +++ b/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py @@ -1,15 +1,16 @@ from typing import Dict, List, Optional +from testutils.memory_paratext_project_file_handler import DefaultParatextProjectSettings from testutils.memory_paratext_project_quote_convention_detector import MemoryParatextProjectQuoteConventionDetector -from machine.corpora import ParatextProjectSettings, UsfmStylesheet +from machine.corpora import ParatextProjectSettings from machine.punctuation_analysis import ( STANDARD_QUOTE_CONVENTIONS, ParatextProjectQuoteConventionDetector, QuoteConvention, QuoteConventionAnalysis, ) -from machine.scripture import ORIGINAL_VERSIFICATION, Versification, get_chapters +from machine.scripture import ORIGINAL_VERSIFICATION, get_chapters standard_english_quote_convention: Optional[QuoteConvention] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( "standard_english" @@ -122,7 +123,7 @@ def __init__( files: Optional[Dict[str, str]] = None, ) -> None: self._detector: ParatextProjectQuoteConventionDetector = MemoryParatextProjectQuoteConventionDetector( - settings or _DefaultParatextProjectSettings(), files or {} + settings or DefaultParatextProjectSettings(), files or {} ) @property @@ -150,36 +151,3 @@ def get_test_chapter(number: int, quote_convention: Optional[QuoteConvention]) - \q3 and more things someone else said.{right_quote} \m That is why he said {left_quote}things someone else said.{right_quote} \v 5 Then someone said, {left_quote}More things someone said.{right_quote}""" - - -class _DefaultParatextProjectSettings(ParatextProjectSettings): - def __init__( - self, - name: str = "Test", - full_name: str = "TestProject", - encoding: Optional[str] = None, - versification: Optional[Versification] = None, - stylesheet: Optional[UsfmStylesheet] = None, - file_name_prefix: str = "", - file_name_form: str = "41MAT", - file_name_suffix: str = "Test.SFM", - biblical_terms_list_type: str = "Project", - biblical_terms_project_name: str = "Test", - biblical_terms_file_name: str = "ProjectBiblicalTerms.xml", - language_code: str = "en", - ): - - super().__init__( - name, - full_name, - encoding if encoding is not None else "utf-8", - versification if versification is not None else ORIGINAL_VERSIFICATION, - stylesheet if stylesheet is not None else UsfmStylesheet("usfm.sty"), - file_name_prefix, - file_name_form, - file_name_suffix, - biblical_terms_list_type, - biblical_terms_project_name, - biblical_terms_file_name, - language_code, - ) diff --git a/tests/testutils/memory_paratext_project_file_handler.py b/tests/testutils/memory_paratext_project_file_handler.py new file mode 100644 index 00000000..a764cd69 --- /dev/null +++ b/tests/testutils/memory_paratext_project_file_handler.py @@ -0,0 +1,56 @@ +from io import BytesIO +from typing import BinaryIO, Dict, Optional + +from machine.corpora import ParatextProjectFileHandler, ParatextProjectSettings, UsfmStylesheet +from machine.scripture import ORIGINAL_VERSIFICATION, Versification + + +class MemoryParatextProjectFileHandler(ParatextProjectFileHandler): + def __init__(self, files: Dict[str, str]) -> None: + + self.files = files + + def exists(self, file_name: str) -> bool: + return file_name in self.files + + def open(self, file_name: str) -> BinaryIO: + return BytesIO(self.files[file_name].encode("utf-8")) + + def find(self, extension): + raise NotImplementedError + + def create_stylesheet(self, file_name): + raise NotImplementedError + + +class DefaultParatextProjectSettings(ParatextProjectSettings): + def __init__( + self, + name: str = "Test", + full_name: str = "TestProject", + encoding: Optional[str] = None, + versification: Optional[Versification] = None, + stylesheet: Optional[UsfmStylesheet] = None, + file_name_prefix: str = "", + file_name_form: str = "41MAT", + file_name_suffix: str = "Test.SFM", + biblical_terms_list_type: str = "Project", + biblical_terms_project_name: str = "Test", + biblical_terms_file_name: str = "ProjectBiblicalTerms.xml", + language_code: str = "en", + ): + + super().__init__( + name, + full_name, + encoding if encoding is not None else "utf-8", + versification if versification is not None else ORIGINAL_VERSIFICATION, + stylesheet if stylesheet is not None else UsfmStylesheet("usfm.sty"), + file_name_prefix, + file_name_form, + file_name_suffix, + biblical_terms_list_type, + biblical_terms_project_name, + biblical_terms_file_name, + language_code, + ) diff --git a/tests/testutils/memory_paratext_project_quote_convention_detector.py b/tests/testutils/memory_paratext_project_quote_convention_detector.py index 4a83f42f..f8976e78 100644 --- a/tests/testutils/memory_paratext_project_quote_convention_detector.py +++ b/tests/testutils/memory_paratext_project_quote_convention_detector.py @@ -1,18 +1,11 @@ -from io import BytesIO -from typing import BinaryIO, Dict +from typing import Dict from machine.corpora import ParatextProjectSettings from machine.punctuation_analysis import ParatextProjectQuoteConventionDetector +from .memory_paratext_project_file_handler import DefaultParatextProjectSettings, MemoryParatextProjectFileHandler + class MemoryParatextProjectQuoteConventionDetector(ParatextProjectQuoteConventionDetector): def __init__(self, settings: ParatextProjectSettings, files: Dict[str, str]) -> None: - super().__init__(settings) - - self.files = files - - def _exists(self, file_name: str) -> bool: - return file_name in self.files - - def _open(self, file_name: str) -> BinaryIO: - return BytesIO(self.files[file_name].encode("utf-8")) + super().__init__(MemoryParatextProjectFileHandler(files), settings or DefaultParatextProjectSettings()) diff --git a/tests/testutils/memory_paratext_project_terms_parser.py b/tests/testutils/memory_paratext_project_terms_parser.py index e807d7b9..fd1dc31c 100644 --- a/tests/testutils/memory_paratext_project_terms_parser.py +++ b/tests/testutils/memory_paratext_project_terms_parser.py @@ -1,17 +1,10 @@ -from io import BytesIO -from typing import BinaryIO, Dict +from typing import Dict, Optional from machine.corpora import ParatextProjectSettings, ParatextProjectTermsParserBase +from .memory_paratext_project_file_handler import DefaultParatextProjectSettings, MemoryParatextProjectFileHandler -class MemoryParatextProjectTermsParser(ParatextProjectTermsParserBase): - def __init__(self, settings: ParatextProjectSettings, files: Dict[str, str]) -> None: - super().__init__(settings) - - self.files = files - def _exists(self, file_name: str) -> bool: - return file_name in self.files - - def _open(self, file_name: str) -> BinaryIO: - return BytesIO(self.files[file_name].encode("utf-8")) +class MemoryParatextProjectTermsParser(ParatextProjectTermsParserBase): + def __init__(self, files: Dict[str, str], settings: Optional[ParatextProjectSettings]) -> None: + super().__init__(MemoryParatextProjectFileHandler(files), settings or DefaultParatextProjectSettings()) diff --git a/tests/testutils/memory_paratext_project_versification_error_detector.py b/tests/testutils/memory_paratext_project_versification_error_detector.py new file mode 100644 index 00000000..62911113 --- /dev/null +++ b/tests/testutils/memory_paratext_project_versification_error_detector.py @@ -0,0 +1,10 @@ +from typing import Dict, Optional + +from machine.corpora import ParatextProjectSettings, ParatextProjectVersificationErrorDetector + +from .memory_paratext_project_file_handler import DefaultParatextProjectSettings, MemoryParatextProjectFileHandler + + +class MemoryParatextProjectVersificationErrorDetector(ParatextProjectVersificationErrorDetector): + def __init__(self, settings: Optional[ParatextProjectSettings], files: Dict[str, str]) -> None: + super().__init__(MemoryParatextProjectFileHandler(files), settings or DefaultParatextProjectSettings()) From 5f1acf1c9001923ca80154fca7d8783d89c10fe3 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 19 Nov 2025 12:10:35 -0500 Subject: [PATCH 2/2] Fix imports --- tests/corpora/test_usfm_manual.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/corpora/test_usfm_manual.py b/tests/corpora/test_usfm_manual.py index 6e7eaaba..cb214f85 100644 --- a/tests/corpora/test_usfm_manual.py +++ b/tests/corpora/test_usfm_manual.py @@ -21,8 +21,6 @@ ZipParatextProjectVersificationErrorDetector, ) from machine.punctuation_analysis import ZipParatextProjectQuoteConventionDetector -from machine.corpora.zip_paratext_project_versification_detector import ZipParatextProjectVersificationErrorDetector -from machine.punctuation_analysis import ZipParatextProjectQuoteConventionDetector @pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.")