From ba0aa0a3d127f57ea0d75a966cab8c53f2df8306 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 4 Aug 2025 14:33:20 -0400 Subject: [PATCH 1/3] Add zip quote convention detector; tests; fix pytest error --- ...atext_project_quote_convention_detector.py | 42 ++++++++++ machine/corpora/paratext_project_settings.py | 8 +- ...atext_project_quote_convention_detector.py | 24 ++++++ machine/scripture/canon.py | 6 +- ...atext_project_quote_convention_detector.py | 82 +++++++++++++++++++ .../test_update_usfm_parser_handler.py | 24 +++--- ...atext_project_quote_convention_detector.py | 19 +++++ 7 files changed, 190 insertions(+), 15 deletions(-) create mode 100644 machine/corpora/paratext_project_quote_convention_detector.py create mode 100644 machine/corpora/zip_paratext_project_quote_convention_detector.py create mode 100644 tests/corpora/test_paratext_project_quote_convention_detector.py create mode 100644 tests/testutils/memory_paratext_project_quote_convention_detector.py diff --git a/machine/corpora/paratext_project_quote_convention_detector.py b/machine/corpora/paratext_project_quote_convention_detector.py new file mode 100644 index 00000000..10537100 --- /dev/null +++ b/machine/corpora/paratext_project_quote_convention_detector.py @@ -0,0 +1,42 @@ +from abc import ABC, abstractmethod +from typing import BinaryIO, Iterable, Optional, Sequence, Union + +from ..utils.typeshed import StrPath +from .paratext_project_settings import ParatextProjectSettings +from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase +from .punctuation_analysis.quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector +from .usfm_parser import parse_usfm + + +class ParatextProjectQuoteConventionDetector(ABC): + def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase]) -> None: + if isinstance(settings, ParatextProjectSettingsParserBase): + self._settings = settings.parse() + else: + self._settings = settings + + def get_quote_convention_analysis( + self, handler: Optional[QuoteConventionDetector] = None + ) -> Optional[QuoteConventionAnalysis]: + handler = QuoteConventionDetector() if handler is None else handler + for file_name in self._settings.get_all_scripture_book_file_names(): + if not self._exists(file_name): + continue + with self._open(file_name) as sfm_file: + usfm: str = sfm_file.read().decode(self._settings.encoding) + try: + parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification) + except Exception as e: + error_message = ( + f"An error occurred while parsing the usfm for '{file_name}'" + f"{f' in project {self._settings.name}' if self._settings.name else ''}" + f". Error: '{e}'" + ) + raise RuntimeError(error_message) from e + return handler.detect_quote_convention() + + @abstractmethod + def _exists(self, file_name: StrPath) -> bool: ... + + @abstractmethod + def _open(self, file_name: StrPath) -> BinaryIO: ... diff --git a/machine/corpora/paratext_project_settings.py b/machine/corpora/paratext_project_settings.py index fa87c976..ad86b303 100644 --- a/machine/corpora/paratext_project_settings.py +++ b/machine/corpora/paratext_project_settings.py @@ -1,7 +1,7 @@ from dataclasses import dataclass -from typing import Optional +from typing import Iterable, Optional -from ..scripture.canon import book_id_to_number, book_number_to_id +from ..scripture.canon import book_id_to_number, book_number_to_id, get_scripture_books from ..scripture.verse_ref import Versification from .usfm_stylesheet import UsfmStylesheet @@ -53,6 +53,10 @@ def get_book_file_name(self, book_id: str) -> str: book_part = _get_book_file_name_digits(book_id) + book_id return self.file_name_prefix + book_part + self.file_name_suffix + def get_all_scripture_book_file_names(self) -> Iterable[str]: + for book_id in get_scripture_books(): + yield self.get_book_file_name(book_id) + def _get_book_file_name_digits(book_id: str) -> str: book_num = book_id_to_number(book_id) diff --git a/machine/corpora/zip_paratext_project_quote_convention_detector.py b/machine/corpora/zip_paratext_project_quote_convention_detector.py new file mode 100644 index 00000000..c0ccc90d --- /dev/null +++ b/machine/corpora/zip_paratext_project_quote_convention_detector.py @@ -0,0 +1,24 @@ +from io import BytesIO +from typing import BinaryIO, Optional +from zipfile import ZipFile + +from .paratext_project_quote_convention_detector import ParatextProjectQuoteConventionDetector + + +class ZipParatextProjectQuoteConventionDetector(ParatextProjectQuoteConventionDetector): + def __init__(self, archive: ZipFile) -> None: + self._archive = archive + + def _exists(self, file_name: str) -> bool: + return file_name in self._archive.namelist() + + def _find(self, extension: str) -> Optional[str]: + for entry in self._archive.namelist(): + if entry.endswith(extension): + return entry + return None + + def _open(self, file_name: str) -> Optional[BinaryIO]: + if file_name in self._archive.namelist(): + return BytesIO(self._archive.read(file_name)) + return None diff --git a/machine/scripture/canon.py b/machine/scripture/canon.py index f986c492..8bafaf19 100644 --- a/machine/scripture/canon.py +++ b/machine/scripture/canon.py @@ -1,4 +1,4 @@ -from typing import Union +from typing import Iterable, Union ALL_BOOK_IDS = [ "GEN", @@ -181,3 +181,7 @@ def is_canonical(book: Union[str, int]) -> bool: if isinstance(book, int): book = book_number_to_id(book) return is_book_id_valid(book) and book not in NON_CANONICAL_IDS + + +def get_scripture_books() -> Iterable[str]: + return list(map(lambda kvp: kvp[0], filter(lambda kvp: is_ot_nt(kvp[1]), BOOK_NUMBERS.items()))) diff --git a/tests/corpora/test_paratext_project_quote_convention_detector.py b/tests/corpora/test_paratext_project_quote_convention_detector.py new file mode 100644 index 00000000..80372b7e --- /dev/null +++ b/tests/corpora/test_paratext_project_quote_convention_detector.py @@ -0,0 +1,82 @@ +from typing import Dict, Optional + +from testutils.memory_paratext_project_quote_convention_detector import MemoryParatextProjectQuoteConventionDetector + +from machine.corpora import ParatextProjectSettings, UsfmStylesheet +from machine.corpora.paratext_project_quote_convention_detector import ParatextProjectQuoteConventionDetector +from machine.corpora.punctuation_analysis.quote_convention_detector import QuoteConventionAnalysis +from machine.scripture import ORIGINAL_VERSIFICATION, Versification + + +def test_get_quote_convention() -> None: + env = _TestEnvironment( + files={ + "41MATTest.SFM": r"""\id MAT +\c 1 +\v 1 Someone said, “This is something I am saying! +\v 2 This is also something I am saying” (that is, “something I am speaking”). +\p +\v 3 Other text, and someone else said, +\q1 +\v 4 “Things +\q2 someone else said! +\q3 and more things someone else said.” +\m That is why he said “things someone else said.” +\v 5 Then someone said, “More things someone said.”""", + } + ) + analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention() + assert analysis is not None + assert analysis.best_quote_convention_score > 0.8 + assert analysis.best_quote_convention.name == "standard_english" + + +class _TestEnvironment: + def __init__( + self, + settings: Optional[ParatextProjectSettings] = None, + files: Optional[Dict[str, str]] = None, + ) -> None: + self._detector: ParatextProjectQuoteConventionDetector = MemoryParatextProjectQuoteConventionDetector( + settings or _DefaultParatextProjectSettings(), files or {} + ) + + @property + def detector(self) -> ParatextProjectQuoteConventionDetector: + return self._detector + + def get_quote_convention(self) -> Optional[QuoteConventionAnalysis]: + return self.detector.get_quote_convention_analysis() + + +class _DefaultParatextProjectSettings(ParatextProjectSettings): + def __init__( + self, + name: str = "Test", + full_name: str = "TestProject", + encoding: Optional[str] = None, + versification: Optional[Versification] = None, + stylesheet: Optional[UsfmStylesheet] = None, + file_name_prefix: str = "", + file_name_form: str = "41MAT", + file_name_suffix: str = "Test.SFM", + biblical_terms_list_type: str = "Project", + biblical_terms_project_name: str = "Test", + biblical_terms_file_name: str = "ProjectBiblicalTerms.xml", + language_code: str = "en", + ): + + super().__init__( + name, + full_name, + encoding if encoding is not None else "utf-8", + versification if versification is not None else ORIGINAL_VERSIFICATION, + stylesheet if stylesheet is not None else UsfmStylesheet("usfm.sty"), + file_name_prefix, + file_name_form, + file_name_suffix, + biblical_terms_list_type, + biblical_terms_project_name, + biblical_terms_file_name, + language_code, + ) diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index 47cd6280..2c227376 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -843,7 +843,7 @@ def test_update_block_verse_preserve_paras() -> None: \v 1 verse 1 \p inner verse paragraph """ - update_block_handler = TestUsfmUpdateBlockHandler() + update_block_handler = _TestUsfmUpdateBlockHandler() update_usfm( rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, update_block_handlers=[update_block_handler] ) @@ -872,7 +872,7 @@ def test_update_block_verse_strip_paras() -> None: \v 1 verse 1 \p inner verse paragraph """ - update_block_handler = TestUsfmUpdateBlockHandler() + update_block_handler = _TestUsfmUpdateBlockHandler() update_usfm( rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP, update_block_handlers=[update_block_handler] ) @@ -901,7 +901,7 @@ def test_update_block_verse_range() -> None: \v 1-3 verse 1 through 3 """ - update_block_handler = TestUsfmUpdateBlockHandler() + update_block_handler = _TestUsfmUpdateBlockHandler() update_usfm( rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, update_block_handlers=[update_block_handler] ) @@ -928,7 +928,7 @@ def test_update_block_footnote_preserve_embeds() -> None: \v 1 verse\f \fr 1.1 \ft Some note \f* 1 """ - update_block_handler = TestUsfmUpdateBlockHandler() + update_block_handler = _TestUsfmUpdateBlockHandler() update_usfm( rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, update_block_handlers=[update_block_handler] ) @@ -957,7 +957,7 @@ def test_update_block_footnote_strip_embeds() -> None: \v 1 verse\f \fr 1.1 \ft Some note \f* 1 """ - update_block_handler = TestUsfmUpdateBlockHandler() + update_block_handler = _TestUsfmUpdateBlockHandler() update_usfm(rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.STRIP, update_block_handlers=[update_block_handler]) assert len(update_block_handler.blocks) == 1 @@ -985,7 +985,7 @@ def test_update_block_nonverse() -> None: \v 1 verse 1 """ - update_block_handler = TestUsfmUpdateBlockHandler() + update_block_handler = _TestUsfmUpdateBlockHandler() update_usfm(rows, usfm, update_block_handlers=[update_block_handler]) assert len(update_block_handler.blocks) == 2 @@ -1010,7 +1010,7 @@ def test_update_block_verse_preserve_styles() -> None: \v 1 verse \bd 1\bd* """ - update_block_handler = TestUsfmUpdateBlockHandler() + update_block_handler = _TestUsfmUpdateBlockHandler() update_usfm( rows, usfm, style_behavior=UpdateUsfmMarkerBehavior.PRESERVE, update_block_handlers=[update_block_handler] ) @@ -1041,7 +1041,7 @@ def test_update_block_verse_strip_styles() -> None: \v 1 verse \bd 1\bd* """ - update_block_handler = TestUsfmUpdateBlockHandler() + update_block_handler = _TestUsfmUpdateBlockHandler() update_usfm(rows, usfm, style_behavior=UpdateUsfmMarkerBehavior.STRIP, update_block_handlers=[update_block_handler]) assert len(update_block_handler.blocks) == 1 @@ -1074,7 +1074,7 @@ def test_update_block_verse_section_header() -> None: \v 2 Verse 2 """ - update_block_handler = TestUsfmUpdateBlockHandler() + update_block_handler = _TestUsfmUpdateBlockHandler() update_usfm(rows, usfm, update_block_handlers=[update_block_handler]) assert len(update_block_handler.blocks) == 4 @@ -1114,7 +1114,7 @@ def test_update_block_verse_section_header_in_verse() -> None: \p end of verse """ - update_block_handler = TestUsfmUpdateBlockHandler() + update_block_handler = _TestUsfmUpdateBlockHandler() update_usfm(rows, usfm, update_block_handlers=[update_block_handler]) assert len(update_block_handler.blocks) == 3 @@ -1148,7 +1148,7 @@ def test_update_block_nonverse_paragraph_end_of_verse() -> None: \s Section header """ - update_block_handler = TestUsfmUpdateBlockHandler() + update_block_handler = _TestUsfmUpdateBlockHandler() update_usfm(rows, usfm, update_block_handlers=[update_block_handler]) assert len(update_block_handler.blocks) == 3 @@ -1307,7 +1307,7 @@ def assert_update_block_equals( assert element.marked_for_removal == expected_marked_for_removal -class TestUsfmUpdateBlockHandler(UsfmUpdateBlockHandler): +class _TestUsfmUpdateBlockHandler(UsfmUpdateBlockHandler): def __init__(self): self.blocks: list[UsfmUpdateBlock] = [] diff --git a/tests/testutils/memory_paratext_project_quote_convention_detector.py b/tests/testutils/memory_paratext_project_quote_convention_detector.py new file mode 100644 index 00000000..ae624913 --- /dev/null +++ b/tests/testutils/memory_paratext_project_quote_convention_detector.py @@ -0,0 +1,19 @@ +from io import BytesIO +from typing import BinaryIO, Dict + +from machine.corpora import ParatextProjectSettings + +from machine.corpora.paratext_project_quote_convention_detector import ParatextProjectQuoteConventionDetector + + +class MemoryParatextProjectQuoteConventionDetector(ParatextProjectQuoteConventionDetector): + def __init__(self, settings: ParatextProjectSettings, files: Dict[str, str]) -> None: + super().__init__(settings) + + self.files = files + + def _exists(self, file_name: str) -> bool: + return file_name in self.files + + def _open(self, file_name: str) -> BinaryIO: + return BytesIO(self.files[file_name].encode("utf-8")) From af5ceadfbf76171eb7aae9fa2f997673d3c60373 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 4 Aug 2025 14:37:49 -0400 Subject: [PATCH 2/3] Fix unused imports --- machine/corpora/paratext_project_quote_convention_detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine/corpora/paratext_project_quote_convention_detector.py b/machine/corpora/paratext_project_quote_convention_detector.py index 10537100..0e145eb2 100644 --- a/machine/corpora/paratext_project_quote_convention_detector.py +++ b/machine/corpora/paratext_project_quote_convention_detector.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import BinaryIO, Iterable, Optional, Sequence, Union +from typing import BinaryIO, Optional, Union from ..utils.typeshed import StrPath from .paratext_project_settings import ParatextProjectSettings From 1ba105ef6929390995a26afc5e91c6ecd1dc9da8 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 4 Aug 2025 14:43:14 -0400 Subject: [PATCH 3/3] Run isort --- .../memory_paratext_project_quote_convention_detector.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/testutils/memory_paratext_project_quote_convention_detector.py b/tests/testutils/memory_paratext_project_quote_convention_detector.py index ae624913..0f6226d3 100644 --- a/tests/testutils/memory_paratext_project_quote_convention_detector.py +++ b/tests/testutils/memory_paratext_project_quote_convention_detector.py @@ -2,7 +2,6 @@ from typing import BinaryIO, Dict from machine.corpora import ParatextProjectSettings - from machine.corpora.paratext_project_quote_convention_detector import ParatextProjectQuoteConventionDetector