From 267a1ee5d1ab4222b2a71e23163306fada728002 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 2 Oct 2025 14:25:46 -0400 Subject: [PATCH 1/6] Port https://github.com/sillsdev/machine/pull/343, https://github.com/sillsdev/machine/pull/342, and https://github.com/sillsdev/machine/pull/341 --- .../paratext_project_terms_parser_base.py | 38 +++-- .../paratext_project_text_updater_base.py | 8 +- ...place_markers_usfm_update_block_handler.py | 13 +- machine/corpora/scripture_ref.py | 22 ++- machine/corpora/update_usfm_parser_handler.py | 133 ++++++++++++++---- machine/corpora/usfm_update_block_handler.py | 10 ++ machine/scripture/verse_ref.py | 18 +++ .../test_paratext_project_terms_parser.py | 15 +- ...place_markers_usfm_update_block_handler.py | 4 +- .../test_update_usfm_parser_handler.py | 124 ++++++++++++++-- 10 files changed, 319 insertions(+), 66 deletions(-) diff --git a/machine/corpora/paratext_project_terms_parser_base.py b/machine/corpora/paratext_project_terms_parser_base.py index 3245c953..678b6a9e 100644 --- a/machine/corpora/paratext_project_terms_parser_base.py +++ b/machine/corpora/paratext_project_terms_parser_base.py @@ -70,10 +70,14 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) - id = term.attrib["Id"] if _is_in_category(id, term_categories, term_id_to_category_dict): id_ = id.replace("\n", " ") - renderings = term.find("Renderings") - gloss = renderings.text if renderings is not None and renderings.text is not None else "" - glosses = _get_glosses(gloss) - terms_renderings[id_].extend(glosses) + renderings_element = term.find("Renderings") + rendering_text = ( + renderings_element.text + if renderings_element is not None and renderings_element.text is not None + else "" + ) + renderings = _get_renderings(rendering_text) + terms_renderings[id_].extend(renderings) terms_glosses: Dict[str, List[str]] = defaultdict(list) if terms_glosses_doc is not None and use_term_glosses: @@ -102,25 +106,33 @@ def _is_in_category(id: str, term_categories: Sequence[str], term_id_to_category return not term_categories or (category is not None and category in term_categories) +def _clean_term(term: str): + term = term.strip() + term = _strip_parens(term) + term = " ".join(term.split()) + return term + + def _get_glosses(gloss: str) -> List[str]: match = _CONTENT_IN_BRACKETS_REGEX.match(gloss) if match: - gloss = match.group(0) - gloss = gloss.replace("?", "") - gloss = gloss.replace("*", "") - gloss = gloss.replace("/", " ") - gloss = gloss.strip() - gloss = _strip_parens(gloss) + gloss = match.group(1) + gloss = _clean_term(gloss) gloss = _strip_parens(gloss, left="[", right="]") gloss = gloss.strip() for match in _NUMERICAL_INFORMATION_REGEX.finditer(gloss): gloss = gloss.replace(match.group(0), "") - glosses = re.split(r"\|\|", gloss) - glosses = [re.split(r"[,;]", g) for g in glosses] - glosses = [item.strip() for sublist in glosses for item in sublist if item.strip()] + glosses = re.split(r"[,;/]", gloss) + glosses = list(set([gloss.strip() for gloss in glosses if gloss.strip()])) return glosses +def _get_renderings(rendering: str) -> List[str]: + renderings = re.split(r"\|\|", rendering.strip()) + renderings = [_clean_term(rendering).strip().replace("*", "") for rendering in renderings] + return [rendering for rendering in renderings if rendering] + + def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str: parens: int = 0 end: int = -1 diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index c29d1efa..81f2c5d1 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import BinaryIO, Iterable, Optional, Sequence, Union +from typing import BinaryIO, Callable, Iterable, Optional, Sequence, Union from ..utils.typeshed import StrPath from .paratext_project_settings import ParatextProjectSettings @@ -11,7 +11,7 @@ UpdateUsfmTextBehavior, ) from .usfm_parser import parse_usfm -from .usfm_update_block_handler import UsfmUpdateBlockHandler +from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerException class ParatextProjectTextUpdaterBase(ABC): @@ -33,6 +33,8 @@ def update_usfm( preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None, update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None, remarks: Optional[Iterable[str]] = None, + error_handler: Optional[Callable[[UsfmUpdateBlockHandlerException], bool]] = None, + compare_segments: bool = False, ) -> Optional[str]: file_name: str = self._settings.get_book_file_name(book_id) if not self._exists(file_name): @@ -49,6 +51,8 @@ def update_usfm( preserve_paragraph_styles, update_block_handlers=update_block_handlers, remarks=remarks, + error_handler=error_handler, + compare_segments=compare_segments, ) try: parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification) diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py index a48a92a6..78db9a06 100644 --- a/machine/corpora/place_markers_usfm_update_block_handler.py +++ b/machine/corpora/place_markers_usfm_update_block_handler.py @@ -7,7 +7,7 @@ from .usfm_token import UsfmToken, UsfmTokenType from .usfm_update_block import UsfmUpdateBlock from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType -from .usfm_update_block_handler import UsfmUpdateBlockHandler +from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerException PLACE_MARKERS_ALIGNMENT_INFO_KEY = "alignment_info" @@ -118,7 +118,16 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: trg_tok_starts = [] prev_len = 0 for tok in trg_toks: - trg_tok_starts.append(trg_sent.index(tok, trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0)) + try: + index_of_trg_tok_in_sent = trg_sent.index( + tok, trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0 + ) + except ValueError: + raise UsfmUpdateBlockHandlerException( + block, + f'No token "{tok}" found in text "{trg_sent}" at or beyond index {trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0}. Is the versification correctly specified?', + ) + trg_tok_starts.append(index_of_trg_tok_in_sent) prev_len = len(tok) # Predict marker placements and get insertion order diff --git a/machine/corpora/scripture_ref.py b/machine/corpora/scripture_ref.py index 5d63ed89..154b9e11 100644 --- a/machine/corpora/scripture_ref.py +++ b/machine/corpora/scripture_ref.py @@ -4,7 +4,7 @@ from typing import List, Optional from ..scripture.constants import ENGLISH_VERSIFICATION -from ..scripture.verse_ref import VerseRef, Versification +from ..scripture.verse_ref import IgnoreSegmentsVerseRef, VerseRef, Versification from ..utils.comparable import Comparable from .scripture_element import ScriptureElement @@ -112,7 +112,7 @@ def compare_to(self, other: object, compare_segments: bool = True) -> int: def __eq__(self, other: object) -> bool: if not isinstance(other, ScriptureRef): return NotImplemented - return self.verse_ref == other.verse_ref and self.path == other.path + return self.compare_to(other, True) == 0 def __lt__(self, other: object) -> bool: if not isinstance(other, ScriptureRef): @@ -120,7 +120,8 @@ def __lt__(self, other: object) -> bool: return self.compare_to(other) < 0 def __hash__(self) -> int: - return hash((self.verse_ref, tuple(self.path))) + # Using to_relaxed() is necessary to maintain equality across relaxed refs, __eq__ properly handles relaxed ref comparison + return hash((self.verse_ref, tuple(self.to_relaxed().path))) def __repr__(self) -> str: result = str(self.verse_ref) @@ -129,4 +130,19 @@ def __repr__(self) -> str: return result +class IgnoreSegmentsScriptureRef(ScriptureRef): + def __eq__(self, other): + if not isinstance(other, ScriptureRef): + return NotImplemented + return self.compare_to(other, False) + + def __hash__(self): + return hash( + ( + IgnoreSegmentsVerseRef(self.verse_ref), + tuple(self.to_relaxed().path), + ) + ) + + EMPTY_SCRIPTURE_REF = ScriptureRef() diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 032fceb7..b715fee8 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -1,5 +1,7 @@ from enum import Enum, auto -from typing import Iterable, List, Optional, Sequence, Tuple, Union +from typing import Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union + +from ..scripture.verse_ref import IgnoreSegmentsVerseRef, VerseRef, Versification from .scripture_ref import ScriptureRef from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType @@ -10,7 +12,7 @@ from .usfm_tokenizer import UsfmTokenizer from .usfm_update_block import UsfmUpdateBlock from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType -from .usfm_update_block_handler import UsfmUpdateBlockHandler +from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerException class UpdateUsfmTextBehavior(Enum): @@ -24,6 +26,12 @@ class UpdateUsfmMarkerBehavior(Enum): STRIP = auto() +class _RowInfo: + def __init__(self, row_index: int): + self.row_index = row_index + self.is_consumed = False + + class UpdateUsfmRow: def __init__(self, refs: Sequence[ScriptureRef], text: str, metadata: Optional[dict[str, object]] = None): self.refs = refs @@ -43,9 +51,18 @@ def __init__( preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None, update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None, remarks: Optional[Iterable[str]] = None, + error_handler: Optional[Callable[[UsfmUpdateBlockHandlerException], bool]] = None, + compare_segments: bool = False, ) -> None: super().__init__() self._rows = rows or [] + self._verse_rows: List[int] = [] + self._verse_row_index = 0 + self._verse_rows_map: Dict[VerseRef, List[_RowInfo]] = {} + if len(self._rows) > 0: + self._update_rows_versification: Versification = self._rows[0].refs[0].versification + else: + self._update_rows_versification = Versification.get_builtin("English") self._tokens: List[UsfmToken] = [] self._updated_text: List[UsfmToken] = [] self._update_block_stack: list[UsfmUpdateBlock] = [] @@ -65,6 +82,11 @@ def __init__( self._remarks = [] else: self._remarks = list(remarks) + if error_handler is None: + self._error_handler = lambda _: False + else: + self._error_handler = error_handler + self._compare_segments = compare_segments self._text_behavior = text_behavior self._paragraph_behavior = paragraph_behavior self._embed_behavior = embed_behavior @@ -82,6 +104,10 @@ def end_usfm(self, state: UsfmParserState) -> None: super().end_usfm(state) def start_book(self, state: UsfmParserState, marker: str, code: str) -> None: + self._verse_rows_ref = state.verse_ref.copy() + self._update_verse_rows_map() + self._update_verse_rows() + self._collect_readonly_tokens(state) self._update_block_stack.append(UsfmUpdateBlock()) start_book_tokens: List[UsfmToken] = [] @@ -108,7 +134,7 @@ def start_para( ) -> None: if state.is_verse_text: # Only strip paragraph markers in a verse - if self._paragraph_behavior == UpdateUsfmMarkerBehavior.PRESERVE: + if self._paragraph_behavior == UpdateUsfmMarkerBehavior.PRESERVE and not self._duplicate_verse: self._collect_updatable_tokens(state) else: self._skip_updatable_tokens(state) @@ -148,6 +174,11 @@ def chapter( ) -> None: self._use_updated_text() + if self._verse_rows_ref != state.verse_ref: + self._verse_rows_ref = state.verse_ref.copy() + self._update_verse_rows_map() + self._update_verse_rows() + super().chapter(state, number, marker, alt_number, pub_number) self._collect_readonly_tokens(state) @@ -179,14 +210,23 @@ def verse( if last_paragraph is not None: last_paragraph.marked_for_removal = False - super().verse(state, number, marker, alt_number, pub_number) + if self._verse_rows_ref != state.verse_ref: + self._verse_rows_ref = state.verse_ref.copy() + self._update_verse_rows() - self._collect_readonly_tokens(state) + super().verse(state, number, marker, alt_number, pub_number) + if self._duplicate_verse: + self._skip_updatable_tokens(state) + else: + self._collect_readonly_tokens(state) def start_note(self, state: UsfmParserState, marker: str, caller: str, category: str) -> None: super().start_note(state, marker, caller, category) - self._collect_updatable_tokens(state) + if not self._duplicate_verse: + self._collect_updatable_tokens(state) + else: + self._skip_updatable_tokens(state) def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None: if closed: @@ -219,15 +259,14 @@ def end_char( attributes: Sequence[UsfmAttribute], closed: bool, ) -> None: - if closed: - if self._current_text_type == ScriptureTextType.EMBED: - self._collect_updatable_tokens(state) + if self._current_text_type == ScriptureTextType.EMBED: + self._collect_updatable_tokens(state) + else: + self._replace_with_new_tokens(state) + if self._style_behavior == UpdateUsfmMarkerBehavior.STRIP: + self._skip_updatable_tokens(state) else: - self._replace_with_new_tokens(state) - if self._style_behavior == UpdateUsfmMarkerBehavior.STRIP: - self._skip_updatable_tokens(state) - else: - self._collect_updatable_tokens(state) + self._collect_updatable_tokens(state) super().end_char(state, marker, attributes, closed) @@ -242,7 +281,9 @@ def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> def text(self, state: UsfmParserState, text: str) -> None: super().text(state, text) - if self._replace_with_new_tokens(state): + if self._replace_with_new_tokens(state) or ( + self._duplicate_verse and self._current_text_type == ScriptureTextType.VERSE + ): self._skip_updatable_tokens(state) else: self._collect_updatable_tokens(state) @@ -292,11 +333,10 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: for remark in self._remarks: remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem")) remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark)) - if len(tokens) > 0 and tokens[0].marker == "id": - index = 1 - if len(tokens) > 1 and tokens[1].type == UsfmTokenType.TEXT: - index = 2 - while tokens[index].marker == "rem": + if len(tokens) > 0: + index = 0 + markers_to_skip = {"id", "ide", "rem"} + while tokens[index].marker in markers_to_skip: index += 1 if len(tokens) > index and tokens[index].type == UsfmTokenType.TEXT: index += 1 @@ -308,13 +348,15 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str] row_texts: List[str] = [] row_metadata = None source_index: int = 0 - while self._row_index < len(self._rows) and source_index < len(seg_scr_refs): + while self._verse_row_index < len(self._verse_rows) and source_index < len(seg_scr_refs): compare: int = 0 - row = self._rows[self._row_index] + row = self._rows[self._verse_rows[self._verse_row_index]] row_scr_refs, text, metadata = row.refs, row.text, row.metadata for row_scr_ref in row_scr_refs: while source_index < len(seg_scr_refs): - compare = row_scr_ref.compare_to(seg_scr_refs[source_index], compare_segments=False) + compare = row_scr_ref.compare_to( + seg_scr_refs[source_index], compare_segments=self._compare_segments + ) if compare > 0: # row is ahead of source, increment source source_index += 1 @@ -328,7 +370,7 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str] break if compare <= 0: # source is ahead of row, increment row - self._row_index += 1 + self._verse_row_index += 1 return row_texts, row_metadata def _collect_updatable_tokens(self, state: UsfmParserState) -> None: @@ -418,7 +460,13 @@ def _end_update_block(self, state: UsfmParserState, scripture_refs: Sequence[Scr para_elems.append(update_block.pop()) for handler in self._update_block_handlers: - update_block = handler.process_block(update_block) + try: + update_block = handler.process_block(update_block) + except UsfmUpdateBlockHandlerException as e: + should_continue = self._error_handler(e) + if not should_continue: + raise + tokens = update_block.get_tokens() for elem in reversed(para_elems): tokens.extend(elem.get_tokens()) @@ -449,6 +497,41 @@ def _pop_new_tokens(self) -> None: def _is_in_preserved_paragraph(self, state: UsfmParserState) -> bool: return state.para_tag is not None and state.para_tag.marker in self._preserve_paragraph_styles + def _update_verse_rows_map(self) -> None: + self._verse_rows_map.clear() + while ( + self._row_index < len(self._rows) + and self._rows[self._row_index].refs[0].chapter_num == self._verse_rows_ref.chapter_num + ): + row = self._rows[self._row_index] + ri = _RowInfo(self._row_index) + for sr in row.refs: + vr = sr.verse_ref if self._compare_segments else IgnoreSegmentsVerseRef(sr.verse_ref) + if vr in self._verse_rows_map: + self._verse_rows_map[vr].append(ri) + else: + self._verse_rows_map[vr] = [ri] + self._row_index += 1 + + def _update_verse_rows(self) -> None: + vref = self._verse_rows_ref.copy() + # We are using a dictionary, which uses an equality comparer. As a result, we need to change the + # source verse ref to use the row versification. If we used a SortedList, it wouldn't be necessary, but it + # would be less efficient. + vref.change_versification(self._update_rows_versification) + + self._verse_rows.clear() + self._verse_row_index = 0 + + for vr in vref.all_verses(): + if not self._compare_segments: + vr = IgnoreSegmentsVerseRef(vr) + if rows := self._verse_rows_map.get(vr): + for row in rows: + if not row.is_consumed: + self._verse_rows.append(row.row_index) + row.is_consumed = True + def _is_nonverse_paragraph(state: UsfmParserState, element: UsfmUpdateBlockElement) -> bool: if element.type != UsfmUpdateBlockElementType.PARAGRAPH: diff --git a/machine/corpora/usfm_update_block_handler.py b/machine/corpora/usfm_update_block_handler.py index b06a31dd..eb936bc9 100644 --- a/machine/corpora/usfm_update_block_handler.py +++ b/machine/corpora/usfm_update_block_handler.py @@ -6,3 +6,13 @@ class UsfmUpdateBlockHandler(ABC): @abstractmethod def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: ... + + +class UsfmUpdateBlockHandlerException(Exception): + def __init__(self, block: UsfmUpdateBlock, *args): + self._block = block + super().__init__(*args) + + @property + def block(self): + return self._block diff --git a/machine/scripture/verse_ref.py b/machine/scripture/verse_ref.py index df1c47ee..5f5b41e5 100644 --- a/machine/scripture/verse_ref.py +++ b/machine/scripture/verse_ref.py @@ -391,6 +391,11 @@ def exact_equals(self, other: object) -> bool: and self.versification == other.versification ) + def __eq__(self, other): + if not isinstance(other, VerseRef): + return NotImplemented + return self._compare_verses(other, True) == 0 + def __hash__(self) -> int: if self._verse is not None: return self.bbbcccvvv ^ hash(self._verse) @@ -574,6 +579,19 @@ def _get_verse_num(verse: Optional[str]) -> Tuple[bool, int]: return True, v_num +class IgnoreSegmentsVerseRef(VerseRef): + def __init__(self, verse_ref: VerseRef): + super().__init__(verse_ref.book, verse_ref.chapter, verse_ref.verse, verse_ref.versification) + + def __eq__(self, other): + if not isinstance(other, VerseRef): + return NotImplemented + return self._compare_verses(other, False) == 0 + + def __hash__(self) -> int: + return self.bbbcccvvv + + class VersificationType(IntEnum): UNKNOWN = 0 ORIGINAL = 1 diff --git a/tests/corpora/test_paratext_project_terms_parser.py b/tests/corpora/test_paratext_project_terms_parser.py index 3ba87c57..51be188e 100644 --- a/tests/corpora/test_paratext_project_terms_parser.py +++ b/tests/corpora/test_paratext_project_terms_parser.py @@ -3,7 +3,7 @@ from testutils.memory_paratext_project_terms_parser import MemoryParatextProjectTermsParser from machine.corpora import ParatextProjectSettings, ParatextProjectTermsParserBase, UsfmStylesheet -from machine.corpora.paratext_project_terms_parser_base import _get_glosses, _strip_parens +from machine.corpora.paratext_project_terms_parser_base import _get_glosses, _get_renderings, _strip_parens from machine.scripture import ORIGINAL_VERSIFICATION, Versification @@ -110,11 +110,16 @@ def test_strip_parens() -> None: def test_get_glosses() -> None: assert _get_glosses("") == [] - assert _get_glosses("*Abba* /") == ["Abba"] - assert _get_glosses("Abba|| ") == ["Abba"] - assert _get_glosses("Abba||Abbah?") == ["Abba", "Abbah"] assert _get_glosses("Abba (note)") == ["Abba"] - assert _get_glosses("Ahasuerus, Xerxes; Assuerus") == ["Ahasuerus", "Xerxes", "Assuerus"] + assert set(_get_glosses("Ahasuerus, Xerxes; Assuerus")) == set(["Assuerus", "Xerxes", "Ahasuerus"]) + + +def test_get_renderings() -> None: + assert _get_renderings("") == [] + assert _get_renderings("*Abba*") == ["Abba"] + assert _get_renderings("Abba|| ") == ["Abba"] + assert _get_renderings("Abba||Abbah") == ["Abba", "Abbah"] + assert _get_renderings("Abba (note)") == ["Abba"] class _TestEnvironment: diff --git a/tests/corpora/test_place_markers_usfm_update_block_handler.py b/tests/corpora/test_place_markers_usfm_update_block_handler.py index 910841bf..5f75c635 100644 --- a/tests/corpora/test_place_markers_usfm_update_block_handler.py +++ b/tests/corpora/test_place_markers_usfm_update_block_handler.py @@ -594,8 +594,8 @@ def test_verses_out_of_order() -> None: result = r"""\id MAT \c 1 \v 2 new verse 2 -\v 1 -\p +\v 1 new verse 1 +\p new paragraph 2 """ assess(target, result) diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index 11bb4c03..a81920e8 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -1214,38 +1214,129 @@ def test_header_reference_paragraphs() -> None: assert_usfm_equals(target, result) +def test_out_of_order_verses() -> None: + rows = [ + UpdateUsfmRow(scr_ref("MAT 1:1"), "new verse 1"), + UpdateUsfmRow(scr_ref("MAT 1:2"), "new verse 2"), + UpdateUsfmRow(scr_ref("MAT 1:3"), "new verse 3"), + UpdateUsfmRow(scr_ref("MAT 1:4"), "new verse 4"), + UpdateUsfmRow(scr_ref("MAT 1:5"), "new verse 5"), + UpdateUsfmRow(scr_ref("MAT 1:6a"), "new verse 6a"), + UpdateUsfmRow(scr_ref("MAT 1:6b"), "new verse 6b"), + UpdateUsfmRow(scr_ref("MAT 1:6b/1:s"), "new section"), + UpdateUsfmRow(scr_ref("MAT 1:7"), "new verse 7"), + UpdateUsfmRow(scr_ref("MAT 1:8"), "new verse 8"), + ] + usfm = r"""\id MAT +\c 1 +\s1 beginning-of-chapter header +\p +\v 1 verse 1 +\v 2 verse 2 +\v 3 verse 3 +\v 6b verse 6b +\s section +\v 7 verse 7 +\v 8 verse 8 +\v 4 verse 4 +\v 5 verse 5 +\v 6a verse 6a +""" + + target = update_usfm(rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP, compare_segments=True) + result = r"""\id MAT +\c 1 +\s1 beginning-of-chapter header +\p +\v 1 new verse 1 +\v 2 new verse 2 +\v 3 new verse 3 +\v 6b new verse 6b +\s new section +\v 7 new verse 7 +\v 8 new verse 8 +\v 4 new verse 4 +\v 5 new verse 5 +\v 6a new verse 6a +""" + assert_usfm_equals(target, result) + + +def test_duplicate_verses() -> None: + rows = [ + UpdateUsfmRow(scr_ref("MAT 1:1"), "new verse 1"), + UpdateUsfmRow(scr_ref("MAT 1:2"), "new verse 2"), + UpdateUsfmRow(scr_ref("MAT 1:3"), "new verse 3"), + UpdateUsfmRow(scr_ref("MAT 1:4"), "new verse 4"), + ] + usfm = r"""\id MAT +\c 1 +\s1 beginning-of-chapter header +\p +\v 1 verse 1 +\v 2 verse 2 +\v 3 verse 3 +\v 3 another verse 3\f \fr 1.3 \ft Some duplicate verse three note \f* 1 +\p more verse three +\v 4 verse 4 +""" + + target = update_usfm(rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP) + result = r"""\id MAT +\c 1 +\s1 beginning-of-chapter header +\p +\v 1 new verse 1 +\v 2 new verse 2 +\v 3 new verse 3 +\v 4 new verse 4 +""" + assert_usfm_equals(target, result) + + def test_pass_remark(): rows = [ UpdateUsfmRow( scr_ref("MAT 1:1"), - str("Update 1"), + "Update 1", + ), + UpdateUsfmRow( + scr_ref("MAT 1:2"), + "Update 2", ), ] - usfm = r"""\id MAT + usfm = r"""\id MAT - Test +\ide UTF-8 \rem Existing remark \c 1 -\v 1 This is a verse +\v 1 Some text +\v 2 +\v 3 Other text """ - target = update_usfm(rows, usfm, text_behavior=UpdateUsfmTextBehavior.STRIP_EXISTING, remarks=["An added remark"]) - result = r"""\id MAT + target = update_usfm(rows, usfm, text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, remarks=["New remark"]) + result = r"""\id MAT - Test +\ide UTF-8 \rem Existing remark -\rem An added remark +\rem New remark \c 1 -\v 1 Update 1 +\v 1 Some text +\v 2 Update 2 +\v 3 Other text """ assert_usfm_equals(target, result) - target = update_usfm( - rows, target, text_behavior=UpdateUsfmTextBehavior.STRIP_EXISTING, remarks=["Another added remark"] - ) - result = r"""\id MAT + target = update_usfm(rows, target, text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, remarks=["New remark 2"]) + result = r"""\id MAT - Test +\ide UTF-8 \rem Existing remark -\rem An added remark -\rem Another added remark +\rem New remark +\rem New remark 2 \c 1 -\v 1 Update 1 +\v 1 Some text +\v 2 Update 2 +\v 3 Other text """ assert_usfm_equals(target, result) @@ -1266,6 +1357,7 @@ def update_usfm( preserve_paragraph_styles: Optional[Iterable[str]] = None, update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None, remarks: Optional[Iterable[str]] = None, + compare_segments: bool = False, ) -> Optional[str]: if source is None: updater = FileParatextProjectTextUpdater(USFM_TEST_PROJECT_PATH) @@ -1280,6 +1372,8 @@ def update_usfm( preserve_paragraph_styles, update_block_handlers, remarks, + lambda _: False, + compare_segments, ) else: source = source.strip().replace("\r\n", "\n") + "\r\n" @@ -1293,6 +1387,8 @@ def update_usfm( preserve_paragraph_styles, update_block_handlers, remarks, + lambda _: False, + compare_segments, ) parse_usfm(source, updater) return updater.get_usfm() From 89a93d56f1da7f1141062166eeeddde712c89d02 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 2 Oct 2025 14:32:10 -0400 Subject: [PATCH 2/6] Fix formatting --- machine/corpora/place_markers_usfm_update_block_handler.py | 4 +++- machine/corpora/scripture_ref.py | 3 ++- machine/corpora/update_usfm_parser_handler.py | 1 - 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py index 78db9a06..855e197f 100644 --- a/machine/corpora/place_markers_usfm_update_block_handler.py +++ b/machine/corpora/place_markers_usfm_update_block_handler.py @@ -125,7 +125,9 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: except ValueError: raise UsfmUpdateBlockHandlerException( block, - f'No token "{tok}" found in text "{trg_sent}" at or beyond index {trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0}. Is the versification correctly specified?', + f'No token "{tok}" found in text "{trg_sent}" at or beyond index' + f"{trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0}." + "Is the versification correctly specified?", ) trg_tok_starts.append(index_of_trg_tok_in_sent) prev_len = len(tok) diff --git a/machine/corpora/scripture_ref.py b/machine/corpora/scripture_ref.py index 154b9e11..ed479089 100644 --- a/machine/corpora/scripture_ref.py +++ b/machine/corpora/scripture_ref.py @@ -120,7 +120,8 @@ def __lt__(self, other: object) -> bool: return self.compare_to(other) < 0 def __hash__(self) -> int: - # Using to_relaxed() is necessary to maintain equality across relaxed refs, __eq__ properly handles relaxed ref comparison + # Using to_relaxed() is necessary to maintain equality across relaxed refs, + # __eq__ properly handles relaxed ref comparison return hash((self.verse_ref, tuple(self.to_relaxed().path))) def __repr__(self) -> str: diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index b715fee8..38a87c32 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -2,7 +2,6 @@ from typing import Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union from ..scripture.verse_ref import IgnoreSegmentsVerseRef, VerseRef, Versification - from .scripture_ref import ScriptureRef from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType from .usfm_parser_state import UsfmParserState From 9618d8f8ac2ea1532c2604fc3023f5dbfd5b2490 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 2 Oct 2025 14:49:07 -0400 Subject: [PATCH 3/6] Fix IgnoreSegmentsVerseRef --- machine/corpora/update_usfm_parser_handler.py | 1 + machine/scripture/verse_ref.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 38a87c32..6f8b873a 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -58,6 +58,7 @@ def __init__( self._verse_rows: List[int] = [] self._verse_row_index = 0 self._verse_rows_map: Dict[VerseRef, List[_RowInfo]] = {} + self._verse_rows_ref = VerseRef() if len(self._rows) > 0: self._update_rows_versification: Versification = self._rows[0].refs[0].versification else: diff --git a/machine/scripture/verse_ref.py b/machine/scripture/verse_ref.py index 5f5b41e5..2422163d 100644 --- a/machine/scripture/verse_ref.py +++ b/machine/scripture/verse_ref.py @@ -581,7 +581,7 @@ def _get_verse_num(verse: Optional[str]) -> Tuple[bool, int]: class IgnoreSegmentsVerseRef(VerseRef): def __init__(self, verse_ref: VerseRef): - super().__init__(verse_ref.book, verse_ref.chapter, verse_ref.verse, verse_ref.versification) + super().__init__(verse_ref.book_num, verse_ref.chapter_num, verse_ref.verse_num, verse_ref.versification) def __eq__(self, other): if not isinstance(other, VerseRef): From eca9fb54ba37163691ac660a9416032a7707d629 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 3 Oct 2025 14:48:13 -0400 Subject: [PATCH 4/6] Update exception name to UsfmUpdateBlockHandlerError --- machine/corpora/paratext_project_text_updater_base.py | 4 ++-- machine/corpora/place_markers_usfm_update_block_handler.py | 4 ++-- machine/corpora/update_usfm_parser_handler.py | 6 +++--- machine/corpora/usfm_update_block_handler.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index 81f2c5d1..14c57305 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -11,7 +11,7 @@ UpdateUsfmTextBehavior, ) from .usfm_parser import parse_usfm -from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerException +from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError class ParatextProjectTextUpdaterBase(ABC): @@ -33,7 +33,7 @@ def update_usfm( preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None, update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None, remarks: Optional[Iterable[str]] = None, - error_handler: Optional[Callable[[UsfmUpdateBlockHandlerException], bool]] = None, + error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None, compare_segments: bool = False, ) -> Optional[str]: file_name: str = self._settings.get_book_file_name(book_id) diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py index 855e197f..e2dd4a1b 100644 --- a/machine/corpora/place_markers_usfm_update_block_handler.py +++ b/machine/corpora/place_markers_usfm_update_block_handler.py @@ -7,7 +7,7 @@ from .usfm_token import UsfmToken, UsfmTokenType from .usfm_update_block import UsfmUpdateBlock from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType -from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerException +from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError PLACE_MARKERS_ALIGNMENT_INFO_KEY = "alignment_info" @@ -123,7 +123,7 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: tok, trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0 ) except ValueError: - raise UsfmUpdateBlockHandlerException( + raise UsfmUpdateBlockHandlerError( block, f'No token "{tok}" found in text "{trg_sent}" at or beyond index' f"{trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0}." diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 6f8b873a..71617bc5 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -11,7 +11,7 @@ from .usfm_tokenizer import UsfmTokenizer from .usfm_update_block import UsfmUpdateBlock from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType -from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerException +from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError class UpdateUsfmTextBehavior(Enum): @@ -50,7 +50,7 @@ def __init__( preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None, update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None, remarks: Optional[Iterable[str]] = None, - error_handler: Optional[Callable[[UsfmUpdateBlockHandlerException], bool]] = None, + error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None, compare_segments: bool = False, ) -> None: super().__init__() @@ -462,7 +462,7 @@ def _end_update_block(self, state: UsfmParserState, scripture_refs: Sequence[Scr for handler in self._update_block_handlers: try: update_block = handler.process_block(update_block) - except UsfmUpdateBlockHandlerException as e: + except UsfmUpdateBlockHandlerError as e: should_continue = self._error_handler(e) if not should_continue: raise diff --git a/machine/corpora/usfm_update_block_handler.py b/machine/corpora/usfm_update_block_handler.py index eb936bc9..9df2e210 100644 --- a/machine/corpora/usfm_update_block_handler.py +++ b/machine/corpora/usfm_update_block_handler.py @@ -8,7 +8,7 @@ class UsfmUpdateBlockHandler(ABC): def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: ... -class UsfmUpdateBlockHandlerException(Exception): +class UsfmUpdateBlockHandlerError(Exception): def __init__(self, block: UsfmUpdateBlock, *args): self._block = block super().__init__(*args) From 512a3d4c7d84da8d6af662c73a491d0697c95a7f Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 6 Oct 2025 13:19:28 -0400 Subject: [PATCH 5/6] Remove scripture ref 'comparer' --- machine/corpora/scripture_ref.py | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/machine/corpora/scripture_ref.py b/machine/corpora/scripture_ref.py index ed479089..408892ef 100644 --- a/machine/corpora/scripture_ref.py +++ b/machine/corpora/scripture_ref.py @@ -112,7 +112,7 @@ def compare_to(self, other: object, compare_segments: bool = True) -> int: def __eq__(self, other: object) -> bool: if not isinstance(other, ScriptureRef): return NotImplemented - return self.compare_to(other, True) == 0 + return self.verse_ref == other.verse_ref and self.path == other.path def __lt__(self, other: object) -> bool: if not isinstance(other, ScriptureRef): @@ -120,9 +120,7 @@ def __lt__(self, other: object) -> bool: return self.compare_to(other) < 0 def __hash__(self) -> int: - # Using to_relaxed() is necessary to maintain equality across relaxed refs, - # __eq__ properly handles relaxed ref comparison - return hash((self.verse_ref, tuple(self.to_relaxed().path))) + return hash((self.verse_ref, tuple(self.path))) def __repr__(self) -> str: result = str(self.verse_ref) @@ -131,19 +129,4 @@ def __repr__(self) -> str: return result -class IgnoreSegmentsScriptureRef(ScriptureRef): - def __eq__(self, other): - if not isinstance(other, ScriptureRef): - return NotImplemented - return self.compare_to(other, False) - - def __hash__(self): - return hash( - ( - IgnoreSegmentsVerseRef(self.verse_ref), - tuple(self.to_relaxed().path), - ) - ) - - EMPTY_SCRIPTURE_REF = ScriptureRef() From 2c3ec99371cfad302a47cab74c0ac7b278bae3a8 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 6 Oct 2025 13:25:38 -0400 Subject: [PATCH 6/6] Remove unused import --- machine/corpora/scripture_ref.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine/corpora/scripture_ref.py b/machine/corpora/scripture_ref.py index 408892ef..5d63ed89 100644 --- a/machine/corpora/scripture_ref.py +++ b/machine/corpora/scripture_ref.py @@ -4,7 +4,7 @@ from typing import List, Optional from ..scripture.constants import ENGLISH_VERSIFICATION -from ..scripture.verse_ref import IgnoreSegmentsVerseRef, VerseRef, Versification +from ..scripture.verse_ref import VerseRef, Versification from ..utils.comparable import Comparable from .scripture_element import ScriptureElement