diff --git a/machine/corpora/paratext_project_terms_parser_base.py b/machine/corpora/paratext_project_terms_parser_base.py index 3245c95..678b6a9 100644 --- a/machine/corpora/paratext_project_terms_parser_base.py +++ b/machine/corpora/paratext_project_terms_parser_base.py @@ -70,10 +70,14 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) - id = term.attrib["Id"] if _is_in_category(id, term_categories, term_id_to_category_dict): id_ = id.replace("\n", " ") - renderings = term.find("Renderings") - gloss = renderings.text if renderings is not None and renderings.text is not None else "" - glosses = _get_glosses(gloss) - terms_renderings[id_].extend(glosses) + renderings_element = term.find("Renderings") + rendering_text = ( + renderings_element.text + if renderings_element is not None and renderings_element.text is not None + else "" + ) + renderings = _get_renderings(rendering_text) + terms_renderings[id_].extend(renderings) terms_glosses: Dict[str, List[str]] = defaultdict(list) if terms_glosses_doc is not None and use_term_glosses: @@ -102,25 +106,33 @@ def _is_in_category(id: str, term_categories: Sequence[str], term_id_to_category return not term_categories or (category is not None and category in term_categories) +def _clean_term(term: str): + term = term.strip() + term = _strip_parens(term) + term = " ".join(term.split()) + return term + + def _get_glosses(gloss: str) -> List[str]: match = _CONTENT_IN_BRACKETS_REGEX.match(gloss) if match: - gloss = match.group(0) - gloss = gloss.replace("?", "") - gloss = gloss.replace("*", "") - gloss = gloss.replace("/", " ") - gloss = gloss.strip() - gloss = _strip_parens(gloss) + gloss = match.group(1) + gloss = _clean_term(gloss) gloss = _strip_parens(gloss, left="[", right="]") gloss = gloss.strip() for match in _NUMERICAL_INFORMATION_REGEX.finditer(gloss): gloss = gloss.replace(match.group(0), "") - glosses = re.split(r"\|\|", gloss) - glosses = [re.split(r"[,;]", g) for g in glosses] - glosses = [item.strip() for sublist in glosses for item in sublist if item.strip()] + glosses = re.split(r"[,;/]", gloss) + glosses = list(set([gloss.strip() for gloss in glosses if gloss.strip()])) return glosses +def _get_renderings(rendering: str) -> List[str]: + renderings = re.split(r"\|\|", rendering.strip()) + renderings = [_clean_term(rendering).strip().replace("*", "") for rendering in renderings] + return [rendering for rendering in renderings if rendering] + + def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str: parens: int = 0 end: int = -1 diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index c29d1ef..14c5730 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import BinaryIO, Iterable, Optional, Sequence, Union +from typing import BinaryIO, Callable, Iterable, Optional, Sequence, Union from ..utils.typeshed import StrPath from .paratext_project_settings import ParatextProjectSettings @@ -11,7 +11,7 @@ UpdateUsfmTextBehavior, ) from .usfm_parser import parse_usfm -from .usfm_update_block_handler import UsfmUpdateBlockHandler +from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError class ParatextProjectTextUpdaterBase(ABC): @@ -33,6 +33,8 @@ def update_usfm( preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None, update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None, remarks: Optional[Iterable[str]] = None, + error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None, + compare_segments: bool = False, ) -> Optional[str]: file_name: str = self._settings.get_book_file_name(book_id) if not self._exists(file_name): @@ -49,6 +51,8 @@ def update_usfm( preserve_paragraph_styles, update_block_handlers=update_block_handlers, remarks=remarks, + error_handler=error_handler, + compare_segments=compare_segments, ) try: parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification) diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py index a48a92a..e2dd4a1 100644 --- a/machine/corpora/place_markers_usfm_update_block_handler.py +++ b/machine/corpora/place_markers_usfm_update_block_handler.py @@ -7,7 +7,7 @@ from .usfm_token import UsfmToken, UsfmTokenType from .usfm_update_block import UsfmUpdateBlock from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType -from .usfm_update_block_handler import UsfmUpdateBlockHandler +from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError PLACE_MARKERS_ALIGNMENT_INFO_KEY = "alignment_info" @@ -118,7 +118,18 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: trg_tok_starts = [] prev_len = 0 for tok in trg_toks: - trg_tok_starts.append(trg_sent.index(tok, trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0)) + try: + index_of_trg_tok_in_sent = trg_sent.index( + tok, trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0 + ) + except ValueError: + raise UsfmUpdateBlockHandlerError( + block, + f'No token "{tok}" found in text "{trg_sent}" at or beyond index' + f"{trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0}." + "Is the versification correctly specified?", + ) + trg_tok_starts.append(index_of_trg_tok_in_sent) prev_len = len(tok) # Predict marker placements and get insertion order diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 032fceb..71617bc 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -1,6 +1,7 @@ from enum import Enum, auto -from typing import Iterable, List, Optional, Sequence, Tuple, Union +from typing import Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union +from ..scripture.verse_ref import IgnoreSegmentsVerseRef, VerseRef, Versification from .scripture_ref import ScriptureRef from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType from .usfm_parser_state import UsfmParserState @@ -10,7 +11,7 @@ from .usfm_tokenizer import UsfmTokenizer from .usfm_update_block import UsfmUpdateBlock from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType -from .usfm_update_block_handler import UsfmUpdateBlockHandler +from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError class UpdateUsfmTextBehavior(Enum): @@ -24,6 +25,12 @@ class UpdateUsfmMarkerBehavior(Enum): STRIP = auto() +class _RowInfo: + def __init__(self, row_index: int): + self.row_index = row_index + self.is_consumed = False + + class UpdateUsfmRow: def __init__(self, refs: Sequence[ScriptureRef], text: str, metadata: Optional[dict[str, object]] = None): self.refs = refs @@ -43,9 +50,19 @@ def __init__( preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None, update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None, remarks: Optional[Iterable[str]] = None, + error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None, + compare_segments: bool = False, ) -> None: super().__init__() self._rows = rows or [] + self._verse_rows: List[int] = [] + self._verse_row_index = 0 + self._verse_rows_map: Dict[VerseRef, List[_RowInfo]] = {} + self._verse_rows_ref = VerseRef() + if len(self._rows) > 0: + self._update_rows_versification: Versification = self._rows[0].refs[0].versification + else: + self._update_rows_versification = Versification.get_builtin("English") self._tokens: List[UsfmToken] = [] self._updated_text: List[UsfmToken] = [] self._update_block_stack: list[UsfmUpdateBlock] = [] @@ -65,6 +82,11 @@ def __init__( self._remarks = [] else: self._remarks = list(remarks) + if error_handler is None: + self._error_handler = lambda _: False + else: + self._error_handler = error_handler + self._compare_segments = compare_segments self._text_behavior = text_behavior self._paragraph_behavior = paragraph_behavior self._embed_behavior = embed_behavior @@ -82,6 +104,10 @@ def end_usfm(self, state: UsfmParserState) -> None: super().end_usfm(state) def start_book(self, state: UsfmParserState, marker: str, code: str) -> None: + self._verse_rows_ref = state.verse_ref.copy() + self._update_verse_rows_map() + self._update_verse_rows() + self._collect_readonly_tokens(state) self._update_block_stack.append(UsfmUpdateBlock()) start_book_tokens: List[UsfmToken] = [] @@ -108,7 +134,7 @@ def start_para( ) -> None: if state.is_verse_text: # Only strip paragraph markers in a verse - if self._paragraph_behavior == UpdateUsfmMarkerBehavior.PRESERVE: + if self._paragraph_behavior == UpdateUsfmMarkerBehavior.PRESERVE and not self._duplicate_verse: self._collect_updatable_tokens(state) else: self._skip_updatable_tokens(state) @@ -148,6 +174,11 @@ def chapter( ) -> None: self._use_updated_text() + if self._verse_rows_ref != state.verse_ref: + self._verse_rows_ref = state.verse_ref.copy() + self._update_verse_rows_map() + self._update_verse_rows() + super().chapter(state, number, marker, alt_number, pub_number) self._collect_readonly_tokens(state) @@ -179,14 +210,23 @@ def verse( if last_paragraph is not None: last_paragraph.marked_for_removal = False - super().verse(state, number, marker, alt_number, pub_number) + if self._verse_rows_ref != state.verse_ref: + self._verse_rows_ref = state.verse_ref.copy() + self._update_verse_rows() - self._collect_readonly_tokens(state) + super().verse(state, number, marker, alt_number, pub_number) + if self._duplicate_verse: + self._skip_updatable_tokens(state) + else: + self._collect_readonly_tokens(state) def start_note(self, state: UsfmParserState, marker: str, caller: str, category: str) -> None: super().start_note(state, marker, caller, category) - self._collect_updatable_tokens(state) + if not self._duplicate_verse: + self._collect_updatable_tokens(state) + else: + self._skip_updatable_tokens(state) def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None: if closed: @@ -219,15 +259,14 @@ def end_char( attributes: Sequence[UsfmAttribute], closed: bool, ) -> None: - if closed: - if self._current_text_type == ScriptureTextType.EMBED: - self._collect_updatable_tokens(state) + if self._current_text_type == ScriptureTextType.EMBED: + self._collect_updatable_tokens(state) + else: + self._replace_with_new_tokens(state) + if self._style_behavior == UpdateUsfmMarkerBehavior.STRIP: + self._skip_updatable_tokens(state) else: - self._replace_with_new_tokens(state) - if self._style_behavior == UpdateUsfmMarkerBehavior.STRIP: - self._skip_updatable_tokens(state) - else: - self._collect_updatable_tokens(state) + self._collect_updatable_tokens(state) super().end_char(state, marker, attributes, closed) @@ -242,7 +281,9 @@ def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> def text(self, state: UsfmParserState, text: str) -> None: super().text(state, text) - if self._replace_with_new_tokens(state): + if self._replace_with_new_tokens(state) or ( + self._duplicate_verse and self._current_text_type == ScriptureTextType.VERSE + ): self._skip_updatable_tokens(state) else: self._collect_updatable_tokens(state) @@ -292,11 +333,10 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: for remark in self._remarks: remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem")) remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark)) - if len(tokens) > 0 and tokens[0].marker == "id": - index = 1 - if len(tokens) > 1 and tokens[1].type == UsfmTokenType.TEXT: - index = 2 - while tokens[index].marker == "rem": + if len(tokens) > 0: + index = 0 + markers_to_skip = {"id", "ide", "rem"} + while tokens[index].marker in markers_to_skip: index += 1 if len(tokens) > index and tokens[index].type == UsfmTokenType.TEXT: index += 1 @@ -308,13 +348,15 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str] row_texts: List[str] = [] row_metadata = None source_index: int = 0 - while self._row_index < len(self._rows) and source_index < len(seg_scr_refs): + while self._verse_row_index < len(self._verse_rows) and source_index < len(seg_scr_refs): compare: int = 0 - row = self._rows[self._row_index] + row = self._rows[self._verse_rows[self._verse_row_index]] row_scr_refs, text, metadata = row.refs, row.text, row.metadata for row_scr_ref in row_scr_refs: while source_index < len(seg_scr_refs): - compare = row_scr_ref.compare_to(seg_scr_refs[source_index], compare_segments=False) + compare = row_scr_ref.compare_to( + seg_scr_refs[source_index], compare_segments=self._compare_segments + ) if compare > 0: # row is ahead of source, increment source source_index += 1 @@ -328,7 +370,7 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str] break if compare <= 0: # source is ahead of row, increment row - self._row_index += 1 + self._verse_row_index += 1 return row_texts, row_metadata def _collect_updatable_tokens(self, state: UsfmParserState) -> None: @@ -418,7 +460,13 @@ def _end_update_block(self, state: UsfmParserState, scripture_refs: Sequence[Scr para_elems.append(update_block.pop()) for handler in self._update_block_handlers: - update_block = handler.process_block(update_block) + try: + update_block = handler.process_block(update_block) + except UsfmUpdateBlockHandlerError as e: + should_continue = self._error_handler(e) + if not should_continue: + raise + tokens = update_block.get_tokens() for elem in reversed(para_elems): tokens.extend(elem.get_tokens()) @@ -449,6 +497,41 @@ def _pop_new_tokens(self) -> None: def _is_in_preserved_paragraph(self, state: UsfmParserState) -> bool: return state.para_tag is not None and state.para_tag.marker in self._preserve_paragraph_styles + def _update_verse_rows_map(self) -> None: + self._verse_rows_map.clear() + while ( + self._row_index < len(self._rows) + and self._rows[self._row_index].refs[0].chapter_num == self._verse_rows_ref.chapter_num + ): + row = self._rows[self._row_index] + ri = _RowInfo(self._row_index) + for sr in row.refs: + vr = sr.verse_ref if self._compare_segments else IgnoreSegmentsVerseRef(sr.verse_ref) + if vr in self._verse_rows_map: + self._verse_rows_map[vr].append(ri) + else: + self._verse_rows_map[vr] = [ri] + self._row_index += 1 + + def _update_verse_rows(self) -> None: + vref = self._verse_rows_ref.copy() + # We are using a dictionary, which uses an equality comparer. As a result, we need to change the + # source verse ref to use the row versification. If we used a SortedList, it wouldn't be necessary, but it + # would be less efficient. + vref.change_versification(self._update_rows_versification) + + self._verse_rows.clear() + self._verse_row_index = 0 + + for vr in vref.all_verses(): + if not self._compare_segments: + vr = IgnoreSegmentsVerseRef(vr) + if rows := self._verse_rows_map.get(vr): + for row in rows: + if not row.is_consumed: + self._verse_rows.append(row.row_index) + row.is_consumed = True + def _is_nonverse_paragraph(state: UsfmParserState, element: UsfmUpdateBlockElement) -> bool: if element.type != UsfmUpdateBlockElementType.PARAGRAPH: diff --git a/machine/corpora/usfm_update_block_handler.py b/machine/corpora/usfm_update_block_handler.py index b06a31d..9df2e21 100644 --- a/machine/corpora/usfm_update_block_handler.py +++ b/machine/corpora/usfm_update_block_handler.py @@ -6,3 +6,13 @@ class UsfmUpdateBlockHandler(ABC): @abstractmethod def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: ... + + +class UsfmUpdateBlockHandlerError(Exception): + def __init__(self, block: UsfmUpdateBlock, *args): + self._block = block + super().__init__(*args) + + @property + def block(self): + return self._block diff --git a/machine/scripture/verse_ref.py b/machine/scripture/verse_ref.py index df1c47e..2422163 100644 --- a/machine/scripture/verse_ref.py +++ b/machine/scripture/verse_ref.py @@ -391,6 +391,11 @@ def exact_equals(self, other: object) -> bool: and self.versification == other.versification ) + def __eq__(self, other): + if not isinstance(other, VerseRef): + return NotImplemented + return self._compare_verses(other, True) == 0 + def __hash__(self) -> int: if self._verse is not None: return self.bbbcccvvv ^ hash(self._verse) @@ -574,6 +579,19 @@ def _get_verse_num(verse: Optional[str]) -> Tuple[bool, int]: return True, v_num +class IgnoreSegmentsVerseRef(VerseRef): + def __init__(self, verse_ref: VerseRef): + super().__init__(verse_ref.book_num, verse_ref.chapter_num, verse_ref.verse_num, verse_ref.versification) + + def __eq__(self, other): + if not isinstance(other, VerseRef): + return NotImplemented + return self._compare_verses(other, False) == 0 + + def __hash__(self) -> int: + return self.bbbcccvvv + + class VersificationType(IntEnum): UNKNOWN = 0 ORIGINAL = 1 diff --git a/tests/corpora/test_paratext_project_terms_parser.py b/tests/corpora/test_paratext_project_terms_parser.py index 3ba87c5..51be188 100644 --- a/tests/corpora/test_paratext_project_terms_parser.py +++ b/tests/corpora/test_paratext_project_terms_parser.py @@ -3,7 +3,7 @@ from testutils.memory_paratext_project_terms_parser import MemoryParatextProjectTermsParser from machine.corpora import ParatextProjectSettings, ParatextProjectTermsParserBase, UsfmStylesheet -from machine.corpora.paratext_project_terms_parser_base import _get_glosses, _strip_parens +from machine.corpora.paratext_project_terms_parser_base import _get_glosses, _get_renderings, _strip_parens from machine.scripture import ORIGINAL_VERSIFICATION, Versification @@ -110,11 +110,16 @@ def test_strip_parens() -> None: def test_get_glosses() -> None: assert _get_glosses("") == [] - assert _get_glosses("*Abba* /") == ["Abba"] - assert _get_glosses("Abba|| ") == ["Abba"] - assert _get_glosses("Abba||Abbah?") == ["Abba", "Abbah"] assert _get_glosses("Abba (note)") == ["Abba"] - assert _get_glosses("Ahasuerus, Xerxes; Assuerus") == ["Ahasuerus", "Xerxes", "Assuerus"] + assert set(_get_glosses("Ahasuerus, Xerxes; Assuerus")) == set(["Assuerus", "Xerxes", "Ahasuerus"]) + + +def test_get_renderings() -> None: + assert _get_renderings("") == [] + assert _get_renderings("*Abba*") == ["Abba"] + assert _get_renderings("Abba|| ") == ["Abba"] + assert _get_renderings("Abba||Abbah") == ["Abba", "Abbah"] + assert _get_renderings("Abba (note)") == ["Abba"] class _TestEnvironment: diff --git a/tests/corpora/test_place_markers_usfm_update_block_handler.py b/tests/corpora/test_place_markers_usfm_update_block_handler.py index 910841b..5f75c63 100644 --- a/tests/corpora/test_place_markers_usfm_update_block_handler.py +++ b/tests/corpora/test_place_markers_usfm_update_block_handler.py @@ -594,8 +594,8 @@ def test_verses_out_of_order() -> None: result = r"""\id MAT \c 1 \v 2 new verse 2 -\v 1 -\p +\v 1 new verse 1 +\p new paragraph 2 """ assess(target, result) diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index 11bb4c0..a81920e 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -1214,38 +1214,129 @@ def test_header_reference_paragraphs() -> None: assert_usfm_equals(target, result) +def test_out_of_order_verses() -> None: + rows = [ + UpdateUsfmRow(scr_ref("MAT 1:1"), "new verse 1"), + UpdateUsfmRow(scr_ref("MAT 1:2"), "new verse 2"), + UpdateUsfmRow(scr_ref("MAT 1:3"), "new verse 3"), + UpdateUsfmRow(scr_ref("MAT 1:4"), "new verse 4"), + UpdateUsfmRow(scr_ref("MAT 1:5"), "new verse 5"), + UpdateUsfmRow(scr_ref("MAT 1:6a"), "new verse 6a"), + UpdateUsfmRow(scr_ref("MAT 1:6b"), "new verse 6b"), + UpdateUsfmRow(scr_ref("MAT 1:6b/1:s"), "new section"), + UpdateUsfmRow(scr_ref("MAT 1:7"), "new verse 7"), + UpdateUsfmRow(scr_ref("MAT 1:8"), "new verse 8"), + ] + usfm = r"""\id MAT +\c 1 +\s1 beginning-of-chapter header +\p +\v 1 verse 1 +\v 2 verse 2 +\v 3 verse 3 +\v 6b verse 6b +\s section +\v 7 verse 7 +\v 8 verse 8 +\v 4 verse 4 +\v 5 verse 5 +\v 6a verse 6a +""" + + target = update_usfm(rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP, compare_segments=True) + result = r"""\id MAT +\c 1 +\s1 beginning-of-chapter header +\p +\v 1 new verse 1 +\v 2 new verse 2 +\v 3 new verse 3 +\v 6b new verse 6b +\s new section +\v 7 new verse 7 +\v 8 new verse 8 +\v 4 new verse 4 +\v 5 new verse 5 +\v 6a new verse 6a +""" + assert_usfm_equals(target, result) + + +def test_duplicate_verses() -> None: + rows = [ + UpdateUsfmRow(scr_ref("MAT 1:1"), "new verse 1"), + UpdateUsfmRow(scr_ref("MAT 1:2"), "new verse 2"), + UpdateUsfmRow(scr_ref("MAT 1:3"), "new verse 3"), + UpdateUsfmRow(scr_ref("MAT 1:4"), "new verse 4"), + ] + usfm = r"""\id MAT +\c 1 +\s1 beginning-of-chapter header +\p +\v 1 verse 1 +\v 2 verse 2 +\v 3 verse 3 +\v 3 another verse 3\f \fr 1.3 \ft Some duplicate verse three note \f* 1 +\p more verse three +\v 4 verse 4 +""" + + target = update_usfm(rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP) + result = r"""\id MAT +\c 1 +\s1 beginning-of-chapter header +\p +\v 1 new verse 1 +\v 2 new verse 2 +\v 3 new verse 3 +\v 4 new verse 4 +""" + assert_usfm_equals(target, result) + + def test_pass_remark(): rows = [ UpdateUsfmRow( scr_ref("MAT 1:1"), - str("Update 1"), + "Update 1", + ), + UpdateUsfmRow( + scr_ref("MAT 1:2"), + "Update 2", ), ] - usfm = r"""\id MAT + usfm = r"""\id MAT - Test +\ide UTF-8 \rem Existing remark \c 1 -\v 1 This is a verse +\v 1 Some text +\v 2 +\v 3 Other text """ - target = update_usfm(rows, usfm, text_behavior=UpdateUsfmTextBehavior.STRIP_EXISTING, remarks=["An added remark"]) - result = r"""\id MAT + target = update_usfm(rows, usfm, text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, remarks=["New remark"]) + result = r"""\id MAT - Test +\ide UTF-8 \rem Existing remark -\rem An added remark +\rem New remark \c 1 -\v 1 Update 1 +\v 1 Some text +\v 2 Update 2 +\v 3 Other text """ assert_usfm_equals(target, result) - target = update_usfm( - rows, target, text_behavior=UpdateUsfmTextBehavior.STRIP_EXISTING, remarks=["Another added remark"] - ) - result = r"""\id MAT + target = update_usfm(rows, target, text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, remarks=["New remark 2"]) + result = r"""\id MAT - Test +\ide UTF-8 \rem Existing remark -\rem An added remark -\rem Another added remark +\rem New remark +\rem New remark 2 \c 1 -\v 1 Update 1 +\v 1 Some text +\v 2 Update 2 +\v 3 Other text """ assert_usfm_equals(target, result) @@ -1266,6 +1357,7 @@ def update_usfm( preserve_paragraph_styles: Optional[Iterable[str]] = None, update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None, remarks: Optional[Iterable[str]] = None, + compare_segments: bool = False, ) -> Optional[str]: if source is None: updater = FileParatextProjectTextUpdater(USFM_TEST_PROJECT_PATH) @@ -1280,6 +1372,8 @@ def update_usfm( preserve_paragraph_styles, update_block_handlers, remarks, + lambda _: False, + compare_segments, ) else: source = source.strip().replace("\r\n", "\n") + "\r\n" @@ -1293,6 +1387,8 @@ def update_usfm( preserve_paragraph_styles, update_block_handlers, remarks, + lambda _: False, + compare_segments, ) parse_usfm(source, updater) return updater.get_usfm()