diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py index cc0888d8..8e7c95b4 100644 --- a/machine/corpora/place_markers_usfm_update_block_handler.py +++ b/machine/corpora/place_markers_usfm_update_block_handler.py @@ -100,8 +100,10 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: adj_src_toks.append(src_tok_idx) trg_tok_starts = [] + prev_len = 0 for tok in trg_toks: - trg_tok_starts.append(trg_sent.index(tok, trg_tok_starts[-1] + 1 if len(trg_tok_starts) > 0 else 0)) + trg_tok_starts.append(trg_sent.index(tok, trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0)) + prev_len = len(tok) # Predict marker placements and get insertion order to_insert = [] diff --git a/tests/corpora/test_place_markers_usfm_update_block_handler.py b/tests/corpora/test_place_markers_usfm_update_block_handler.py index 569597c8..359c4e14 100644 --- a/tests/corpora/test_place_markers_usfm_update_block_handler.py +++ b/tests/corpora/test_place_markers_usfm_update_block_handler.py @@ -448,6 +448,36 @@ def test_no_text() -> None: assess(target, result) +def test_consecutive_substring() -> None: + rows = [(scr_ref("MAT 1:1"), "string ring")] + usfm = r"""\id MAT +\c 1 +\v 1 string +\p ring +""" + + align_info = [ + PlaceMarkersAlignmentInfo( + refs=["MAT 1:1"], + source_tokens=["string", "ring"], + translation_tokens=["string", "ring"], + alignment=to_word_alignment_matrix("0-0 1-1"), + ), + ] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 string +\p ring +""" + assess(target, result) + + def scr_ref(*refs: str) -> List[ScriptureRef]: return [ScriptureRef.parse(ref) for ref in refs]