From cf993b4d6ae302ccc043ec8e25223988433fa71e Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Thu, 8 May 2025 23:45:24 -0400
Subject: [PATCH 1/6] Marker placement update block handler

---
 ...place_markers_usfm_update_block_handler.py | 259 ++++++++++
 ...place_markers_usfm_update_block_handler.py | 483 ++++++++++++++++++
 2 files changed, 742 insertions(+)
 create mode 100644 machine/corpora/place_markers_usfm_update_block_handler.py
 create mode 100644 tests/corpora/test_place_markers_usfm_update_block_handler.py

diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py
new file mode 100644
index 00000000..514a3022
--- /dev/null
+++ b/machine/corpora/place_markers_usfm_update_block_handler.py
@@ -0,0 +1,259 @@
+from __future__ import annotations
+
+from typing import List, Sequence
+
+from ..jobs.eflomal_aligner import to_word_alignment_matrix
+from ..jobs.translation_file_service import PretranslationInfo
+from ..scripture.verse_ref import VerseRef
+from ..tokenization import LatinWordTokenizer
+from ..translation import WordAlignmentMatrix
+from .aligned_word_pair import AlignedWordPair
+from .usfm_stylesheet import UsfmStylesheet
+from .usfm_tag import UsfmTextType
+from .usfm_token import UsfmToken, UsfmTokenType
+from .usfm_update_block import UsfmUpdateBlock
+from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
+from .usfm_update_block_handler import UsfmUpdateBlockHandler
+
+TOKENIZER = LatinWordTokenizer()
+STYLESHEET = UsfmStylesheet("usfm.sty")
+
+
+class PlaceMarkersUsfmUpdateBlockHandler(UsfmUpdateBlockHandler):
+
+    def __init__(self, pt_info: Sequence[PretranslationInfo]):
+        self._pt_info = {}
+        for info in pt_info:
+            if len(info["refs"]) == 1:
+                ref_str = info["refs"][0]
+            else:
+                ref_str_start = VerseRef.from_string(info["refs"][0])
+                ref_str_end = VerseRef.from_string(info["refs"][-1])
+                ref_str = str(VerseRef.from_range(ref_str_start, ref_str_end))
+            self._pt_info[ref_str] = info
+        # self._pt_info = {info["refs"][0]: info for info in pt_info}
+
+    def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
+        block_ref = str(
+            block.refs[0]
+            if len(block.refs) == 1
+            else VerseRef.from_range(block.refs[0].verse_ref, block.refs[-1].verse_ref)
+        )
+
+        # Nothing to do if there are no markers to place, no alignment to use, or if the block represents an embed
+        if (
+            len(block.elements) == 0
+            or block_ref not in self._pt_info.keys()
+            or len(self._pt_info[block_ref]["alignment"]) == 0
+            # TODO: is this too restrictive?
+            or block.elements[0].tokens[0].marker != "v"
+            or not any(
+                (
+                    element.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE]
+                    and not element.marked_for_removal
+                )
+                for element in block.elements[1:]  # TODO: all of block
+            )
+        ):
+            return block
+
+        # Work on a copy in case the block needs to be returned unchanged
+        orig_elements = list(block.elements)
+
+        src_sent = ""
+        trg_sent = ""
+        to_place = []
+        src_marker_idxs = []
+        placed_elements = [orig_elements[0]]  # TODO: no elements to start
+        ignored_elements = []
+
+        # Section headers should be ignored but re-inserted in the same position relative to other paragraph markers
+        header_elements = []
+        para_markers_left = 0
+        for i, element in reversed(list(enumerate(orig_elements))):
+            if element.type == UsfmUpdateBlockElementType.PARAGRAPH and not element.marked_for_removal:
+                if STYLESHEET.get_tag(str(element.tokens[0].marker)).text_type == UsfmTextType.SECTION:
+                    # if i < len(orig_elements) - 1 and orig_elements[i + 1].type == UsfmUpdateBlockElementType.TEXT:
+                    #     header_elements.insert(0, (para_markers_left, [element, orig_elements.pop(i + 1)]))
+                    # else:
+                    header_elements.insert(0, (para_markers_left, element))
+                    orig_elements.pop(i)
+                else:
+                    para_markers_left += 1
+
+        # Paragraph markers at the end of the block should stay there
+        end_elements = []
+        for i, element in reversed(list(enumerate(orig_elements))):
+            if element.type == UsfmUpdateBlockElementType.PARAGRAPH and not element.marked_for_removal:
+                end_elements.insert(0, element)
+                orig_elements.pop(i)
+            elif element.type != UsfmUpdateBlockElementType.EMBED:
+                break
+
+        for element in orig_elements[1:]:  # TODO: all
+            if element.type == UsfmUpdateBlockElementType.TEXT:
+                if element.marked_for_removal:
+                    src_sent += element.tokens[0].to_usfm()
+                else:
+                    trg_sent += element.tokens[0].to_usfm()
+
+            if element.marked_for_removal or element.type == UsfmUpdateBlockElementType.EMBED:
+                ignored_elements.append(element)
+            elif element.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE]:
+                to_place.append(element)
+                src_marker_idxs.append(len(src_sent))
+
+        src_toks = self._pt_info[block_ref]["source_toks"]
+        trg_toks = self._pt_info[block_ref]["translation_toks"]
+
+        # Don't do anything if the source sentence or pretranslation has changed
+        if (
+            list(t for t in TOKENIZER.tokenize(src_sent)) != src_toks
+            or list(t for t in TOKENIZER.tokenize(trg_sent)) != trg_toks
+        ):
+            return block
+
+        src_tok_starts = []
+        for tok in src_toks:
+            src_tok_starts.append(src_sent.index(tok, src_tok_starts[-1] + 1 if len(src_tok_starts) > 0 else 0))
+        trg_tok_starts = []
+        for tok in trg_toks:
+            trg_tok_starts.append(trg_sent.index(tok, trg_tok_starts[-1] + 1 if len(trg_tok_starts) > 0 else 0))
+
+        # Get index of the text token immediately following each marker
+        # and predict the corresponding token on the target side
+        adj_src_toks = []
+        for idx in src_marker_idxs:
+            for i, start_idx in reversed(list(enumerate(src_tok_starts))):
+                if start_idx < idx:
+                    adj_src_toks.append(i + 1)
+                    break
+                if i == 0:
+                    adj_src_toks.append(i)
+
+        alignment = to_word_alignment_matrix(self._pt_info[block_ref]["alignment"])
+        adj_trg_toks = [
+            self._predict_marker_location(alignment, adj_src_tok, src_toks, trg_toks) for adj_src_tok in adj_src_toks
+        ]
+
+        # Collect the markers to be inserted
+        to_insert = []
+        for element, adj_trg_tok in zip(to_place, adj_trg_toks):
+            trg_str_idx = trg_tok_starts[adj_trg_tok] if adj_trg_tok < len(trg_tok_starts) else len(trg_sent)
+
+            # Determine the order of the markers in the sentence to handle ambiguity for directly adjacent markers
+            insert_pos = 0
+            while insert_pos < len(to_insert) and to_insert[insert_pos][0] <= trg_str_idx:
+                insert_pos += 1
+            to_insert.insert(insert_pos, (trg_str_idx, element))
+
+        # Construct new text tokens to put between markers
+        # and reincorporate headers and empty end-of-verse paragraph markers
+        if len(to_insert) == 0 or to_insert[0][0] > 0:
+            placed_elements.append(
+                UsfmUpdateBlockElement(
+                    UsfmUpdateBlockElementType.TEXT,
+                    [
+                        UsfmToken(
+                            UsfmTokenType.TEXT, text=trg_sent[: to_insert[0][0]] if len(to_insert) > 0 else trg_sent
+                        )
+                    ],
+                )
+            )
+        for j, (insert_idx, element) in enumerate(to_insert):
+            if element.type == UsfmUpdateBlockElementType.PARAGRAPH:
+                while len(header_elements) > 0 and header_elements[0][0] == para_markers_left:
+                    placed_elements.append(header_elements.pop(0)[1])
+                para_markers_left -= 1
+
+            placed_elements.append(element)
+            text_token = UsfmToken(
+                UsfmTokenType.TEXT,
+                text=(trg_sent[insert_idx : to_insert[j + 1][0]] if j + 1 < len(to_insert) else trg_sent[insert_idx:]),
+            )
+            placed_elements.append(UsfmUpdateBlockElement(UsfmUpdateBlockElementType.TEXT, [text_token]))
+        for element in end_elements:
+            while len(header_elements) > 0 and header_elements[0][0] == para_markers_left:
+                placed_elements.append(header_elements.pop(0)[1])
+            para_markers_left -= 1
+            placed_elements.append(element)
+        while len(header_elements) > 0:
+            placed_elements.append(header_elements.pop(0)[1])
+
+        block._elements = placed_elements + ignored_elements
+        return block
+
+    def _predict_marker_location(
+        self,
+        alignment: WordAlignmentMatrix,
+        adj_src_tok: int,
+        src_toks: List[str],
+        trg_toks: List[str],
+    ) -> int:
+        # Gets the number of alignment pairs that "cross the line" between
+        # the src marker position and the potential trg marker position, (src_idx - .5) and (trg_idx - .5)
+        def num_align_crossings(src_idx: int, trg_idx: int) -> int:
+            crossings = 0
+            for i in range(alignment.row_count):
+                for j in range(alignment.column_count):
+                    if alignment[i, j] and ((i < src_idx and j >= trg_idx) or (i >= src_idx and j < trg_idx)):
+                        crossings += 1
+            return crossings
+
+        # If the token on either side of a potential target location is punctuation,
+        # use it as the basis for deciding the target marker location
+        trg_hyp = -1
+        punct_hyps = [-1, 0]
+        for punct_hyp in punct_hyps:
+            src_hyp = adj_src_tok + punct_hyp
+            if src_hyp < 0 or src_hyp >= len(src_toks):
+                continue
+            # Only accept aligned pairs where both the src and trg token are punctuation
+            hyp_tok = src_toks[src_hyp]
+            if len(hyp_tok) > 0 and not any(c.isalpha() for c in hyp_tok) and src_hyp < alignment.row_count:
+                aligned_trg_toks = list(alignment.get_row_aligned_indices(src_hyp))
+                # If aligning to a token that precedes that marker,
+                # the trg token predicted to be closest to the marker
+                # is the last token aligned to the src rather than the first
+                for trg_idx in reversed(aligned_trg_toks) if punct_hyp < 0 else aligned_trg_toks:
+                    trg_tok = trg_toks[trg_idx]
+                    if len(trg_tok) > 0 and not any(c.isalpha() for c in trg_tok):
+                        trg_hyp = trg_idx
+                        break
+            if trg_hyp != -1:
+                # Since the marker location is represented by the token after the marker,
+                # adjust the index when aligning to punctuation that precedes the token
+                return trg_hyp + (1 if punct_hyp == -1 else 0)
+
+        hyps = [0, 1, 2]
+        best_hyp = -1
+        best_num_crossings = 200**2  # mostly meaningless, a big number
+        checked = set()
+        for hyp in hyps:
+            src_hyp = adj_src_tok + hyp
+            if src_hyp in checked:
+                continue
+            trg_hyp = -1
+            while trg_hyp == -1 and src_hyp >= 0 and src_hyp < alignment.row_count:
+                checked.add(src_hyp)
+                aligned_trg_toks = list(alignment.get_row_aligned_indices(src_hyp))
+                if len(aligned_trg_toks) > 0:
+                    # If aligning with a source token that precedes the marker,
+                    # the target token predicted to be closest to the marker is the last aligned token rather than the first
+                    trg_hyp = aligned_trg_toks[-1 if hyp < 0 else 0]
+                else:  # continue the search outwards
+                    src_hyp += -1 if hyp < 0 else 1
+            if trg_hyp != -1:
+                # TODO: experiment w/ using adj_src_tok instead of src_hyp
+                # probably doesn't work well w/ word order switches, e.g. eng vs spa noun/adj
+                # one issue it does fix is markers getting sucked to punctuation
+                # (could be the source of some of the \w\w* issues)
+                num_crossings = num_align_crossings(adj_src_tok, trg_hyp)
+                if num_crossings < best_num_crossings:
+                    best_hyp = trg_hyp
+                    best_num_crossings = num_crossings
+                if num_crossings == 0:
+                    break
+
+        # If no alignments found, insert at the end of the sentence
+        return best_hyp if best_hyp != -1 else len(trg_toks)
diff --git a/tests/corpora/test_place_markers_usfm_update_block_handler.py b/tests/corpora/test_place_markers_usfm_update_block_handler.py
new file mode 100644
index 00000000..ee1779c6
--- /dev/null
+++ b/tests/corpora/test_place_markers_usfm_update_block_handler.py
@@ -0,0 +1,483 @@
+from typing import List, Optional, Sequence, Tuple
+
+from machine.corpora import (
+    ScriptureRef,
+    UpdateUsfmMarkerBehavior,
+    UpdateUsfmParserHandler,
+    UpdateUsfmTextBehavior,
+    parse_usfm,
+)
+from machine.corpora.place_markers_usfm_update_block_handler import PlaceMarkersUsfmUpdateBlockHandler
+from machine.corpora.usfm_update_block_handler import UsfmUpdateBlockHandler
+from machine.jobs.translation_file_service import PretranslationInfo
+from machine.tokenization import LatinWordTokenizer
+
+TOKENIZER = LatinWordTokenizer()
+
+
+def test_paragraph_markers():
+    source = "This is the first paragraph. This text is in English, and this test is for paragraph markers."
+    pretranslation = "Este es el primer párrafo. Este texto está en inglés y esta prueba es para marcadores de párrafo."
+    rows = [(scr_ref("MAT 1:1"), str(pretranslation))]
+    usfm = r"""\id MAT
+\c 1
+\v 1 This is the first paragraph.
+\p This text is in English,
+\p and this test is for paragraph markers.
+"""
+
+    pt_info = [
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=["MAT 1:1"],
+            translation=pretranslation,
+            source_toks=[t for t in TOKENIZER.tokenize(source)],
+            translation_toks=[t for t in TOKENIZER.tokenize(pretranslation)],
+            alignment="0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19",
+        ),
+    ]
+    target = update_usfm(
+        rows,
+        usfm,
+        paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+    )
+    result = r"""\id MAT
+\c 1
+\v 1 Este es el primer párrafo.
+\p Este texto está en inglés
+\p y esta prueba es para marcadores de párrafo.
+"""
+    assess(target, result)
+
+    target = update_usfm(
+        rows,
+        usfm,
+        paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP,
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+    )
+    result = r"""\id MAT
+\c 1
+\v 1 Este es el primer párrafo. Este texto está en inglés y esta prueba es para marcadores de párrafo.
+"""
+    assess(target, result)
+
+
+def test_list_paragraph_markers():
+    source = "This is a list: First list item Second list item Third list item"
+    pretranslation = (
+        "Esta es una lista: Primer elemento de la lista Segundo elemento de la lista Tercer elemento de la lista"
+    )
+    rows = [(scr_ref("MAT 1:1"), str(pretranslation))]
+    usfm = r"""\id MAT
+\c 1
+\v 1 This is a list:
+\li1 First list item
+\li1 Second list item
+\li1 Third list item
+"""
+
+    pt_info = [
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=["MAT 1:1"],
+            translation=pretranslation,
+            source_toks=[t for t in TOKENIZER.tokenize(source)],
+            translation_toks=[t for t in TOKENIZER.tokenize(pretranslation)],
+            alignment="0-0 1-1 2-2 3-3 4-4 5-5 6-9 7-6 8-10 9-14 10-11 11-15 12-19 13-16",
+        ),
+    ]
+    target = update_usfm(
+        rows,
+        usfm,
+        paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+    )
+    result = r"""\id MAT
+\c 1
+\v 1 Esta es una lista:
+\li1 Primer elemento de la lista
+\li1 Segundo elemento de la lista
+\li1 Tercer elemento de la lista
+"""
+    assess(target, result)
+
+
+def test_style_markers():
+    source = "This is the first sentence. This text is in English, and this test is for style markers."
+    pretranslation = "Esta es la primera oración. Este texto está en inglés y esta prueba es para marcadores de estilo."
+    rows = [(scr_ref("MAT 1:1"), str(pretranslation))]
+    usfm = r"""\id MAT
+\c 1
+\v 1 This is the \w first\w* sentence. This text is in \w English\w*, and this test is \w for\w* style markers.
+"""
+
+    pt_info = [
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=["MAT 1:1"],
+            translation=pretranslation,
+            source_toks=[t for t in TOKENIZER.tokenize(source)],
+            translation_toks=[t for t in TOKENIZER.tokenize(pretranslation)],
+            alignment="0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19",
+        ),
+    ]
+    target = update_usfm(
+        rows,
+        usfm,
+        style_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+    )
+    result = r"""\id MAT
+\c 1
+\v 1 Esta es la \w primera \w*oración. Este texto está en \w inglés \w*y esta prueba es \w para \w*marcadores de estilo.
+"""
+    # TODO: the spacing before/after end markers is incorrect,
+    # but this is an issue with how the is USFM is generated from the tokens
+    assess(target, result)
+
+    target = update_usfm(
+        rows,
+        usfm,
+        style_behavior=UpdateUsfmMarkerBehavior.STRIP,
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+    )
+    result = r"""\id MAT
+\c 1
+\v 1 Esta es la primera oración. Este texto está en inglés y esta prueba es para marcadores de estilo.
+"""
+    assess(target, result)
+
+
+def test_embeds():
+    rows = [
+        (scr_ref("MAT 1:1"), str("New verse 1")),
+        (scr_ref("MAT 1:2"), str("New verse 2")),
+        (scr_ref("MAT 1:3"), str("New verse 3")),
+        (scr_ref("MAT 1:4"), str("New verse 4")),
+        (scr_ref("MAT 1:4/1:f"), str("New embed text")),
+        (scr_ref("MAT 1:5"), str("New verse 5")),
+        (scr_ref("MAT 1:6"), str("New verse 6")),
+        (scr_ref("MAT 1:6/1:f"), str("New verse 6 embed text")),
+    ]
+    usfm = r"""\id MAT
+\c 1
+\v 1 \f \fr 1.1 \ft Some note \f*Start of sentence embed
+\v 2 Middle of sentence \f \fr 1.2 \ft Some other note \f*embed
+\v 3 End of sentence embed\f \fr 1.3 \ft A third note \f*
+\v 4 Updated embed\f \fr 1.4 \ft A fourth note \f*
+\v 5 Embed with style markers \f \fr 1.5 \ft A \+w stylish\+w* note \f*
+\v 6 Updated embed with style markers \f \fr 1.6 \ft Another \+w stylish\+w* note \f*
+"""
+
+    pt_info = [
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=["MAT 1:1"],
+            translation="New verse 1",
+            source_toks=["Start", "of", "sentence", "embed"],
+            translation_toks=["New", "verse", "1"],
+            alignment="",
+        ),
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=["MAT 1:2"],
+            translation="New verse 2",
+            source_toks=["Middle", "of", "sentence", "embed"],
+            translation_toks=["New", "verse", "2"],
+            alignment="",
+        ),
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=["MAT 1:3"],
+            translation="New verse 3",
+            source_toks=["End", "of", "sentence", "embed"],
+            translation_toks=["New", "verse", "3"],
+            alignment="",
+        ),
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=["MAT 1:4"],
+            translation="New verse 4",
+            source_toks=["Updated", "embed"],
+            translation_toks=["New", "verse", "4"],
+            alignment="",
+        ),
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=["MAT 1:4/1:f"],
+            translation="New embed text",
+            source_toks=["A", "fourth", "note"],
+            translation_toks=["New", "embed", "text"],
+            alignment="",
+        ),
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=["MAT 1:5"],
+            translation="New verse 5",
+            source_toks=["Embed", "with", "style", "markers"],
+            translation_toks=["New", "verse", "5"],
+            alignment="",
+        ),
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=["MAT 1:6"],
+            translation="New verse 6",
+            source_toks=["Updated", "embed", "with", "style", "markers"],
+            translation_toks=["New", "verse", "6"],
+            alignment="",
+        ),
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=["MAT 1:6/1:f"],
+            translation="New verse 6 embed text",
+            source_toks=["Another", "stylish", "note"],
+            translation_toks=["New", "verse", "6", "embed", "text"],
+            alignment="",
+        ),
+    ]
+    target = update_usfm(
+        rows,
+        usfm,
+        embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+    )
+    # NOTE: currently not updating embeds
+    result = r"""\id MAT
+\c 1
+\v 1 New verse 1 \f \fr 1.1 \ft Some note \f*
+\v 2 New verse 2 \f \fr 1.2 \ft Some other note \f*
+\v 3 New verse 3 \f \fr 1.3 \ft A third note \f*
+\v 4 New verse 4 \f \fr 1.4 \ft A fourth note \f*
+\v 5 New verse 5 \f \fr 1.5 \ft A \+w stylish\+w* note \f*
+\v 6 New verse 6 \f \fr 1.6 \ft Another \+w stylish\+w* note \f*
+"""
+    assess(target, result)
+
+    target = update_usfm(
+        rows,
+        usfm,
+        embed_behavior=UpdateUsfmMarkerBehavior.STRIP,
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+    )
+    result = r"""\id MAT
+\c 1
+\v 1 New verse 1
+\v 2 New verse 2
+\v 3 New verse 3
+\v 4 New verse 4
+\v 5 New verse 5
+\v 6 New verse 6
+"""
+    assess(target, result)
+
+
+def test_headers():
+    rows = [(scr_ref("MAT 1:1"), "X Y Z"), (scr_ref("MAT 1:2"), "X")]
+    usfm = r"""\id MAT
+\c 1
+\s1 Start of chapter header
+\v 1 A
+\p B
+\s1 Mid-verse header
+\p C
+\s1 End of verse header
+\p
+\p
+\s1 Header after all paragraphs
+\v 2 A
+\s1 Header followed by a reference
+\r (reference)
+\p
+"""
+
+    pt_info = [
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=["MAT 1:1"],
+            translation="X Y Z",
+            source_toks=["A", "B", "C"],
+            translation_toks=["X", "Y", "Z"],
+            alignment="0-0 1-1 2-2",
+        ),
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=["MAT 1:2"],
+            translation="X",
+            source_toks=["A"],
+            translation_toks=["X"],
+            alignment="0-0",
+        ),
+    ]
+    target = update_usfm(
+        rows,
+        usfm,
+        paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+    )
+    result = r"""\id MAT
+\c 1
+\s1 Start of chapter header
+\v 1 X
+\p Y
+\s1 Mid-verse header
+\p Z
+\s1 End of verse header
+\p
+\p
+\s1 Header after all paragraphs
+\v 2 X
+\s1 Header followed by a reference
+\r (reference)
+\p
+"""
+    assess(target, result)
+
+
+def test_verse_ranges():
+    rows = [([ScriptureRef.parse(f"MAT 1:{i}") for i in range(1, 6)], "New verse range text")]
+    usfm = r"""\id MAT
+\c 1
+\v 1-5 Verse range
+"""
+
+    pt_info = [
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=[str(ScriptureRef.parse(f"MAT 1:{i}")) for i in range(1, 6)],
+            translation="New verse range text",
+            source_toks=["Verse", "range"],
+            translation_toks=["New", "verse", "range", "text"],
+            alignment="0-1 1-2",
+        ),
+    ]
+    target = update_usfm(
+        rows,
+        usfm,
+        paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+    )
+    result = r"""\id MAT
+\c 1
+\v 1-5 New verse range text
+"""
+    assess(target, result)
+
+
+def test_no_alignment():
+    rows = [(scr_ref("MAT 1:1"), str("New paragraph 1 New paragraph 2"))]
+    usfm = r"""\id MAT
+\c 1
+\v 1 Old paragraph 1
+\p Old paragraph 2
+"""
+
+    pt_info = [
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=["MAT 1:1"],
+            translation="New paragraph 1 New paragraph 2",
+            source_toks=["Old", "paragraph", "1", "Old", "paragraph", "2"],
+            translation_toks=["New", "paragraph", "1", "New", "paragraph", "2"],
+            alignment="",
+        ),
+    ]
+    target = update_usfm(
+        rows,
+        usfm,
+        paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+    )
+    result = r"""\id MAT
+\c 1
+\v 1 New paragraph 1 New paragraph 2
+\p
+"""
+    assess(target, result)
+
+
+def test_changed_text():
+    rows = [(scr_ref("MAT 1:1"), str("New paragraph 1 New paragraph 2"))]
+    usfm = r"""\id MAT
+\c 1
+\v 1 Old paragraph 1
+\p Old paragraph 2
+"""
+
+    pt_info = [
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=["MAT 1:1"],
+            translation="Changed paragraph 1 Changed paragraph 2",
+            source_toks=["Old", "paragraph", "1", "Old", "paragraph", "2"],
+            translation_toks=["Changed", "paragraph", "1", "Changed", "paragraph", "2"],
+            alignment="0-0 1-1 2-2 3-3 4-4 5-5",
+        ),
+    ]
+    target = update_usfm(
+        rows,
+        usfm,
+        paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+    )
+    result = r"""\id MAT
+\c 1
+\v 1 New paragraph 1 New paragraph 2
+\p
+"""
+    assess(target, result)
+
+
+def scr_ref(*refs: str) -> List[ScriptureRef]:
+    return [ScriptureRef.parse(ref) for ref in refs]
+
+
+def update_usfm(
+    rows: Sequence[Tuple[Sequence[ScriptureRef], str]],
+    source: str,
+    id_text: Optional[str] = None,
+    text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_NEW,
+    paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
+    embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
+    style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
+    preserve_paragraph_styles: Optional[Sequence[str]] = None,
+    update_block_handlers: Optional[list[UsfmUpdateBlockHandler]] = None,
+) -> Optional[str]:
+    source = source.strip().replace("\r\n", "\n") + "\r\n"
+    updater = UpdateUsfmParserHandler(
+        rows,
+        id_text,
+        text_behavior,
+        paragraph_behavior,
+        embed_behavior,
+        style_behavior,
+        preserve_paragraph_styles,
+        update_block_handlers,
+    )
+    parse_usfm(source, updater)
+    return updater.get_usfm()
+
+
+def assess(target: Optional[str], truth: str) -> None:
+    assert target is not None
+    for target_line, truth_line in zip(target.split("\n"), truth.split("\n")):
+        print(truth_line)
+        print(target_line)
+    for target_line, truth_line in zip(target.split("\n"), truth.split("\n")):
+        assert target_line.strip() == truth_line.strip()

From d2b8a11fd1c27f2884ed255b6bc7925ac0162545 Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Fri, 9 May 2025 20:03:50 -0400
Subject: [PATCH 2/6] Refactor marker placement handler, small bug fixes

---
 ...place_markers_usfm_update_block_handler.py | 175 +++++++-----------
 1 file changed, 63 insertions(+), 112 deletions(-)

diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py
index 514a3022..576aa42f 100644
--- a/machine/corpora/place_markers_usfm_update_block_handler.py
+++ b/machine/corpora/place_markers_usfm_update_block_handler.py
@@ -4,96 +4,85 @@
 
 from ..jobs.eflomal_aligner import to_word_alignment_matrix
 from ..jobs.translation_file_service import PretranslationInfo
-from ..scripture.verse_ref import VerseRef
 from ..tokenization import LatinWordTokenizer
 from ..translation import WordAlignmentMatrix
-from .aligned_word_pair import AlignedWordPair
-from .usfm_stylesheet import UsfmStylesheet
-from .usfm_tag import UsfmTextType
 from .usfm_token import UsfmToken, UsfmTokenType
 from .usfm_update_block import UsfmUpdateBlock
 from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
 from .usfm_update_block_handler import UsfmUpdateBlockHandler
 
 TOKENIZER = LatinWordTokenizer()
-STYLESHEET = UsfmStylesheet("usfm.sty")
 
 
 class PlaceMarkersUsfmUpdateBlockHandler(UsfmUpdateBlockHandler):
 
     def __init__(self, pt_info: Sequence[PretranslationInfo]):
-        self._pt_info = {}
-        for info in pt_info:
-            if len(info["refs"]) == 1:
-                ref_str = info["refs"][0]
-            else:
-                ref_str_start = VerseRef.from_string(info["refs"][0])
-                ref_str_end = VerseRef.from_string(info["refs"][-1])
-                ref_str = str(VerseRef.from_range(ref_str_start, ref_str_end))
-            self._pt_info[ref_str] = info
-        # self._pt_info = {info["refs"][0]: info for info in pt_info}
+        self._pt_info = {info["refs"][0]: info for info in pt_info}
 
     def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
-        block_ref = str(
-            block.refs[0]
-            if len(block.refs) == 1
-            else VerseRef.from_range(block.refs[0].verse_ref, block.refs[-1].verse_ref)
-        )
+        ref = str(block.refs[0])
+        elements = list(block.elements)
 
-        # Nothing to do if there are no markers to place, no alignment to use, or if the block represents an embed
+        # Nothing to do if there are no markers to place or no alignment to use
         if (
-            len(block.elements) == 0
-            or block_ref not in self._pt_info.keys()
-            or len(self._pt_info[block_ref]["alignment"]) == 0
-            # TODO: is this too restrictive?
-            or block.elements[0].tokens[0].marker != "v"
+            len(elements) == 0
+            or ref not in self._pt_info.keys()
+            or len(self._pt_info[ref]["alignment"]) == 0
             or not any(
                 (
-                    element.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE]
-                    and not element.marked_for_removal
+                    e.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE]
+                    and not e.marked_for_removal
                 )
-                for element in block.elements[1:]  # TODO: all of block
+                for e in elements
             )
         ):
             return block
 
-        # Work on a copy in case the block needs to be returned unchanged
-        orig_elements = list(block.elements)
-
-        src_sent = ""
-        trg_sent = ""
-        to_place = []
-        src_marker_idxs = []
-        placed_elements = [orig_elements[0]]  # TODO: no elements to start
-        ignored_elements = []
-
+        # Paragraph markers at the end of the block should stay there
         # Section headers should be ignored but re-inserted in the same position relative to other paragraph markers
+        end_elements = []
+        eob_empty_paras = True
         header_elements = []
         para_markers_left = 0
-        for i, element in reversed(list(enumerate(orig_elements))):
+        for i, element in reversed(list(enumerate(elements))):
             if element.type == UsfmUpdateBlockElementType.PARAGRAPH and not element.marked_for_removal:
-                if STYLESHEET.get_tag(str(element.tokens[0].marker)).text_type == UsfmTextType.SECTION:
-                    # if i < len(orig_elements) - 1 and orig_elements[i + 1].type == UsfmUpdateBlockElementType.TEXT:
-                    #     header_elements.insert(0, (para_markers_left, [element, orig_elements.pop(i + 1)]))
-                    # else:
+                if len(element.tokens) > 1:
                     header_elements.insert(0, (para_markers_left, element))
-                    orig_elements.pop(i)
+                    elements.pop(i)
                 else:
                     para_markers_left += 1
 
-        # Paragraph markers at the end of the block should stay there
-        end_elements = []
-        for i, element in reversed(list(enumerate(orig_elements))):
-            if element.type == UsfmUpdateBlockElementType.PARAGRAPH and not element.marked_for_removal:
-                end_elements.insert(0, element)
-                orig_elements.pop(i)
+                    if eob_empty_paras:
+                        end_elements.insert(0, element)
+                        elements.pop(i)
             elif element.type != UsfmUpdateBlockElementType.EMBED:
-                break
+                eob_empty_paras = False
+
+        src_toks = self._pt_info[ref]["source_toks"]
+        trg_toks = self._pt_info[ref]["translation_toks"]
+        src_tok_idx = 0
 
-        for element in orig_elements[1:]:  # TODO: all
+        src_sent = ""
+        trg_sent = ""
+        to_place = []
+        adj_src_toks = []
+        placed_elements = [elements.pop(0)] if elements[0].type == UsfmUpdateBlockElementType.OTHER else []
+        ignored_elements = []
+        for element in elements:
             if element.type == UsfmUpdateBlockElementType.TEXT:
                 if element.marked_for_removal:
-                    src_sent += element.tokens[0].to_usfm()
+                    text = element.tokens[0].to_usfm()
+                    src_sent += text
+
+                    # Handle tokens split across text elements
+                    if len(text.strip()) > 0 and (
+                        src_toks[src_tok_idx] not in text or text.strip().index(src_toks[src_tok_idx]) > 0
+                    ):
+                        src_tok_idx += 1
+                    # Track seen tokens
+                    while src_tok_idx < len(src_toks) and src_toks[src_tok_idx] in text:
+                        text = text[text.index(src_toks[src_tok_idx]) + len(src_toks[src_tok_idx]) :]
+                        src_tok_idx += 1
                 else:
                     trg_sent += element.tokens[0].to_usfm()
 
@@ -101,63 +90,33 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
                 ignored_elements.append(element)
             elif element.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE]:
                 to_place.append(element)
-                src_marker_idxs.append(len(src_sent))
-
-        src_toks = self._pt_info[block_ref]["source_toks"]
-        trg_toks = self._pt_info[block_ref]["translation_toks"]
-
-        # Don't do anything if the source sentence or pretranslation has changed
-        if (
-            list(t for t in TOKENIZER.tokenize(src_sent)) != src_toks
-            or list(t for t in TOKENIZER.tokenize(trg_sent)) != trg_toks
-        ):
-            return block
+                adj_src_toks.append(src_tok_idx)
 
-        src_tok_starts = []
-        for tok in src_toks:
-            src_tok_starts.append(src_sent.index(tok, src_tok_starts[-1] + 1 if len(src_tok_starts) > 0 else 0))
         trg_tok_starts = []
         for tok in trg_toks:
             trg_tok_starts.append(trg_sent.index(tok, trg_tok_starts[-1] + 1 if len(trg_tok_starts) > 0 else 0))
 
-        # Get index of the text token immediately following each marker
-        # and predict the corresponding token on the target side
-        adj_src_toks = []
-        for idx in src_marker_idxs:
-            for i, start_idx in reversed(list(enumerate(src_tok_starts))):
-                if start_idx < idx:
-                    adj_src_toks.append(i + 1)
-                    break
-                if i == 0:
-                    adj_src_toks.append(i)
-
-        alignment = to_word_alignment_matrix(self._pt_info[block_ref]["alignment"])
-        adj_trg_toks = [
-            self._predict_marker_location(alignment, adj_src_tok, src_toks, trg_toks) for adj_src_tok in adj_src_toks
-        ]
-
-        # Collect the markers to be inserted
+        # Predict marker placements and get insertion order
         to_insert = []
-        for element, adj_trg_tok in zip(to_place, adj_trg_toks):
+        alignment = to_word_alignment_matrix(self._pt_info[ref]["alignment"])
+        for element, adj_src_tok in zip(to_place, adj_src_toks):
+            adj_trg_tok = self._predict_marker_location(alignment, adj_src_tok, src_toks, trg_toks)
             trg_str_idx = trg_tok_starts[adj_trg_tok] if adj_trg_tok < len(trg_tok_starts) else len(trg_sent)
 
-            # Determine the order of the markers in the sentence to handle ambiguity for directly adjacent markers
-            insert_pos = 0
-            while insert_pos < len(to_insert) and to_insert[insert_pos][0] <= trg_str_idx:
-                insert_pos += 1
-            to_insert.insert(insert_pos, (trg_str_idx, element))
+            to_insert.append((trg_str_idx, element))
+        to_insert.sort(key=lambda x: x[0])
+        to_insert += [(len(trg_sent), element) for element in end_elements]
 
         # Construct new text tokens to put between markers
         # and reincorporate headers and empty end-of-verse paragraph markers
-        if len(to_insert) == 0 or to_insert[0][0] > 0:
+        if len(to_insert) == 0:
+            placed_elements.append(
+                UsfmUpdateBlockElement(UsfmUpdateBlockElementType.TEXT, [UsfmToken(UsfmTokenType.TEXT, text=trg_sent)])
+            )
+        elif to_insert[0][0] > 0:
             placed_elements.append(
                 UsfmUpdateBlockElement(
-                    UsfmUpdateBlockElementType.TEXT,
-                    [
-                        UsfmToken(
-                            UsfmTokenType.TEXT, text=trg_sent[: to_insert[0][0]] if len(to_insert) > 0 else trg_sent
-                        )
-                    ],
+                    UsfmUpdateBlockElementType.TEXT, [UsfmToken(UsfmTokenType.TEXT, text=trg_sent[: to_insert[0][0]])]
                 )
             )
         for j, (insert_idx, element) in enumerate(to_insert):
@@ -167,16 +126,12 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
                 para_markers_left -= 1
 
             placed_elements.append(element)
-            text_token = UsfmToken(
-                UsfmTokenType.TEXT,
-                text=(trg_sent[insert_idx : to_insert[j + 1][0]] if j + 1 < len(to_insert) else trg_sent[insert_idx:]),
-            )
-            placed_elements.append(UsfmUpdateBlockElement(UsfmUpdateBlockElementType.TEXT, [text_token]))
-        for element in end_elements:
-            while len(header_elements) > 0 and header_elements[0][0] == para_markers_left:
-                placed_elements.append(header_elements.pop(0)[1])
-            para_markers_left -= 1
-            placed_elements.append(element)
+            if insert_idx < len(trg_sent) and (j + 1 == len(to_insert) or insert_idx < to_insert[j + 1][0]):
+                if j + 1 < len(to_insert):
+                    text_token = UsfmToken(UsfmTokenType.TEXT, text=(trg_sent[insert_idx : to_insert[j + 1][0]]))
+                else:
+                    text_token = UsfmToken(UsfmTokenType.TEXT, text=(trg_sent[insert_idx:]))
+                placed_elements.append(UsfmUpdateBlockElement(UsfmUpdateBlockElementType.TEXT, [text_token]))
         while len(header_elements) > 0:
             placed_elements.append(header_elements.pop(0)[1])
 
@@ -244,10 +199,6 @@ def num_align_crossings(src_idx: int, trg_idx: int) -> int:
                 else:  # continue the search outwards
                     src_hyp += -1 if hyp < 0 else 1
             if trg_hyp != -1:
-                # TODO: experiment w/ using adj_src_tok instead of src_hyp
-                # probably doesn't work well w/ word order switches, e.g. eng vs spa noun/adj
-                # one issue it does fix is markers getting sucked to punctuation
-                # (could be the source of some of the \w\w* issues)
                 num_crossings = num_align_crossings(adj_src_tok, trg_hyp)
                 if num_crossings < best_num_crossings:
                     best_hyp = trg_hyp

From bbf097a03f9793763eff3b73b212dff745d9f50d Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Fri, 9 May 2025 22:22:37 -0400
Subject: [PATCH 3/6] Extend and clean up tests, more code cleanup

---
 ...place_markers_usfm_update_block_handler.py |  24 +-
 ...place_markers_usfm_update_block_handler.py | 320 +++++++++---------
 2 files changed, 167 insertions(+), 177 deletions(-)

diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py
index 576aa42f..ee6f86cf 100644
--- a/machine/corpora/place_markers_usfm_update_block_handler.py
+++ b/machine/corpora/place_markers_usfm_update_block_handler.py
@@ -4,19 +4,16 @@
 
 from ..jobs.eflomal_aligner import to_word_alignment_matrix
 from ..jobs.translation_file_service import PretranslationInfo
-from ..tokenization import LatinWordTokenizer
 from ..translation import WordAlignmentMatrix
 from .usfm_token import UsfmToken, UsfmTokenType
 from .usfm_update_block import UsfmUpdateBlock
 from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
 from .usfm_update_block_handler import UsfmUpdateBlockHandler
 
-TOKENIZER = LatinWordTokenizer()
-
 
 class PlaceMarkersUsfmUpdateBlockHandler(UsfmUpdateBlockHandler):
 
-    def __init__(self, pt_info: Sequence[PretranslationInfo]):
+    def __init__(self, pt_info: Sequence[PretranslationInfo]) -> None:
         self._pt_info = {info["refs"][0]: info for info in pt_info}
 
     def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
@@ -55,7 +52,10 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
                     if eob_empty_paras:
                         end_elements.insert(0, element)
                         elements.pop(i)
-            elif element.type != UsfmUpdateBlockElementType.EMBED:
+            elif not (
+                element.type == UsfmUpdateBlockElementType.EMBED
+                or (element.type == UsfmUpdateBlockElementType.TEXT and len(element.tokens[0].to_usfm().strip()) == 0)
+            ):
                 eob_empty_paras = False
 
         src_toks = self._pt_info[ref]["source_toks"]
@@ -74,15 +74,13 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
                     text = element.tokens[0].to_usfm()
                     src_sent += text
 
-                    # Handle tokens split across text elements
-                    if len(text.strip()) > 0 and (
-                        src_toks[src_tok_idx] not in text or text.strip().index(src_toks[src_tok_idx]) > 0
-                    ):
-                        src_tok_idx += 1
                     # Track seen tokens
                     while src_tok_idx < len(src_toks) and src_toks[src_tok_idx] in text:
                         text = text[text.index(src_toks[src_tok_idx]) + len(src_toks[src_tok_idx]) :]
                         src_tok_idx += 1
+                    # Handle tokens split across text elements
+                    if len(text.strip()) > 0:
+                        src_tok_idx += 1
                 else:
                     trg_sent += element.tokens[0].to_usfm()
 
@@ -109,11 +107,7 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
 
         # Construct new text tokens to put between markers
         # and reincorporate headers and empty end-of-verse paragraph markers
-        if len(to_insert) == 0:
-            placed_elements.append(
-                UsfmUpdateBlockElement(UsfmUpdateBlockElementType.TEXT, [UsfmToken(UsfmTokenType.TEXT, text=trg_sent)])
-            )
-        elif to_insert[0][0] > 0:
+        if to_insert[0][0] > 0:
             placed_elements.append(
                 UsfmUpdateBlockElement(
                     UsfmUpdateBlockElementType.TEXT, [UsfmToken(UsfmTokenType.TEXT, text=trg_sent[: to_insert[0][0]])]
diff --git a/tests/corpora/test_place_markers_usfm_update_block_handler.py b/tests/corpora/test_place_markers_usfm_update_block_handler.py
index ee1779c6..40f79f5a 100644
--- a/tests/corpora/test_place_markers_usfm_update_block_handler.py
+++ b/tests/corpora/test_place_markers_usfm_update_block_handler.py
@@ -15,7 +15,7 @@
 TOKENIZER = LatinWordTokenizer()
 
 
-def test_paragraph_markers():
+def test_paragraph_markers() -> None:
     source = "This is the first paragraph. This text is in English, and this test is for paragraph markers."
     pretranslation = "Este es el primer párrafo. Este texto está en inglés y esta prueba es para marcadores de párrafo."
     rows = [(scr_ref("MAT 1:1"), str(pretranslation))]
@@ -51,61 +51,8 @@ def test_paragraph_markers():
 """
     assess(target, result)
 
-    target = update_usfm(
-        rows,
-        usfm,
-        paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP,
-        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
-    )
-    result = r"""\id MAT
-\c 1
-\v 1 Este es el primer párrafo. Este texto está en inglés y esta prueba es para marcadores de párrafo.
-"""
-    assess(target, result)
-
-
-def test_list_paragraph_markers():
-    source = "This is a list: First list item Second list item Third list item"
-    pretranslation = (
-        "Esta es una lista: Primer elemento de la lista Segundo elemento de la lista Tercer elemento de la lista"
-    )
-    rows = [(scr_ref("MAT 1:1"), str(pretranslation))]
-    usfm = r"""\id MAT
-\c 1
-\v 1 This is a list:
-\li1 First list item
-\li1 Second list item
-\li1 Third list item
-"""
-
-    pt_info = [
-        PretranslationInfo(
-            corpusId="",
-            textId="",
-            refs=["MAT 1:1"],
-            translation=pretranslation,
-            source_toks=[t for t in TOKENIZER.tokenize(source)],
-            translation_toks=[t for t in TOKENIZER.tokenize(pretranslation)],
-            alignment="0-0 1-1 2-2 3-3 4-4 5-5 6-9 7-6 8-10 9-14 10-11 11-15 12-19 13-16",
-        ),
-    ]
-    target = update_usfm(
-        rows,
-        usfm,
-        paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
-        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
-    )
-    result = r"""\id MAT
-\c 1
-\v 1 Esta es una lista:
-\li1 Primer elemento de la lista
-\li1 Segundo elemento de la lista
-\li1 Tercer elemento de la lista
-"""
-    assess(target, result)
 
-
-def test_style_markers():
+def test_style_markers() -> None:
     source = "This is the first sentence. This text is in English, and this test is for style markers."
     pretranslation = "Esta es la primera oración. Este texto está en inglés y esta prueba es para marcadores de estilo."
     rows = [(scr_ref("MAT 1:1"), str(pretranslation))]
@@ -135,7 +82,7 @@ def test_style_markers():
 \c 1
 \v 1 Esta es la \w primera \w*oración. Este texto está en \w inglés \w*y esta prueba es \w para \w*marcadores de estilo.
 """
-    # TODO: the spacing before/after end markers is incorrect,
+    # NOTE: the spacing before/after end markers is incorrect,
     # but this is an issue with how the is USFM is generated from the tokens
     assess(target, result)
 
@@ -152,16 +99,17 @@ def test_style_markers():
     assess(target, result)
 
 
-def test_embeds():
+# NOTE: Not currently updating embeds, will need to change test when we do
+def test_embeds() -> None:
     rows = [
-        (scr_ref("MAT 1:1"), str("New verse 1")),
-        (scr_ref("MAT 1:2"), str("New verse 2")),
-        (scr_ref("MAT 1:3"), str("New verse 3")),
-        (scr_ref("MAT 1:4"), str("New verse 4")),
-        (scr_ref("MAT 1:4/1:f"), str("New embed text")),
-        (scr_ref("MAT 1:5"), str("New verse 5")),
-        (scr_ref("MAT 1:6"), str("New verse 6")),
-        (scr_ref("MAT 1:6/1:f"), str("New verse 6 embed text")),
+        (scr_ref("MAT 1:1"), "New verse 1"),
+        (scr_ref("MAT 1:2"), "New verse 2"),
+        (scr_ref("MAT 1:3"), "New verse 3"),
+        (scr_ref("MAT 1:4"), "New verse 4"),
+        (scr_ref("MAT 1:4/1:f"), "New embed text"),
+        (scr_ref("MAT 1:5"), "New verse 5"),
+        (scr_ref("MAT 1:6"), "New verse 6"),
+        (scr_ref("MAT 1:6/1:f"), "New verse 6 embed text"),
     ]
     usfm = r"""\id MAT
 \c 1
@@ -173,87 +121,13 @@ def test_embeds():
 \v 6 Updated embed with style markers \f \fr 1.6 \ft Another \+w stylish\+w* note \f*
 """
 
-    pt_info = [
-        PretranslationInfo(
-            corpusId="",
-            textId="",
-            refs=["MAT 1:1"],
-            translation="New verse 1",
-            source_toks=["Start", "of", "sentence", "embed"],
-            translation_toks=["New", "verse", "1"],
-            alignment="",
-        ),
-        PretranslationInfo(
-            corpusId="",
-            textId="",
-            refs=["MAT 1:2"],
-            translation="New verse 2",
-            source_toks=["Middle", "of", "sentence", "embed"],
-            translation_toks=["New", "verse", "2"],
-            alignment="",
-        ),
-        PretranslationInfo(
-            corpusId="",
-            textId="",
-            refs=["MAT 1:3"],
-            translation="New verse 3",
-            source_toks=["End", "of", "sentence", "embed"],
-            translation_toks=["New", "verse", "3"],
-            alignment="",
-        ),
-        PretranslationInfo(
-            corpusId="",
-            textId="",
-            refs=["MAT 1:4"],
-            translation="New verse 4",
-            source_toks=["Updated", "embed"],
-            translation_toks=["New", "verse", "4"],
-            alignment="",
-        ),
-        PretranslationInfo(
-            corpusId="",
-            textId="",
-            refs=["MAT 1:4/1:f"],
-            translation="New embed text",
-            source_toks=["A", "fourth", "note"],
-            translation_toks=["New", "embed", "text"],
-            alignment="",
-        ),
-        PretranslationInfo(
-            corpusId="",
-            textId="",
-            refs=["MAT 1:5"],
-            translation="New verse 5",
-            source_toks=["Embed", "with", "style", "markers"],
-            translation_toks=["New", "verse", "5"],
-            alignment="",
-        ),
-        PretranslationInfo(
-            corpusId="",
-            textId="",
-            refs=["MAT 1:6"],
-            translation="New verse 6",
-            source_toks=["Updated", "embed", "with", "style", "markers"],
-            translation_toks=["New", "verse", "6"],
-            alignment="",
-        ),
-        PretranslationInfo(
-            corpusId="",
-            textId="",
-            refs=["MAT 1:6/1:f"],
-            translation="New verse 6 embed text",
-            source_toks=["Another", "stylish", "note"],
-            translation_toks=["New", "verse", "6", "embed", "text"],
-            alignment="",
-        ),
-    ]
+    pt_info = []
     target = update_usfm(
         rows,
         usfm,
         embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
         update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
     )
-    # NOTE: currently not updating embeds
     result = r"""\id MAT
 \c 1
 \v 1 New verse 1 \f \fr 1.1 \ft Some note \f*
@@ -283,8 +157,50 @@ def test_embeds():
     assess(target, result)
 
 
-def test_headers():
-    rows = [(scr_ref("MAT 1:1"), "X Y Z"), (scr_ref("MAT 1:2"), "X")]
+def test_trailing_empty_paragraphs() -> None:
+    rows = [(scr_ref("MAT 1:1"), "New verse 1")]
+    usfm = r"""\id MAT
+\c 1
+\v 1 Verse 1
+\p
+\b
+\q1 \f embed \f*
+"""
+
+    pt_info = [
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=["MAT 1:1"],
+            translation="New verse 1",
+            source_toks=["Verse" "1"],
+            translation_toks=["New", "verse", "1"],
+            alignment="0-1 1-2",
+        ),
+    ]
+    target = update_usfm(
+        rows,
+        usfm,
+        paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+    )
+    result = r"""\id MAT
+\c 1
+\v 1 New verse 1
+\p
+\b
+\q1 \f embed \f*
+"""
+    assess(target, result)
+
+
+def test_headers() -> None:
+    rows = [
+        (scr_ref("MAT 1:1"), "X Y Z"),
+        (scr_ref("MAT 1:2"), "X"),
+        (scr_ref("MAT 1:3"), "Y"),
+        (scr_ref("MAT 1:3/1:s1"), "Updated header"),
+    ]
     usfm = r"""\id MAT
 \c 1
 \s1 Start of chapter header
@@ -300,6 +216,8 @@ def test_headers():
 \s1 Header followed by a reference
 \r (reference)
 \p
+\v 3 B
+\s1 Header to be updated
 """
 
     pt_info = [
@@ -343,15 +261,52 @@ def test_headers():
 \s1 Header followed by a reference
 \r (reference)
 \p
+\v 3 Y
+\s1 Updated header
 """
     assess(target, result)
 
 
-def test_verse_ranges():
-    rows = [([ScriptureRef.parse(f"MAT 1:{i}") for i in range(1, 6)], "New verse range text")]
+def test_consecutive_markers() -> None:
+    rows = [(scr_ref("MAT 1:1"), "New verse 1 WORD")]
+    usfm = r"""\id MAT
+\c 1
+\v 1 Old verse 1
+\p \qt \+w word \+w* \qt*
+"""
+
+    pt_info = [
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=["MAT 1:1"],
+            translation="New verse 1 WORD",
+            source_toks=["Old", "verse", "1", "word"],
+            translation_toks=["New", "verse", "1", "WORD"],
+            alignment="0-0 1-1 2-2 3-3",
+        ),
+    ]
+    target = update_usfm(
+        rows,
+        usfm,
+        paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
+        style_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+    )
+    result = r"""\id MAT
+\c 1
+\v 1 New verse 1
+\p \qt \+w WORD \+w*\qt*
+"""
+    assess(target, result)
+
+
+def test_verse_ranges() -> None:
+    rows = [([ScriptureRef.parse(f"MAT 1:{i}") for i in range(1, 6)], "New verse range text new paragraph 2")]
     usfm = r"""\id MAT
 \c 1
 \v 1-5 Verse range
+\p old paragraph 2
 """
 
     pt_info = [
@@ -359,10 +314,10 @@ def test_verse_ranges():
             corpusId="",
             textId="",
             refs=[str(ScriptureRef.parse(f"MAT 1:{i}")) for i in range(1, 6)],
-            translation="New verse range text",
-            source_toks=["Verse", "range"],
-            translation_toks=["New", "verse", "range", "text"],
-            alignment="0-1 1-2",
+            translation="New verse range text new paragraph 2",
+            source_toks=["Verse", "range", "old", "paragraph", "2"],
+            translation_toks=["New", "verse", "range", "text", "new", "paragraph", "2"],
+            alignment="0-1 1-2 2-4 3-5 4-6",
         ),
     ]
     target = update_usfm(
@@ -374,18 +329,20 @@ def test_verse_ranges():
     result = r"""\id MAT
 \c 1
 \v 1-5 New verse range text
+\p new paragraph 2
 """
     assess(target, result)
 
 
-def test_no_alignment():
-    rows = [(scr_ref("MAT 1:1"), str("New paragraph 1 New paragraph 2"))]
+def test_no_update() -> None:
+    rows = [(scr_ref("MAT 1:1"), "New paragraph 1 New paragraph 2")]
     usfm = r"""\id MAT
 \c 1
 \v 1 Old paragraph 1
 \p Old paragraph 2
 """
 
+    # Strip paragraphs
     pt_info = [
         PretranslationInfo(
             corpusId="",
@@ -394,6 +351,30 @@ def test_no_alignment():
             translation="New paragraph 1 New paragraph 2",
             source_toks=["Old", "paragraph", "1", "Old", "paragraph", "2"],
             translation_toks=["New", "paragraph", "1", "New", "paragraph", "2"],
+            alignment="0-0 1-1 2-2 3-3 4-4 5-5",
+        ),
+    ]
+    target = update_usfm(
+        rows,
+        usfm,
+        paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP,
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+    )
+    result = r"""\id MAT
+\c 1
+\v 1 New paragraph 1 New paragraph 2
+"""
+    assess(target, result)
+
+    # No alignment
+    pt_info = [
+        PretranslationInfo(
+            corpusId="",
+            textId="",
+            refs=["MAT 1:1"],
+            translation="New paragraph 1 New paragraph 2",
+            source_toks=[],
+            translation_toks=[],
             alignment="",
         ),
     ]
@@ -410,13 +391,30 @@ def test_no_alignment():
 """
     assess(target, result)
 
-
-def test_changed_text():
-    rows = [(scr_ref("MAT 1:1"), str("New paragraph 1 New paragraph 2"))]
-    usfm = r"""\id MAT
+    # No text update
+    rows = []
+    pt_info = []
+    target = update_usfm(
+        rows,
+        usfm,
+        paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+    )
+    result = r"""\id MAT
 \c 1
 \v 1 Old paragraph 1
 \p Old paragraph 2
+"""
+    assess(target, result)
+
+
+def test_split_tokens() -> None:
+    rows = [(scr_ref("MAT 1:1"), "words split words split words split")]
+    usfm = r"""\id MAT
+\c 1
+\v 1 words spl
+\p it words spl
+\p it words split
 """
 
     pt_info = [
@@ -424,9 +422,9 @@ def test_changed_text():
             corpusId="",
             textId="",
             refs=["MAT 1:1"],
-            translation="Changed paragraph 1 Changed paragraph 2",
-            source_toks=["Old", "paragraph", "1", "Old", "paragraph", "2"],
-            translation_toks=["Changed", "paragraph", "1", "Changed", "paragraph", "2"],
+            translation="words split words split words split",
+            source_toks=["words", "split", "words", "split", "words", "split"],
+            translation_toks=["words", "split", "words", "split", "words", "split"],
             alignment="0-0 1-1 2-2 3-3 4-4 5-5",
         ),
     ]
@@ -438,8 +436,9 @@ def test_changed_text():
     )
     result = r"""\id MAT
 \c 1
-\v 1 New paragraph 1 New paragraph 2
-\p
+\v 1 words split
+\p words split
+\p words split
 """
     assess(target, result)
 
@@ -476,8 +475,5 @@ def update_usfm(
 
 def assess(target: Optional[str], truth: str) -> None:
     assert target is not None
-    for target_line, truth_line in zip(target.split("\n"), truth.split("\n")):
-        print(truth_line)
-        print(target_line)
     for target_line, truth_line in zip(target.split("\n"), truth.split("\n")):
         assert target_line.strip() == truth_line.strip()

From 6d108178a17352f1798772df102e259fcb58859c Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Tue, 13 May 2025 18:40:19 -0400
Subject: [PATCH 4/6] Fix imports, use separate AlignmentInfo type

---
 machine/corpora/__init__.py                   |   3 +
 ...place_markers_usfm_update_block_handler.py |  39 +++++--
 ...place_markers_usfm_update_block_handler.py | 104 +++++++-----------
 3 files changed, 68 insertions(+), 78 deletions(-)

diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py
index 523604c0..66c36ca3 100644
--- a/machine/corpora/__init__.py
+++ b/machine/corpora/__init__.py
@@ -23,6 +23,7 @@
 from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase
 from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
 from .paratext_text_corpus import ParatextTextCorpus
+from .place_markers_usfm_update_block_handler import AlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler
 from .scripture_element import ScriptureElement
 from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef
 from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
@@ -79,6 +80,7 @@
     "AlignedWordPair",
     "AlignmentCollection",
     "AlignmentCorpus",
+    "AlignmentInfo",
     "AlignmentRow",
     "batch",
     "Corpus",
@@ -112,6 +114,7 @@
     "ParatextProjectTermsParserBase",
     "ParatextProjectTextUpdaterBase",
     "ParatextTextCorpus",
+    "PlaceMarkersUsfmUpdateBlockHandler",
     "parse_usfm",
     "RtlReferenceOrder",
     "ScriptureElement",
diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py
index ee6f86cf..f18dd617 100644
--- a/machine/corpora/place_markers_usfm_update_block_handler.py
+++ b/machine/corpora/place_markers_usfm_update_block_handler.py
@@ -1,20 +1,25 @@
 from __future__ import annotations
 
-from typing import List, Sequence
+from typing import List, Sequence, TypedDict
 
-from ..jobs.eflomal_aligner import to_word_alignment_matrix
-from ..jobs.translation_file_service import PretranslationInfo
-from ..translation import WordAlignmentMatrix
+from ..translation.word_alignment_matrix import AlignedWordPair, WordAlignmentMatrix
 from .usfm_token import UsfmToken, UsfmTokenType
 from .usfm_update_block import UsfmUpdateBlock
 from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
 from .usfm_update_block_handler import UsfmUpdateBlockHandler
 
 
+class AlignmentInfo(TypedDict):
+    refs: List[str]
+    source_toks: List[str]
+    translation_toks: List[str]
+    alignment: str
+
+
 class PlaceMarkersUsfmUpdateBlockHandler(UsfmUpdateBlockHandler):
 
-    def __init__(self, pt_info: Sequence[PretranslationInfo]) -> None:
-        self._pt_info = {info["refs"][0]: info for info in pt_info}
+    def __init__(self, align_info: Sequence[AlignmentInfo]) -> None:
+        self._align_info = {info["refs"][0]: info for info in align_info}
 
     def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
         ref = str(block.refs[0])
@@ -23,8 +28,8 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
         # Nothing to do if there are no markers to place or no alignment to use
         if (
             len(elements) == 0
-            or ref not in self._pt_info.keys()
-            or len(self._pt_info[ref]["alignment"]) == 0
+            or ref not in self._align_info.keys()
+            or len(self._align_info[ref]["alignment"]) == 0
             or not any(
                 (
                     e.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE]
@@ -58,8 +63,8 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
             ):
                 eob_empty_paras = False
 
-        src_toks = self._pt_info[ref]["source_toks"]
-        trg_toks = self._pt_info[ref]["translation_toks"]
+        src_toks = self._align_info[ref]["source_toks"]
+        trg_toks = self._align_info[ref]["translation_toks"]
         src_tok_idx = 0
 
         src_sent = ""
@@ -96,7 +101,7 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
 
         # Predict marker placements and get insertion order
         to_insert = []
-        alignment = to_word_alignment_matrix(self._pt_info[ref]["alignment"])
+        alignment = to_word_alignment_matrix(self._align_info[ref]["alignment"])
         for element, adj_src_tok in zip(to_place, adj_src_toks):
             adj_trg_tok = self._predict_marker_location(alignment, adj_src_tok, src_toks, trg_toks)
             trg_str_idx = trg_tok_starts[adj_trg_tok] if adj_trg_tok < len(trg_tok_starts) else len(trg_sent)
@@ -202,3 +207,15 @@ def num_align_crossings(src_idx: int, trg_idx: int) -> int:
 
         # If no alignments found, insert at the end of the sentence
         return best_hyp if best_hyp != -1 else len(trg_toks)
+
+
+def to_word_alignment_matrix(alignment_str: str) -> WordAlignmentMatrix:
+    word_pairs = AlignedWordPair.from_string(alignment_str)
+    row_count = 0
+    column_count = 0
+    for pair in word_pairs:
+        if pair.source_index + 1 > row_count:
+            row_count = pair.source_index + 1
+        if pair.target_index + 1 > column_count:
+            column_count = pair.target_index + 1
+    return WordAlignmentMatrix.from_word_pairs(row_count, column_count, word_pairs)
diff --git a/tests/corpora/test_place_markers_usfm_update_block_handler.py b/tests/corpora/test_place_markers_usfm_update_block_handler.py
index 40f79f5a..ebe73547 100644
--- a/tests/corpora/test_place_markers_usfm_update_block_handler.py
+++ b/tests/corpora/test_place_markers_usfm_update_block_handler.py
@@ -1,15 +1,15 @@
 from typing import List, Optional, Sequence, Tuple
 
 from machine.corpora import (
+    AlignmentInfo,
+    PlaceMarkersUsfmUpdateBlockHandler,
     ScriptureRef,
     UpdateUsfmMarkerBehavior,
     UpdateUsfmParserHandler,
     UpdateUsfmTextBehavior,
+    UsfmUpdateBlockHandler,
     parse_usfm,
 )
-from machine.corpora.place_markers_usfm_update_block_handler import PlaceMarkersUsfmUpdateBlockHandler
-from machine.corpora.usfm_update_block_handler import UsfmUpdateBlockHandler
-from machine.jobs.translation_file_service import PretranslationInfo
 from machine.tokenization import LatinWordTokenizer
 
 TOKENIZER = LatinWordTokenizer()
@@ -26,12 +26,9 @@ def test_paragraph_markers() -> None:
 \p and this test is for paragraph markers.
 """
 
-    pt_info = [
-        PretranslationInfo(
-            corpusId="",
-            textId="",
+    align_info = [
+        AlignmentInfo(
             refs=["MAT 1:1"],
-            translation=pretranslation,
             source_toks=[t for t in TOKENIZER.tokenize(source)],
             translation_toks=[t for t in TOKENIZER.tokenize(pretranslation)],
             alignment="0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19",
@@ -41,7 +38,7 @@ def test_paragraph_markers() -> None:
         rows,
         usfm,
         paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
-        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)],
     )
     result = r"""\id MAT
 \c 1
@@ -61,12 +58,9 @@ def test_style_markers() -> None:
 \v 1 This is the \w first\w* sentence. This text is in \w English\w*, and this test is \w for\w* style markers.
 """
 
-    pt_info = [
-        PretranslationInfo(
-            corpusId="",
-            textId="",
+    align_info = [
+        AlignmentInfo(
             refs=["MAT 1:1"],
-            translation=pretranslation,
             source_toks=[t for t in TOKENIZER.tokenize(source)],
             translation_toks=[t for t in TOKENIZER.tokenize(pretranslation)],
             alignment="0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19",
@@ -76,7 +70,7 @@ def test_style_markers() -> None:
         rows,
         usfm,
         style_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
-        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)],
     )
     result = r"""\id MAT
 \c 1
@@ -90,7 +84,7 @@ def test_style_markers() -> None:
         rows,
         usfm,
         style_behavior=UpdateUsfmMarkerBehavior.STRIP,
-        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)],
     )
     result = r"""\id MAT
 \c 1
@@ -121,12 +115,12 @@ def test_embeds() -> None:
 \v 6 Updated embed with style markers \f \fr 1.6 \ft Another \+w stylish\+w* note \f*
 """
 
-    pt_info = []
+    align_info = []
     target = update_usfm(
         rows,
         usfm,
         embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
-        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)],
     )
     result = r"""\id MAT
 \c 1
@@ -143,7 +137,7 @@ def test_embeds() -> None:
         rows,
         usfm,
         embed_behavior=UpdateUsfmMarkerBehavior.STRIP,
-        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)],
     )
     result = r"""\id MAT
 \c 1
@@ -167,12 +161,9 @@ def test_trailing_empty_paragraphs() -> None:
 \q1 \f embed \f*
 """
 
-    pt_info = [
-        PretranslationInfo(
-            corpusId="",
-            textId="",
+    align_info = [
+        AlignmentInfo(
             refs=["MAT 1:1"],
-            translation="New verse 1",
             source_toks=["Verse" "1"],
             translation_toks=["New", "verse", "1"],
             alignment="0-1 1-2",
@@ -182,7 +173,7 @@ def test_trailing_empty_paragraphs() -> None:
         rows,
         usfm,
         paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
-        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)],
     )
     result = r"""\id MAT
 \c 1
@@ -220,21 +211,15 @@ def test_headers() -> None:
 \s1 Header to be updated
 """
 
-    pt_info = [
-        PretranslationInfo(
-            corpusId="",
-            textId="",
+    align_info = [
+        AlignmentInfo(
             refs=["MAT 1:1"],
-            translation="X Y Z",
             source_toks=["A", "B", "C"],
             translation_toks=["X", "Y", "Z"],
             alignment="0-0 1-1 2-2",
         ),
-        PretranslationInfo(
-            corpusId="",
-            textId="",
+        AlignmentInfo(
             refs=["MAT 1:2"],
-            translation="X",
             source_toks=["A"],
             translation_toks=["X"],
             alignment="0-0",
@@ -244,7 +229,7 @@ def test_headers() -> None:
         rows,
         usfm,
         paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
-        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)],
     )
     result = r"""\id MAT
 \c 1
@@ -275,12 +260,9 @@ def test_consecutive_markers() -> None:
 \p \qt \+w word \+w* \qt*
 """
 
-    pt_info = [
-        PretranslationInfo(
-            corpusId="",
-            textId="",
+    align_info = [
+        AlignmentInfo(
             refs=["MAT 1:1"],
-            translation="New verse 1 WORD",
             source_toks=["Old", "verse", "1", "word"],
             translation_toks=["New", "verse", "1", "WORD"],
             alignment="0-0 1-1 2-2 3-3",
@@ -291,7 +273,7 @@ def test_consecutive_markers() -> None:
         usfm,
         paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
         style_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
-        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)],
     )
     result = r"""\id MAT
 \c 1
@@ -309,12 +291,9 @@ def test_verse_ranges() -> None:
 \p old paragraph 2
 """
 
-    pt_info = [
-        PretranslationInfo(
-            corpusId="",
-            textId="",
+    align_info = [
+        AlignmentInfo(
             refs=[str(ScriptureRef.parse(f"MAT 1:{i}")) for i in range(1, 6)],
-            translation="New verse range text new paragraph 2",
             source_toks=["Verse", "range", "old", "paragraph", "2"],
             translation_toks=["New", "verse", "range", "text", "new", "paragraph", "2"],
             alignment="0-1 1-2 2-4 3-5 4-6",
@@ -324,7 +303,7 @@ def test_verse_ranges() -> None:
         rows,
         usfm,
         paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
-        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)],
     )
     result = r"""\id MAT
 \c 1
@@ -343,12 +322,9 @@ def test_no_update() -> None:
 """
 
     # Strip paragraphs
-    pt_info = [
-        PretranslationInfo(
-            corpusId="",
-            textId="",
+    align_info = [
+        AlignmentInfo(
             refs=["MAT 1:1"],
-            translation="New paragraph 1 New paragraph 2",
             source_toks=["Old", "paragraph", "1", "Old", "paragraph", "2"],
             translation_toks=["New", "paragraph", "1", "New", "paragraph", "2"],
             alignment="0-0 1-1 2-2 3-3 4-4 5-5",
@@ -358,7 +334,7 @@ def test_no_update() -> None:
         rows,
         usfm,
         paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP,
-        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)],
     )
     result = r"""\id MAT
 \c 1
@@ -367,12 +343,9 @@ def test_no_update() -> None:
     assess(target, result)
 
     # No alignment
-    pt_info = [
-        PretranslationInfo(
-            corpusId="",
-            textId="",
+    align_info = [
+        AlignmentInfo(
             refs=["MAT 1:1"],
-            translation="New paragraph 1 New paragraph 2",
             source_toks=[],
             translation_toks=[],
             alignment="",
@@ -382,7 +355,7 @@ def test_no_update() -> None:
         rows,
         usfm,
         paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
-        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)],
     )
     result = r"""\id MAT
 \c 1
@@ -393,12 +366,12 @@ def test_no_update() -> None:
 
     # No text update
     rows = []
-    pt_info = []
+    align_info = []
     target = update_usfm(
         rows,
         usfm,
         paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
-        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)],
     )
     result = r"""\id MAT
 \c 1
@@ -417,12 +390,9 @@ def test_split_tokens() -> None:
 \p it words split
 """
 
-    pt_info = [
-        PretranslationInfo(
-            corpusId="",
-            textId="",
+    align_info = [
+        AlignmentInfo(
             refs=["MAT 1:1"],
-            translation="words split words split words split",
             source_toks=["words", "split", "words", "split", "words", "split"],
             translation_toks=["words", "split", "words", "split", "words", "split"],
             alignment="0-0 1-1 2-2 3-3 4-4 5-5",
@@ -432,7 +402,7 @@ def test_split_tokens() -> None:
         rows,
         usfm,
         paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
-        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)],
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)],
     )
     result = r"""\id MAT
 \c 1

From d942cbdc724e5ce470ba7df4dbeaf9abf11ed048 Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Thu, 15 May 2025 00:12:48 -0400
Subject: [PATCH 5/6] Adjust (PlaceMarkers)AlignmentInfo type

---
 machine/corpora/__init__.py                   |   4 +-
 ...place_markers_usfm_update_block_handler.py |  38 +++----
 ...place_markers_usfm_update_block_handler.py | 100 +++++++++++-------
 3 files changed, 75 insertions(+), 67 deletions(-)

diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py
index 66c36ca3..314b35b2 100644
--- a/machine/corpora/__init__.py
+++ b/machine/corpora/__init__.py
@@ -23,7 +23,7 @@
 from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase
 from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
 from .paratext_text_corpus import ParatextTextCorpus
-from .place_markers_usfm_update_block_handler import AlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler
+from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler
 from .scripture_element import ScriptureElement
 from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef
 from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
@@ -80,7 +80,6 @@
     "AlignedWordPair",
     "AlignmentCollection",
     "AlignmentCorpus",
-    "AlignmentInfo",
     "AlignmentRow",
     "batch",
     "Corpus",
@@ -114,6 +113,7 @@
     "ParatextProjectTermsParserBase",
     "ParatextProjectTextUpdaterBase",
     "ParatextTextCorpus",
+    "PlaceMarkersAlignmentInfo",
     "PlaceMarkersUsfmUpdateBlockHandler",
     "parse_usfm",
     "RtlReferenceOrder",
diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py
index f18dd617..7e44e02b 100644
--- a/machine/corpora/place_markers_usfm_update_block_handler.py
+++ b/machine/corpora/place_markers_usfm_update_block_handler.py
@@ -1,24 +1,24 @@
 from __future__ import annotations
 
-from typing import List, Sequence, TypedDict
+from typing import Iterable, List, TypedDict
 
-from ..translation.word_alignment_matrix import AlignedWordPair, WordAlignmentMatrix
+from ..translation.word_alignment_matrix import WordAlignmentMatrix
 from .usfm_token import UsfmToken, UsfmTokenType
 from .usfm_update_block import UsfmUpdateBlock
 from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
 from .usfm_update_block_handler import UsfmUpdateBlockHandler
 
 
-class AlignmentInfo(TypedDict):
+class PlaceMarkersAlignmentInfo(TypedDict):
     refs: List[str]
-    source_toks: List[str]
-    translation_toks: List[str]
-    alignment: str
+    source_tokens: List[str]
+    translation_tokens: List[str]
+    alignment: WordAlignmentMatrix
 
 
 class PlaceMarkersUsfmUpdateBlockHandler(UsfmUpdateBlockHandler):
 
-    def __init__(self, align_info: Sequence[AlignmentInfo]) -> None:
+    def __init__(self, align_info: Iterable[PlaceMarkersAlignmentInfo]) -> None:
         self._align_info = {info["refs"][0]: info for info in align_info}
 
     def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
@@ -29,7 +29,8 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
         if (
             len(elements) == 0
             or ref not in self._align_info.keys()
-            or len(self._align_info[ref]["alignment"]) == 0
+            or self._align_info[ref]["alignment"].row_count == 0
+            or self._align_info[ref]["alignment"].column_count == 0
             or not any(
                 (
                     e.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE]
@@ -63,8 +64,8 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
             ):
                 eob_empty_paras = False
 
-        src_toks = self._align_info[ref]["source_toks"]
-        trg_toks = self._align_info[ref]["translation_toks"]
+        src_toks = self._align_info[ref]["source_tokens"]
+        trg_toks = self._align_info[ref]["translation_tokens"]
         src_tok_idx = 0
 
         src_sent = ""
@@ -101,9 +102,10 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
 
         # Predict marker placements and get insertion order
         to_insert = []
-        alignment = to_word_alignment_matrix(self._align_info[ref]["alignment"])
         for element, adj_src_tok in zip(to_place, adj_src_toks):
-            adj_trg_tok = self._predict_marker_location(alignment, adj_src_tok, src_toks, trg_toks)
+            adj_trg_tok = self._predict_marker_location(
+                self._align_info[ref]["alignment"], adj_src_tok, src_toks, trg_toks
+            )
             trg_str_idx = trg_tok_starts[adj_trg_tok] if adj_trg_tok < len(trg_tok_starts) else len(trg_sent)
 
             to_insert.append((trg_str_idx, element))
@@ -207,15 +209,3 @@ def num_align_crossings(src_idx: int, trg_idx: int) -> int:
 
         # If no alignments found, insert at the end of the sentence
         return best_hyp if best_hyp != -1 else len(trg_toks)
-
-
-def to_word_alignment_matrix(alignment_str: str) -> WordAlignmentMatrix:
-    word_pairs = AlignedWordPair.from_string(alignment_str)
-    row_count = 0
-    column_count = 0
-    for pair in word_pairs:
-        if pair.source_index + 1 > row_count:
-            row_count = pair.source_index + 1
-        if pair.target_index + 1 > column_count:
-            column_count = pair.target_index + 1
-    return WordAlignmentMatrix.from_word_pairs(row_count, column_count, word_pairs)
diff --git a/tests/corpora/test_place_markers_usfm_update_block_handler.py b/tests/corpora/test_place_markers_usfm_update_block_handler.py
index ebe73547..9f41ce63 100644
--- a/tests/corpora/test_place_markers_usfm_update_block_handler.py
+++ b/tests/corpora/test_place_markers_usfm_update_block_handler.py
@@ -1,7 +1,8 @@
 from typing import List, Optional, Sequence, Tuple
 
 from machine.corpora import (
-    AlignmentInfo,
+    AlignedWordPair,
+    PlaceMarkersAlignmentInfo,
     PlaceMarkersUsfmUpdateBlockHandler,
     ScriptureRef,
     UpdateUsfmMarkerBehavior,
@@ -11,6 +12,7 @@
     parse_usfm,
 )
 from machine.tokenization import LatinWordTokenizer
+from machine.translation import WordAlignmentMatrix
 
 TOKENIZER = LatinWordTokenizer()
 
@@ -27,11 +29,13 @@ def test_paragraph_markers() -> None:
 """
 
     align_info = [
-        AlignmentInfo(
+        PlaceMarkersAlignmentInfo(
             refs=["MAT 1:1"],
-            source_toks=[t for t in TOKENIZER.tokenize(source)],
-            translation_toks=[t for t in TOKENIZER.tokenize(pretranslation)],
-            alignment="0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19",
+            source_tokens=[t for t in TOKENIZER.tokenize(source)],
+            translation_tokens=[t for t in TOKENIZER.tokenize(pretranslation)],
+            alignment=to_word_alignment_matrix(
+                "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19"
+            ),
         ),
     ]
     target = update_usfm(
@@ -59,11 +63,13 @@ def test_style_markers() -> None:
 """
 
     align_info = [
-        AlignmentInfo(
+        PlaceMarkersAlignmentInfo(
             refs=["MAT 1:1"],
-            source_toks=[t for t in TOKENIZER.tokenize(source)],
-            translation_toks=[t for t in TOKENIZER.tokenize(pretranslation)],
-            alignment="0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19",
+            source_tokens=[t for t in TOKENIZER.tokenize(source)],
+            translation_tokens=[t for t in TOKENIZER.tokenize(pretranslation)],
+            alignment=to_word_alignment_matrix(
+                "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19"
+            ),
         ),
     ]
     target = update_usfm(
@@ -162,11 +168,11 @@ def test_trailing_empty_paragraphs() -> None:
 """
 
     align_info = [
-        AlignmentInfo(
+        PlaceMarkersAlignmentInfo(
             refs=["MAT 1:1"],
-            source_toks=["Verse" "1"],
-            translation_toks=["New", "verse", "1"],
-            alignment="0-1 1-2",
+            source_tokens=["Verse" "1"],
+            translation_tokens=["New", "verse", "1"],
+            alignment=to_word_alignment_matrix("0-1 1-2"),
         ),
     ]
     target = update_usfm(
@@ -212,17 +218,17 @@ def test_headers() -> None:
 """
 
     align_info = [
-        AlignmentInfo(
+        PlaceMarkersAlignmentInfo(
             refs=["MAT 1:1"],
-            source_toks=["A", "B", "C"],
-            translation_toks=["X", "Y", "Z"],
-            alignment="0-0 1-1 2-2",
+            source_tokens=["A", "B", "C"],
+            translation_tokens=["X", "Y", "Z"],
+            alignment=to_word_alignment_matrix("0-0 1-1 2-2"),
         ),
-        AlignmentInfo(
+        PlaceMarkersAlignmentInfo(
             refs=["MAT 1:2"],
-            source_toks=["A"],
-            translation_toks=["X"],
-            alignment="0-0",
+            source_tokens=["A"],
+            translation_tokens=["X"],
+            alignment=to_word_alignment_matrix("0-0"),
         ),
     ]
     target = update_usfm(
@@ -261,11 +267,11 @@ def test_consecutive_markers() -> None:
 """
 
     align_info = [
-        AlignmentInfo(
+        PlaceMarkersAlignmentInfo(
             refs=["MAT 1:1"],
-            source_toks=["Old", "verse", "1", "word"],
-            translation_toks=["New", "verse", "1", "WORD"],
-            alignment="0-0 1-1 2-2 3-3",
+            source_tokens=["Old", "verse", "1", "word"],
+            translation_tokens=["New", "verse", "1", "WORD"],
+            alignment=to_word_alignment_matrix("0-0 1-1 2-2 3-3"),
         ),
     ]
     target = update_usfm(
@@ -292,11 +298,11 @@ def test_verse_ranges() -> None:
 """
 
     align_info = [
-        AlignmentInfo(
+        PlaceMarkersAlignmentInfo(
             refs=[str(ScriptureRef.parse(f"MAT 1:{i}")) for i in range(1, 6)],
-            source_toks=["Verse", "range", "old", "paragraph", "2"],
-            translation_toks=["New", "verse", "range", "text", "new", "paragraph", "2"],
-            alignment="0-1 1-2 2-4 3-5 4-6",
+            source_tokens=["Verse", "range", "old", "paragraph", "2"],
+            translation_tokens=["New", "verse", "range", "text", "new", "paragraph", "2"],
+            alignment=to_word_alignment_matrix("0-1 1-2 2-4 3-5 4-6"),
         ),
     ]
     target = update_usfm(
@@ -323,11 +329,11 @@ def test_no_update() -> None:
 
     # Strip paragraphs
     align_info = [
-        AlignmentInfo(
+        PlaceMarkersAlignmentInfo(
             refs=["MAT 1:1"],
-            source_toks=["Old", "paragraph", "1", "Old", "paragraph", "2"],
-            translation_toks=["New", "paragraph", "1", "New", "paragraph", "2"],
-            alignment="0-0 1-1 2-2 3-3 4-4 5-5",
+            source_tokens=["Old", "paragraph", "1", "Old", "paragraph", "2"],
+            translation_tokens=["New", "paragraph", "1", "New", "paragraph", "2"],
+            alignment=to_word_alignment_matrix("0-0 1-1 2-2 3-3 4-4 5-5"),
         ),
     ]
     target = update_usfm(
@@ -344,11 +350,11 @@ def test_no_update() -> None:
 
     # No alignment
     align_info = [
-        AlignmentInfo(
+        PlaceMarkersAlignmentInfo(
             refs=["MAT 1:1"],
-            source_toks=[],
-            translation_toks=[],
-            alignment="",
+            source_tokens=[],
+            translation_tokens=[],
+            alignment=to_word_alignment_matrix(""),
         ),
     ]
     target = update_usfm(
@@ -391,11 +397,11 @@ def test_split_tokens() -> None:
 """
 
     align_info = [
-        AlignmentInfo(
+        PlaceMarkersAlignmentInfo(
             refs=["MAT 1:1"],
-            source_toks=["words", "split", "words", "split", "words", "split"],
-            translation_toks=["words", "split", "words", "split", "words", "split"],
-            alignment="0-0 1-1 2-2 3-3 4-4 5-5",
+            source_tokens=["words", "split", "words", "split", "words", "split"],
+            translation_tokens=["words", "split", "words", "split", "words", "split"],
+            alignment=to_word_alignment_matrix("0-0 1-1 2-2 3-3 4-4 5-5"),
         ),
     ]
     target = update_usfm(
@@ -417,6 +423,18 @@ def scr_ref(*refs: str) -> List[ScriptureRef]:
     return [ScriptureRef.parse(ref) for ref in refs]
 
 
+def to_word_alignment_matrix(alignment_str: str) -> WordAlignmentMatrix:
+    word_pairs = AlignedWordPair.from_string(alignment_str)
+    row_count = 0
+    column_count = 0
+    for pair in word_pairs:
+        if pair.source_index + 1 > row_count:
+            row_count = pair.source_index + 1
+        if pair.target_index + 1 > column_count:
+            column_count = pair.target_index + 1
+    return WordAlignmentMatrix.from_word_pairs(row_count, column_count, word_pairs)
+
+
 def update_usfm(
     rows: Sequence[Tuple[Sequence[ScriptureRef], str]],
     source: str,

From 96e85e393aae6ffdc4e6363cf22321ed6affe7aa Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Thu, 15 May 2025 17:49:12 -0400
Subject: [PATCH 6/6] 'toks' --> 'tokens'

---
 machine/jobs/nmt_engine_build_job.py     |  4 ++--
 machine/jobs/translation_file_service.py |  8 ++++----
 tests/jobs/test_nmt_engine_build_job.py  | 12 ++++++------
 tests/jobs/test_smt_engine_build_job.py  |  4 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/machine/jobs/nmt_engine_build_job.py b/machine/jobs/nmt_engine_build_job.py
index b7b2afbc..5bb120a8 100644
--- a/machine/jobs/nmt_engine_build_job.py
+++ b/machine/jobs/nmt_engine_build_job.py
@@ -157,8 +157,8 @@ def _align(
             check_canceled()
 
         for i in range(len(pretranslations)):
-            pretranslations[i]["source_toks"] = list(src_tokenized[i])
-            pretranslations[i]["translation_toks"] = list(trg_tokenized[i])
+            pretranslations[i]["source_tokens"] = list(src_tokenized[i])
+            pretranslations[i]["translation_tokens"] = list(trg_tokenized[i])
             pretranslations[i]["alignment"] = alignments[i]
 
         return pretranslations
diff --git a/machine/jobs/translation_file_service.py b/machine/jobs/translation_file_service.py
index 54c4ae90..f29e1005 100644
--- a/machine/jobs/translation_file_service.py
+++ b/machine/jobs/translation_file_service.py
@@ -16,8 +16,8 @@ class PretranslationInfo(TypedDict):
     textId: str  # noqa: N815
     refs: List[str]
     translation: str
-    source_toks: List[str]
-    translation_toks: List[str]
+    source_tokens: List[str]
+    translation_tokens: List[str]
     alignment: str
 
 
@@ -65,8 +65,8 @@ def generator() -> Generator[PretranslationInfo, None, None]:
                         textId=pi["textId"],
                         refs=list(pi["refs"]),
                         translation=pi["translation"],
-                        source_toks=list(pi["source_toks"]),
-                        translation_toks=list(pi["translation_toks"]),
+                        source_tokens=list(pi["source_tokens"]),
+                        translation_tokens=list(pi["translation_tokens"]),
                         alignment=pi["alignment"],
                     )
 
diff --git a/tests/jobs/test_nmt_engine_build_job.py b/tests/jobs/test_nmt_engine_build_job.py
index 227b909b..014fc743 100644
--- a/tests/jobs/test_nmt_engine_build_job.py
+++ b/tests/jobs/test_nmt_engine_build_job.py
@@ -38,7 +38,7 @@ def test_run(decoy: Decoy) -> None:
     assert len(pretranslations) == 1
     assert pretranslations[0]["translation"] == "Please, I have booked a room."
     if is_eflomal_available():
-        assert pretranslations[0]["source_toks"] == [
+        assert pretranslations[0]["source_tokens"] == [
             "Por",
             "favor",
             ",",
@@ -48,11 +48,11 @@ def test_run(decoy: Decoy) -> None:
             "habitación",
             ".",
         ]
-        assert pretranslations[0]["translation_toks"] == ["Please", ",", "I", "have", "booked", "a", "room", "."]
+        assert pretranslations[0]["translation_tokens"] == ["Please", ",", "I", "have", "booked", "a", "room", "."]
         assert len(pretranslations[0]["alignment"]) > 0
     else:
-        assert pretranslations[0]["source_toks"] == []
-        assert pretranslations[0]["translation_toks"] == []
+        assert pretranslations[0]["source_tokens"] == []
+        assert pretranslations[0]["translation_tokens"] == []
         assert len(pretranslations[0]["alignment"]) == 0
     decoy.verify(env.translation_file_service.save_model(Path("model.tar.gz"), "models/save-model.tar.gz"), times=1)
 
@@ -131,8 +131,8 @@ def __init__(self, decoy: Decoy) -> None:
                             textId="text1",
                             refs=["ref1"],
                             translation="Por favor, tengo reservada una habitación.",
-                            source_toks=[],
-                            translation_toks=[],
+                            source_tokens=[],
+                            translation_tokens=[],
                             alignment="",
                         )
                     ]
diff --git a/tests/jobs/test_smt_engine_build_job.py b/tests/jobs/test_smt_engine_build_job.py
index 16afcacf..eff4649f 100644
--- a/tests/jobs/test_smt_engine_build_job.py
+++ b/tests/jobs/test_smt_engine_build_job.py
@@ -137,8 +137,8 @@ def __init__(self, decoy: Decoy) -> None:
                             textId="text1",
                             refs=["ref1"],
                             translation="Por favor, tengo reservada una habitación.",
-                            source_toks=[],
-                            translation_toks=[],
+                            source_tokens=[],
+                            translation_tokens=[],
                             alignment="",
                         )
                     ]