From cf993b4d6ae302ccc043ec8e25223988433fa71e Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Thu, 8 May 2025 23:45:24 -0400 Subject: [PATCH 1/6] Marker placement update block handler --- ...place_markers_usfm_update_block_handler.py | 259 ++++++++++ ...place_markers_usfm_update_block_handler.py | 483 ++++++++++++++++++ 2 files changed, 742 insertions(+) create mode 100644 machine/corpora/place_markers_usfm_update_block_handler.py create mode 100644 tests/corpora/test_place_markers_usfm_update_block_handler.py diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py new file mode 100644 index 00000000..514a3022 --- /dev/null +++ b/machine/corpora/place_markers_usfm_update_block_handler.py @@ -0,0 +1,259 @@ +from __future__ import annotations + +from typing import List, Sequence + +from ..jobs.eflomal_aligner import to_word_alignment_matrix +from ..jobs.translation_file_service import PretranslationInfo +from ..scripture.verse_ref import VerseRef +from ..tokenization import LatinWordTokenizer +from ..translation import WordAlignmentMatrix +from .aligned_word_pair import AlignedWordPair +from .usfm_stylesheet import UsfmStylesheet +from .usfm_tag import UsfmTextType +from .usfm_token import UsfmToken, UsfmTokenType +from .usfm_update_block import UsfmUpdateBlock +from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType +from .usfm_update_block_handler import UsfmUpdateBlockHandler + +TOKENIZER = LatinWordTokenizer() +STYLESHEET = UsfmStylesheet("usfm.sty") + + +class PlaceMarkersUsfmUpdateBlockHandler(UsfmUpdateBlockHandler): + + def __init__(self, pt_info: Sequence[PretranslationInfo]): + self._pt_info = {} + for info in pt_info: + if len(info["refs"]) == 1: + ref_str = info["refs"][0] + else: + ref_str_start = VerseRef.from_string(info["refs"][0]) + ref_str_end = VerseRef.from_string(info["refs"][-1]) + ref_str = str(VerseRef.from_range(ref_str_start, ref_str_end)) + self._pt_info[ref_str] = info + # self._pt_info = {info["refs"][0]: info for info in pt_info} + + def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: + block_ref = str( + block.refs[0] + if len(block.refs) == 1 + else VerseRef.from_range(block.refs[0].verse_ref, block.refs[-1].verse_ref) + ) + + # Nothing to do if there are no markers to place, no alignment to use, or if the block represents an embed + if ( + len(block.elements) == 0 + or block_ref not in self._pt_info.keys() + or len(self._pt_info[block_ref]["alignment"]) == 0 + # TODO: is this too restrictive? + or block.elements[0].tokens[0].marker != "v" + or not any( + ( + element.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE] + and not element.marked_for_removal + ) + for element in block.elements[1:] # TODO: all of block + ) + ): + return block + + # Work on a copy in case the block needs to be returned unchanged + orig_elements = list(block.elements) + + src_sent = "" + trg_sent = "" + to_place = [] + src_marker_idxs = [] + placed_elements = [orig_elements[0]] # TODO: no elements to start + ignored_elements = [] + + # Section headers should be ignored but re-inserted in the same position relative to other paragraph markers + header_elements = [] + para_markers_left = 0 + for i, element in reversed(list(enumerate(orig_elements))): + if element.type == UsfmUpdateBlockElementType.PARAGRAPH and not element.marked_for_removal: + if STYLESHEET.get_tag(str(element.tokens[0].marker)).text_type == UsfmTextType.SECTION: + # if i < len(orig_elements) - 1 and orig_elements[i + 1].type == UsfmUpdateBlockElementType.TEXT: + # header_elements.insert(0, (para_markers_left, [element, orig_elements.pop(i + 1)])) + # else: + header_elements.insert(0, (para_markers_left, element)) + orig_elements.pop(i) + else: + para_markers_left += 1 + + # Paragraph markers at the end of the block should stay there + end_elements = [] + for i, element in reversed(list(enumerate(orig_elements))): + if element.type == UsfmUpdateBlockElementType.PARAGRAPH and not element.marked_for_removal: + end_elements.insert(0, element) + orig_elements.pop(i) + elif element.type != UsfmUpdateBlockElementType.EMBED: + break + + for element in orig_elements[1:]: # TODO: all + if element.type == UsfmUpdateBlockElementType.TEXT: + if element.marked_for_removal: + src_sent += element.tokens[0].to_usfm() + else: + trg_sent += element.tokens[0].to_usfm() + + if element.marked_for_removal or element.type == UsfmUpdateBlockElementType.EMBED: + ignored_elements.append(element) + elif element.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE]: + to_place.append(element) + src_marker_idxs.append(len(src_sent)) + + src_toks = self._pt_info[block_ref]["source_toks"] + trg_toks = self._pt_info[block_ref]["translation_toks"] + + # Don't do anything if the source sentence or pretranslation has changed + if ( + list(t for t in TOKENIZER.tokenize(src_sent)) != src_toks + or list(t for t in TOKENIZER.tokenize(trg_sent)) != trg_toks + ): + return block + + src_tok_starts = [] + for tok in src_toks: + src_tok_starts.append(src_sent.index(tok, src_tok_starts[-1] + 1 if len(src_tok_starts) > 0 else 0)) + trg_tok_starts = [] + for tok in trg_toks: + trg_tok_starts.append(trg_sent.index(tok, trg_tok_starts[-1] + 1 if len(trg_tok_starts) > 0 else 0)) + + # Get index of the text token immediately following each marker + # and predict the corresponding token on the target side + adj_src_toks = [] + for idx in src_marker_idxs: + for i, start_idx in reversed(list(enumerate(src_tok_starts))): + if start_idx < idx: + adj_src_toks.append(i + 1) + break + if i == 0: + adj_src_toks.append(i) + + alignment = to_word_alignment_matrix(self._pt_info[block_ref]["alignment"]) + adj_trg_toks = [ + self._predict_marker_location(alignment, adj_src_tok, src_toks, trg_toks) for adj_src_tok in adj_src_toks + ] + + # Collect the markers to be inserted + to_insert = [] + for element, adj_trg_tok in zip(to_place, adj_trg_toks): + trg_str_idx = trg_tok_starts[adj_trg_tok] if adj_trg_tok < len(trg_tok_starts) else len(trg_sent) + + # Determine the order of the markers in the sentence to handle ambiguity for directly adjacent markers + insert_pos = 0 + while insert_pos < len(to_insert) and to_insert[insert_pos][0] <= trg_str_idx: + insert_pos += 1 + to_insert.insert(insert_pos, (trg_str_idx, element)) + + # Construct new text tokens to put between markers + # and reincorporate headers and empty end-of-verse paragraph markers + if len(to_insert) == 0 or to_insert[0][0] > 0: + placed_elements.append( + UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.TEXT, + [ + UsfmToken( + UsfmTokenType.TEXT, text=trg_sent[: to_insert[0][0]] if len(to_insert) > 0 else trg_sent + ) + ], + ) + ) + for j, (insert_idx, element) in enumerate(to_insert): + if element.type == UsfmUpdateBlockElementType.PARAGRAPH: + while len(header_elements) > 0 and header_elements[0][0] == para_markers_left: + placed_elements.append(header_elements.pop(0)[1]) + para_markers_left -= 1 + + placed_elements.append(element) + text_token = UsfmToken( + UsfmTokenType.TEXT, + text=(trg_sent[insert_idx : to_insert[j + 1][0]] if j + 1 < len(to_insert) else trg_sent[insert_idx:]), + ) + placed_elements.append(UsfmUpdateBlockElement(UsfmUpdateBlockElementType.TEXT, [text_token])) + for element in end_elements: + while len(header_elements) > 0 and header_elements[0][0] == para_markers_left: + placed_elements.append(header_elements.pop(0)[1]) + para_markers_left -= 1 + placed_elements.append(element) + while len(header_elements) > 0: + placed_elements.append(header_elements.pop(0)[1]) + + block._elements = placed_elements + ignored_elements + return block + + def _predict_marker_location( + self, + alignment: WordAlignmentMatrix, + adj_src_tok: int, + src_toks: List[str], + trg_toks: List[str], + ) -> int: + # Gets the number of alignment pairs that "cross the line" between + # the src marker position and the potential trg marker position, (src_idx - .5) and (trg_idx - .5) + def num_align_crossings(src_idx: int, trg_idx: int) -> int: + crossings = 0 + for i in range(alignment.row_count): + for j in range(alignment.column_count): + if alignment[i, j] and ((i < src_idx and j >= trg_idx) or (i >= src_idx and j < trg_idx)): + crossings += 1 + return crossings + + # If the token on either side of a potential target location is punctuation, + # use it as the basis for deciding the target marker location + trg_hyp = -1 + punct_hyps = [-1, 0] + for punct_hyp in punct_hyps: + src_hyp = adj_src_tok + punct_hyp + if src_hyp < 0 or src_hyp >= len(src_toks): + continue + # Only accept aligned pairs where both the src and trg token are punctuation + hyp_tok = src_toks[src_hyp] + if len(hyp_tok) > 0 and not any(c.isalpha() for c in hyp_tok) and src_hyp < alignment.row_count: + aligned_trg_toks = list(alignment.get_row_aligned_indices(src_hyp)) + # If aligning to a token that precedes that marker, + # the trg token predicted to be closest to the marker + # is the last token aligned to the src rather than the first + for trg_idx in reversed(aligned_trg_toks) if punct_hyp < 0 else aligned_trg_toks: + trg_tok = trg_toks[trg_idx] + if len(trg_tok) > 0 and not any(c.isalpha() for c in trg_tok): + trg_hyp = trg_idx + break + if trg_hyp != -1: + # Since the marker location is represented by the token after the marker, + # adjust the index when aligning to punctuation that precedes the token + return trg_hyp + (1 if punct_hyp == -1 else 0) + + hyps = [0, 1, 2] + best_hyp = -1 + best_num_crossings = 200**2 # mostly meaningless, a big number + checked = set() + for hyp in hyps: + src_hyp = adj_src_tok + hyp + if src_hyp in checked: + continue + trg_hyp = -1 + while trg_hyp == -1 and src_hyp >= 0 and src_hyp < alignment.row_count: + checked.add(src_hyp) + aligned_trg_toks = list(alignment.get_row_aligned_indices(src_hyp)) + if len(aligned_trg_toks) > 0: + # If aligning with a source token that precedes the marker, + # the target token predicted to be closest to the marker is the last aligned token rather than the first + trg_hyp = aligned_trg_toks[-1 if hyp < 0 else 0] + else: # continue the search outwards + src_hyp += -1 if hyp < 0 else 1 + if trg_hyp != -1: + # TODO: experiment w/ using adj_src_tok instead of src_hyp + # probably doesn't work well w/ word order switches, e.g. eng vs spa noun/adj + # one issue it does fix is markers getting sucked to punctuation + # (could be the source of some of the \w\w* issues) + num_crossings = num_align_crossings(adj_src_tok, trg_hyp) + if num_crossings < best_num_crossings: + best_hyp = trg_hyp + best_num_crossings = num_crossings + if num_crossings == 0: + break + + # If no alignments found, insert at the end of the sentence + return best_hyp if best_hyp != -1 else len(trg_toks) diff --git a/tests/corpora/test_place_markers_usfm_update_block_handler.py b/tests/corpora/test_place_markers_usfm_update_block_handler.py new file mode 100644 index 00000000..ee1779c6 --- /dev/null +++ b/tests/corpora/test_place_markers_usfm_update_block_handler.py @@ -0,0 +1,483 @@ +from typing import List, Optional, Sequence, Tuple + +from machine.corpora import ( + ScriptureRef, + UpdateUsfmMarkerBehavior, + UpdateUsfmParserHandler, + UpdateUsfmTextBehavior, + parse_usfm, +) +from machine.corpora.place_markers_usfm_update_block_handler import PlaceMarkersUsfmUpdateBlockHandler +from machine.corpora.usfm_update_block_handler import UsfmUpdateBlockHandler +from machine.jobs.translation_file_service import PretranslationInfo +from machine.tokenization import LatinWordTokenizer + +TOKENIZER = LatinWordTokenizer() + + +def test_paragraph_markers(): + source = "This is the first paragraph. This text is in English, and this test is for paragraph markers." + pretranslation = "Este es el primer párrafo. Este texto está en inglés y esta prueba es para marcadores de párrafo." + rows = [(scr_ref("MAT 1:1"), str(pretranslation))] + usfm = r"""\id MAT +\c 1 +\v 1 This is the first paragraph. +\p This text is in English, +\p and this test is for paragraph markers. +""" + + pt_info = [ + PretranslationInfo( + corpusId="", + textId="", + refs=["MAT 1:1"], + translation=pretranslation, + source_toks=[t for t in TOKENIZER.tokenize(source)], + translation_toks=[t for t in TOKENIZER.tokenize(pretranslation)], + alignment="0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19", + ), + ] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 Este es el primer párrafo. +\p Este texto está en inglés +\p y esta prueba es para marcadores de párrafo. +""" + assess(target, result) + + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 Este es el primer párrafo. Este texto está en inglés y esta prueba es para marcadores de párrafo. +""" + assess(target, result) + + +def test_list_paragraph_markers(): + source = "This is a list: First list item Second list item Third list item" + pretranslation = ( + "Esta es una lista: Primer elemento de la lista Segundo elemento de la lista Tercer elemento de la lista" + ) + rows = [(scr_ref("MAT 1:1"), str(pretranslation))] + usfm = r"""\id MAT +\c 1 +\v 1 This is a list: +\li1 First list item +\li1 Second list item +\li1 Third list item +""" + + pt_info = [ + PretranslationInfo( + corpusId="", + textId="", + refs=["MAT 1:1"], + translation=pretranslation, + source_toks=[t for t in TOKENIZER.tokenize(source)], + translation_toks=[t for t in TOKENIZER.tokenize(pretranslation)], + alignment="0-0 1-1 2-2 3-3 4-4 5-5 6-9 7-6 8-10 9-14 10-11 11-15 12-19 13-16", + ), + ] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 Esta es una lista: +\li1 Primer elemento de la lista +\li1 Segundo elemento de la lista +\li1 Tercer elemento de la lista +""" + assess(target, result) + + +def test_style_markers(): + source = "This is the first sentence. This text is in English, and this test is for style markers." + pretranslation = "Esta es la primera oración. Este texto está en inglés y esta prueba es para marcadores de estilo." + rows = [(scr_ref("MAT 1:1"), str(pretranslation))] + usfm = r"""\id MAT +\c 1 +\v 1 This is the \w first\w* sentence. This text is in \w English\w*, and this test is \w for\w* style markers. +""" + + pt_info = [ + PretranslationInfo( + corpusId="", + textId="", + refs=["MAT 1:1"], + translation=pretranslation, + source_toks=[t for t in TOKENIZER.tokenize(source)], + translation_toks=[t for t in TOKENIZER.tokenize(pretranslation)], + alignment="0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19", + ), + ] + target = update_usfm( + rows, + usfm, + style_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 Esta es la \w primera \w*oración. Este texto está en \w inglés \w*y esta prueba es \w para \w*marcadores de estilo. +""" + # TODO: the spacing before/after end markers is incorrect, + # but this is an issue with how the is USFM is generated from the tokens + assess(target, result) + + target = update_usfm( + rows, + usfm, + style_behavior=UpdateUsfmMarkerBehavior.STRIP, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 Esta es la primera oración. Este texto está en inglés y esta prueba es para marcadores de estilo. +""" + assess(target, result) + + +def test_embeds(): + rows = [ + (scr_ref("MAT 1:1"), str("New verse 1")), + (scr_ref("MAT 1:2"), str("New verse 2")), + (scr_ref("MAT 1:3"), str("New verse 3")), + (scr_ref("MAT 1:4"), str("New verse 4")), + (scr_ref("MAT 1:4/1:f"), str("New embed text")), + (scr_ref("MAT 1:5"), str("New verse 5")), + (scr_ref("MAT 1:6"), str("New verse 6")), + (scr_ref("MAT 1:6/1:f"), str("New verse 6 embed text")), + ] + usfm = r"""\id MAT +\c 1 +\v 1 \f \fr 1.1 \ft Some note \f*Start of sentence embed +\v 2 Middle of sentence \f \fr 1.2 \ft Some other note \f*embed +\v 3 End of sentence embed\f \fr 1.3 \ft A third note \f* +\v 4 Updated embed\f \fr 1.4 \ft A fourth note \f* +\v 5 Embed with style markers \f \fr 1.5 \ft A \+w stylish\+w* note \f* +\v 6 Updated embed with style markers \f \fr 1.6 \ft Another \+w stylish\+w* note \f* +""" + + pt_info = [ + PretranslationInfo( + corpusId="", + textId="", + refs=["MAT 1:1"], + translation="New verse 1", + source_toks=["Start", "of", "sentence", "embed"], + translation_toks=["New", "verse", "1"], + alignment="", + ), + PretranslationInfo( + corpusId="", + textId="", + refs=["MAT 1:2"], + translation="New verse 2", + source_toks=["Middle", "of", "sentence", "embed"], + translation_toks=["New", "verse", "2"], + alignment="", + ), + PretranslationInfo( + corpusId="", + textId="", + refs=["MAT 1:3"], + translation="New verse 3", + source_toks=["End", "of", "sentence", "embed"], + translation_toks=["New", "verse", "3"], + alignment="", + ), + PretranslationInfo( + corpusId="", + textId="", + refs=["MAT 1:4"], + translation="New verse 4", + source_toks=["Updated", "embed"], + translation_toks=["New", "verse", "4"], + alignment="", + ), + PretranslationInfo( + corpusId="", + textId="", + refs=["MAT 1:4/1:f"], + translation="New embed text", + source_toks=["A", "fourth", "note"], + translation_toks=["New", "embed", "text"], + alignment="", + ), + PretranslationInfo( + corpusId="", + textId="", + refs=["MAT 1:5"], + translation="New verse 5", + source_toks=["Embed", "with", "style", "markers"], + translation_toks=["New", "verse", "5"], + alignment="", + ), + PretranslationInfo( + corpusId="", + textId="", + refs=["MAT 1:6"], + translation="New verse 6", + source_toks=["Updated", "embed", "with", "style", "markers"], + translation_toks=["New", "verse", "6"], + alignment="", + ), + PretranslationInfo( + corpusId="", + textId="", + refs=["MAT 1:6/1:f"], + translation="New verse 6 embed text", + source_toks=["Another", "stylish", "note"], + translation_toks=["New", "verse", "6", "embed", "text"], + alignment="", + ), + ] + target = update_usfm( + rows, + usfm, + embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + ) + # NOTE: currently not updating embeds + result = r"""\id MAT +\c 1 +\v 1 New verse 1 \f \fr 1.1 \ft Some note \f* +\v 2 New verse 2 \f \fr 1.2 \ft Some other note \f* +\v 3 New verse 3 \f \fr 1.3 \ft A third note \f* +\v 4 New verse 4 \f \fr 1.4 \ft A fourth note \f* +\v 5 New verse 5 \f \fr 1.5 \ft A \+w stylish\+w* note \f* +\v 6 New verse 6 \f \fr 1.6 \ft Another \+w stylish\+w* note \f* +""" + assess(target, result) + + target = update_usfm( + rows, + usfm, + embed_behavior=UpdateUsfmMarkerBehavior.STRIP, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 New verse 1 +\v 2 New verse 2 +\v 3 New verse 3 +\v 4 New verse 4 +\v 5 New verse 5 +\v 6 New verse 6 +""" + assess(target, result) + + +def test_headers(): + rows = [(scr_ref("MAT 1:1"), "X Y Z"), (scr_ref("MAT 1:2"), "X")] + usfm = r"""\id MAT +\c 1 +\s1 Start of chapter header +\v 1 A +\p B +\s1 Mid-verse header +\p C +\s1 End of verse header +\p +\p +\s1 Header after all paragraphs +\v 2 A +\s1 Header followed by a reference +\r (reference) +\p +""" + + pt_info = [ + PretranslationInfo( + corpusId="", + textId="", + refs=["MAT 1:1"], + translation="X Y Z", + source_toks=["A", "B", "C"], + translation_toks=["X", "Y", "Z"], + alignment="0-0 1-1 2-2", + ), + PretranslationInfo( + corpusId="", + textId="", + refs=["MAT 1:2"], + translation="X", + source_toks=["A"], + translation_toks=["X"], + alignment="0-0", + ), + ] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + ) + result = r"""\id MAT +\c 1 +\s1 Start of chapter header +\v 1 X +\p Y +\s1 Mid-verse header +\p Z +\s1 End of verse header +\p +\p +\s1 Header after all paragraphs +\v 2 X +\s1 Header followed by a reference +\r (reference) +\p +""" + assess(target, result) + + +def test_verse_ranges(): + rows = [([ScriptureRef.parse(f"MAT 1:{i}") for i in range(1, 6)], "New verse range text")] + usfm = r"""\id MAT +\c 1 +\v 1-5 Verse range +""" + + pt_info = [ + PretranslationInfo( + corpusId="", + textId="", + refs=[str(ScriptureRef.parse(f"MAT 1:{i}")) for i in range(1, 6)], + translation="New verse range text", + source_toks=["Verse", "range"], + translation_toks=["New", "verse", "range", "text"], + alignment="0-1 1-2", + ), + ] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + ) + result = r"""\id MAT +\c 1 +\v 1-5 New verse range text +""" + assess(target, result) + + +def test_no_alignment(): + rows = [(scr_ref("MAT 1:1"), str("New paragraph 1 New paragraph 2"))] + usfm = r"""\id MAT +\c 1 +\v 1 Old paragraph 1 +\p Old paragraph 2 +""" + + pt_info = [ + PretranslationInfo( + corpusId="", + textId="", + refs=["MAT 1:1"], + translation="New paragraph 1 New paragraph 2", + source_toks=["Old", "paragraph", "1", "Old", "paragraph", "2"], + translation_toks=["New", "paragraph", "1", "New", "paragraph", "2"], + alignment="", + ), + ] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 New paragraph 1 New paragraph 2 +\p +""" + assess(target, result) + + +def test_changed_text(): + rows = [(scr_ref("MAT 1:1"), str("New paragraph 1 New paragraph 2"))] + usfm = r"""\id MAT +\c 1 +\v 1 Old paragraph 1 +\p Old paragraph 2 +""" + + pt_info = [ + PretranslationInfo( + corpusId="", + textId="", + refs=["MAT 1:1"], + translation="Changed paragraph 1 Changed paragraph 2", + source_toks=["Old", "paragraph", "1", "Old", "paragraph", "2"], + translation_toks=["Changed", "paragraph", "1", "Changed", "paragraph", "2"], + alignment="0-0 1-1 2-2 3-3 4-4 5-5", + ), + ] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 New paragraph 1 New paragraph 2 +\p +""" + assess(target, result) + + +def scr_ref(*refs: str) -> List[ScriptureRef]: + return [ScriptureRef.parse(ref) for ref in refs] + + +def update_usfm( + rows: Sequence[Tuple[Sequence[ScriptureRef], str]], + source: str, + id_text: Optional[str] = None, + text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_NEW, + paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, + embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, + style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, + preserve_paragraph_styles: Optional[Sequence[str]] = None, + update_block_handlers: Optional[list[UsfmUpdateBlockHandler]] = None, +) -> Optional[str]: + source = source.strip().replace("\r\n", "\n") + "\r\n" + updater = UpdateUsfmParserHandler( + rows, + id_text, + text_behavior, + paragraph_behavior, + embed_behavior, + style_behavior, + preserve_paragraph_styles, + update_block_handlers, + ) + parse_usfm(source, updater) + return updater.get_usfm() + + +def assess(target: Optional[str], truth: str) -> None: + assert target is not None + for target_line, truth_line in zip(target.split("\n"), truth.split("\n")): + print(truth_line) + print(target_line) + for target_line, truth_line in zip(target.split("\n"), truth.split("\n")): + assert target_line.strip() == truth_line.strip() From d2b8a11fd1c27f2884ed255b6bc7925ac0162545 Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Fri, 9 May 2025 20:03:50 -0400 Subject: [PATCH 2/6] Refactor marker placement handler, small bug fixes --- ...place_markers_usfm_update_block_handler.py | 175 +++++++----------- 1 file changed, 63 insertions(+), 112 deletions(-) diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py index 514a3022..576aa42f 100644 --- a/machine/corpora/place_markers_usfm_update_block_handler.py +++ b/machine/corpora/place_markers_usfm_update_block_handler.py @@ -4,96 +4,85 @@ from ..jobs.eflomal_aligner import to_word_alignment_matrix from ..jobs.translation_file_service import PretranslationInfo -from ..scripture.verse_ref import VerseRef from ..tokenization import LatinWordTokenizer from ..translation import WordAlignmentMatrix -from .aligned_word_pair import AlignedWordPair -from .usfm_stylesheet import UsfmStylesheet -from .usfm_tag import UsfmTextType from .usfm_token import UsfmToken, UsfmTokenType from .usfm_update_block import UsfmUpdateBlock from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType from .usfm_update_block_handler import UsfmUpdateBlockHandler TOKENIZER = LatinWordTokenizer() -STYLESHEET = UsfmStylesheet("usfm.sty") class PlaceMarkersUsfmUpdateBlockHandler(UsfmUpdateBlockHandler): def __init__(self, pt_info: Sequence[PretranslationInfo]): - self._pt_info = {} - for info in pt_info: - if len(info["refs"]) == 1: - ref_str = info["refs"][0] - else: - ref_str_start = VerseRef.from_string(info["refs"][0]) - ref_str_end = VerseRef.from_string(info["refs"][-1]) - ref_str = str(VerseRef.from_range(ref_str_start, ref_str_end)) - self._pt_info[ref_str] = info - # self._pt_info = {info["refs"][0]: info for info in pt_info} + self._pt_info = {info["refs"][0]: info for info in pt_info} def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: - block_ref = str( - block.refs[0] - if len(block.refs) == 1 - else VerseRef.from_range(block.refs[0].verse_ref, block.refs[-1].verse_ref) - ) + ref = str(block.refs[0]) + elements = list(block.elements) - # Nothing to do if there are no markers to place, no alignment to use, or if the block represents an embed + # Nothing to do if there are no markers to place or no alignment to use if ( - len(block.elements) == 0 - or block_ref not in self._pt_info.keys() - or len(self._pt_info[block_ref]["alignment"]) == 0 - # TODO: is this too restrictive? - or block.elements[0].tokens[0].marker != "v" + len(elements) == 0 + or ref not in self._pt_info.keys() + or len(self._pt_info[ref]["alignment"]) == 0 or not any( ( - element.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE] - and not element.marked_for_removal + e.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE] + and not e.marked_for_removal ) - for element in block.elements[1:] # TODO: all of block + for e in elements ) ): return block - # Work on a copy in case the block needs to be returned unchanged - orig_elements = list(block.elements) - - src_sent = "" - trg_sent = "" - to_place = [] - src_marker_idxs = [] - placed_elements = [orig_elements[0]] # TODO: no elements to start - ignored_elements = [] - + # Paragraph markers at the end of the block should stay there # Section headers should be ignored but re-inserted in the same position relative to other paragraph markers + end_elements = [] + eob_empty_paras = True header_elements = [] para_markers_left = 0 - for i, element in reversed(list(enumerate(orig_elements))): + for i, element in reversed(list(enumerate(elements))): if element.type == UsfmUpdateBlockElementType.PARAGRAPH and not element.marked_for_removal: - if STYLESHEET.get_tag(str(element.tokens[0].marker)).text_type == UsfmTextType.SECTION: - # if i < len(orig_elements) - 1 and orig_elements[i + 1].type == UsfmUpdateBlockElementType.TEXT: - # header_elements.insert(0, (para_markers_left, [element, orig_elements.pop(i + 1)])) - # else: + if len(element.tokens) > 1: header_elements.insert(0, (para_markers_left, element)) - orig_elements.pop(i) + elements.pop(i) else: para_markers_left += 1 - # Paragraph markers at the end of the block should stay there - end_elements = [] - for i, element in reversed(list(enumerate(orig_elements))): - if element.type == UsfmUpdateBlockElementType.PARAGRAPH and not element.marked_for_removal: - end_elements.insert(0, element) - orig_elements.pop(i) + if eob_empty_paras: + end_elements.insert(0, element) + elements.pop(i) elif element.type != UsfmUpdateBlockElementType.EMBED: - break + eob_empty_paras = False + + src_toks = self._pt_info[ref]["source_toks"] + trg_toks = self._pt_info[ref]["translation_toks"] + src_tok_idx = 0 - for element in orig_elements[1:]: # TODO: all + src_sent = "" + trg_sent = "" + to_place = [] + adj_src_toks = [] + placed_elements = [elements.pop(0)] if elements[0].type == UsfmUpdateBlockElementType.OTHER else [] + ignored_elements = [] + for element in elements: if element.type == UsfmUpdateBlockElementType.TEXT: if element.marked_for_removal: - src_sent += element.tokens[0].to_usfm() + text = element.tokens[0].to_usfm() + src_sent += text + + # Handle tokens split across text elements + if len(text.strip()) > 0 and ( + src_toks[src_tok_idx] not in text or text.strip().index(src_toks[src_tok_idx]) > 0 + ): + src_tok_idx += 1 + # Track seen tokens + while src_tok_idx < len(src_toks) and src_toks[src_tok_idx] in text: + text = text[text.index(src_toks[src_tok_idx]) + len(src_toks[src_tok_idx]) :] + src_tok_idx += 1 else: trg_sent += element.tokens[0].to_usfm() @@ -101,63 +90,33 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: ignored_elements.append(element) elif element.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE]: to_place.append(element) - src_marker_idxs.append(len(src_sent)) - - src_toks = self._pt_info[block_ref]["source_toks"] - trg_toks = self._pt_info[block_ref]["translation_toks"] - - # Don't do anything if the source sentence or pretranslation has changed - if ( - list(t for t in TOKENIZER.tokenize(src_sent)) != src_toks - or list(t for t in TOKENIZER.tokenize(trg_sent)) != trg_toks - ): - return block + adj_src_toks.append(src_tok_idx) - src_tok_starts = [] - for tok in src_toks: - src_tok_starts.append(src_sent.index(tok, src_tok_starts[-1] + 1 if len(src_tok_starts) > 0 else 0)) trg_tok_starts = [] for tok in trg_toks: trg_tok_starts.append(trg_sent.index(tok, trg_tok_starts[-1] + 1 if len(trg_tok_starts) > 0 else 0)) - # Get index of the text token immediately following each marker - # and predict the corresponding token on the target side - adj_src_toks = [] - for idx in src_marker_idxs: - for i, start_idx in reversed(list(enumerate(src_tok_starts))): - if start_idx < idx: - adj_src_toks.append(i + 1) - break - if i == 0: - adj_src_toks.append(i) - - alignment = to_word_alignment_matrix(self._pt_info[block_ref]["alignment"]) - adj_trg_toks = [ - self._predict_marker_location(alignment, adj_src_tok, src_toks, trg_toks) for adj_src_tok in adj_src_toks - ] - - # Collect the markers to be inserted + # Predict marker placements and get insertion order to_insert = [] - for element, adj_trg_tok in zip(to_place, adj_trg_toks): + alignment = to_word_alignment_matrix(self._pt_info[ref]["alignment"]) + for element, adj_src_tok in zip(to_place, adj_src_toks): + adj_trg_tok = self._predict_marker_location(alignment, adj_src_tok, src_toks, trg_toks) trg_str_idx = trg_tok_starts[adj_trg_tok] if adj_trg_tok < len(trg_tok_starts) else len(trg_sent) - # Determine the order of the markers in the sentence to handle ambiguity for directly adjacent markers - insert_pos = 0 - while insert_pos < len(to_insert) and to_insert[insert_pos][0] <= trg_str_idx: - insert_pos += 1 - to_insert.insert(insert_pos, (trg_str_idx, element)) + to_insert.append((trg_str_idx, element)) + to_insert.sort(key=lambda x: x[0]) + to_insert += [(len(trg_sent), element) for element in end_elements] # Construct new text tokens to put between markers # and reincorporate headers and empty end-of-verse paragraph markers - if len(to_insert) == 0 or to_insert[0][0] > 0: + if len(to_insert) == 0: + placed_elements.append( + UsfmUpdateBlockElement(UsfmUpdateBlockElementType.TEXT, [UsfmToken(UsfmTokenType.TEXT, text=trg_sent)]) + ) + elif to_insert[0][0] > 0: placed_elements.append( UsfmUpdateBlockElement( - UsfmUpdateBlockElementType.TEXT, - [ - UsfmToken( - UsfmTokenType.TEXT, text=trg_sent[: to_insert[0][0]] if len(to_insert) > 0 else trg_sent - ) - ], + UsfmUpdateBlockElementType.TEXT, [UsfmToken(UsfmTokenType.TEXT, text=trg_sent[: to_insert[0][0]])] ) ) for j, (insert_idx, element) in enumerate(to_insert): @@ -167,16 +126,12 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: para_markers_left -= 1 placed_elements.append(element) - text_token = UsfmToken( - UsfmTokenType.TEXT, - text=(trg_sent[insert_idx : to_insert[j + 1][0]] if j + 1 < len(to_insert) else trg_sent[insert_idx:]), - ) - placed_elements.append(UsfmUpdateBlockElement(UsfmUpdateBlockElementType.TEXT, [text_token])) - for element in end_elements: - while len(header_elements) > 0 and header_elements[0][0] == para_markers_left: - placed_elements.append(header_elements.pop(0)[1]) - para_markers_left -= 1 - placed_elements.append(element) + if insert_idx < len(trg_sent) and (j + 1 == len(to_insert) or insert_idx < to_insert[j + 1][0]): + if j + 1 < len(to_insert): + text_token = UsfmToken(UsfmTokenType.TEXT, text=(trg_sent[insert_idx : to_insert[j + 1][0]])) + else: + text_token = UsfmToken(UsfmTokenType.TEXT, text=(trg_sent[insert_idx:])) + placed_elements.append(UsfmUpdateBlockElement(UsfmUpdateBlockElementType.TEXT, [text_token])) while len(header_elements) > 0: placed_elements.append(header_elements.pop(0)[1]) @@ -244,10 +199,6 @@ def num_align_crossings(src_idx: int, trg_idx: int) -> int: else: # continue the search outwards src_hyp += -1 if hyp < 0 else 1 if trg_hyp != -1: - # TODO: experiment w/ using adj_src_tok instead of src_hyp - # probably doesn't work well w/ word order switches, e.g. eng vs spa noun/adj - # one issue it does fix is markers getting sucked to punctuation - # (could be the source of some of the \w\w* issues) num_crossings = num_align_crossings(adj_src_tok, trg_hyp) if num_crossings < best_num_crossings: best_hyp = trg_hyp From bbf097a03f9793763eff3b73b212dff745d9f50d Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Fri, 9 May 2025 22:22:37 -0400 Subject: [PATCH 3/6] Extend and clean up tests, more code cleanup --- ...place_markers_usfm_update_block_handler.py | 24 +- ...place_markers_usfm_update_block_handler.py | 320 +++++++++--------- 2 files changed, 167 insertions(+), 177 deletions(-) diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py index 576aa42f..ee6f86cf 100644 --- a/machine/corpora/place_markers_usfm_update_block_handler.py +++ b/machine/corpora/place_markers_usfm_update_block_handler.py @@ -4,19 +4,16 @@ from ..jobs.eflomal_aligner import to_word_alignment_matrix from ..jobs.translation_file_service import PretranslationInfo -from ..tokenization import LatinWordTokenizer from ..translation import WordAlignmentMatrix from .usfm_token import UsfmToken, UsfmTokenType from .usfm_update_block import UsfmUpdateBlock from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType from .usfm_update_block_handler import UsfmUpdateBlockHandler -TOKENIZER = LatinWordTokenizer() - class PlaceMarkersUsfmUpdateBlockHandler(UsfmUpdateBlockHandler): - def __init__(self, pt_info: Sequence[PretranslationInfo]): + def __init__(self, pt_info: Sequence[PretranslationInfo]) -> None: self._pt_info = {info["refs"][0]: info for info in pt_info} def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: @@ -55,7 +52,10 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: if eob_empty_paras: end_elements.insert(0, element) elements.pop(i) - elif element.type != UsfmUpdateBlockElementType.EMBED: + elif not ( + element.type == UsfmUpdateBlockElementType.EMBED + or (element.type == UsfmUpdateBlockElementType.TEXT and len(element.tokens[0].to_usfm().strip()) == 0) + ): eob_empty_paras = False src_toks = self._pt_info[ref]["source_toks"] @@ -74,15 +74,13 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: text = element.tokens[0].to_usfm() src_sent += text - # Handle tokens split across text elements - if len(text.strip()) > 0 and ( - src_toks[src_tok_idx] not in text or text.strip().index(src_toks[src_tok_idx]) > 0 - ): - src_tok_idx += 1 # Track seen tokens while src_tok_idx < len(src_toks) and src_toks[src_tok_idx] in text: text = text[text.index(src_toks[src_tok_idx]) + len(src_toks[src_tok_idx]) :] src_tok_idx += 1 + # Handle tokens split across text elements + if len(text.strip()) > 0: + src_tok_idx += 1 else: trg_sent += element.tokens[0].to_usfm() @@ -109,11 +107,7 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: # Construct new text tokens to put between markers # and reincorporate headers and empty end-of-verse paragraph markers - if len(to_insert) == 0: - placed_elements.append( - UsfmUpdateBlockElement(UsfmUpdateBlockElementType.TEXT, [UsfmToken(UsfmTokenType.TEXT, text=trg_sent)]) - ) - elif to_insert[0][0] > 0: + if to_insert[0][0] > 0: placed_elements.append( UsfmUpdateBlockElement( UsfmUpdateBlockElementType.TEXT, [UsfmToken(UsfmTokenType.TEXT, text=trg_sent[: to_insert[0][0]])] diff --git a/tests/corpora/test_place_markers_usfm_update_block_handler.py b/tests/corpora/test_place_markers_usfm_update_block_handler.py index ee1779c6..40f79f5a 100644 --- a/tests/corpora/test_place_markers_usfm_update_block_handler.py +++ b/tests/corpora/test_place_markers_usfm_update_block_handler.py @@ -15,7 +15,7 @@ TOKENIZER = LatinWordTokenizer() -def test_paragraph_markers(): +def test_paragraph_markers() -> None: source = "This is the first paragraph. This text is in English, and this test is for paragraph markers." pretranslation = "Este es el primer párrafo. Este texto está en inglés y esta prueba es para marcadores de párrafo." rows = [(scr_ref("MAT 1:1"), str(pretranslation))] @@ -51,61 +51,8 @@ def test_paragraph_markers(): """ assess(target, result) - target = update_usfm( - rows, - usfm, - paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], - ) - result = r"""\id MAT -\c 1 -\v 1 Este es el primer párrafo. Este texto está en inglés y esta prueba es para marcadores de párrafo. -""" - assess(target, result) - - -def test_list_paragraph_markers(): - source = "This is a list: First list item Second list item Third list item" - pretranslation = ( - "Esta es una lista: Primer elemento de la lista Segundo elemento de la lista Tercer elemento de la lista" - ) - rows = [(scr_ref("MAT 1:1"), str(pretranslation))] - usfm = r"""\id MAT -\c 1 -\v 1 This is a list: -\li1 First list item -\li1 Second list item -\li1 Third list item -""" - - pt_info = [ - PretranslationInfo( - corpusId="", - textId="", - refs=["MAT 1:1"], - translation=pretranslation, - source_toks=[t for t in TOKENIZER.tokenize(source)], - translation_toks=[t for t in TOKENIZER.tokenize(pretranslation)], - alignment="0-0 1-1 2-2 3-3 4-4 5-5 6-9 7-6 8-10 9-14 10-11 11-15 12-19 13-16", - ), - ] - target = update_usfm( - rows, - usfm, - paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], - ) - result = r"""\id MAT -\c 1 -\v 1 Esta es una lista: -\li1 Primer elemento de la lista -\li1 Segundo elemento de la lista -\li1 Tercer elemento de la lista -""" - assess(target, result) - -def test_style_markers(): +def test_style_markers() -> None: source = "This is the first sentence. This text is in English, and this test is for style markers." pretranslation = "Esta es la primera oración. Este texto está en inglés y esta prueba es para marcadores de estilo." rows = [(scr_ref("MAT 1:1"), str(pretranslation))] @@ -135,7 +82,7 @@ def test_style_markers(): \c 1 \v 1 Esta es la \w primera \w*oración. Este texto está en \w inglés \w*y esta prueba es \w para \w*marcadores de estilo. """ - # TODO: the spacing before/after end markers is incorrect, + # NOTE: the spacing before/after end markers is incorrect, # but this is an issue with how the is USFM is generated from the tokens assess(target, result) @@ -152,16 +99,17 @@ def test_style_markers(): assess(target, result) -def test_embeds(): +# NOTE: Not currently updating embeds, will need to change test when we do +def test_embeds() -> None: rows = [ - (scr_ref("MAT 1:1"), str("New verse 1")), - (scr_ref("MAT 1:2"), str("New verse 2")), - (scr_ref("MAT 1:3"), str("New verse 3")), - (scr_ref("MAT 1:4"), str("New verse 4")), - (scr_ref("MAT 1:4/1:f"), str("New embed text")), - (scr_ref("MAT 1:5"), str("New verse 5")), - (scr_ref("MAT 1:6"), str("New verse 6")), - (scr_ref("MAT 1:6/1:f"), str("New verse 6 embed text")), + (scr_ref("MAT 1:1"), "New verse 1"), + (scr_ref("MAT 1:2"), "New verse 2"), + (scr_ref("MAT 1:3"), "New verse 3"), + (scr_ref("MAT 1:4"), "New verse 4"), + (scr_ref("MAT 1:4/1:f"), "New embed text"), + (scr_ref("MAT 1:5"), "New verse 5"), + (scr_ref("MAT 1:6"), "New verse 6"), + (scr_ref("MAT 1:6/1:f"), "New verse 6 embed text"), ] usfm = r"""\id MAT \c 1 @@ -173,87 +121,13 @@ def test_embeds(): \v 6 Updated embed with style markers \f \fr 1.6 \ft Another \+w stylish\+w* note \f* """ - pt_info = [ - PretranslationInfo( - corpusId="", - textId="", - refs=["MAT 1:1"], - translation="New verse 1", - source_toks=["Start", "of", "sentence", "embed"], - translation_toks=["New", "verse", "1"], - alignment="", - ), - PretranslationInfo( - corpusId="", - textId="", - refs=["MAT 1:2"], - translation="New verse 2", - source_toks=["Middle", "of", "sentence", "embed"], - translation_toks=["New", "verse", "2"], - alignment="", - ), - PretranslationInfo( - corpusId="", - textId="", - refs=["MAT 1:3"], - translation="New verse 3", - source_toks=["End", "of", "sentence", "embed"], - translation_toks=["New", "verse", "3"], - alignment="", - ), - PretranslationInfo( - corpusId="", - textId="", - refs=["MAT 1:4"], - translation="New verse 4", - source_toks=["Updated", "embed"], - translation_toks=["New", "verse", "4"], - alignment="", - ), - PretranslationInfo( - corpusId="", - textId="", - refs=["MAT 1:4/1:f"], - translation="New embed text", - source_toks=["A", "fourth", "note"], - translation_toks=["New", "embed", "text"], - alignment="", - ), - PretranslationInfo( - corpusId="", - textId="", - refs=["MAT 1:5"], - translation="New verse 5", - source_toks=["Embed", "with", "style", "markers"], - translation_toks=["New", "verse", "5"], - alignment="", - ), - PretranslationInfo( - corpusId="", - textId="", - refs=["MAT 1:6"], - translation="New verse 6", - source_toks=["Updated", "embed", "with", "style", "markers"], - translation_toks=["New", "verse", "6"], - alignment="", - ), - PretranslationInfo( - corpusId="", - textId="", - refs=["MAT 1:6/1:f"], - translation="New verse 6 embed text", - source_toks=["Another", "stylish", "note"], - translation_toks=["New", "verse", "6", "embed", "text"], - alignment="", - ), - ] + pt_info = [] target = update_usfm( rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], ) - # NOTE: currently not updating embeds result = r"""\id MAT \c 1 \v 1 New verse 1 \f \fr 1.1 \ft Some note \f* @@ -283,8 +157,50 @@ def test_embeds(): assess(target, result) -def test_headers(): - rows = [(scr_ref("MAT 1:1"), "X Y Z"), (scr_ref("MAT 1:2"), "X")] +def test_trailing_empty_paragraphs() -> None: + rows = [(scr_ref("MAT 1:1"), "New verse 1")] + usfm = r"""\id MAT +\c 1 +\v 1 Verse 1 +\p +\b +\q1 \f embed \f* +""" + + pt_info = [ + PretranslationInfo( + corpusId="", + textId="", + refs=["MAT 1:1"], + translation="New verse 1", + source_toks=["Verse" "1"], + translation_toks=["New", "verse", "1"], + alignment="0-1 1-2", + ), + ] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 New verse 1 +\p +\b +\q1 \f embed \f* +""" + assess(target, result) + + +def test_headers() -> None: + rows = [ + (scr_ref("MAT 1:1"), "X Y Z"), + (scr_ref("MAT 1:2"), "X"), + (scr_ref("MAT 1:3"), "Y"), + (scr_ref("MAT 1:3/1:s1"), "Updated header"), + ] usfm = r"""\id MAT \c 1 \s1 Start of chapter header @@ -300,6 +216,8 @@ def test_headers(): \s1 Header followed by a reference \r (reference) \p +\v 3 B +\s1 Header to be updated """ pt_info = [ @@ -343,15 +261,52 @@ def test_headers(): \s1 Header followed by a reference \r (reference) \p +\v 3 Y +\s1 Updated header """ assess(target, result) -def test_verse_ranges(): - rows = [([ScriptureRef.parse(f"MAT 1:{i}") for i in range(1, 6)], "New verse range text")] +def test_consecutive_markers() -> None: + rows = [(scr_ref("MAT 1:1"), "New verse 1 WORD")] + usfm = r"""\id MAT +\c 1 +\v 1 Old verse 1 +\p \qt \+w word \+w* \qt* +""" + + pt_info = [ + PretranslationInfo( + corpusId="", + textId="", + refs=["MAT 1:1"], + translation="New verse 1 WORD", + source_toks=["Old", "verse", "1", "word"], + translation_toks=["New", "verse", "1", "WORD"], + alignment="0-0 1-1 2-2 3-3", + ), + ] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + style_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 New verse 1 +\p \qt \+w WORD \+w*\qt* +""" + assess(target, result) + + +def test_verse_ranges() -> None: + rows = [([ScriptureRef.parse(f"MAT 1:{i}") for i in range(1, 6)], "New verse range text new paragraph 2")] usfm = r"""\id MAT \c 1 \v 1-5 Verse range +\p old paragraph 2 """ pt_info = [ @@ -359,10 +314,10 @@ def test_verse_ranges(): corpusId="", textId="", refs=[str(ScriptureRef.parse(f"MAT 1:{i}")) for i in range(1, 6)], - translation="New verse range text", - source_toks=["Verse", "range"], - translation_toks=["New", "verse", "range", "text"], - alignment="0-1 1-2", + translation="New verse range text new paragraph 2", + source_toks=["Verse", "range", "old", "paragraph", "2"], + translation_toks=["New", "verse", "range", "text", "new", "paragraph", "2"], + alignment="0-1 1-2 2-4 3-5 4-6", ), ] target = update_usfm( @@ -374,18 +329,20 @@ def test_verse_ranges(): result = r"""\id MAT \c 1 \v 1-5 New verse range text +\p new paragraph 2 """ assess(target, result) -def test_no_alignment(): - rows = [(scr_ref("MAT 1:1"), str("New paragraph 1 New paragraph 2"))] +def test_no_update() -> None: + rows = [(scr_ref("MAT 1:1"), "New paragraph 1 New paragraph 2")] usfm = r"""\id MAT \c 1 \v 1 Old paragraph 1 \p Old paragraph 2 """ + # Strip paragraphs pt_info = [ PretranslationInfo( corpusId="", @@ -394,6 +351,30 @@ def test_no_alignment(): translation="New paragraph 1 New paragraph 2", source_toks=["Old", "paragraph", "1", "Old", "paragraph", "2"], translation_toks=["New", "paragraph", "1", "New", "paragraph", "2"], + alignment="0-0 1-1 2-2 3-3 4-4 5-5", + ), + ] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 New paragraph 1 New paragraph 2 +""" + assess(target, result) + + # No alignment + pt_info = [ + PretranslationInfo( + corpusId="", + textId="", + refs=["MAT 1:1"], + translation="New paragraph 1 New paragraph 2", + source_toks=[], + translation_toks=[], alignment="", ), ] @@ -410,13 +391,30 @@ def test_no_alignment(): """ assess(target, result) - -def test_changed_text(): - rows = [(scr_ref("MAT 1:1"), str("New paragraph 1 New paragraph 2"))] - usfm = r"""\id MAT + # No text update + rows = [] + pt_info = [] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + ) + result = r"""\id MAT \c 1 \v 1 Old paragraph 1 \p Old paragraph 2 +""" + assess(target, result) + + +def test_split_tokens() -> None: + rows = [(scr_ref("MAT 1:1"), "words split words split words split")] + usfm = r"""\id MAT +\c 1 +\v 1 words spl +\p it words spl +\p it words split """ pt_info = [ @@ -424,9 +422,9 @@ def test_changed_text(): corpusId="", textId="", refs=["MAT 1:1"], - translation="Changed paragraph 1 Changed paragraph 2", - source_toks=["Old", "paragraph", "1", "Old", "paragraph", "2"], - translation_toks=["Changed", "paragraph", "1", "Changed", "paragraph", "2"], + translation="words split words split words split", + source_toks=["words", "split", "words", "split", "words", "split"], + translation_toks=["words", "split", "words", "split", "words", "split"], alignment="0-0 1-1 2-2 3-3 4-4 5-5", ), ] @@ -438,8 +436,9 @@ def test_changed_text(): ) result = r"""\id MAT \c 1 -\v 1 New paragraph 1 New paragraph 2 -\p +\v 1 words split +\p words split +\p words split """ assess(target, result) @@ -476,8 +475,5 @@ def update_usfm( def assess(target: Optional[str], truth: str) -> None: assert target is not None - for target_line, truth_line in zip(target.split("\n"), truth.split("\n")): - print(truth_line) - print(target_line) for target_line, truth_line in zip(target.split("\n"), truth.split("\n")): assert target_line.strip() == truth_line.strip() From 6d108178a17352f1798772df102e259fcb58859c Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Tue, 13 May 2025 18:40:19 -0400 Subject: [PATCH 4/6] Fix imports, use separate AlignmentInfo type --- machine/corpora/__init__.py | 3 + ...place_markers_usfm_update_block_handler.py | 39 +++++-- ...place_markers_usfm_update_block_handler.py | 104 +++++++----------- 3 files changed, 68 insertions(+), 78 deletions(-) diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index 523604c0..66c36ca3 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -23,6 +23,7 @@ from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase from .paratext_text_corpus import ParatextTextCorpus +from .place_markers_usfm_update_block_handler import AlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler from .scripture_element import ScriptureElement from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType @@ -79,6 +80,7 @@ "AlignedWordPair", "AlignmentCollection", "AlignmentCorpus", + "AlignmentInfo", "AlignmentRow", "batch", "Corpus", @@ -112,6 +114,7 @@ "ParatextProjectTermsParserBase", "ParatextProjectTextUpdaterBase", "ParatextTextCorpus", + "PlaceMarkersUsfmUpdateBlockHandler", "parse_usfm", "RtlReferenceOrder", "ScriptureElement", diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py index ee6f86cf..f18dd617 100644 --- a/machine/corpora/place_markers_usfm_update_block_handler.py +++ b/machine/corpora/place_markers_usfm_update_block_handler.py @@ -1,20 +1,25 @@ from __future__ import annotations -from typing import List, Sequence +from typing import List, Sequence, TypedDict -from ..jobs.eflomal_aligner import to_word_alignment_matrix -from ..jobs.translation_file_service import PretranslationInfo -from ..translation import WordAlignmentMatrix +from ..translation.word_alignment_matrix import AlignedWordPair, WordAlignmentMatrix from .usfm_token import UsfmToken, UsfmTokenType from .usfm_update_block import UsfmUpdateBlock from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType from .usfm_update_block_handler import UsfmUpdateBlockHandler +class AlignmentInfo(TypedDict): + refs: List[str] + source_toks: List[str] + translation_toks: List[str] + alignment: str + + class PlaceMarkersUsfmUpdateBlockHandler(UsfmUpdateBlockHandler): - def __init__(self, pt_info: Sequence[PretranslationInfo]) -> None: - self._pt_info = {info["refs"][0]: info for info in pt_info} + def __init__(self, align_info: Sequence[AlignmentInfo]) -> None: + self._align_info = {info["refs"][0]: info for info in align_info} def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: ref = str(block.refs[0]) @@ -23,8 +28,8 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: # Nothing to do if there are no markers to place or no alignment to use if ( len(elements) == 0 - or ref not in self._pt_info.keys() - or len(self._pt_info[ref]["alignment"]) == 0 + or ref not in self._align_info.keys() + or len(self._align_info[ref]["alignment"]) == 0 or not any( ( e.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE] @@ -58,8 +63,8 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: ): eob_empty_paras = False - src_toks = self._pt_info[ref]["source_toks"] - trg_toks = self._pt_info[ref]["translation_toks"] + src_toks = self._align_info[ref]["source_toks"] + trg_toks = self._align_info[ref]["translation_toks"] src_tok_idx = 0 src_sent = "" @@ -96,7 +101,7 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: # Predict marker placements and get insertion order to_insert = [] - alignment = to_word_alignment_matrix(self._pt_info[ref]["alignment"]) + alignment = to_word_alignment_matrix(self._align_info[ref]["alignment"]) for element, adj_src_tok in zip(to_place, adj_src_toks): adj_trg_tok = self._predict_marker_location(alignment, adj_src_tok, src_toks, trg_toks) trg_str_idx = trg_tok_starts[adj_trg_tok] if adj_trg_tok < len(trg_tok_starts) else len(trg_sent) @@ -202,3 +207,15 @@ def num_align_crossings(src_idx: int, trg_idx: int) -> int: # If no alignments found, insert at the end of the sentence return best_hyp if best_hyp != -1 else len(trg_toks) + + +def to_word_alignment_matrix(alignment_str: str) -> WordAlignmentMatrix: + word_pairs = AlignedWordPair.from_string(alignment_str) + row_count = 0 + column_count = 0 + for pair in word_pairs: + if pair.source_index + 1 > row_count: + row_count = pair.source_index + 1 + if pair.target_index + 1 > column_count: + column_count = pair.target_index + 1 + return WordAlignmentMatrix.from_word_pairs(row_count, column_count, word_pairs) diff --git a/tests/corpora/test_place_markers_usfm_update_block_handler.py b/tests/corpora/test_place_markers_usfm_update_block_handler.py index 40f79f5a..ebe73547 100644 --- a/tests/corpora/test_place_markers_usfm_update_block_handler.py +++ b/tests/corpora/test_place_markers_usfm_update_block_handler.py @@ -1,15 +1,15 @@ from typing import List, Optional, Sequence, Tuple from machine.corpora import ( + AlignmentInfo, + PlaceMarkersUsfmUpdateBlockHandler, ScriptureRef, UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, UpdateUsfmTextBehavior, + UsfmUpdateBlockHandler, parse_usfm, ) -from machine.corpora.place_markers_usfm_update_block_handler import PlaceMarkersUsfmUpdateBlockHandler -from machine.corpora.usfm_update_block_handler import UsfmUpdateBlockHandler -from machine.jobs.translation_file_service import PretranslationInfo from machine.tokenization import LatinWordTokenizer TOKENIZER = LatinWordTokenizer() @@ -26,12 +26,9 @@ def test_paragraph_markers() -> None: \p and this test is for paragraph markers. """ - pt_info = [ - PretranslationInfo( - corpusId="", - textId="", + align_info = [ + AlignmentInfo( refs=["MAT 1:1"], - translation=pretranslation, source_toks=[t for t in TOKENIZER.tokenize(source)], translation_toks=[t for t in TOKENIZER.tokenize(pretranslation)], alignment="0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19", @@ -41,7 +38,7 @@ def test_paragraph_markers() -> None: rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], ) result = r"""\id MAT \c 1 @@ -61,12 +58,9 @@ def test_style_markers() -> None: \v 1 This is the \w first\w* sentence. This text is in \w English\w*, and this test is \w for\w* style markers. """ - pt_info = [ - PretranslationInfo( - corpusId="", - textId="", + align_info = [ + AlignmentInfo( refs=["MAT 1:1"], - translation=pretranslation, source_toks=[t for t in TOKENIZER.tokenize(source)], translation_toks=[t for t in TOKENIZER.tokenize(pretranslation)], alignment="0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19", @@ -76,7 +70,7 @@ def test_style_markers() -> None: rows, usfm, style_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], ) result = r"""\id MAT \c 1 @@ -90,7 +84,7 @@ def test_style_markers() -> None: rows, usfm, style_behavior=UpdateUsfmMarkerBehavior.STRIP, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], ) result = r"""\id MAT \c 1 @@ -121,12 +115,12 @@ def test_embeds() -> None: \v 6 Updated embed with style markers \f \fr 1.6 \ft Another \+w stylish\+w* note \f* """ - pt_info = [] + align_info = [] target = update_usfm( rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], ) result = r"""\id MAT \c 1 @@ -143,7 +137,7 @@ def test_embeds() -> None: rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.STRIP, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], ) result = r"""\id MAT \c 1 @@ -167,12 +161,9 @@ def test_trailing_empty_paragraphs() -> None: \q1 \f embed \f* """ - pt_info = [ - PretranslationInfo( - corpusId="", - textId="", + align_info = [ + AlignmentInfo( refs=["MAT 1:1"], - translation="New verse 1", source_toks=["Verse" "1"], translation_toks=["New", "verse", "1"], alignment="0-1 1-2", @@ -182,7 +173,7 @@ def test_trailing_empty_paragraphs() -> None: rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], ) result = r"""\id MAT \c 1 @@ -220,21 +211,15 @@ def test_headers() -> None: \s1 Header to be updated """ - pt_info = [ - PretranslationInfo( - corpusId="", - textId="", + align_info = [ + AlignmentInfo( refs=["MAT 1:1"], - translation="X Y Z", source_toks=["A", "B", "C"], translation_toks=["X", "Y", "Z"], alignment="0-0 1-1 2-2", ), - PretranslationInfo( - corpusId="", - textId="", + AlignmentInfo( refs=["MAT 1:2"], - translation="X", source_toks=["A"], translation_toks=["X"], alignment="0-0", @@ -244,7 +229,7 @@ def test_headers() -> None: rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], ) result = r"""\id MAT \c 1 @@ -275,12 +260,9 @@ def test_consecutive_markers() -> None: \p \qt \+w word \+w* \qt* """ - pt_info = [ - PretranslationInfo( - corpusId="", - textId="", + align_info = [ + AlignmentInfo( refs=["MAT 1:1"], - translation="New verse 1 WORD", source_toks=["Old", "verse", "1", "word"], translation_toks=["New", "verse", "1", "WORD"], alignment="0-0 1-1 2-2 3-3", @@ -291,7 +273,7 @@ def test_consecutive_markers() -> None: usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, style_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], ) result = r"""\id MAT \c 1 @@ -309,12 +291,9 @@ def test_verse_ranges() -> None: \p old paragraph 2 """ - pt_info = [ - PretranslationInfo( - corpusId="", - textId="", + align_info = [ + AlignmentInfo( refs=[str(ScriptureRef.parse(f"MAT 1:{i}")) for i in range(1, 6)], - translation="New verse range text new paragraph 2", source_toks=["Verse", "range", "old", "paragraph", "2"], translation_toks=["New", "verse", "range", "text", "new", "paragraph", "2"], alignment="0-1 1-2 2-4 3-5 4-6", @@ -324,7 +303,7 @@ def test_verse_ranges() -> None: rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], ) result = r"""\id MAT \c 1 @@ -343,12 +322,9 @@ def test_no_update() -> None: """ # Strip paragraphs - pt_info = [ - PretranslationInfo( - corpusId="", - textId="", + align_info = [ + AlignmentInfo( refs=["MAT 1:1"], - translation="New paragraph 1 New paragraph 2", source_toks=["Old", "paragraph", "1", "Old", "paragraph", "2"], translation_toks=["New", "paragraph", "1", "New", "paragraph", "2"], alignment="0-0 1-1 2-2 3-3 4-4 5-5", @@ -358,7 +334,7 @@ def test_no_update() -> None: rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], ) result = r"""\id MAT \c 1 @@ -367,12 +343,9 @@ def test_no_update() -> None: assess(target, result) # No alignment - pt_info = [ - PretranslationInfo( - corpusId="", - textId="", + align_info = [ + AlignmentInfo( refs=["MAT 1:1"], - translation="New paragraph 1 New paragraph 2", source_toks=[], translation_toks=[], alignment="", @@ -382,7 +355,7 @@ def test_no_update() -> None: rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], ) result = r"""\id MAT \c 1 @@ -393,12 +366,12 @@ def test_no_update() -> None: # No text update rows = [] - pt_info = [] + align_info = [] target = update_usfm( rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], ) result = r"""\id MAT \c 1 @@ -417,12 +390,9 @@ def test_split_tokens() -> None: \p it words split """ - pt_info = [ - PretranslationInfo( - corpusId="", - textId="", + align_info = [ + AlignmentInfo( refs=["MAT 1:1"], - translation="words split words split words split", source_toks=["words", "split", "words", "split", "words", "split"], translation_toks=["words", "split", "words", "split", "words", "split"], alignment="0-0 1-1 2-2 3-3 4-4 5-5", @@ -432,7 +402,7 @@ def test_split_tokens() -> None: rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(pt_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], ) result = r"""\id MAT \c 1 From d942cbdc724e5ce470ba7df4dbeaf9abf11ed048 Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Thu, 15 May 2025 00:12:48 -0400 Subject: [PATCH 5/6] Adjust (PlaceMarkers)AlignmentInfo type --- machine/corpora/__init__.py | 4 +- ...place_markers_usfm_update_block_handler.py | 38 +++---- ...place_markers_usfm_update_block_handler.py | 100 +++++++++++------- 3 files changed, 75 insertions(+), 67 deletions(-) diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index 66c36ca3..314b35b2 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -23,7 +23,7 @@ from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase from .paratext_text_corpus import ParatextTextCorpus -from .place_markers_usfm_update_block_handler import AlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler +from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler from .scripture_element import ScriptureElement from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType @@ -80,7 +80,6 @@ "AlignedWordPair", "AlignmentCollection", "AlignmentCorpus", - "AlignmentInfo", "AlignmentRow", "batch", "Corpus", @@ -114,6 +113,7 @@ "ParatextProjectTermsParserBase", "ParatextProjectTextUpdaterBase", "ParatextTextCorpus", + "PlaceMarkersAlignmentInfo", "PlaceMarkersUsfmUpdateBlockHandler", "parse_usfm", "RtlReferenceOrder", diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py index f18dd617..7e44e02b 100644 --- a/machine/corpora/place_markers_usfm_update_block_handler.py +++ b/machine/corpora/place_markers_usfm_update_block_handler.py @@ -1,24 +1,24 @@ from __future__ import annotations -from typing import List, Sequence, TypedDict +from typing import Iterable, List, TypedDict -from ..translation.word_alignment_matrix import AlignedWordPair, WordAlignmentMatrix +from ..translation.word_alignment_matrix import WordAlignmentMatrix from .usfm_token import UsfmToken, UsfmTokenType from .usfm_update_block import UsfmUpdateBlock from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType from .usfm_update_block_handler import UsfmUpdateBlockHandler -class AlignmentInfo(TypedDict): +class PlaceMarkersAlignmentInfo(TypedDict): refs: List[str] - source_toks: List[str] - translation_toks: List[str] - alignment: str + source_tokens: List[str] + translation_tokens: List[str] + alignment: WordAlignmentMatrix class PlaceMarkersUsfmUpdateBlockHandler(UsfmUpdateBlockHandler): - def __init__(self, align_info: Sequence[AlignmentInfo]) -> None: + def __init__(self, align_info: Iterable[PlaceMarkersAlignmentInfo]) -> None: self._align_info = {info["refs"][0]: info for info in align_info} def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: @@ -29,7 +29,8 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: if ( len(elements) == 0 or ref not in self._align_info.keys() - or len(self._align_info[ref]["alignment"]) == 0 + or self._align_info[ref]["alignment"].row_count == 0 + or self._align_info[ref]["alignment"].column_count == 0 or not any( ( e.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE] @@ -63,8 +64,8 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: ): eob_empty_paras = False - src_toks = self._align_info[ref]["source_toks"] - trg_toks = self._align_info[ref]["translation_toks"] + src_toks = self._align_info[ref]["source_tokens"] + trg_toks = self._align_info[ref]["translation_tokens"] src_tok_idx = 0 src_sent = "" @@ -101,9 +102,10 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: # Predict marker placements and get insertion order to_insert = [] - alignment = to_word_alignment_matrix(self._align_info[ref]["alignment"]) for element, adj_src_tok in zip(to_place, adj_src_toks): - adj_trg_tok = self._predict_marker_location(alignment, adj_src_tok, src_toks, trg_toks) + adj_trg_tok = self._predict_marker_location( + self._align_info[ref]["alignment"], adj_src_tok, src_toks, trg_toks + ) trg_str_idx = trg_tok_starts[adj_trg_tok] if adj_trg_tok < len(trg_tok_starts) else len(trg_sent) to_insert.append((trg_str_idx, element)) @@ -207,15 +209,3 @@ def num_align_crossings(src_idx: int, trg_idx: int) -> int: # If no alignments found, insert at the end of the sentence return best_hyp if best_hyp != -1 else len(trg_toks) - - -def to_word_alignment_matrix(alignment_str: str) -> WordAlignmentMatrix: - word_pairs = AlignedWordPair.from_string(alignment_str) - row_count = 0 - column_count = 0 - for pair in word_pairs: - if pair.source_index + 1 > row_count: - row_count = pair.source_index + 1 - if pair.target_index + 1 > column_count: - column_count = pair.target_index + 1 - return WordAlignmentMatrix.from_word_pairs(row_count, column_count, word_pairs) diff --git a/tests/corpora/test_place_markers_usfm_update_block_handler.py b/tests/corpora/test_place_markers_usfm_update_block_handler.py index ebe73547..9f41ce63 100644 --- a/tests/corpora/test_place_markers_usfm_update_block_handler.py +++ b/tests/corpora/test_place_markers_usfm_update_block_handler.py @@ -1,7 +1,8 @@ from typing import List, Optional, Sequence, Tuple from machine.corpora import ( - AlignmentInfo, + AlignedWordPair, + PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler, ScriptureRef, UpdateUsfmMarkerBehavior, @@ -11,6 +12,7 @@ parse_usfm, ) from machine.tokenization import LatinWordTokenizer +from machine.translation import WordAlignmentMatrix TOKENIZER = LatinWordTokenizer() @@ -27,11 +29,13 @@ def test_paragraph_markers() -> None: """ align_info = [ - AlignmentInfo( + PlaceMarkersAlignmentInfo( refs=["MAT 1:1"], - source_toks=[t for t in TOKENIZER.tokenize(source)], - translation_toks=[t for t in TOKENIZER.tokenize(pretranslation)], - alignment="0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19", + source_tokens=[t for t in TOKENIZER.tokenize(source)], + translation_tokens=[t for t in TOKENIZER.tokenize(pretranslation)], + alignment=to_word_alignment_matrix( + "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" + ), ), ] target = update_usfm( @@ -59,11 +63,13 @@ def test_style_markers() -> None: """ align_info = [ - AlignmentInfo( + PlaceMarkersAlignmentInfo( refs=["MAT 1:1"], - source_toks=[t for t in TOKENIZER.tokenize(source)], - translation_toks=[t for t in TOKENIZER.tokenize(pretranslation)], - alignment="0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19", + source_tokens=[t for t in TOKENIZER.tokenize(source)], + translation_tokens=[t for t in TOKENIZER.tokenize(pretranslation)], + alignment=to_word_alignment_matrix( + "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" + ), ), ] target = update_usfm( @@ -162,11 +168,11 @@ def test_trailing_empty_paragraphs() -> None: """ align_info = [ - AlignmentInfo( + PlaceMarkersAlignmentInfo( refs=["MAT 1:1"], - source_toks=["Verse" "1"], - translation_toks=["New", "verse", "1"], - alignment="0-1 1-2", + source_tokens=["Verse" "1"], + translation_tokens=["New", "verse", "1"], + alignment=to_word_alignment_matrix("0-1 1-2"), ), ] target = update_usfm( @@ -212,17 +218,17 @@ def test_headers() -> None: """ align_info = [ - AlignmentInfo( + PlaceMarkersAlignmentInfo( refs=["MAT 1:1"], - source_toks=["A", "B", "C"], - translation_toks=["X", "Y", "Z"], - alignment="0-0 1-1 2-2", + source_tokens=["A", "B", "C"], + translation_tokens=["X", "Y", "Z"], + alignment=to_word_alignment_matrix("0-0 1-1 2-2"), ), - AlignmentInfo( + PlaceMarkersAlignmentInfo( refs=["MAT 1:2"], - source_toks=["A"], - translation_toks=["X"], - alignment="0-0", + source_tokens=["A"], + translation_tokens=["X"], + alignment=to_word_alignment_matrix("0-0"), ), ] target = update_usfm( @@ -261,11 +267,11 @@ def test_consecutive_markers() -> None: """ align_info = [ - AlignmentInfo( + PlaceMarkersAlignmentInfo( refs=["MAT 1:1"], - source_toks=["Old", "verse", "1", "word"], - translation_toks=["New", "verse", "1", "WORD"], - alignment="0-0 1-1 2-2 3-3", + source_tokens=["Old", "verse", "1", "word"], + translation_tokens=["New", "verse", "1", "WORD"], + alignment=to_word_alignment_matrix("0-0 1-1 2-2 3-3"), ), ] target = update_usfm( @@ -292,11 +298,11 @@ def test_verse_ranges() -> None: """ align_info = [ - AlignmentInfo( + PlaceMarkersAlignmentInfo( refs=[str(ScriptureRef.parse(f"MAT 1:{i}")) for i in range(1, 6)], - source_toks=["Verse", "range", "old", "paragraph", "2"], - translation_toks=["New", "verse", "range", "text", "new", "paragraph", "2"], - alignment="0-1 1-2 2-4 3-5 4-6", + source_tokens=["Verse", "range", "old", "paragraph", "2"], + translation_tokens=["New", "verse", "range", "text", "new", "paragraph", "2"], + alignment=to_word_alignment_matrix("0-1 1-2 2-4 3-5 4-6"), ), ] target = update_usfm( @@ -323,11 +329,11 @@ def test_no_update() -> None: # Strip paragraphs align_info = [ - AlignmentInfo( + PlaceMarkersAlignmentInfo( refs=["MAT 1:1"], - source_toks=["Old", "paragraph", "1", "Old", "paragraph", "2"], - translation_toks=["New", "paragraph", "1", "New", "paragraph", "2"], - alignment="0-0 1-1 2-2 3-3 4-4 5-5", + source_tokens=["Old", "paragraph", "1", "Old", "paragraph", "2"], + translation_tokens=["New", "paragraph", "1", "New", "paragraph", "2"], + alignment=to_word_alignment_matrix("0-0 1-1 2-2 3-3 4-4 5-5"), ), ] target = update_usfm( @@ -344,11 +350,11 @@ def test_no_update() -> None: # No alignment align_info = [ - AlignmentInfo( + PlaceMarkersAlignmentInfo( refs=["MAT 1:1"], - source_toks=[], - translation_toks=[], - alignment="", + source_tokens=[], + translation_tokens=[], + alignment=to_word_alignment_matrix(""), ), ] target = update_usfm( @@ -391,11 +397,11 @@ def test_split_tokens() -> None: """ align_info = [ - AlignmentInfo( + PlaceMarkersAlignmentInfo( refs=["MAT 1:1"], - source_toks=["words", "split", "words", "split", "words", "split"], - translation_toks=["words", "split", "words", "split", "words", "split"], - alignment="0-0 1-1 2-2 3-3 4-4 5-5", + source_tokens=["words", "split", "words", "split", "words", "split"], + translation_tokens=["words", "split", "words", "split", "words", "split"], + alignment=to_word_alignment_matrix("0-0 1-1 2-2 3-3 4-4 5-5"), ), ] target = update_usfm( @@ -417,6 +423,18 @@ def scr_ref(*refs: str) -> List[ScriptureRef]: return [ScriptureRef.parse(ref) for ref in refs] +def to_word_alignment_matrix(alignment_str: str) -> WordAlignmentMatrix: + word_pairs = AlignedWordPair.from_string(alignment_str) + row_count = 0 + column_count = 0 + for pair in word_pairs: + if pair.source_index + 1 > row_count: + row_count = pair.source_index + 1 + if pair.target_index + 1 > column_count: + column_count = pair.target_index + 1 + return WordAlignmentMatrix.from_word_pairs(row_count, column_count, word_pairs) + + def update_usfm( rows: Sequence[Tuple[Sequence[ScriptureRef], str]], source: str, From 96e85e393aae6ffdc4e6363cf22321ed6affe7aa Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Thu, 15 May 2025 17:49:12 -0400 Subject: [PATCH 6/6] 'toks' --> 'tokens' --- machine/jobs/nmt_engine_build_job.py | 4 ++-- machine/jobs/translation_file_service.py | 8 ++++---- tests/jobs/test_nmt_engine_build_job.py | 12 ++++++------ tests/jobs/test_smt_engine_build_job.py | 4 ++-- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/machine/jobs/nmt_engine_build_job.py b/machine/jobs/nmt_engine_build_job.py index b7b2afbc..5bb120a8 100644 --- a/machine/jobs/nmt_engine_build_job.py +++ b/machine/jobs/nmt_engine_build_job.py @@ -157,8 +157,8 @@ def _align( check_canceled() for i in range(len(pretranslations)): - pretranslations[i]["source_toks"] = list(src_tokenized[i]) - pretranslations[i]["translation_toks"] = list(trg_tokenized[i]) + pretranslations[i]["source_tokens"] = list(src_tokenized[i]) + pretranslations[i]["translation_tokens"] = list(trg_tokenized[i]) pretranslations[i]["alignment"] = alignments[i] return pretranslations diff --git a/machine/jobs/translation_file_service.py b/machine/jobs/translation_file_service.py index 54c4ae90..f29e1005 100644 --- a/machine/jobs/translation_file_service.py +++ b/machine/jobs/translation_file_service.py @@ -16,8 +16,8 @@ class PretranslationInfo(TypedDict): textId: str # noqa: N815 refs: List[str] translation: str - source_toks: List[str] - translation_toks: List[str] + source_tokens: List[str] + translation_tokens: List[str] alignment: str @@ -65,8 +65,8 @@ def generator() -> Generator[PretranslationInfo, None, None]: textId=pi["textId"], refs=list(pi["refs"]), translation=pi["translation"], - source_toks=list(pi["source_toks"]), - translation_toks=list(pi["translation_toks"]), + source_tokens=list(pi["source_tokens"]), + translation_tokens=list(pi["translation_tokens"]), alignment=pi["alignment"], ) diff --git a/tests/jobs/test_nmt_engine_build_job.py b/tests/jobs/test_nmt_engine_build_job.py index 227b909b..014fc743 100644 --- a/tests/jobs/test_nmt_engine_build_job.py +++ b/tests/jobs/test_nmt_engine_build_job.py @@ -38,7 +38,7 @@ def test_run(decoy: Decoy) -> None: assert len(pretranslations) == 1 assert pretranslations[0]["translation"] == "Please, I have booked a room." if is_eflomal_available(): - assert pretranslations[0]["source_toks"] == [ + assert pretranslations[0]["source_tokens"] == [ "Por", "favor", ",", @@ -48,11 +48,11 @@ def test_run(decoy: Decoy) -> None: "habitación", ".", ] - assert pretranslations[0]["translation_toks"] == ["Please", ",", "I", "have", "booked", "a", "room", "."] + assert pretranslations[0]["translation_tokens"] == ["Please", ",", "I", "have", "booked", "a", "room", "."] assert len(pretranslations[0]["alignment"]) > 0 else: - assert pretranslations[0]["source_toks"] == [] - assert pretranslations[0]["translation_toks"] == [] + assert pretranslations[0]["source_tokens"] == [] + assert pretranslations[0]["translation_tokens"] == [] assert len(pretranslations[0]["alignment"]) == 0 decoy.verify(env.translation_file_service.save_model(Path("model.tar.gz"), "models/save-model.tar.gz"), times=1) @@ -131,8 +131,8 @@ def __init__(self, decoy: Decoy) -> None: textId="text1", refs=["ref1"], translation="Por favor, tengo reservada una habitación.", - source_toks=[], - translation_toks=[], + source_tokens=[], + translation_tokens=[], alignment="", ) ] diff --git a/tests/jobs/test_smt_engine_build_job.py b/tests/jobs/test_smt_engine_build_job.py index 16afcacf..eff4649f 100644 --- a/tests/jobs/test_smt_engine_build_job.py +++ b/tests/jobs/test_smt_engine_build_job.py @@ -137,8 +137,8 @@ def __init__(self, decoy: Decoy) -> None: textId="text1", refs=["ref1"], translation="Por favor, tengo reservada una habitación.", - source_toks=[], - translation_toks=[], + source_tokens=[], + translation_tokens=[], alignment="", ) ]