### USFM marker preservation
* Extract footnotes and put them at the end
* Extract each instance of a marker and record its index
* Tokenize source sentences and match each marker to surrounding tokens based on their original indices
* Train aligner on all training data + translation, align translation to source
* Reinsert marker instances

In [None]:
from pathlib import Path

'''Define project values'''
pair = ""
project = ""
file_suffix = ""
trg_project = ""
trg_file_suffix = ""

book = "MAT"
book_name = f"41{book}"
src_fpath = Path(f"test_S/Paratext/projects/{project}/{book_name}{file_suffix}.SFM")
aligner = "eflomal"
pair_book_dir = Path(f"zzz_PN_KTs/{pair}/{book}")
align_path = pair_book_dir / f"{book_name}_sym-align_{aligner}.txt"
out_fpath = pair_book_dir / f"{book_name}{trg_file_suffix}_out.SFM"

In [5]:
from machine.corpora import FileParatextProjectSettingsParser, UsfmFileText # UsfmTokenizer
# from machine.tokenization import LatinWordTokenizer

src_settings = FileParatextProjectSettingsParser(src_fpath.parent).parse()
src_file_text = UsfmFileText(
    src_settings.stylesheet,
    src_settings.encoding,
    book,
    src_fpath,
    src_settings.versification,
    include_markers=True, # F/T gives notes their own rows, F/F gives just the main text, T/F gives one ref per verse and all markers are inline
    include_all_text=True, # T/T includes all intro and section titles (as does F/T), all other notes/markers inline
    project=src_settings.name,
)

sentences = []
vrefs = []
for sent in src_file_text:
    if len(sent.ref.path) > 0 and sent.ref.path[-1].name == "rem":
        continue
    sentences.append(sent.text.strip())
    vrefs.append(sent.ref)

# for ref, sent in zip(vrefs, sentences):
#     print(ref, sent)

### Only deal with paragraph markers

In [8]:
from machine.corpora import UsfmTokenizer, UsfmTokenType, UsfmStyleType
# TODO: would it be easier to always use StyleType (for UsfmTags) vs TokenType (for UsfmTokens)?

'''Parse sentences'''
tokenizer = UsfmTokenizer(src_settings.stylesheet)
sentence_toks = [tokenizer.tokenize(sent) for sent in sentences]

to_delete = ["fig"]
inline_markers = []
text_only_sents = ["" for _ in sentence_toks]
for i, toks in enumerate(sentence_toks):
    ignore_scope = None
    char_scope = None
    for j, tok in enumerate(toks): # POSSIBLE TYPES: TEXT, PARAGRAPH, CHARACTER, NOTE, END
        if ignore_scope is not None:
            if tok.type == UsfmTokenType.END and tok.marker[:-1] == ignore_scope.marker:
                ignore_scope = None
        elif tok.type == UsfmTokenType.NOTE or (tok.type == UsfmTokenType.CHARACTER and tok.marker in to_delete):
            ignore_scope = tok
        elif tok.type in [UsfmTokenType.PARAGRAPH, UsfmTokenType.CHARACTER, UsfmTokenType.END]:
            inline_markers.append((i, len(text_only_sents[i]), tok.to_usfm()))
        elif tok.type == UsfmTokenType.TEXT:
            text_only_sents[i] += tok.text

# print("sent_idx, orig_idx, marker")
# for marker in inline_markers:
#     print(marker)

In [None]:
from machine.corpora import FileParatextProjectSettingsParser, UsfmFileText
from pathlib import Path
from typing import Tuple, List

'''Translate sentences'''
trg_sents = []
src_tokens = []
trg_tokens = []
alignment_matrices = []

# TODO: verify sentences
for sent, sent_toks in zip(trg_sents, trg_tokens):
    print("".join(sent_toks).replace("_", " ") == sent)

'''Need to match markers to their closest token idx'''
def get_toks_after_sequences(sequences: List[Tuple]) -> List[int]:
    return []

toks_after_markers = get_toks_after_sequences(inline_markers)

for ref, sent in zip(vrefs,trg_sents):
    print(ref, sent)

In [None]:
from collections import defaultdict
from silnlp.common.corpus import load_corpus

'''Decide where to reinsert markers'''
trg_toks_after_markers = []
matrix: List[List[bool]]
for marker, idx, matrix in zip(inline_markers, toks_after_markers, alignment_matrices):
    trg_toks_after_markers.append(matrix[idx].index(True))

print(toks_after_markers)
print(trg_toks_after_markers)
print(len(trg_toks_after_markers))

In [None]:
'''
TODO: right now, half the word for the insertion order is done when to_insert is filled out, and the other half is done when the markers are
being inserted (with reverse). Since there there's already funky stuff going on in the order of the markers in to_insert (for disambiguation 
for the same insertion idx), it would make more sense to just do all the ordering when to_insert is being filled out, i.e. the order of a list
in to_insert is the order they need to be inserted in
on the other hand, the current way might be more human-readable
'''

'''Reinsert markers'''
to_insert = [[] for _ in vrefs]

# Collect the markers to be inserted
for i, (mark, next_trg_tok) in enumerate(zip(inline_markers, trg_toks_after_markers)):
    sent_idx, _, marker = mark
    insert_idx = trg_word_tok_ranges[sent_idx][next_trg_tok].start

    # figure out the order of the markers in the sentence to handle ambiguity for directly adjacent markers
    insert_place = 0
    while insert_place < len(to_insert[sent_idx]) and to_insert[sent_idx][insert_place][0] <= insert_idx:
        insert_place += 1

    to_insert[sent_idx].insert(insert_place, (insert_idx, marker))

'''create rows for each paragraph marker and insert character markers back into text'''
# Construct rows to update the USFM file with
rows = []
for sent_idx, (ref, trg_sent) in enumerate(zip(vrefs, trg_sents)):
    row_texts = []
    attach_to_prev = False # TODO: better name
    prev_is_end = False # hacky
    for insert_idx, marker in reversed(to_insert[sent_idx]):
        is_char_marker = src_settings.stylesheet.get_tag(marker.strip(" \\+*")).style_type == UsfmStyleType.CHARACTER
        row_text = (marker if is_char_marker else "") \
                    + (" " if "*" in marker and insert_idx < len(trg_sent) and trg_sent[insert_idx].isalpha() else "") \
                    + trg_sent[insert_idx:]

        if attach_to_prev:
            # don't want a space before end marker
            if prev_is_end and len(row_text) > 0 and row_text[-1] == " ": # hacky
                row_text = row_text[:-1]
            # append text segments instead of creating a new one since previous segment (really the next segment bc iterating backwards) is not its own paragraph
            row_texts[0] = row_text + row_texts[0]
        else:
            row_texts.insert(0, row_text)

        # only paragraph markers get their own rows, so all segments split by character markers and their end markers need to be rejoined
        attach_to_prev = is_char_marker
        prev_is_end = "*" in marker
        trg_sent = trg_sent[:insert_idx]

    # do the same as above with the text at the beginning of the verse
    if attach_to_prev:
        if prev_is_end and len(trg_sent) > 0 and trg_sent[-1] == " ": # hacky
            trg_sent = trg_sent[:-1]
        row_texts[0] = trg_sent + row_texts[0]
    else:
        rows.append(([ref], trg_sent))

    for row_text in row_texts:
        rows.append(([ref], row_text))

print(len(rows))
for ref, sent in rows:
    print(ref, sent)

In [None]:
from machine.corpora import UpdateUsfmParserHandler, parse_usfm, UsfmParserState, UpdateUsfmBehavior

class ParagraphUpdateUsfmParserHandler(UpdateUsfmParserHandler):
    def _collect_tokens(self, state: UsfmParserState) -> None:
        self._tokens.extend(self._new_tokens)
        self._new_tokens.clear()
        while self._token_index <= state.index + state.special_token_count:
            if state.tokens[self._token_index].type == UsfmTokenType.PARAGRAPH and state.tokens[self._token_index].marker != "rem":
                num_text = 0
                rem_offset = 0
                for i in range(len(self._tokens) - 1, -1, -1):
                    if self._tokens[i].type == UsfmTokenType.TEXT:
                        num_text += 1
                    elif self._tokens[i].type == UsfmTokenType.PARAGRAPH and self._tokens[i].marker == "rem":
                        rem_offset += num_text + 1
                        num_text = 0
                    else:
                        break
                if num_text >= 2:
                    self._tokens.insert(-(rem_offset + num_text - 1), state.tokens[self._token_index])
                    self._token_index += 1
                    break # should this be continue instead? what situations are there where 
            self._tokens.append(state.tokens[self._token_index])
            self._token_index += 1

'''Update USFM and write out'''
# preserve_whitespace=True doesn't change anything with markers on newlines but it does take care of the \vp\*vp somehow
with open(src_fpath, encoding="utf-8-sig") as f:
    usfm = f.read()
handler = ParagraphUpdateUsfmParserHandler(rows, behavior=UpdateUsfmBehavior.PREFER_NEW)
parse_usfm(usfm, handler, src_settings.stylesheet, src_settings.versification, preserve_whitespace=False)
usfm_out = handler.get_usfm(src_settings.stylesheet)

with out_fpath.open("w", encoding=src_settings.encoding) as f:
    f.write(usfm_out)