### KT sorting

In [None]:
from pathlib import Path
import silnlp.common.paratext
import json
from collections import defaultdict
from silnlp.common.corpus import load_corpus

# paths for generic Major KT lists
metadata_path = Path("silnlp/assets/Major-metadata.txt")
vrefs_path = Path("silnlp/assets/Major-vrefs.txt")

In [None]:
src_gloss_path = Path("silnlp/assets/fr-Major-glosses.txt") # en  fr
trg_gloss_path = Path("test_S/MT/terms/bcw-bcw_2024_02_21-Major-renderings.txt") # lmp-lmp_2024_02_16  bcw-bcw_2024_02_21
pair = "fr_bcw" # en_lmp  fr_bcw

proper_nouns = defaultdict(dict)
for i, (meta, vref, src_gloss, trg_gloss) in enumerate(zip(load_corpus(metadata_path), load_corpus(vrefs_path), load_corpus(src_gloss_path), load_corpus(trg_gloss_path))):
    term, pt_cat, sem_cat = meta.split("\t") # orig lang term, Paratext category (PN, FL, RE, etc.), semantic category (person, grasses, containers, etc.)
    instances = vref.split("\t") # all occurrences of the term
    src_glosses = src_gloss.split("\t") # all potential glosses for term
    trg_glosses = trg_gloss.split("\t")

    if pt_cat == "PN" and trg_glosses != [""]:
        proper_nouns[i]["glosses"] = (src_glosses, trg_glosses)
        proper_nouns[i]["instances"] = instances # might want to give this further structure, i.e. be a dict w/ book:chapter:[instances]

with open(f"zzz_PN_KTs/{pair}/KT_to_vrefs.json", "w", encoding="utf-8") as f:
    json.dump(proper_nouns, f, ensure_ascii=False, indent=4)

# Create verse-to-KTs dict
vref_to_KTs = defaultdict(list)
for i, pn_dict in proper_nouns.items():
    for vref in pn_dict["instances"]:
        vref_to_KTs[vref].append(i)
with open(f"zzz_PN_KTs/{pair}/vref_to_KTs.json", "w", encoding="utf-8") as f:
    json.dump(vref_to_KTs, f, ensure_ascii=False, indent=4)

### Fix KTs

In [None]:
from silnlp.common.corpus import load_corpus
from pathlib import Path
from machine.corpora import ScriptureRef
from silnlp.alignment.utils import compute_alignment_scores

pair = ""
book_name = "08RUT"
vrefs = [ScriptureRef.parse(ref) for ref in load_corpus(Path(f"zzz_PN_KTs/{pair}/{book_name}_vrefs.txt"))]
src_path = Path(f"zzz_PN_KTs/{pair}/{book_name}_src_sents.txt")
trg_path = Path(f"zzz_PN_KTs/{pair}/{book_name}_trg_sents.txt")

# always uses LatinWordTokenizer
sym_align_path = Path(f"zzz_PN_KTs/{pair}/{book_name}_sym-align.txt")
scores = compute_alignment_scores(src_path, trg_path, aligner_id="eflomal", sym_align_path=sym_align_path)

In [None]:
from machine.tokenization import LatinWordTokenizer
from machine.corpora import TextFileTextCorpus
from machine.scripture import VerseRef
import json
import nltk

# treat_apostrophe_as_single_quote=True didn't do anything
# confirmed that these have the same tokenization as the aligner (for Latin script)
# aligner uses LatinWordTokenizer + escape_spaces, nfc_normalize, lowercase from TextCorpus
src_lines = [line.segment for line in TextFileTextCorpus(src_path).tokenize(LatinWordTokenizer()).lowercase()]
trg_lines = [line.segment for line in TextFileTextCorpus(trg_path).tokenize(LatinWordTokenizer())]
src_lines_raw = load_corpus(src_path)
trg_lines_raw = load_corpus(trg_path)

align_lines = [[(lambda x: (int(x[0]), int(x[1])))(pair.split(":")[0].split("-")) for pair in line.split()] for line in load_corpus(sym_align_path)]

# # check for alignment coverage
# # not complete coverage, so can't assume anything about if specific words are aligned
# for i, (ref,src_line,trg_line,align_pairs) in enumerate(zip(vrefs, src_lines, trg_lines, align_lines)):
#     src_idxs = {pair[0] for pair in align_pairs}
#     trg_idxs = {pair[1] for pair in align_pairs}

#     print(i+1, ref)
#     print(f"unaligned SRC: {len(src_line) - len(src_idxs)}")
#     print(f"unaligned TRG: {len(trg_line) - len(trg_idxs)}")

pair = "en_lmp"
book = "RUT"
with open(f"zzz_PN_KTs/{pair}/vref_to_KTs.json", encoding="utf-8") as f:
    vref_to_KTs = json.load(f)
with open(f"zzz_PN_KTs/{pair}/KT_to_vrefs.json", encoding="utf-8") as f:
    KT_to_vrefs = json.load(f)

term_ids = set()
exp_vrefs = set()
for ref, ids in vref_to_KTs.items():
    if VerseRef.from_string(ref).book == book:
        term_ids.update(ids)
        exp_vrefs.add(ref)
src_terms = set()
trg_terms = set()
for id in term_ids:
    src_terms.update(KT_to_vrefs[str(id)]["glosses"][0])
    trg_terms.update(KT_to_vrefs[str(id)]["glosses"][1])
print(src_terms)

# found = defaultdict(list)
for ref,src_line,trg_line,align_pairs,trg_line_raw in zip(vrefs, src_lines, trg_lines, align_lines,trg_lines_raw):
    if str(ref.verse_ref) not in vref_to_KTs.keys():
        continue
    if ref.verse_num == 0 or ref.path[0].name != "": # the ScriptureRefs I'm testing with have an empty ScriptureElement in the path so is_verse doesn't work
        continue

    found = []
    for term_id in vref_to_KTs[str(ref.verse_ref)]:
        glosses = [gloss.lower() for gloss in KT_to_vrefs[str(term_id)]["glosses"][0]]
        min_dist = (0, 0, 100) # gloss idx of closest match, tok idx of closest match, distance
        for i, gloss in enumerate(glosses): # could adjust this to look at n-grams, where n is the number of words in the gloss
            for j, tok in enumerate(src_line):
                if (j, term_id) in found:
                    continue
                dist = nltk.edit_distance(gloss, tok) / len(tok)
                if dist < min_dist[2]:
                    min_dist = (i, j, dist)
        # print(glosses[min_dist[0]], src_line[min_dist[1]], min_dist[2])
        if min_dist[2] < .3:
            found.append((min_dist[1], term_id))
    # print(ref)
    # print(len(found), len(vref_to_KTs[str(ref.verse_ref)]))
    # print(found)
    # print(vref_to_KTs[str(ref.verse_ref)])
    # print(src_line)

    # replace word(s) in target text
    for src_idx, term_id in found:
        trg_idxs = [pair[1] for pair in align_pairs if pair[0] == src_idx]
        print(src_idx, trg_idxs)
        print(src_line[src_idx], [trg_line[idx] for idx in trg_idxs])
        print(trg_line_raw)
        print(trg_line)
        print("\n")


### Construct Test Files
* Remove character markers and delete notes entriely
* Leave paragraph markers alone

In [None]:
from machine.corpora import (
    FileParatextProjectSettingsParser, 
    UsfmFileText, 
    UpdateUsfmParserHandler, 
    UsfmTokenizer, 
    UsfmTokenType, 
    parse_usfm, 
    UsfmParserState,
    UpdateUsfmBehavior
    )
from pathlib import Path

class ParagraphUpdateUsfmParserHandler(UpdateUsfmParserHandler):
    def _collect_tokens(self, state: UsfmParserState) -> None:
        self._tokens.extend(self._new_tokens)
        self._new_tokens.clear()
        while self._token_index <= state.index + state.special_token_count:
            if state.tokens[self._token_index].type == UsfmTokenType.PARAGRAPH and state.tokens[self._token_index].marker != "rem":
                num_text = 0
                rem_offset = 0
                for i in range(len(self._tokens) - 1, -1, -1):
                    if self._tokens[i].type == UsfmTokenType.TEXT:
                        num_text += 1
                    elif self._tokens[i].type == UsfmTokenType.PARAGRAPH and self._tokens[i].marker == "rem":
                        rem_offset += num_text + 1
                        num_text = 0
                    else:
                        break
                if num_text >= 2:
                    self._tokens.insert(-(rem_offset + num_text - 1), state.tokens[self._token_index])
                    self._token_index += 1
                    break # should this be continue instead? what situations are there where 
            self._tokens.append(state.tokens[self._token_index])
            self._token_index += 1

pair = ""
project = ""
file_suffix = ""

char_markers = True

book = "JHN"
book_name = f"44{book}"
src_file_path = Path(f"test_S/Paratext/projects/{project}/{book_name}{file_suffix}.SFM")
out_file_path = Path(f"zzz_PN_KTs/{pair}/{book}/{book_name}{file_suffix}_goal{'_char' if char_markers else ''}.SFM")
src_settings = FileParatextProjectSettingsParser(src_file_path.parent).parse()
src_file_text = UsfmFileText(
    src_settings.stylesheet,
    src_settings.encoding,
    src_settings.get_book_id(src_file_path.name),
    src_file_path,
    src_settings.versification,
    include_markers=True,
    include_all_text=True,
    project=src_settings.name,
)

tokenizer = UsfmTokenizer(src_settings.stylesheet)
sentence_toks = [tokenizer.tokenize(sent.text.strip()) for sent in src_file_text]
vrefs = [s.ref for s in src_file_text]

to_delete = ["fig"]
out_toks = [[[]] for _ in sentence_toks]
for i, toks in enumerate(sentence_toks):
    ignore_scope = None
    for j, tok in enumerate(toks): # POSSIBLE TYPES: TEXT, PARAGRAPH, CHARACTER, NOTE, END, ATTRIBUTE
        if ignore_scope is not None:
            if tok.type == UsfmTokenType.END and tok.marker[:-1] == ignore_scope.marker:
                ignore_scope = None
        elif tok.type == UsfmTokenType.NOTE or (tok.type == UsfmTokenType.CHARACTER and tok.marker in to_delete):
            ignore_scope = tok
        elif tok.type == UsfmTokenType.PARAGRAPH:
            out_toks[i].append([])
        elif (tok.type == UsfmTokenType.TEXT
              or (char_markers and (tok.type == UsfmTokenType.CHARACTER or tok.type == UsfmTokenType.END))):
            out_toks[i][-1].append(tok)

rows = []
for ref, sent in zip(vrefs, out_toks):
    for segment in sent:
        rows.append(([ref], "".join([tok.to_usfm() for tok in segment])))

'''Update file and write out'''
# rows = [([ref], "".join([tok.to_usfm() for tok in sent])) for ref, sent in zip(vrefs, out_toks)]
# dest_updater = FileParatextProjectTextUpdater(src_file_path.parent)
# usfm_out = dest_updater.update_usfm(
#     src_file_text.id, rows, strip_all_text=True, prefer_existing_text=False
# )
# with open(f"zzz_PN_KTs/{pair}/{book}/{book_name}{file_suffix}_goal.SFM", "w", encoding=src_settings.encoding) as f:
#     f.write(usfm_out)
with open(src_file_path, encoding="utf-8-sig") as f:
    usfm = f.read()
handler = ParagraphUpdateUsfmParserHandler(rows, behavior=UpdateUsfmBehavior.PREFER_NEW)
parse_usfm(usfm, handler, src_settings.stylesheet, src_settings.versification, preserve_whitespace=False)
usfm_out = handler.get_usfm(src_settings.stylesheet)
with out_file_path.open("w", encoding=src_settings.encoding) as f:
    f.write(usfm_out)

### Evaluation -- no good
* score orig --> no inline markers
* score orig --> only para markers

In [None]:
import sacrebleu
from pathlib import Path

out = list(load_corpus(Path("")))
ref = [list(load_corpus(Path("")))]

bleu = sacrebleu.corpus_bleu(out, ref, lowercase=True).score
spbleu = sacrebleu.corpus_bleu(out, ref, lowercase=True, tokenize="flores200").score
chrf = sacrebleu.corpus_chrf(out, ref, char_order=6, beta=3, remove_whitespace=True).score
chrfp = sacrebleu.corpus_chrf(out, ref, char_order=6, beta=3, word_order=1, remove_whitespace=True, eps_smoothing=True).score
chrfpp = sacrebleu.corpus_chrf(out, ref, char_order=6, beta=3, word_order=2, remove_whitespace=True, eps_smoothing=True).score
print(bleu, spbleu, chrf, chrfp, chrfpp)