Not much time was spent on this, but the original idea was to find and correct instances of proper nouns in drafts using the key terms lists.

In [None]:
from pathlib import Path
import json
from collections import defaultdict
from silnlp.common.corpus import load_corpus

# paths for generic Major KT lists
metadata_path = Path("silnlp/assets/Major-metadata.txt")
vrefs_path = Path("silnlp/assets/Major-vrefs.txt")

Create mappings of proper noun KTs to their instances and verse refs to the proper nouns they contain

In [None]:
src_gloss_path = Path("silnlp/assets/fr-Major-glosses.txt") # en  fr
trg_gloss_path = Path("test_S/MT/terms/bcw-bcw_2024_02_21-Major-renderings.txt") # lmp-lmp_2024_02_16  bcw-bcw_2024_02_21
pair = "fr_bcw" # en_lmp  fr_bcw

proper_nouns = defaultdict(dict)
for i, (meta, vref, src_gloss, trg_gloss) in enumerate(zip(load_corpus(metadata_path), load_corpus(vrefs_path), load_corpus(src_gloss_path), load_corpus(trg_gloss_path))):
    term, pt_cat, sem_cat = meta.split("\t") # orig lang term, Paratext category (PN, FL, RE, etc.), semantic category (person, grasses, containers, etc.)
    instances = vref.split("\t") # all occurrences of the term
    src_glosses = src_gloss.split("\t") # all potential glosses for term
    trg_glosses = trg_gloss.split("\t")

    if pt_cat == "PN" and trg_glosses != [""]:
        proper_nouns[i]["glosses"] = (src_glosses, trg_glosses)
        proper_nouns[i]["instances"] = instances # might want to give this further structure, i.e. be a dict w/ book:chapter:[instances]

with open("KT_to_vrefs.json", "w", encoding="utf-8") as f:
    json.dump(proper_nouns, f, ensure_ascii=False, indent=4)

# Create verse-to-KTs dict
vref_to_KTs = defaultdict(list)
for i, pn_dict in proper_nouns.items():
    for vref in pn_dict["instances"]:
        vref_to_KTs[vref].append(i)
with open("vref_to_KTs.json", "w", encoding="utf-8") as f:
    json.dump(vref_to_KTs, f, ensure_ascii=False, indent=4)

Attempt to correct the translations of known instances of KTs

(This is very preliminary work so I have no idea if it's useful)

In [None]:
from silnlp.common.corpus import load_corpus
from pathlib import Path
from machine.corpora import ScriptureRef
from silnlp.alignment.utils import compute_alignment_scores

book_name = "08RUT"
vrefs = [ScriptureRef.parse(ref) for ref in load_corpus(Path(f"{book_name}_vrefs.txt"))]
src_path = Path(f"{book_name}_src_sents.txt")
trg_path = Path(f"{book_name}_trg_sents.txt")

# always uses LatinWordTokenizer
sym_align_path = Path(f"{book_name}_sym-align.txt")
scores = compute_alignment_scores(src_path, trg_path, aligner_id="eflomal", sym_align_path=sym_align_path)

In [None]:
from machine.tokenization import LatinWordTokenizer
from machine.corpora import TextFileTextCorpus
from machine.scripture import VerseRef
import json
import nltk

src_lines = [line.segment for line in TextFileTextCorpus(src_path).tokenize(LatinWordTokenizer()).lowercase()]
trg_lines = [line.segment for line in TextFileTextCorpus(trg_path).tokenize(LatinWordTokenizer())]
src_lines_raw = load_corpus(src_path)
trg_lines_raw = load_corpus(trg_path)

align_lines = [[(lambda x: (int(x[0]), int(x[1])))(pair.split(":")[0].split("-")) for pair in line.split()] for line in load_corpus(sym_align_path)]

book = "RUT"
with open("vref_to_KTs.json", encoding="utf-8") as f:
    vref_to_KTs = json.load(f)
with open("KT_to_vrefs.json", encoding="utf-8") as f:
    KT_to_vrefs = json.load(f)

term_ids = set()
exp_vrefs = set()
for ref, ids in vref_to_KTs.items():
    if VerseRef.from_string(ref).book == book:
        term_ids.update(ids)
        exp_vrefs.add(ref)
src_terms = set()
trg_terms = set()
for id in term_ids:
    src_terms.update(KT_to_vrefs[str(id)]["glosses"][0])
    trg_terms.update(KT_to_vrefs[str(id)]["glosses"][1])
print(src_terms)

for ref,src_line,trg_line,align_pairs,trg_line_raw in zip(vrefs, src_lines, trg_lines, align_lines,trg_lines_raw):
    if str(ref.verse_ref) not in vref_to_KTs.keys():
        continue
    if ref.verse_num == 0 or ref.path[0].name != "": # the ScriptureRefs I'm testing with have an empty ScriptureElement in the path so is_verse doesn't work
        continue

    found = []
    for term_id in vref_to_KTs[str(ref.verse_ref)]:
        glosses = [gloss.lower() for gloss in KT_to_vrefs[str(term_id)]["glosses"][0]]
        min_dist = (0, 0, 100) # gloss idx of closest match, tok idx of closest match, distance
        for i, gloss in enumerate(glosses): # could adjust this to look at n-grams, where n is the number of words in the gloss
            for j, tok in enumerate(src_line):
                if (j, term_id) in found:
                    continue
                dist = nltk.edit_distance(gloss, tok) / len(tok)
                if dist < min_dist[2]:
                    min_dist = (i, j, dist)
        if min_dist[2] < .3:
            found.append((min_dist[1], term_id))

    # replace word(s) in target text
    for src_idx, term_id in found:
        trg_idxs = [pair[1] for pair in align_pairs if pair[0] == src_idx]