# Make Translations JSONL

Outputs human translations to the data folder.

http://oracc.museum.upenn.edu/doc/help/editinginatf/primer/inlinetutorial/index.html

In [1]:
import os, sys, math
import requests
import json
import time
import random
import shutil
import zipfile
from collections import defaultdict
from glob import glob
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import importlib
import torch
import random
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TranslationPipeline


In [2]:
import languages
import cdli
import oracc
import corpi

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
supported_langs = set(["akk", "sux"])

In [5]:
oracc_corpus = corpi.ORACC(oracc_dir="/Volumes/FrankDisk/oracc_zips", tqdm=tqdm)

In [6]:
cdli_corpus = corpi.CDLI(tqdm=tqdm)

In [7]:
# Convert CDLI lines to paragraphs
for pub in cdli_corpus.cdli_pubs.values():
    for a in pub.text_areas:
        a.lines_to_paragraphs(pub.language, "en")

## Test Normalized ORACC Transliterations

In [8]:
test_object_ids = [
    "Q003230", # Asshur
    "P250815", # szag4 to sza3, kud to ku5
    "P271132", # geme2 to dam
    "P332924", # gab2 to kab, zid to zi, tum12 to tu
    "P271187", # tu4 to tum, ir3 to ARAD2
    "P271030", # pu to bu
    "P228726", # sag10 to saga, gurum to gur2, ah3 to had2
    "P247541", # giggi to kukku5
    "P503256", # Links in cuneiform
    "P237767",
    "P503256",
    "P237730",

    "P010627", # Notes: o ii 66

    "Q000041",
    "Q000057",
]

In [9]:
for t in test_object_ids[:1]:
    pub = oracc_corpus.oracc_pubs[t]
    print("-"*80)
    print(pub.id)
    for a in pub.text_areas:
        paras = a.paragraphs_to_lines(pub.language, pub.corpus)
        for pi, plines in enumerate(paras):
            # for si, ei, line in plines:
            #     print(line)
            para = a.paragraphs[pi]
            en_text = a.paragraphs[pi].languages["en"]
            en_lines = en_text.split("\n")
            # print(f"Para lines: {para.start_line_index} - {para.end_line_index} ({para.end_line_index - para.start_line_index}) and {len(en_lines)} en lines")
            # for li in range(para.start_line_index, para.end_line_index):
            #     if li < len(a.lines):
            #         print(f"{pub.language}{li}: {a.lines[li].text}")
            # for li, line in enumerate(en_lines):
            #     print(f"en{li}: {line}")

--------------------------------------------------------------------------------
Q003230


## Show CDLI and ORACC Corpi Stats

In [10]:
cdli_translated_pubs = {x.id: x for x in cdli_corpus.cdli_pubs.values() if x.is_translated("en")}
len(cdli_translated_pubs), "cdli translated pubs"

(5363, 'cdli translated pubs')

In [11]:

def show_language_stats(src_lang, tgt_lang="en"):
    transliterated_cdli_index = {x.id: x for x in cdli_corpus.cdli_pubs.values() if x.language == src_lang}
    transliterated_cdli_ids = set(transliterated_cdli_index.keys())
    transliterated_oracc_index = {x.id: x for x in oracc_corpus.oracc_pubs.values() if x.language == src_lang}
    transliterated_oracc_ids = set(transliterated_oracc_index.keys())
    all_transliterated_ids = transliterated_cdli_ids.union(transliterated_oracc_ids)

    translated_cdli_index = {x.id: x for x in transliterated_cdli_index.values() if x.is_translated(tgt_lang=tgt_lang)}
    translated_cdli_ids = set(translated_cdli_index.keys())
    translated_oracc_index = {x.id: x for x in transliterated_oracc_index.values() if x.is_translated(tgt_lang=tgt_lang)}
    translated_oracc_ids = set(translated_oracc_index.keys())
    all_translated_ids = translated_cdli_ids.union(translated_oracc_ids)
    
    cdli_needs_translation_ids = transliterated_cdli_ids.difference(all_translated_ids)
    oracc_needs_translation_ids = transliterated_oracc_ids.difference(all_translated_ids)
    all_needs_translation_ids = cdli_needs_translation_ids.union(oracc_needs_translation_ids)
    
#     cdli_ids = set(x[0] for x in cdli_pub_ids_and_langs if x[1] == src_lang)

    print("="*42)
    print(f"                   lang: {src_lang}")
    print()
#     print(f"                   cdli: {len(cdli_ids)}")
    print(f"    transliterated cdli: {len(transliterated_cdli_ids):,}")
    print(f"        translated cdli: {len(translated_cdli_ids):,}")
    print(f" need translations cdli: {len(cdli_needs_translation_ids):,}")

    print()
    print(f"   transliterated oracc: {len(transliterated_oracc_ids):,}")
    print(f"       translated oracc: {len(translated_oracc_ids):,}")
    print(f"need translations oracc: {len(oracc_needs_translation_ids):,}")
    print("-"*42)
    print(f"                  total: {len(all_transliterated_ids):,}")
    print(f"   transliterated total: {len(all_transliterated_ids):,}")
    print(f"       translated total: {len(all_translated_ids):,}")
    print(f"need translations total: {len(all_needs_translation_ids):,}")

show_language_stats(src_lang="sux")
show_language_stats(src_lang="akk")

                   lang: sux

    transliterated cdli: 99,808
        translated cdli: 4,151
 need translations cdli: 95,611

   transliterated oracc: 3,513
       translated oracc: 443
need translations oracc: 3,039
------------------------------------------
                  total: 103,075
   transliterated total: 103,075
       translated total: 4,583
need translations total: 98,492
                   lang: akk

    transliterated cdli: 21,945
        translated cdli: 972
 need translations cdli: 20,937

   transliterated oracc: 12,007
       translated oracc: 9,098
need translations oracc: 2,906
------------------------------------------
                  total: 31,747
   transliterated total: 31,747
       translated total: 10,069
need translations total: 21,678


## Test Tokenization of Sources and Targets

In [12]:
model_id = "praeclarum/cuneiform"
model_revision = "7a60be19efe61bf4adf873eb86f864ea7bfb4876"

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=model_revision, device=-1)
model_max_length = tokenizer.model_max_length
model_max_length

512

In [14]:
importlib.reload(languages)

<module 'languages' from '/Users/fak/Dropbox/Projects/CuneiformTranslators/tools/languages.py'>

In [15]:
def can_tokenize(text):
    text = text.strip()
    tokens = tokenizer.encode(text)
    # print(tokens)
    dec_text = tokenizer.decode(tokens).strip()
    # print(dec_text)
    if dec_text.endswith("</s>"):
        dec_text = dec_text[:-4]
    text = text.replace("…", "...")
    text = languages.remove_extraneous_space(text.replace(".", "").replace(",", "").replace(";", ""))
    dec_text = languages.remove_extraneous_space(dec_text.replace(".", "").replace(",", "").replace(";", ""))
    if dec_text != text:
        return False, text, dec_text
    return True, text, dec_text

def test_tokenize(text, title="text", verbose=False):
    can, text, dec = can_tokenize(text)
    if not can:
        # Find the problematic character
        good_end_index = 0
        ti = 0
        di = 0
        while ti < len(text) and di < len(dec):
            d = dec[di]
            t = text[ti]
            if t == d:
                good_end_index = ti
                ti += 1
                di += 1
            else:
                if d == " " and di + 1 < len(dec) and dec[di+1] == t:
                    good_end_index = ti
                    di += 2
                    ti += 1
                elif t == " " and ti + 1 < len(text) and text[ti+1] == d:
                    good_end_index = ti + 1
                    ti += 2
                    di += 1
                else:
                    break
        if good_end_index + 1 < len(text):
            head = f"Can't tokenize {title}: "
            print(f"{head}{text}")
            print(f"{' '*len(head)}{' '*(good_end_index+1)}^ unicode \\u{ord(text[good_end_index+1]):04x}")
            print(f"{' '*len(head)}{dec}")
    else:
        if verbose:
            print(f"Can tokenize {title}: {text}")
test_tokenize("the gods Aššur, Sîn, Šamaš, Bēl, and Nabû, Ištar of Nineveh, (and) Ištar of Arbela", verbose=True)
test_tokenize(languages.replace_unsupported_en("the gods Aššur, Sîn, Šamaš, Bēl, and Nabû, Ištar of Nineveh, (and) Ištar of Arbela"), verbose=True)

Can't tokenize text: the gods Aššur Sîn Šamaš Bēl and Nabû Ištar of Nineveh (and) Ištar of Arbela
                               ^ unicode \u0161
                     the gods A<unk>ur Sîn <unk>ama<unk> B<unk>l and Nabû I<unk>tar of Nineveh (and) I<unk>tar of Arbela
Can tokenize text: the gods Ashur Sîn Shamash Bel and Nabû Ishtar of Nineveh (and) Ishtar of Arbela


## Output Translations

In [16]:
translations_out_dir = f"../data"
os.makedirs(translations_out_dir, exist_ok=True)

In [21]:
importlib.reload(languages)

<module 'languages' from '/Users/fak/Dropbox/Projects/CuneiformTranslators/tools/languages.py'>

In [22]:
def output_translations(tgt_lang="en"):
    srcs = set()
    translations = defaultdict(list)
    longest_line_len = 0

    all_pubs = list(cdli_corpus.cdli_pubs.values()) + list(oracc_corpus.oracc_pubs.values())

    for pub in tqdm(all_pubs):
        if pub.language not in supported_langs:
            continue
        for a in pub.text_areas:
            for p in a.paragraphs:
                if tgt_lang in p.languages:
                    src_lines = [x.text for x in a.lines[p.start_line_index:p.end_line_index]]
                    src = " ".join(src_lines)
                    src = languages.prep_src_for_nn(src, pub.language, pub.corpus)
                    # test_tokenize(src, "source")

                    tgt = p.languages[tgt_lang]
                    tgt = languages.prep_tgt_for_nn(tgt, tgt_lang, pub.corpus)
                    # test_tokenize(tgt, "target")
                    
                    if len(src) > 0 and languages.target_ok(tgt) and src not in srcs:
                        line_len = len(src)
                        longest_line_len = max(line_len, longest_line_len)
                        out_line = json.dumps({pub.language:src,tgt_lang:tgt})
                        # if pub.id == "Q003230":
                        #     print(f"Q003230: {out_line}")
                        translations[pub.language].append(out_line)
                        srcs.add(src)
                        
    print(f"longest line length:", longest_line_len)

    for src_lang in translations.keys():
        path = f"{translations_out_dir}/translations_{src_lang}_to_{tgt_lang}.jsonl"

        with open(path, "wt") as f:
            head = ""
            for out_line in sorted(translations[src_lang]):
                f.write(head)
                f.write(out_line)
                head = "\n"

    return sum(len(translations) for translations in translations.values())
    
output_translations()


  0%|          | 0/150705 [00:00<?, ?it/s]

longest line length: 5886


116804