# Infer

First, make paragraphs from lines (unwrap), then wrap to the limits of the network

In [1]:
import sys, os, datetime
import json
import torch
import random
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TranslationPipeline
from collections import defaultdict
import zipfile

In [2]:
import languages

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
src_langs = set(["akk", "sux"])
tgt_langs = set(["en"])

In [5]:
model_id = "praeclarum/cuneiform"
# model_revision = "1ba74c8dcf6d1839b0a56589a53dfb5c20ca84f2"
model_revision = "7a60be19efe61bf4adf873eb86f864ea7bfb4876"

In [6]:
batch_size = 16
device = "cuda" if torch.has_cuda else "cpu"
device_id = 0 if device == "cuda" else -1

In [7]:
!nvidia-smi

/bin/bash: nvidia-smi: command not found


## Load Existing Translations

In [8]:
output_json_path = "../data/ml_translations.json"
output_zip_path = output_json_path.replace(".json", ".zip")

In [9]:
with open(output_zip_path, "rb") as zipf:
    with zipfile.ZipFile(zipf) as zf:
        name = [x for x in zf.namelist() if x.endswith(".json")][0]
        with zf.open(name) as f:
            old_translations = json.loads(str(f.read(), "utf8"))
old_translations.keys()
len(old_translations["sux_to_en"])

553296

In [10]:
new_translations = dict(old_translations)
new_translations.keys()


dict_keys(['model_id', 'model_revision', 'akk_to_en', 'sux_to_en'])

In [11]:
if new_translations["model_revision"] != model_revision:
    print("Clearing defunct translations")
    for s_lang in src_langs:
        for t_lang in tgt_langs:
            st_key = f"{s_lang}_to_{t_lang}"
            new_translations[st_key] = {}

In [12]:
def sample_translations():
    for s_lang in src_langs:
        for t_lang in tgt_langs:
            st_key = f"{s_lang}_to_{t_lang}"
            if st_key in new_translations:
                translations = new_translations[st_key]
                print(len(translations), f"{st_key} translations")
                print([(x, translations[x]) for x in translations][:10])

In [13]:
sample_translations()

553296 sux_to_en translations
[('" [...] ~ [...] = %a [u4]-du-ru-u2 " [...] ~ [...] = %a {d}en-lil2 " [...] ~ [...] = %a szu-ub-tum', '... ... ... ...'), ('" [...] ~ |_x-x-lal_| = %a na-du-u2-um " [...] ~ |_x-x-si_| = %a ku-ur-ku-u2-um " [...] ~ |_su-lu-sze3_|# = %a lu-up-pu-um', '... ... ... ... ...'), ('" [ba]-e# = %a za-zum " [za-e] = %a at#-ta " [ga2-e] = %a [a]-na#?-ku-u2 " [...] = %a [at]-ta', 'you are'), ('# " ~ _munsub #_ " ~ _alan# #_ " ~ _alan #_ " ~ |_masz-gu2-gar3_| # zubx(|_masz-gu2-gar3_|) " zu-ub ~ |_masz-gu2-gar3_| = %a zu-bu-um', 'a kind of acquatic animal a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of in

## Load the Model

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=model_revision, device=device)
model_max_length = tokenizer.model_max_length
model_max_length

512

In [15]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, revision=model_revision, max_length=tokenizer.model_max_length)
model = model.to(device)
# model

In [16]:
pipeline = TranslationPipeline(model=model, tokenizer=tokenizer, batch_size=batch_size, device=device_id)

In [17]:
print(pipeline("translate Akkadian to English: 1(disz){d}szul3-ma-nu-_sag man gal?_-u2 _man_ dan-nu _man kisz_"))
print(pipeline("translate Akkadian to English: ra-bi-isz e-pu-usz"))

[{'translation_text': 'ulmanu-re, great king, strong king, king of the universe,'}]
[{'translation_text': 'I built it anew.'}]


## Load Transliterations to Translate

In [18]:
import corpi
import cdli

In [19]:
import importlib
importlib.reload(corpi)

<module 'corpi' from '/Users/fak/Dropbox/Projects/CuneiformTranslators/tools/corpi.py'>

## Load ORACC

In [20]:
importlib.reload(cdli)

<module 'cdli' from '/Users/fak/Dropbox/Projects/CuneiformTranslators/tools/cdli.py'>

In [21]:
oracc_dir = os.path.abspath(f"/Volumes/FrankDisk/oracc_zips")
# oracc_dir = os.path.abspath(f"/home/fak/nn/Data/oracc_zips")
oracc_dir

'/Volumes/FrankDisk/oracc_zips'

In [22]:
oracc_corpus = corpi.ORACC(oracc_dir)

## Load CDLI

In [23]:
cdli_corpus = corpi.CDLI()

## Merge

In [24]:
all_pubs = corpi.merge_corpus_pubs(
    [
        ("oracc", oracc_corpus.oracc_pubs.values()),
        ("cdli", cdli_corpus.cdli_pubs.values())
    ], src_langs)
print(f"Found {len(all_pubs)} unique publications")

Found 134796 unique publications


## Build Need Translation List

In [25]:
def get_need_translations(src_lang, encoding="ascii", tgt_lang="en"):
    srcs = set()

    max_line_length = 0

    for pub_id in tqdm(all_pubs.keys()):
        pub = all_pubs[pub_id]
        if pub.language != src_lang:
            continue
        corpus = pub.corpus
        for a in pub.text_areas:
            if (corpus == "cdli") and len(a.lines) > 0 and len(a.paragraphs) == 0:
                a.lines_to_paragraphs(src_lang, tgt_lang)
            paras = a.paragraphs_to_lines()
            for p in paras:
                for si,ei,line in p:
                    if len(line) > max_line_length:
                        max_line_length = len(line)
                    srcs.add(line)
    print(src_lang, "max line length", max_line_length)
    return srcs
needs_translation = {     
    "sux": get_need_translations("sux"),
    "akk": get_need_translations("akk"),
}
print(len(needs_translation["sux"]), "sux needs translation")
print(len(needs_translation["akk"]), "akk needs translation")


  0%|          | 0/134796 [00:00<?, ?it/s]

sux max line length 458


  0%|          | 0/134796 [00:00<?, ?it/s]

akk max line length 522
524319 sux needs translation
275741 akk needs translation


In [30]:
list(needs_translation["akk"])[:10]

['{munus}ru-ba-a-ia _ba-usz2_ {munus}a-ba-ra-ka-tum _nig2-szu_ dingir-ka-an',
 "[...] s,e#-e-ni sza2 _ugu buru5_ ma-a'-du [x x]-la# a#-na# _kur_ an-szar2{ki} [...] szu#-sza2-an _iri!(_zu_)-ma_-dak-tu2",
 'ma-la2 t,up-pi2-ka3 ha-ar-mi3-im sza-asz2-qu2-lam5 na-sa3-qum lu ku-a-um t,up-pi2-a szi2-ta-me-a-ma szu-ma a-ma-la2 t,up-pi2-a',
 '[...]-a-szur _ku3-babbar_ [...] a-ta [... i-ku]-pi3#-a-szur [a-pu-uh3 ...] x _ku3-babbar_ [sza]-asz2-qu2-[lim? u3] t,up-pi3-im',
 'za-i-ri-szu i-na-ar gi-me-er-szu-nu isz-ki-isz _dingir_-ni _lugal_ isz-nun-na{ki} la sze-mu a-wa-ti-szu i-ik-mi in {gesz}si-gar',
 'a-hu-szi-na a-na a-pil-i3-li2-szu _dumu-ni_ i-zu-zi u2-ul i-tu-ru-ma be-el hu-bu-li-szu a-hu-szi-na u2-ul i-s,a-ba-tu',
 '_DISZ na ninda gu7 kasz nag#_-ma _na [bi_ ...] _gi-gid2 mur-mesz gig_ {u2#}[x ...] _ka_-szu2 ta-kap-par [...]',
 '{d}utu-na-s,ir',
 'nar#-bi-ka# [lu-sza2-pi da3-li2-li2]-ka# lud-lul',
 "[e-zib] szá lu-'u-ú-tú _ki m_ÁSZ _dib_-_mesz_-ma ú-[le-'u-ú]"]

In [31]:
tgt_lang = "en"

def get_translations():
    for src_lang in src_langs:
        st_key = f"{src_lang}_to_{tgt_lang}"
        news = new_translations[st_key]
        # print(list(olds)[:10])
        for src in tqdm(needs_translation[src_lang]):
            if src not in news:
                yield (src_lang, tgt_lang, src, f"translate {languages.all_languages[src_lang]} to {languages.all_languages[tgt_lang]}: " + src)

num_need_translate = sum([len(x) for x in needs_translation.values()])
print(f"{num_need_translate:,} to translate")
to_translate = list(get_translations())
print(f"{len(to_translate):,} left to translate")
                
def get_translations_gen():
    for src_lang, tft_lang, s, q in tqdm(to_translate):
        yield q

800,060 to translate


  0%|          | 0/524319 [00:00<?, ?it/s]

  0%|          | 0/275741 [00:00<?, ?it/s]

0 left to translate


In [32]:
r = pipeline(get_translations_gen())
for i, tr in enumerate(r):
    src_lang, tft_lang, s, q = to_translate[i]
    st_key = f"{src_lang}_to_{tgt_lang}"
    translations = new_translations[st_key]
    t = tr[0]['translation_text']
    translations[s] = t


0it [00:00, ?it/s]

In [33]:
sample_translations()

553296 sux_to_en translations
[('" [...] ~ [...] = %a [u4]-du-ru-u2 " [...] ~ [...] = %a {d}en-lil2 " [...] ~ [...] = %a szu-ub-tum', '... ... ... ...'), ('" [...] ~ |_x-x-lal_| = %a na-du-u2-um " [...] ~ |_x-x-si_| = %a ku-ur-ku-u2-um " [...] ~ |_su-lu-sze3_|# = %a lu-up-pu-um', '... ... ... ... ...'), ('" [ba]-e# = %a za-zum " [za-e] = %a at#-ta " [ga2-e] = %a [a]-na#?-ku-u2 " [...] = %a [at]-ta', 'you are'), ('# " ~ _munsub #_ " ~ _alan# #_ " ~ _alan #_ " ~ |_masz-gu2-gar3_| # zubx(|_masz-gu2-gar3_|) " zu-ub ~ |_masz-gu2-gar3_| = %a zu-bu-um', 'a kind of acquatic animal a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of insect a kind of in

## Save the Translations

In [30]:
def write_translations(f):
    f.write("{\n")
    f.write(f"\"model_id\":\"{model_id}\",\n")
    f.write(f"\"model_revision\":\"{model_revision}\"")
    for st_key in sorted([x for x in new_translations if not x.startswith("model_")]):
        f.write(f",\n\"{st_key}\":{{\n")
        translations = new_translations[st_key]
        head = ""
        for s in sorted(list(translations.keys())):
            f.write(head)
            f.write(json.dumps(s))
            f.write(": ")
            f.write(json.dumps(translations[s]))
            head = ",\n"
        f.write("}")
    f.write("}\n")
    
# write_translations(sys.stdout)

In [33]:
with open(output_json_path, "wt") as f:
    write_translations(f)

In [37]:
import zipfile

def compress(zip_name, file_to_zip):
    if os.path.exists(zip_name):
        os.unlink(zip_name)
    # Create a new zip file and add files to it
    with zipfile.ZipFile(zip_name, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
        zf.write(file_to_zip)
    os.unlink(file_to_zip)

compress('../data/ml_translations.zip', '../data/ml_translations.json')

In [39]:
!ls -al ../data

total 105132
drwxrwxr-x  2 fak fak     4096 Jun  5 12:26 .
drwxrwxr-x 10 fak fak     4096 Jun  1 13:42 ..
-rw-rw-r--  1 fak fak 24652099 Jun  5 11:54 cdli_pubs.zip
-rw-rw-r--  1 fak fak    56450 Jul 25  2022 dataset_index.json
-rw-rw-r--  1 fak fak 32382385 Jun  5 12:26 ml_translations.zip
-rw-rw-r--  1 fak fak 13461354 Jun  5 11:54 oracc_pubs.zip
-rw-rw-r--  1 fak fak 29155935 May 31 12:30 translations_akk_to_en.jsonl
-rw-rw-r--  1 fak fak  7927025 May 31 12:30 translations_sux_to_en.jsonl


In [40]:
new_json = json.load(open(output_json_path, "rt"))
print(len(new_json["akk_to_en"]))
print(len(new_json["sux_to_en"]))

FileNotFoundError: [Errno 2] No such file or directory: '../data/ml_translations.json'