# Infer

First, make paragraphs from lines (unwrap), then wrap to the limits of the network

In [1]:
import sys, os, datetime
import json
import torch
import random
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TranslationPipeline
from collections import defaultdict


2023-06-02 15:02:35.454757: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-02 15:02:35.567594: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-06-02 15:02:35.966865: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.7/lib64:
2023-06-02 15:02:35.966909: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: ca

In [2]:
import languages

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
src_langs = set(["akk", "sux"])
tgt_langs = set(["en"])

In [5]:
model_id = "praeclarum/cuneiform"
# model_revision = "1ba74c8dcf6d1839b0a56589a53dfb5c20ca84f2"
model_revision = "7a60be19efe61bf4adf873eb86f864ea7bfb4876"

In [6]:
batch_size = 16
device = "cuda"

## Load Existing Translations

In [7]:
output_json_path = "../data/ml_translations.json"

In [8]:
old_translations = json.loads(str(open(output_json_path, "rb").read(), "utf8"))
old_translations.keys()

dict_keys(['model_id', 'model_revision', 'akk_to_en', 'sux_to_en'])

In [9]:
new_translations = dict(old_translations)
new_translations.keys()
for s_lang in src_langs:
    for t_lang in tgt_langs:
        st_key = f"{s_lang}_to_{t_lang}"
        new_translations[st_key] = {}

In [12]:
def sample_translations():
    for s_lang in src_langs:
        for t_lang in tgt_langs:
            st_key = f"{s_lang}_to_{t_lang}"
            if st_key in new_translations:
                translations = new_translations[st_key]
                print(len(translations), f"{st_key} translations")
                print([(x, translations[x]) for x in translations][:10])

In [13]:
sample_translations()

0 sux_to_en translations
[]
0 akk_to_en translations
[]


## Load the Model

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=model_revision, device=device)
model_max_length = tokenizer.model_max_length
model_max_length

Downloading:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

512

In [15]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, revision=model_revision, max_length=tokenizer.model_max_length)
model = model.to(device)
# model

Downloading:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [76]:
pipeline = TranslationPipeline(model=model, tokenizer=tokenizer, batch_size=batch_size, device=0)

In [77]:
print(pipeline("translate Akkadian to English: 1(disz){d}szul3-ma-nu-_sag man gal?_-u2 _man_ dan-nu _man kisz_"))
print(pipeline("translate Akkadian to English: ra-bi-isz e-pu-usz"))

[{'translation_text': 'ulmanu-re, great king, strong king, king of the universe,'}]
[{'translation_text': 'I built it anew.'}]


## Load Transliterations to Translate

In [22]:
import corpi
import cdli

In [23]:
import importlib
importlib.reload(corpi)

<module 'corpi' from '/home/fak/Projects/CuneiformTranslators/tools/corpi.py'>

## Load ORACC

In [24]:
importlib.reload(cdli)

<module 'cdli' from '/home/fak/Projects/CuneiformTranslators/tools/cdli.py'>

In [25]:
oracc_dir = os.path.abspath(f"/Volumes/FrankDisk/oracc_zips")
# oracc_dir = os.path.abspath(f"/home/fak/nn/Data/oracc_zips")
oracc_dir

'/Volumes/FrankDisk/oracc_zips'

In [27]:
oracc_corpus = corpi.ORACC(oracc_dir)

## Load CDLI

In [28]:
cdli_corpus = corpi.CDLI()

Downloading https://github.com/cdli-gh/data/raw/master/cdliatf_unblocked.atf
Parsing atf


## Find just the transliterations

In [29]:
def get_all_transliterated_pubs(src_lang):
    transliterated_cdli_index = cdli_corpus.get_pubs_with_lang(src_lang)
    transliterated_cdli_ids = set(transliterated_cdli_index.keys())
    transliterated_oracc_index = oracc_corpus.get_pubs_with_lang(src_lang)
    transliterated_oracc_ids = set(transliterated_oracc_index.keys())
    all_ids = set(transliterated_oracc_ids)
    all_ids = all_ids.union(transliterated_cdli_ids)
    return all_ids, transliterated_cdli_index, transliterated_oracc_index

all_transliterated_pubs = {
    "sux": get_all_transliterated_pubs("sux"),
    "akk": get_all_transliterated_pubs("akk")
}
print("sux", len(all_transliterated_pubs["sux"][0]))
print("akk", len(all_transliterated_pubs["akk"][0]))

sux 104444
akk 32259


## Build Need Translation List

In [30]:
def get_need_translations(src_lang, encoding="ascii", tgt_lang="en"):
    srcs = set()

    pub_ids, transliterated_cdli_index, transliterated_oracc_index = all_transliterated_pubs[src_lang]
    max_line_length = 0

    for pub_id in tqdm(pub_ids):
        corpus = "cdli" if pub_id in transliterated_cdli_index else "oracc"
        if corpus == "cdli":
            pub = transliterated_cdli_index[pub_id]
        else:
            # print(pub_id, "in oracc")
            pub = transliterated_oracc_index[pub_id]
        for a in pub.text_areas:
            if (corpus == "cdli") and len(a.lines) > 0 and len(a.paragraphs) == 0:
                a.lines_to_paragraphs(src_lang, tgt_lang)
            paras = a.paragraphs_to_lines(max_line_length=512)
            for p in paras:
                for si,ei,line in p:
                    if len(line) > max_line_length:
                        max_line_length = len(line)
                    srcs.add(line)
    print(src_lang, "max line length", max_line_length)
    return srcs
needs_translation = {     
    "sux": get_need_translations("sux"),
    "akk": get_need_translations("akk"),
}
print(len(needs_translation["sux"]), "sux needs translation")
print(len(needs_translation["akk"]), "akk needs translation")


  0%|          | 0/104444 [00:00<?, ?it/s]

sux max line length 511


  0%|          | 0/32259 [00:00<?, ?it/s]

akk max line length 522
553291 sux needs translation
293029 akk needs translation


In [31]:
list(needs_translation["akk"])[:10]

['[x x x mu]-kin-nu# {m}s,il-la-a{*}-[a x x x]',
 '_2(u) [n] u4 iti_ e-bir5#-tin',
 '_{gesz#}szudun#_ an-szar2 sza2 is,-lu-u e-mid-su-nu-ti _{lu2}gar-kur-mesz {lu2}til-gid2-mesz_ szi-kin _szu-min_-ia',
 'szi2-bu-ti2-ni / ni-di2-in _igi_ sza-lim-esz18-dar _igi_ en-um-a-szur _igi_ a-szur-ma-lik',
 '1(u)? _lu2-mesz_ sza _{gesz}szukur_ i-na-as,#-s,a#-ru-szu',
 '5(u) _lim ansze-kur-ra-mesz_ dan-nu-ti s,i-mit-ti# [ni-ri ...]',
 '_be g_ÍR ka-pí-ìs, ina 15 ZÉ BÙR _szub_',
 '[...] la i-sze9-em-me-e 1(u) _gin2 ku3-babbar_ asz-szum a-bi#-ka x [(x)] _ku_? szu-bi-lam ub-ba# [x x x x (x) (x)]-lam',
 'A {disz}na-x#-_su_-{d}_en-lil2-la2_',
 '_igi_ {m}{d}30—_i igi_ {m}_iti-kin_-a-a']

In [90]:
def get_translations(tgt_lang = "en"):
    for src_lang in src_langs:
        st_key = f"{src_lang}_to_{tgt_lang}"
        news = new_translations[st_key]
        # print(list(olds)[:10])
        for src in tqdm(needs_translation[src_lang]):
            if src not in news:
                yield (src_lang, tgt_lang, src, f"translate {languages.all_languages[src_lang]} to {languages.all_languages[tgt_lang]}: " + src)

num_need_translate = sum([len(x) for x in needs_translation.values()])
print(f"{num_need_translate:,} to translate")
to_translate = list(get_translations())
print(f"{len(to_translate):,} left to translate")
                
def get_translations_gen():
    for src_lang, tft_lang, s, q in tqdm(to_translate):
        yield q

r = pipeline(get_translations_gen())
for i, tr in enumerate(r):
    src_lang, tft_lang, s, q = to_translate[i]
    st_key = f"{src_lang}_to_{tgt_lang}"
    translations = new_translations[st_key]
    t = tr[0]['translation_text']
    translations[s] = t


846,320 to translate


  0%|          | 0/553291 [00:00<?, ?it/s]

  0%|          | 0/293029 [00:00<?, ?it/s]

114,385 left to translate


  0%|          | 0/114385 [00:00<?, ?it/s]

In [91]:
sample_translations()

553291 sux_to_en translations
[('kiszib3 ur-{d}iszkur iti diri sze-sag11-ku5 mu {d}szu-{d}suen lugal e2 {d}szara2 umma{ki}-ka mu-du3', 'under seal of Ur-Ikur; extra month: “Harvest,” year: “u-Suen, the king, the house of ara in Umma erected.”'), ('[...] = %a [...] x x [...] [...] _har_-_ku_ he2#-en#-gub#-[gub-bu] = %a [...] lil-la-bi-ib [a-bu-bu]', '... may ... stand there'), ('sza3 mu-kux(_du_) er2 su3-a', 'in the delivery of the red tears;'), ('[...] x [...] [iti szu]-numun-sze3#? [x]-bi 4(u) 4(asz) iti 2(disz)-sze3# szunigin 1(gesz2) 3(u) 4(asz) kasz saga gur#', '... ..., for the month “Sowing,” its ...: 44 gur, for 2 months, total: 94 gur fine beer,'), ('azux2(_zu5_){a}-_du-hub2#_', 'a kind of profession'), ('[x] ninda _ka_-gu7 ba-an-ne2 1(u) du8 [x] ninda bappir3 ba-an-[ne2] 1(u) du8 3(asz@c)#? ninda _ka_-gu7 [ba]-an-ne2 1(u) du8', '... bread ... ... 10 loaves ... bread ... 10 loaves 3? bread ... ... 10 loaves'), ('giri3 ur-{d}gilgamesx(|_bil3-ga-mes_|) dumu al-la', 'via Ur-Gilgam

## Save the Translations

In [111]:
def write_translations(f):
    f.write("{\n")
    f.write(f"\"model_id\":\"{model_id}\",\n")
    f.write(f"\"model_revision\":\"{model_revision}\"")
    for st_key in sorted([x for x in new_translations if not x.startswith("model_")]):
        f.write(f",\n\"{st_key}\":{{\n")
        translations = new_translations[st_key]
        head = ""
        for s in sorted(list(translations.keys())):
            f.write(head)
            f.write(json.dumps(s))
            f.write(": ")
            f.write(json.dumps(translations[s]))
            head = ",\n"
        f.write("}")
    f.write("}\n")
    
# write_translations(sys.stdout)

In [112]:
with open(output_json_path, "wt") as f:
    write_translations(f)

In [None]:
import zipfile

def compress(zip_name, file_to_zip):
    if os.path.exists(zip_name):
        os.unlink(zip_name)
    # Create a new zip file and add files to it
    with zipfile.ZipFile(zip_name, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
        zf.write(file_to_zip)

compress('../data/ml_translations.zip', '../data/ml_translations.json')
compress('../data/oracc_pubs.zip', '../data/oracc_pubs.json')

In [None]:
!ls -al ../data

In [105]:
new_json = json.load(open(output_json_path, "rt"))
print(len(new_json["akk_to_en"]))
print(len(new_json["sux_to_en"]))

293029
553291
