# Infer

First, make paragraphs from lines (unwrap), then wrap to the limits of the network

In [1]:
import sys, os, datetime
import json
import torch
import random
import numpy as np
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TranslationPipeline
from collections import defaultdict
import zipfile

2023-06-08 09:56:31.651575: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-08 09:56:31.765969: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-06-08 09:56:32.167052: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.7/lib64:
2023-06-08 09:56:32.167099: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: ca

In [2]:
import languages

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
src_langs = set(["akk", "sux"])
tgt_langs = set(["en"])

In [5]:
model_id = "praeclarum/cuneiform"
# model_revision = "1ba74c8dcf6d1839b0a56589a53dfb5c20ca84f2"
# model_revision = "7a60be19efe61bf4adf873eb86f864ea7bfb4876"
model_revision = "02d6e0940c949f88c70ac3e49dbbf072cf645b92"

In [6]:
batch_size = 16
device = "cuda" if torch.has_cuda else "cpu"
device_id = 0 if device == "cuda" else -1

In [7]:
!nvidia-smi

Thu Jun  8 09:57:10 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.105.01   Driver Version: 515.105.01   CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  N/A |
|  0%   33C    P8    26W / 350W |    351MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

## Load Existing Translations

In [8]:
output_json_path = "../data/ml_translations.json"
output_zip_path = output_json_path.replace(".json", ".zip")

In [9]:
with open(output_zip_path, "rb") as zipf:
    with zipfile.ZipFile(zipf) as zf:
        name = [x for x in zf.namelist() if x.endswith(".json")][0]
        with zf.open(name) as f:
            old_translations = json.loads(str(f.read(), "utf8"))
print(len(old_translations["sux_to_en"]), "sux")
print(len(old_translations["akk_to_en"]), "akk")

91014 sux
14170 akk


In [10]:
new_translations = dict(old_translations)
new_translations.keys()


dict_keys(['model_id', 'model_revision', 'akk_to_en', 'sux_to_en'])

In [11]:
if new_translations["model_revision"] != model_revision:
    print("Clearing defunct translations")
    for s_lang in src_langs:
        for t_lang in tgt_langs:
            st_key = f"{s_lang}_to_{t_lang}"
            new_translations[st_key] = {}

In [12]:
# Remove blank translations
for s_lang in src_langs:
    for t_lang in tgt_langs:
        st_key = f"{s_lang}_to_{t_lang}"
        ts = new_translations[st_key]
        srcs = list(ts.keys())
        for s in srcs:
            if s in ts and len(ts[s].strip()) == 0:
                print("Bad translation: " + s)
                ts.remove(s)
print(len(new_translations["sux_to_en"]), "sux")
print(len(new_translations["akk_to_en"]), "akk")

91014 sux
14170 akk


In [13]:
def sample_translations():
    for s_lang in src_langs:
        for t_lang in tgt_langs:
            st_key = f"{s_lang}_to_{t_lang}"
            if st_key in new_translations:
                translations = new_translations[st_key]
                print(len(translations), f"{st_key} translations")
                print([(x, translations[x]) for x in translations][:10])

In [14]:
sample_translations()

91014 sux_to_en translations
[('# e3 - = %a hi-a-t,u3-um # = %a ru-ub-bu-u2 # e3 - = %a e-ru-u2-um # e3 - = %a a-ma-((x))-rum # = %a szu-pu-u2-um # e3 - = %a sza-ka-kum # e3 - = %a za-qa2-nu-um # a-ra2 - |_a-du_| = %a t,e-mu-um # a-ra2 - = %a a-la-ak-tum # a-ra2 - = %a a-ru-ru#-u2#-um# # a-ra2 - |_a-x_|# = %a x-x-[...]', 'a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind

## Load the Model

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=model_revision, device=device)
model_max_length = tokenizer.model_max_length
model_max_length

512

In [16]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, revision=model_revision, max_length=tokenizer.model_max_length)
model = model.to(device)
# model

In [17]:
pipeline = TranslationPipeline(model=model, tokenizer=tokenizer, batch_size=batch_size, device=device_id)

In [18]:
print(pipeline("translate Akkadian to English: 1(disz)(d)szul3-ma-nu-_sag man gal?_-u2 _man_ dan-nu _man kisz_"))
print(pipeline("translate Akkadian to English: ra-bi-isz e-pu-usz"))

[{'translation_text': 'Shalmaneser, great king, strong king, king of the universe,'}]
[{'translation_text': 'He did it greatly.'}]


## Load Transliterations to Translate

In [19]:
import corpi
import cdli

In [20]:
import importlib
importlib.reload(corpi)

<module 'corpi' from '/home/fak/Projects/CuneiformTranslators/tools/corpi.py'>

## Load ORACC

In [21]:
importlib.reload(cdli)

<module 'cdli' from '/home/fak/Projects/CuneiformTranslators/tools/cdli.py'>

In [22]:
oracc_dir = os.path.abspath(f"/Volumes/FrankDisk/oracc_zips")
# oracc_dir = os.path.abspath(f"/home/fak/nn/Data/oracc_zips")
oracc_dir

'/Volumes/FrankDisk/oracc_zips'

In [23]:
oracc_corpus = corpi.ORACC(oracc_dir)

## Load CDLI

In [24]:
cdli_corpus = corpi.CDLI()

## Merge

In [25]:
all_pubs = corpi.merge_corpus_pubs(
    [
        ("oracc", oracc_corpus.oracc_pubs.values()),
        ("cdli", cdli_corpus.cdli_pubs.values())
    ], src_langs)
print(f"Found {len(all_pubs)} unique publications")

Found 132155 unique publications


## Build Need Translation List

In [26]:
def get_need_translations(src_lang, encoding="ascii", tgt_lang="en"):
    srcs = set()

    max_line_length = 0

    for pub_id in tqdm(all_pubs.keys()):
        pub = all_pubs[pub_id]
        if pub.language != src_lang:
            continue
        corpus = pub.corpus
        for a in pub.text_areas:
            if (corpus == "cdli") and len(a.lines) > 0 and len(a.paragraphs) == 0:
                a.lines_to_paragraphs(src_lang, tgt_lang)
            paras = a.paragraphs_to_lines(src_lang, corpus)
            for p in paras:
                for si,ei,line in p:
                    if len(line) > max_line_length:
                        max_line_length = len(line)
                    srcs.add(line)
    print(src_lang, "max line length", max_line_length)
    return srcs
needs_translation = {     
    "sux": get_need_translations("sux"),
    "akk": get_need_translations("akk"),
}
print(len(needs_translation["sux"]), "sux needs translation")
print(len(needs_translation["akk"]), "akk needs translation")


  0%|          | 0/132155 [00:00<?, ?it/s]

sux max line length 612


  0%|          | 0/132155 [00:00<?, ?it/s]

akk max line length 849
458557 sux needs translation
195312 akk needs translation


In [27]:
list(needs_translation["akk"])[:10]

['[(disz)tat]-tan-nu _dumu_ sza2 (disz)ki-din-(d)60 A (disz)_e2-kur_-za-kir ina hu-ud lib3(*)#-[bi]-szu2# szi-isz-[szu-ru-u2] [sza2] u4#-mu ina _u4 7_-_kam2 gisz-szub-ba_-szu2 (lu2)_ku4 e2_(u2)-(tu2) _igi_ (d)_en-lil2_ (d)_idim_ (d)30 (d)#_utu_ (d)_iszkur_ (d)_amar-utu_ (d)na-na-a (d)_gaszan_-sza2(*)-_sag_ u _dingir_-_mesz e2_-szu2-nu gab-bi sza2 _iti_(us)-(su) kal _mu-an-na_ gu-uq-qa-ne2-e _u4-esz-esz_-_mesz_ u mim-ma gab-bi sza2 a-na szi-isz-szi-ru-u2 ina _u4 7_-_kam2 gisz-szub-ba_ [(lu2)]_ku4# e2_(u2)-(tu2)',
 '_u4_ szul3-ma _iti_ hi-du-tu2 _mu-an-na_ he2-gal2-li-sza2',
 'a-na u4-mu s,a-a-tu2 it-ta-din _ku3-babbar_ a4 2 _ma-na szam2 gisz_(*)#-[_szub-ba_-_mesz mu_-_mesz til_-_mesz_ (disz)_nig2-sum-mu_-(d)60]',
 '_sa2-sag_ si-ma-hi-la-ne2',
 '[...] x pa-ni (lu2)_kur2 ta_ (iri)_e2_ x [...]',
 'har-ra-nu i-na u4-me-szu-ma i-na qi2-bit (d)asz-szur _en_-ia (disz)nig2-du-(d)na-hu-un-du _lugal_ (kur)elam-ma(ki) 3(disz) _iti-mesz_ ul u2-mal-li-ma i-na u4-me la szi-im-ti-szu2 ur-ru-hi-isz im-

In [71]:
tgt_lang = "en"

def get_translations():
    todo = []
    for src_lang in src_langs:
        st_key = f"{src_lang}_to_{tgt_lang}"
        news = new_translations[st_key]
        # print(list(olds)[:10])
        for src in tqdm(needs_translation[src_lang]):
            if src not in news:
                todo.append((src_lang, tgt_lang, src, f"translate {languages.all_languages[src_lang]} to {languages.all_languages[tgt_lang]}: " + src))
    np.random.shuffle(todo)
    for t in todo:
        yield t

num_need_translate = sum([len(x) for x in needs_translation.values()])
print(f"{num_need_translate:,} to translate")
to_translate = list(get_translations())
print(f"{len(to_translate):,} left to translate")
                
def get_translations_gen():
    for src_lang, tft_lang, s, q in tqdm(to_translate):
        yield q

653,869 to translate


  0%|          | 0/458557 [00:00<?, ?it/s]

  0%|          | 0/195312 [00:00<?, ?it/s]

156,893 left to translate


In [72]:
r = pipeline(get_translations_gen())
for i, tr in enumerate(r):
    src_lang, tft_lang, s, q = to_translate[i]
    st_key = f"{src_lang}_to_{tgt_lang}"
    translations = new_translations[st_key]
    t = tr[0]['translation_text']
    translations[s] = t


  0%|          | 0/156893 [00:00<?, ?it/s]

Your input_length: 480 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 523 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 472 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 485 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 465 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 476 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 523 is bigger than 0.9 * max_length: 512. You

In [73]:
sample_translations()

458557 sux_to_en translations
[('# e3 - = %a hi-a-t,u3-um # = %a ru-ub-bu-u2 # e3 - = %a e-ru-u2-um # e3 - = %a a-ma-((x))-rum # = %a szu-pu-u2-um # e3 - = %a sza-ka-kum # e3 - = %a za-qa2-nu-um # a-ra2 - |_a-du_| = %a t,e-mu-um # a-ra2 - = %a a-la-ak-tum # a-ra2 - = %a a-ru-ru#-u2#-um# # a-ra2 - |_a-x_|# = %a x-x-[...]', 'a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kind of a kin

## Save the Translations

In [74]:
def write_translations(f):
    f.write("{\n")
    f.write(f"\"model_id\":\"{model_id}\",\n")
    f.write(f"\"model_revision\":\"{model_revision}\"")
    for st_key in sorted([x for x in new_translations if not x.startswith("model_")]):
        f.write(f",\n\"{st_key}\":{{\n")
        translations = new_translations[st_key]
        head = ""
        for s in sorted(list(translations.keys())):
            f.write(head)
            f.write(json.dumps(s))
            f.write(": ")
            f.write(json.dumps(translations[s]))
            head = ",\n"
        f.write("}")
    f.write("}\n")
    
# write_translations(sys.stdout)

In [75]:
with open(output_json_path, "wt") as f:
    write_translations(f)

In [76]:
import zipfile

def compress(zip_name, file_to_zip):
    if os.path.exists(zip_name):
        os.unlink(zip_name)
    # Create a new zip file and add files to it
    with zipfile.ZipFile(zip_name, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
        zf.write(file_to_zip)
    os.unlink(file_to_zip)

compress('../data/ml_translations.zip', '../data/ml_translations.json')

In [77]:
!ls -al ../data

total 92988
drwxrwxr-x  2 fak fak     4096 Jun  9 05:55 .
drwxrwxr-x 10 fak fak     4096 Jun  1 13:42 ..
-rw-rw-r--  1 fak fak 24652099 Jun  5 11:54 cdli_pubs.zip
-rw-rw-r--  1 fak fak    56450 Jul 25  2022 dataset_index.json
-rw-rw-r--  1 fak fak 30341325 Jun  9 05:55 ml_translations.zip
-rw-rw-r--  1 fak fak 13153368 Jun  6 12:43 oracc_pubs.zip
-rw-rw-r--  1 fak fak 22458763 Jun  6 12:43 translations_akk_to_en.jsonl
-rw-rw-r--  1 fak fak  4537536 Jun  6 12:43 translations_sux_to_en.jsonl
