# Infer

First, make paragraphs from lines (unwrap), then wrap to the limits of the network

In [1]:
import sys, os, datetime
import json
import torch
import random
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TranslationPipeline
from collections import defaultdict


In [2]:
import languages

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
src_langs = set(["akk", "sux"])
tgt_langs = set(["en"])

In [5]:
model_id = "praeclarum/cuneiform"
model_revision = "1ba74c8dcf6d1839b0a56589a53dfb5c20ca84f2"

In [6]:
batch_size = 8
device = "cpu"

## Load Existing Translations

In [7]:
output_json_path = "../data/ml_translations.json"

In [8]:
old_translations = json.loads(str(open(output_json_path, "rb").read(), "utf8"))
old_translations.keys()

dict_keys(['model_id', 'model_revision', 'akk_to_en', 'sux_to_en'])

In [9]:
new_translations = dict(old_translations)
new_translations.keys()
for s_lang in src_langs:
    for t_lang in tgt_langs:
        st_key = f"{s_lang}_to_{t_lang}"
        new_translations[st_key] = {}

In [10]:
def sample_translations():
    for s_lang in src_langs:
        for t_lang in tgt_langs:
            st_key = f"{s_lang}_to_{t_lang}"
            if st_key in old_translations:
                translations = old_translations[st_key]
                print(len(translations), f"{st_key} translations")
                print([(x, translations[x]) for x in translations][:10])

In [11]:
sample_translations()

50000 sux_to_en translations
[('" [...] = %a [at]-ta', '..'), ('" [...] ~ [...] = %a [u4]-du-ru-u2', '..'), ('" [...] ~ [...] = %a szu-ub-tum', '..'), ('" [...] ~ [...] = %a {d}en-lil2', 'of Enlil'), ('" [...] ~ |SU.LU.SZE3|# = %a lu-up-pu-um', '..'), ('" [...] ~ |X.X.LAL| = %a na-du-u2-um', '..'), ('" [...] ~ |X.X.SI| = %a ku-ur-ku-u2-um', '..'), ('" [ba]-e# = %a za-zum', '..'), ('" [ga2-e] = %a [a]-na#?-ku-u2', 'I shall make secure'), ('" [za-e] = %a at#-ta', 'you')]
380004 akk_to_en translations
[('# A2    " x#        ~ A2', '..'), ('# GI    " x#-x#     ~ GI', '..'), ('# GI4   " [...]     ~ GI4', '..'), ('# a2    " [x]-a     ~ A2', '..'), ('# da    " da-a      ~ DA', '..'), ('# gi    " gi        ~ GI', '..'), ('# gilim " gi-li!-im ~ |GI%GI|', 'the reed, the reed, the ...'), ('# kab   " ka-ab#    ~ KAB', '..'), ('# ki2   " ki        ~ GI', '..'), ('# si2   " si        ~ ZI', '..')]


## Load the Model

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=model_revision, device=device)
model_max_length = tokenizer.model_max_length
model_max_length

256

In [13]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, revision=model_revision, max_length=tokenizer.model_max_length)
model = model.to(device)
# model

In [14]:
pipeline = TranslationPipeline(model=model, tokenizer=tokenizer)#, device=0)

In [15]:
print(pipeline("translate Akkadian to English: 1(disz){d}szul3-ma-nu-_sag man gal?_-u2 _man_ dan-nu _man kisz_"))
print(pipeline("translate Akkadian to English: ra-bi-isz e-pu-usz"))

[{'translation_text': 'Shulmanu-sag, great king, strong king, king of the world'}]
[{'translation_text': 'I built in a grand manner'}]


## Load Transliterations to Translate

In [35]:
import corpi
import cdli

In [49]:
import importlib
importlib.reload(corpi)

<module 'corpi' from '/Volumes/home/Projects/CuneiformTranslators/tools/corpi.py'>

## Load ORACC

In [48]:
importlib.reload(cdli)

<module 'cdli' from '/Volumes/home/Projects/CuneiformTranslators/tools/cdli.py'>

In [40]:
oracc_json = {}
# Save the JSON
with open("../data/oracc_pubs.json", "r") as f:
    oracc_json = json.load(f)
oracc_json.keys()

dict_keys(['P260212', 'Q005534', 'P228789', 'X201983', 'X600161', 'Q006659', 'P237793', 'P336482', 'Q003918', 'P239402', 'P229222', 'P334719', 'Q006286', 'P461191', 'P010632', 'P334706', 'X500038', 'P229167', 'P223779', 'P452943', 'P392253', 'P503448', 'P237289', 'P335244', 'P394015', 'P336176', 'Q005855', 'P314087', 'Q003857', 'P230912', 'P240296', 'P272248', 'P336078', 'P443829', 'X201880', 'P450354', 'P236989', 'P503421', 'P428065', 'X500022', 'P522839', 'Q009557', 'Q004131', 'Q001098', 'Q003354', 'P526953', 'P335678', 'P272531', 'P244122', 'P522574', 'P336560', 'P348500', 'P228352', 'P338603', 'P225944', 'P296697', 'P393867', 'P332890', 'P514535', 'P271473', 'P228823', 'P334923', 'Q003772', 'P425538', 'P336558', 'X468674', 'P335215', 'P368168', 'P237916', 'P451353', 'P225951', 'P427796', 'Q005809', 'P313440', 'P224484', 'P230565', 'P356343', 'P335874', 'P230125', 'P334158', 'P392267', 'Q005489', 'P228359', 'P422772', 'P336733', 'P373811', 'Q007510', 'P335892', 'P334302', 'P273712',

In [42]:
oracc_dir = os.path.abspath(f"/Volumes/FrankDisk/oracc_zips")
# oracc_dir = os.path.abspath(f"/home/fak/nn/Data/oracc_zips")
oracc_dir

'/Volumes/FrankDisk/oracc_zips'

In [50]:
oracc_corpus = corpi.ORACC(oracc_dir, download=False)

## Load CDLI

In [51]:
cdli_corpus = corpi.CDLI()

Downloading https://github.com/cdli-gh/data/raw/master/cdliatf_unblocked.atf
Parsing atf


## Find just the transliterations

In [53]:
def get_all_transliterated_pubs(src_lang):
    transliterated_cdli_index = cdli_corpus.get_pubs_with_lang(src_lang)
    transliterated_cdli_ids = set(transliterated_cdli_index.keys())
    transliterated_oracc_index = oracc_corpus.get_pubs_with_lang(src_lang)
    transliterated_oracc_ids = set(transliterated_oracc_index.keys())
    all_ids = set(transliterated_oracc_ids)
    all_ids = all_ids.union(transliterated_cdli_ids)
    return all_ids, transliterated_cdli_index, transliterated_oracc_index

all_transliterated_pubs = {
    "sux": get_all_transliterated_pubs("sux"),
    "akk": get_all_transliterated_pubs("akk")
}
print("sux", len(all_transliterated_pubs["sux"][0]))
print("akk", len(all_transliterated_pubs["akk"][0]))

sux 104444
akk 32259


## Build Need Translation List

In [54]:
def get_need_translations(src_lang, encoding="ascii", tgt_lang="en"):
    srcs = set()

    pub_ids, transliterated_cdli_index, transliterated_oracc_index = all_transliterated_pubs[src_lang]
    max_line_length = 0

    for pub_id in tqdm(pub_ids):
        corpus = "cdli" if pub_id in transliterated_cdli_index else "oracc"
        if corpus == "cdli":
            pub = transliterated_cdli_index[pub_id]
        else:
            # print(pub_id, "in oracc")
            pub = transliterated_oracc_index[pub_id]
        for a in pub.text_areas:
            if (corpus == "cdli") and len(a.lines) > 0 and len(a.paragraphs) == 0:
                a.lines_to_paragraphs(src_lang, tgt_lang)
            paras = a.paragraphs_to_lines(max_line_length=512)
            for p in paras:
                for si,ei,line in p:
                    if len(line) > max_line_length:
                        max_line_length = len(line)
                    srcs.add(line)
    print(src_lang, "max line length", max_line_length)
    return srcs
needs_translation = {     
    "sux": get_need_translations("sux"),
    "akk": get_need_translations("akk"),
}
print(len(needs_translation["sux"]), "sux needs translation")
print(len(needs_translation["akk"]), "akk needs translation")


  0%|          | 0/104444 [00:00<?, ?it/s]

sux max line length 511


  0%|          | 0/32259 [00:00<?, ?it/s]

akk max line length 522
553291 sux needs translation
293029 akk needs translation


In [55]:
list(needs_translation["akk"])[:10]

['[x x] x x x# [x] _igi_{?}',
 '{na4}_kiszib_ {disz}ki-din-{d}60',
 '_gesz#_ gu#-up#-ni szu-a-tu-<nu> is,-s,a mal ak-szit,-t,u u2-pah-hir-ma a-na gu-ru-un-ni ag-ru-un-ma i-na {d}bil-gi aq-mu',
 'ki-ma um-mi3-a-ni puzur4-esz18-dar i-ka3-ri-im e-ta-wu',
 '_1(bur3) _gan2_ a-sza3 a-gar3 1(esze3) 3(iku) _gan2_-e_ i-ta _a-sza3_ sig-an-nu-ni-tum _dumu_ lu2-{d}nanna',
 '[...]-u2 [...]-ru#?-ti _un-mesz#_ [...] sza ina qi2#-[...] {d}1(u)-5(disz) sza# [...] ah-bu#-[...] a-na _dingir#-mesz#-[...]_',
 'un-qa un-qa# {disz}{d}60—x-x [{disz}]ba#-la-t,u',
 '[...] _ha_ x [...] _dumu#_ ip-qu2-[...] [_arad_] _an_ [...]',
 '_kusz3_ [...]',
 'qe2-reb hur-sza2-a-ni zaq-ru-ti _a-sza3_ nam-ra-s,i i-na _ansze-kur-ra_ ar-kab-ma _{gesz}gigir giri3-min_-ia i-na ti-ik-ka-ti u2-sza2-asz2-szi asz2-ru szup-szu-qu i-na _giri3-min_-ia2 ri-ma-nisz at-tag-gesz']

In [56]:
def get_translation_batches(tgt_lang = "en"):
    batch = []
    batches = [batch]
    for src_lang in src_langs:
        st_key = f"{src_lang}_to_{tgt_lang}"
        olds = old_translations[st_key]
        # print(list(olds)[:10])
        for src in tqdm(needs_translation[src_lang]):
            if src not in olds:
                batch.append((src_lang, tgt_lang, src))
                if len(batch) == batch_size:
                    batch = []
                    batches.append(batch)
    return batches

translation_batches = get_translation_batches()
num_needed_translations = sum([len(x) for x in translation_batches])

print(len(translation_batches), "translation batches")
print(num_needed_translations, "needed translations")

for batch in tqdm(translation_batches):
    qs = [f"translate {languages.all_languages[src_lang]} to {languages.all_languages[tgt_lang]}: " + text for src_lang, tgt_lang, text in batch]
    r = pipeline(qs)
    for i, (src_lang, tgt_lang, s) in enumerate(batch):
        st_key = f"{src_lang}_to_{tgt_lang}"
        translations = new_translations[st_key]
        t = r[i]['translation_text']
        print(i, s, t)
        translations[s] = t
    break

  0%|          | 0/553291 [00:00<?, ?it/s]

  0%|          | 0/293029 [00:00<?, ?it/s]

96889 translation batches
775108 needed translations


  0%|          | 0/96889 [00:00<?, ?it/s]

0 im-ku3-sig17 ba-ge6 {gesz}gigir nig2-ba u3 e-sir2 zi-ga Gold-plated chariots, a charioteer, and a ziggurat-wheel
1 ur-du6-ku3-ga szu ba-ti iti udru{duru5} mu ma-da za-ab-sza-li{ki} ba-hul Urdukuga received in the month “Udru” (month 10) the year: “The lands of Zabshali were destroyed,”
2 3(disz) masz2-gal niga du6-ku3 3 bucks, barley-fed, “built-in”
3 mu-un-bar-re-en nam-lugal da-ri2* bala u4 su13-ra2 ma-ni-in-rig7 suhusz* ma-da nam-en-bi ak-de3 ma-an-szum2-ma ma-ni-in-ge4*-en I gave to you the authority of kingship forever, I gave to you the foundation of the country, I gave to you its lordship forever
4 4(disz) sila4 5(disz) masz2 4 lambs, 5 billy goats
5 [n] {kusz}a-ga2-la2 kesz2-ra2 erin2 kun-[zi?-da? i7] {d}isztaran-si-sa2-a-sze3 x x [...] ... leather bags, ... for the troop at the reservoir of Ishtaran-sisa ..
6 szunigin 6(disz) gin2 i3 6(disz) gin2 naga szunigin 7(disz) ku6 szunigin 7(disz) sa szum2 u4 1(u) 7(disz)-kam iti e2-iti-6(disz) total: 6 shekels oil; 6 shekels alkali-

In [32]:
sample_translations()

380004 akk_to_en translations
[('# A2    " x#        ~ A2', '..'), ('# GI    " x#-x#     ~ GI', '..'), ('# GI4   " [...]     ~ GI4', '..'), ('# a2    " [x]-a     ~ A2', '..'), ('# da    " da-a      ~ DA', '..'), ('# gi    " gi        ~ GI', '..'), ('# gilim " gi-li!-im ~ |GI%GI|', 'the reed, the reed, the ...'), ('# kab   " ka-ab#    ~ KAB', '..'), ('# ki2   " ki        ~ GI', '..'), ('# si2   " si        ~ ZI', '..')]
50000 sux_to_en translations
[('" [...] = %a [at]-ta', '..'), ('" [...] ~ [...] = %a [u4]-du-ru-u2', '..'), ('" [...] ~ [...] = %a szu-ub-tum', '..'), ('" [...] ~ [...] = %a {d}en-lil2', 'of Enlil'), ('" [...] ~ |SU.LU.SZE3|# = %a lu-up-pu-um', '..'), ('" [...] ~ |X.X.LAL| = %a na-du-u2-um', '..'), ('" [...] ~ |X.X.SI| = %a ku-ur-ku-u2-um', '..'), ('" [ba]-e# = %a za-zum', '..'), ('" [ga2-e] = %a [a]-na#?-ku-u2', 'I shall make secure'), ('" [za-e] = %a at#-ta', 'you')]


## Save the Translations

In [None]:
def write_translations(f):
    f.write("{\n")
    f.write(f"\"model_id\":\"{model_id}\",\n")
    f.write(f"\"model_revision\":\"{model_revision}\"")
    for st_key in sorted([x for x in new_translations if not x.startswith("model_")]):
        f.write(f",\n\"{st_key}\":{{\n")
        translations = new_translations[st_key]
        head = ""
        for s in sorted(list(translations.keys())):
            f.write(head)
            f.write(json.dumps(s))
            f.write(": ")
            f.write(json.dumps(translations[s]))
            head = ",\n"
        f.write("}")
    f.write("}\n")
    
# write_translations(sys.stdout)

In [None]:
with open(output_json_path, "wt") as f:
    write_translations(f)

In [None]:
!ls -al ../data

In [None]:
new_json = json.load(open(output_json_path, "rt"))
print(len(new_json["akk_to_en"]))
print(len(new_json["sux_to_en"]))