# Infer

In [None]:
import sys, os, datetime
import json
import torch
import random
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TranslationPipeline
import cdli

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
model_id = "praeclarum/cuneiform"
model_revision = "1ba74c8dcf6d1839b0a56589a53dfb5c20ca84f2"

In [32]:
batch_size = 8
device = "cuda"

## Load Existing Translations

In [5]:
output_json_path = "../data/ml_translations.json"

In [6]:
input_json = json.loads(str(open(output_json_path, "rb").read(), "utf8"))

In [7]:
translations = input_json["akk_to_en"]
len(translations)

17497

In [8]:
def sample_translations():
    print(len(translations), "translations")
    print([(x, translations[x]) for x in translations][:10])

In [9]:
sample_translations()

17497 translations
[('_1(u) 2(asz) gur sze_ hu-bu-ta-tum', '12 gur barley of the new-year'), ('u2-sze-ti-iq-ma', 'he caused to be evicted, and'), ('_1(asz) gur 1(barig) 4(ban2) masz2_ u2-s,a-ab', '1 gur 1 barig 4 ban2 of goat hair will be removed'), ('_ki_ {d}utu', 'with Shamash'), ('u3 gi-da-nu-um', 'and the scepter'), ('{disz}{d}be-el-gesztu-a-bu-szu', 'Bl-geshtu-abusshu'), ('_dumu_ i3-li2-ma-ra-x', 'son of Ili-mr...'), ('_szu ba#-an-ti#_', 'received'), ('a-na _masz-gan2_-nim#', 'for the harvest'), ('sze-am _i3-ag2-e_', 'the barley he shall weigh out')]


## Load the Model

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=model_revision, device=device)
model_max_length = tokenizer.model_max_length
model_max_length

256

In [16]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, revision=model_revision, max_length=tokenizer.model_max_length)
model = model.to(device)
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr

In [20]:
pipeline = TranslationPipeline(model=model, tokenizer=tokenizer, device=0)

In [21]:
print(pipeline("translate Akkadian to English: 1(disz){d}szul3-ma-nu-_sag man gal?_-u2 _man_ dan-nu _man kisz_"))
print(pipeline("translate Akkadian to English: ra-bi-isz e-pu-usz"))

[{'translation_text': 'Shulmanu-sag, great king, strong king, king of the world'}]
[{'translation_text': 'I built in a grand manner'}]


## Load Transliterations to Translate

In [22]:
publications = cdli.get_atf()
len(publications)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3541754/3541754 [00:08<00:00, 405996.11it/s]


134560

In [23]:
publications[5000]

Publication('P005633', 'qpc', [TextArea('obverse', []), TextArea('column 1', [TextLine('1.', '1(N14) , GAR', {}), TextLine('2.', '1(N01@f) , KASZ~a SZE~a', {}), TextLine('3.', ', TU~b GU4 A', {})])])

In [None]:
batch = []
need_translation = [batch]
for p in publications:
    if p.language != "akk":
        continue
    for a in p.text_areas:
        for l in a.lines:
            if len(l.text) > 0 and l.text not in translations:
                batch.append(l.text)
                if len(batch) == batch_size:
                    batch = []
                    need_translation.append(batch)

for batch in tqdm(need_translation):
    qs = ["translate Akkadian to English: " + text for text in batch]
    r = pipeline(qs)
    for i, s in enumerate(batch):
        t = r[i]['translation_text']
#         print(i, s, t)
        translations[s] = t


  0%|          | 0/36505 [00:00<?, ?it/s]

In [34]:
sample_translations()

138478 translations
[('_1(u) 2(asz) gur sze_ hu-bu-ta-tum', '12 gur barley of the new-year'), ('u2-sze-ti-iq-ma', 'he caused to be evicted, and'), ('_1(asz) gur 1(barig) 4(ban2) masz2_ u2-s,a-ab', '1 gur 1 barig 4 ban2 of goat hair will be removed'), ('_ki_ {d}utu', 'with Shamash'), ('u3 gi-da-nu-um', 'and the scepter'), ('{disz}{d}be-el-gesztu-a-bu-szu', 'Bl-geshtu-abusshu'), ('_dumu_ i3-li2-ma-ra-x', 'son of Ili-mr...'), ('_szu ba#-an-ti#_', 'received'), ('a-na _masz-gan2_-nim#', 'for the harvest'), ('sze-am _i3-ag2-e_', 'the barley he shall weigh out')]


## Save the Translations

In [35]:
output_json = json.dumps({ "model_id": model_id, "model_revision": model_revision, "akk_to_en": translations })

In [36]:
with open(output_json_path, "wb") as f:
    f.write(bytes(output_json, "utf8"))