# Infer

In [7]:
import sys, os, datetime
import json
import torch
import random
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TranslationPipeline
import cdli

In [None]:
os.environ("TOKENIZERS_PARALLELISM", "false")

In [2]:
model_id = "praeclarum/cuneiform"
model_revision = "1ba74c8dcf6d1839b0a56589a53dfb5c20ca84f2"

In [10]:
translations = {}

## Load the Model

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=model_revision)
model_max_length = tokenizer.model_max_length
model_max_length

256

In [4]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, revision=model_revision, max_length=tokenizer.model_max_length)
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr

In [5]:
pipeline = TranslationPipeline(model=model, tokenizer=tokenizer)

In [6]:
pipeline("translate Akkadian to English: 1(disz){d}szul3-ma-nu-_sag man gal?_-u2 _man_ dan-nu _man kisz_")

[{'translation_text': 'Shulmanu-sag, great king, strong king, king of the world'}]

## Load Transliterations to Translate

In [8]:
publications = cdli.get_atf()
len(publications)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████████████████████████████████████████████████████████████████████████████| 3541754/3541754 [00:14<00:00, 236391.15it/s]


134560

In [9]:
publications[5000]

Publication('P005633', 'qpc', [TextArea('obverse', []), TextArea('column 1', [TextLine('1.', '1(N14) , GAR', {}), TextLine('2.', '1(N01@f) , KASZ~a SZE~a', {}), TextLine('3.', ', TU~b GU4 A', {})])])

In [None]:
for p in tqdm(publications):
    if p.language != "akk":
        continue
    for a in p.text_areas:
        for l in a.lines:
            if len(l.text) > 0 and l.text not in translations:                
                q = "translate Akkadian to English: " + l.text
                r = pipeline(q)[0]['translation_text']
                translations[l.text] = r
#                 print((q, r))


  0%|          | 0/134560 [00:00<?, ?it/s]

In [22]:
print(len(translations), "translations")
[(x, translations[x]) for x in translations][:10]

208 translations


[('_1(u) 2(asz) gur sze_ hu-bu-ta-tum', '12 gur barley of the new-year'),
 ('u2-sze-ti-iq-ma', 'he caused to be evicted, and'),
 ('_1(asz) gur 1(barig) 4(ban2) masz2_ u2-s,a-ab',
  '1 gur 1 barig 4 ban2 of goat hair will be removed'),
 ('_ki_ {d}utu', 'with Shamash'),
 ('u3 gi-da-nu-um', 'and the scepter'),
 ('{disz}{d}be-el-gesztu-a-bu-szu', 'Bl-geshtu-abusshu'),
 ('_dumu_ i3-li2-ma-ra-x', 'son of Ili-mr...'),
 ('_szu ba#-an-ti#_', 'received'),
 ('a-na _masz-gan2_-nim#', 'for the harvest'),
 ('sze-am _i3-ag2-e_', 'the barley he shall weigh out')]