# Train Translator

Based on: https://huggingface.co/docs/transformers/tasks/translation

In [52]:
import sys, os, datetime
import json
import torch
import random
import glob
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TranslationPipeline
from datasets import load_dataset, Dataset
import cdli

In [66]:
!ls "../results/t5-base-p-akksux-en-20220722-173018"

checkpoint-240500  checkpoint-241000  checkpoint-241500


In [76]:
def get_finetune_model_id(model_id):
    model_dir = f"../results/{model_id}"
    checkpoints = [(os.path.abspath(x), int(os.path.split(x)[1].split("-")[1])) for x in glob.glob(f"{model_dir}/checkpoint-*")]
    checkpoints = sorted(checkpoints, key=lambda x: x[1])[-1]
#     print(checkpoints)
    return checkpoints[0]
    
# get_finetune_model_id("t5-base-p-akksux-en-20220722-173018")

In [77]:
os.environ["WANDB_NOTEBOOK_NAME"] = "TrainTranslator.ipynb"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

source_langs = set(["akk"])

# target_langs = set(["en", "it", "es", "fr", "de"])
target_langs = set(["en"])

base_model_id = "t5-base"
finetune_model_id = None
finetune_model_id = get_finetune_model_id("t5-base-p-akksux-en-20220722-173018")

model_max_length = 512
batch_size = 8 if os.path.basename(base_model_id).startswith("t5-base") else 128

num_train_epochs = 30

is_bi = False
paragraphs = True
is_finetune = finetune_model_id is not None and len(finetune_model_id) > 1

In [79]:
date_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
flags = ""
suffix = ""
if is_bi:
    flags += "-bi"
if paragraphs:
    flags += "-p"
if is_finetune:
    flags += "-f"
    suffix += f"-{os.path.basename(os.path.split(finetune_model_id)[0])}-{os.path.basename(finetune_model_id)}"
model_id = f"{os.path.basename(base_model_id)}{flags}-{''.join(sorted(list(source_langs)))}-{''.join(sorted(list(target_langs)))}-{date_id}{suffix}"
model_id

't5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500'

In [80]:
has_cuda = torch.cuda.is_available()
device = torch.cuda.device(0) if has_cuda else "cpu"
has_cuda, device

(True, <torch.cuda.device at 0x7f5eea740580>)

In [73]:
!nvidia-smi

Mon Jul 25 17:01:49 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  N/A |
|  0%   49C    P8    31W / 350W |    168MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

## Load Training Data

In [30]:
avg_src_chars_per_token = 1.8713256996006793
avg_tgt_chars_per_token = 2.577806274115267

In [31]:
lang_full = {
    "akk": "Akkadian",
    "elx": "Elamite",
    "sux": "Sumerian",
    "akkts": "Akkadian",
    "elxts": "Elamite",
    "suxts": "Sumerian",
    "en": "English",
    "it": "Italian",
    "es": "Spanish",
    "fr": "French",
    "de": "German",
}

In [32]:
def get_prefix(src_lang, tgt_lang):
    s = lang_full[src_lang]
    t = lang_full[tgt_lang]
    return f"translate {s} to {t}: "
    
get_prefix("suxts", "es")

'translate Sumerian to Spanish: '

In [33]:
replacements = [
    ("ā", "a"),
    ("Ā", "a"),
    ("ḫ", "h"),
    ("Ḫ", "H"),
    ("ī", "i"),
    ("Ī", "I"),
#     ("î", "i"),
#     ("Î", "I"),
    ("ř", "r"),
    ("Ř", "R"),
    ("š", "sh"),
    ("Š", "Sh"),
    ("ṣ", "sh"),
    ("Ṣ", "Sh"),
    ("ṭ", "t"),
    ("Ṭ", "T"),
    ("ū", "u"),
    ("Ū", "U"),
]
def replace_unsupported(text):
    r = text
    for s, t in replacements:
        r = r.replace(s, t)
    return r

In [15]:
publications = cdli.get_atf()

Downloading https://github.com/cdli-gh/data/raw/master/cdliatf_unblocked.atf
Parsing atf


In [34]:
len(publications), "publications"

(134712, 'publications')

In [35]:
def target_ok(target_text):
    if len(target_text) == 0:
        return False
    if len(set(target_text.replace(" ", ""))) < 2:
        return False
    return True
    

def test_target_ok(text):
    ok = target_ok(text)
    print(ok, repr(text))
    
test_target_ok("")
test_target_ok(" ")
test_target_ok("xx xxx x")
test_target_ok(".. . .. ")
test_target_ok("Hi")

False ''
False ' '
False 'xx xxx x'
False '.. . .. '
True 'Hi'


In [36]:
new_sourceandtargets = []

added_sources = set()

wmax_num_tokens = model_max_length - 192

def wrap_paragraph(paragraph, lines, src_lang, tgt_lang):
    ptag, pline_start_index, pline_end_index = paragraph
    wline_ranges = []
    wline_tok_len = 0.0
    
    def start_new_line(pline_index):
#         print("start", pline_index)
        wline_ranges.append((pline_index, pline_index + 1))
        
    def append_line(pline_index):
#         print("append", pline_index)
        r = wline_ranges[-1]
        if r[1] == pline_index:
            wline_ranges[-1] = (r[0], r[1] + 1)
        else:
            print(f"Missing line: got {pline_index}, expected {r[1]}: {wline_ranges}")

    for pline_index in range(pline_start_index, pline_end_index):
        pline_num_toks = len(lines[pline_index].text) / avg_src_chars_per_token + 1.0
        if len(wline_ranges) == 0 or (wline_tok_len + pline_num_toks > wmax_num_tokens):
            start_new_line(pline_index)
            wline_tok_len = 0.0
        else:
            append_line(pline_index)
        wline_tok_len += pline_num_toks
    return wline_ranges

def add_line_ranges(area, b, e):
#                     print("-"*50)
    ls = " ".join([x.text for x in area.lines[b:e]])
    ls = " ".join(ls.split(" "))
    prefixed_ls = st_prefix + ls
    if prefixed_ls in added_sources:
        return
    lt = " ".join([(x.languages[t] if t in x.languages else "") for x in area.lines[b:e]])
    lt = " ".join(lt.split(" "))
    lt = replace_unsupported(lt)
    if not target_ok(lt):
        return
#                     print(ls)
#                     print(lt)
    added_sources.add(prefixed_ls)
    new_sourceandtargets.append((prefixed_ls, lt))
    if is_bi:
        new_sourceandtargets.append((ts_prefix + lt, ls))



for s in source_langs:    
    for t in target_langs:
        print("Preparing", s, "to", t)
        st_prefix = get_prefix(s, t)
        ts_prefix = get_prefix(t, s)
        for pub in tqdm([p for p in publications if p.language==s]):
            for area in pub.text_areas:
                if not any(x for x in area.lines if t in x.languages):
                    continue
                if paragraphs:
                    paragraphs = area.lines_to_paragraphs(s)
                    line_ranges = []                
                    for p in paragraphs:                    
                        wlines = wrap_paragraph(p, area.lines, s, t)
                        line_ranges.extend(wlines)
    #                 print("="*50, len(area.lines))
                    for b, e in line_ranges:
                        add_line_ranges(area, b, e)
                for i, _ in enumerate(area.lines):
                    add_line_ranges(area, i, i + 1)

random.shuffle(new_sourceandtargets)
new_all_translations = Dataset.from_dict({"source": [x[0] for x in new_sourceandtargets], "target": [x[1] for x in new_sourceandtargets]})
new_all_translations

Preparing akk to en


  0%|          | 0/21820 [00:00<?, ?it/s]

Dataset({
    features: ['source', 'target'],
    num_rows: 18189
})

In [37]:
all_translations = new_all_translations

In [38]:
all_translations[1120:1200]

{'source': ['translate Akkadian to English: {d}ag _uru3_ ku-dur2',
  'translate Akkadian to English: [_a_-szu2 sza2 {disz}x]-x-uri3# {disz}en-szu2-nu#',
  'translate Akkadian to English: [{(d)}na-ra-am-{d}]suen _lugal#_ [ki-ib-ra]-tim# ar#-ba#-im# i3-nu HAR-sza-ma-at{ki} en-a#-ra#-am u3 _am_ in qab2!(DA)-[la2]-NI# ti-ba-ar# _sa-tu_-im su4-ma u-sa-am-qi2-it-su tam2-si4-il-su ib-ni-ma a-na {d}en-lil2 a-bi2#-su _a mu-ru_ sza _dub#_ su4-a u-sa-sa3*-ku#-ni {d}en-lil2 u3 {d}utu _suhusz_-su li-su2-ha u3 _sze-numun_-su li-il-qu3-tam2',
  'translate Akkadian to English: [unu]{ki}',
  'translate Akkadian to English: szum-ma i-na _tur3_',
  'translate Akkadian to English: li-il-qu3#-ta2#',
  'translate Akkadian to English: _u4-mesz_-szu i-gam-ma-ru',
  'translate Akkadian to English: 1(disz) _kusz3 gal_-tu4',
  'translate Akkadian to English: t,up-pu sza szi2-ma-at a-gu5-a',
  'translate Akkadian to English: in 1(disz) _u4_',
  'translate Akkadian to English: mi-na-a ni-i-nu sza2 ni-ib-nu-u2 nu-u

In [39]:
translations = all_translations.train_test_split(test_size=0.1)
translations

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 16370
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 1819
    })
})

In [40]:
original_tests = translations["test"]
original_tests

Dataset({
    features: ['source', 'target'],
    num_rows: 1819
})

In [41]:
test_starts = [f"translate {lang_full[s]} to " for s in source_langs]
print(test_starts)

def should_test(t):
    return any(t["source"].startswith(s) for s in test_starts)

translations["test"] = original_tests.filter(should_test)
translations["test"]

['translate Akkadian to ']


  0%|          | 0/2 [00:00<?, ?ba/s]

Dataset({
    features: ['source', 'target'],
    num_rows: 1819
})

## Tokenize the Data

In [42]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, model_max_length=model_max_length)

In [43]:
tokenizer.model_max_length

512

In [44]:
print("pad", tokenizer.pad_token, tokenizer.pad_token_id)
print("eos", tokenizer.eos_token, tokenizer.eos_token_id)
print("unk", tokenizer.unk_token, tokenizer.unk_token_id)

pad <pad> 0
eos </s> 1
unk <unk> 2


In [45]:
ccc = 0
sum_src_chars_per_token = 0.0
num_src_chars_per_token = 0
sum_tgt_chars_per_token = 0.0
num_tgt_chars_per_token = 0

def preprocess_function(examples):
    global ccc, sum_src_chars_per_token, sum_tgt_chars_per_token, num_src_chars_per_token, num_tgt_chars_per_token
#     print(examples)
    inputs = [example for example in examples["source"]]
    targets = [example for example in examples["target"]]
    model_inputs = tokenizer(inputs, max_length=model_max_length, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=model_max_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    nexamples = len(inputs)
    for i in range(nexamples):
        nchar = len(inputs[i])
        ntoks = len(model_inputs["input_ids"][i])
        if ntoks > 0:
            sum_src_chars_per_token += nchar / ntoks
            num_src_chars_per_token += 1
        nchar = len(targets[i])
        ntoks = len(model_inputs["labels"][i])
        if ntoks > 0:
            sum_tgt_chars_per_token += nchar / ntoks
            num_tgt_chars_per_token += 1
    
    ccc += 1
    if ccc == 1:
        print(model_inputs["input_ids"][0])
        print(model_inputs["labels"][0])
        nchar = len(targets[0])
        ntoks = len(model_inputs["labels"][0])
        print(nchar, ntoks, nchar / ntoks)

    return model_inputs

tokenized_translations = translations.map(preprocess_function, batched=True)
tokenized_translations

  0%|          | 0/17 [00:00<?, ?ba/s]

[13959, 4823, 1258, 8603, 12, 1566, 10, 3, 834, 76, 591, 209, 599, 26, 159, 172, 61, 18, 157, 265, 834, 3, 23, 18, 52, 76, 4663, 18, 6306, 76, 115, 908, 1]
[8, 166, 239, 6, 3, 88, 56, 2058, 117, 1]
29 10 2.9


  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 16370
    })
    test: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1819
    })
})

In [46]:
avg_src_chars_per_token = sum_src_chars_per_token / num_src_chars_per_token
avg_tgt_chars_per_token = sum_tgt_chars_per_token / num_tgt_chars_per_token
print("avg_src_chars_per_token", "=", avg_src_chars_per_token)
print("avg_tgt_chars_per_token", "=", avg_tgt_chars_per_token)

avg_src_chars_per_token = 1.9726247610618712
avg_tgt_chars_per_token = 2.7860413785145792


In [47]:
tokenized_translations["train"] = tokenized_translations["train"].remove_columns(["source", "target"])
tokenized_translations["test"] = tokenized_translations["test"].remove_columns(["source", "target"])
tokenized_translations

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 16370
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1819
    })
})

In [48]:
source_max_length = max([len(x["input_ids"]) for x in tokenized_translations["train"]])
target_max_length = max([len(x["labels"]) for x in tokenized_translations["train"]])
source_max_length, target_max_length

(510, 305)

In [49]:
tokenized_translations["train"][0]["labels"][:10]

[8, 166, 239, 6, 3, 88, 56, 2058, 117, 1]

## Load the Model

In [81]:
model = AutoModelForSeq2SeqLM.from_pretrained(finetune_model_id if len(finetune_model_id)>0 else base_model_id, 
                                              max_length=model_max_length)

In [82]:
model.config

T5Config {
  "_name_or_path": "/home/fak/Projects/CuneiformTranslators/results/t5-base-p-akksux-en-20220722-173018/checkpoint-241500",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "max_length": 512,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping":

## Train

In [83]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
# data_collator

In [84]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"../results/{model_id}",
    evaluation_strategy="epoch",
    learning_rate=2*2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    fp16=has_cuda,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_translations["train"],
    eval_dataset=tokenized_translations["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)



Using amp half precision backend


In [None]:
trainer.train()

***** Running training *****
  Num examples = 16370
  Num Epochs = 30
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 61410
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mpraeclarum[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,1.0941,0.792806
2,1.0043,0.788036
3,0.9573,0.78501
4,0.868,0.782949
5,0.8268,0.773469
6,0.7644,0.773726
7,0.7209,0.771519
8,0.6868,0.770516
9,0.6681,0.765769
10,0.6034,0.768286


Saving model checkpoint to ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-500
Configuration saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-500/config.json
Model weights saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-1000
Configuration saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-b

Saving model checkpoint to ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-5000
Configuration saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-5000/config.json
Model weights saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-5000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-5000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-3500] due to args.save_total_limit
Saving model checkpoint to ../results/

tokenizer config file saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-9000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-9000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-7500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-9500
Configuration saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-9500/config.json
Model weights saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-9500/pytorch_model.bin
tokenizer config file saved in ../resu

Deleting older checkpoint [../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-11500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-13500
Configuration saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-13500/config.json
Model weights saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-13500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-13500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-13500/special_tokens_map.json
Deleting older checkpoint [../re

Configuration saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-17500/config.json
Model weights saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-17500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-17500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-17500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-16000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-18000
Configuration saved in ../result

tokenizer config file saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-21500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-21500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-20000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-22000
Configuration saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-22000/config.json
Model weights saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-22000/pytorch_model.bin
tokenizer config file saved in .

Deleting older checkpoint [../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-24000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-26000
Configuration saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-26000/config.json
Model weights saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-26000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-26000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-26000/special_tokens_map.json
Deleting older checkpoint [../re

Configuration saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-30000/config.json
Model weights saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-30000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-30000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-30000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-28500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-f-akk-en-20220725-170404-t5-base-p-akksux-en-20220722-173018-checkpoint-241500/checkpoint-30500
Configuration saved in ../result

## Sample

In [40]:
pipeline = TranslationPipeline(model=model.to("cpu"), tokenizer=tokenizer, max_length=model_max_length)

In [33]:
pipeline

<transformers.pipelines.text2text_generation.TranslationPipeline at 0x7f40af0d6cb0>

In [34]:
pipeline("translate English to French: hello my name is Frank")

[{'translation_text': 'hu-mu-um dnanna-mu'}]

In [35]:
source_test = translations["test"][0]["source"]
target_test = translations["test"][0]["target"]
print(source_test)
print("-"*80)
print(target_test)

translate Sumerian to English: {d}nin#-[...] x x AN# [...]
--------------------------------------------------------------------------------
Ninhursag(?) ...


In [36]:
def translate(text):
    return pipeline(text)

translate(source_test)

[{'translation_text': 'Ningublaga, the lady of the mountain range, the mountain range of the mountain range, the mountain of the king, the mountain of the king, the mountain of the king, the mountain of the lands'}]

In [37]:
tests = original_tests
def sample(num_samples=100):
    for i in range(min(num_samples, tests.num_rows)):
        t = tests[i]
    #     print(t)
        src = t["source"]
        tgt = t["target"]
        query = src
        pred = pipeline(query)[0]["translation_text"]
        print("-"*48)
        print("QUERY ", query)
        print("TARGET", tgt)
        print("PRED  ", pred)
    #     break
    
sample()

------------------------------------------------
QUERY  translate English to Sumerian: The deliberations of the elders and juniors ... before ... the utterance(?) ...
TARGET [...] lu2#? banda3{+da} sza3 kusz2-u3-bi [...] igi-sze3 ka ba x [...]
PRED   ab-ba tur-tur-bi igi-sze3 [...] du11-ga x [...]
------------------------------------------------
QUERY  translate Sumerian to English: {d}nin#-[...] x x AN# [...]
TARGET Ninhursag(?) ...
PRED   Ningublaga, the lady of the mountain range, the mountain range of the mountain range, the mountain of the king, the mountain of the king, the mountain of the king, the mountain of the lands
------------------------------------------------
QUERY  translate Sumerian to English: [u4 ul4-li2-a-sze3] pa bi2-e3 %a an s,e-a-tim u2-szu-[pi2]
TARGET I (thus) made resplendent forever.
PRED   I made resplendent forever.
------------------------------------------------
QUERY  translate Akkadian to English: _e2-gal_ lu2-{d}marduk _lugal_
TARGET the palace of Amē

------------------------------------------------
QUERY  translate English to Akkadian: If a man ... grain ... gave, and then the grain into silver has converted, at harvest the grain and the interest on it, per 1 kor, 1 barig 4 ban2 he shall take
TARGET szum-ma _lu2 sze_-a-am a-na x x x id-di-in-ma _sze_-a-am a-na _ku3-babbar_ i-te-pu!-usz i-na e-bu-ri _sze_-a-am u3 _masz-bi 1(asz) gur 1(barig) 4(ban2)_ i-le-eq-qe2
PRED   szum-ma a-wi-lum _sze_ [...] id-di-in-ma _sze_ a-na _ku3-babbar_ u2-te-ep-pe2 i-na _buru14 sze_ u3 _masz2_-szu _1(asz) gur 1(barig) 4(ban2)_ i-le-qe2
------------------------------------------------
QUERY  translate Sumerian to English: abul 3(disz)-kam-ma ku4-ku4-da-ni-ta
TARGET When she entered the 3rd gate,
PRED   When she entered the 3rd gate,
------------------------------------------------
QUERY  translate Akkadian to English: _i7-i7_-szu
TARGET his rivers
PRED   his river
------------------------------------------------
QUERY  translate Sumerian to English: ki 

------------------------------------------------
QUERY  translate Sumerian to English: ki-la2-bi 1(disz) ma-na 1(u) 8(disz) gin2
TARGET Their weight: 1 ma-na, 18 shekels.
PRED   Its weight: 1 ma-na, 18 shekels.
------------------------------------------------
QUERY  translate Sumerian to English: lu2-mu igi im-mi-du8-am3
TARGET my own man saw it!
PRED   “Did you see my man?”
------------------------------------------------
QUERY  translate English to Sumerian: Shu-Erra, scribe, son of Ishar-beli, is your servant.
TARGET szu-er3-ra dub-sar dumu i-szar-be-li2 ARAD2-zu
PRED   szu-er3-ra dub-sar dumu i-szar-be-li2 ARAD2-zu
------------------------------------------------
QUERY  translate English to Sumerian: Basket-of-tablets: xxx xxx xxx
TARGET pisan-dub-ba nig2-ka9-ak sza3-bi su-ga sza3 ze2 i3-gal2
PRED   pisan-dub-ba nig2-ka9-ak sze erin2 szu-ku6 sza3 gu2-ab-baki
------------------------------------------------
QUERY  translate English to Sumerian: the first-born son
TARGET dumu-sag
PRE

## Save to Huggingface

In [38]:
model_path = os.path.abspath(f"/home/fak/nn/Data/generated/cuneiform/{model_id}-fullytrained")
trainer.save_model(model_path)
model_path

Saving model checkpoint to /home/fak/nn/Data/generated/cuneiform/t5-base-bi-p-akksux-en-20220723-023520-fullytrained
Configuration saved in /home/fak/nn/Data/generated/cuneiform/t5-base-bi-p-akksux-en-20220723-023520-fullytrained/config.json
Model weights saved in /home/fak/nn/Data/generated/cuneiform/t5-base-bi-p-akksux-en-20220723-023520-fullytrained/pytorch_model.bin
tokenizer config file saved in /home/fak/nn/Data/generated/cuneiform/t5-base-bi-p-akksux-en-20220723-023520-fullytrained/tokenizer_config.json
Special tokens file saved in /home/fak/nn/Data/generated/cuneiform/t5-base-bi-p-akksux-en-20220723-023520-fullytrained/special_tokens_map.json


'/home/fak/nn/Data/generated/cuneiform/t5-base-bi-p-akksux-en-20220723-023520-fullytrained'

In [39]:
tokenizer.save_pretrained(model_path)

tokenizer config file saved in /home/fak/nn/Data/generated/cuneiform/t5-base-bi-p-akksux-en-20220723-023520-fullytrained/tokenizer_config.json
Special tokens file saved in /home/fak/nn/Data/generated/cuneiform/t5-base-bi-p-akksux-en-20220723-023520-fullytrained/special_tokens_map.json


('/home/fak/nn/Data/generated/cuneiform/t5-base-bi-p-akksux-en-20220723-023520-fullytrained/tokenizer_config.json',
 '/home/fak/nn/Data/generated/cuneiform/t5-base-bi-p-akksux-en-20220723-023520-fullytrained/special_tokens_map.json',
 '/home/fak/nn/Data/generated/cuneiform/t5-base-bi-p-akksux-en-20220723-023520-fullytrained/tokenizer.json')