# Train Translator

Based on: https://huggingface.co/docs/transformers/tasks/translation

In [1]:
import sys, os, datetime
import json
import torch
import random
import glob
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TranslationPipeline
from datasets import load_dataset, Dataset, DatasetDict

In [2]:
import cdli
import languages

In [3]:
def get_finetune_model_id(model_id):
    model_dir = f"../results/{model_id}"
    checkpoints = [(os.path.abspath(x), int(os.path.split(x)[1].split("-")[1])) for x in glob.glob(f"{model_dir}/checkpoint-*")]
    checkpoints = sorted(checkpoints, key=lambda x: x[1])[-1]
    return checkpoints[0]

In [4]:
os.environ["WANDB_NOTEBOOK_NAME"] = "TrainTranslator.ipynb"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

source_langs = set(["akk"])

# target_langs = set(["en", "it", "es", "fr", "de"])
target_langs = set(["en"])

base_model_id = "t5-base"
finetune_model_id = None
# finetune_model_id = get_finetune_model_id("t5-base-p-akksux-en-20220722-173018")

model_max_length = 512
batch_size = 8 if os.path.basename(base_model_id).startswith("t5-base") else 128

num_train_epochs = 30

is_bi = False
use_paragraphs = True
use_lines = True
is_finetune = finetune_model_id is not None and len(finetune_model_id) > 1

In [5]:
date_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
flags = ""
suffix = ""
if is_bi:
    flags += "-bi"
if use_paragraphs:
    flags += "-p"
if use_lines:
    flags += "-l"
if is_finetune:
    flags += "-f"
    suffix += f"-{os.path.basename(os.path.split(finetune_model_id)[0])}-{os.path.basename(finetune_model_id)}"
model_id = f"{os.path.basename(base_model_id)}{flags}-{''.join(sorted(list(source_langs)))}-{''.join(sorted(list(target_langs)))}-{date_id}{suffix}"
model_id

't5-base-p-l-akk-en-20220725-224830'

In [6]:
has_cuda = torch.cuda.is_available()
device = torch.cuda.device(0) if has_cuda else "cpu"
has_cuda, device

(True, <torch.cuda.device at 0x7fc7951fed10>)

In [7]:
!nvidia-smi

Mon Jul 25 22:48:30 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  N/A |
| 30%   50C    P8    31W / 350W |    168MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

## Load Training Data

In [8]:
avg_src_chars_per_token = 1.8713256996006793
avg_tgt_chars_per_token = 2.577806274115267

In [9]:
def get_prefix(src_lang, tgt_lang):
    s = languages.all_languages[src_lang]
    t = languages.all_languages[tgt_lang]
    return f"translate {s} to {t}: "
    
get_prefix("suxts", "es")

'translate Sumerian to Spanish: '

In [10]:
publications = cdli.get_atf()

Downloading https://github.com/cdli-gh/data/raw/master/cdliatf_unblocked.atf
Parsing atf


In [11]:
len(publications), "publications"

(134712, 'publications')

In [12]:
def target_ok(target_text):
    if len(target_text) == 0:
        return False
    if len(set(target_text.replace(" ", ""))) < 2:
        return False
    return True
    

def test_target_ok(text):
    ok = target_ok(text)
    print(ok, repr(text))
    
test_target_ok("")
test_target_ok(" ")
test_target_ok("xx xxx x")
test_target_ok(".. . .. ")
test_target_ok("Hi")

False ''
False ' '
False 'xx xxx x'
False '.. . .. '
True 'Hi'


In [13]:
wmax_num_tokens = model_max_length - 192

def wrap_paragraph(paragraph, lines, src_lang, tgt_lang):
    ptag, pline_start_index, pline_end_index = paragraph
    wline_ranges = []
    wline_tok_len = 0.0
    
    def start_new_line(pline_index):
#         print("start", pline_index)
        wline_ranges.append((pline_index, pline_index + 1))
        
    def append_line(pline_index):
#         print("append", pline_index)
        r = wline_ranges[-1]
        if r[1] == pline_index:
            wline_ranges[-1] = (r[0], r[1] + 1)
        else:
            print(f"Missing line: got {pline_index}, expected {r[1]}: {wline_ranges}")

    for pline_index in range(pline_start_index, pline_end_index):
        pline_num_toks = len(lines[pline_index].text) / avg_src_chars_per_token + 1.0
        if len(wline_ranges) == 0 or (wline_tok_len + pline_num_toks > wmax_num_tokens):
            start_new_line(pline_index)
            wline_tok_len = 0.0
        else:
            append_line(pline_index)
        wline_tok_len += pline_num_toks
    return wline_ranges



In [14]:
dataset_index = json.load(open("../data/dataset_index.json", "rt"))
print(dataset_index.keys())

dict_keys(['akk', 'sux'])


In [15]:
print(len(dataset_index["akk"]["train"]), "akk train")
print(len(dataset_index["akk"]["test"]), "akk test")
print(len(dataset_index["sux"]["train"]), "sux train")
print(len(dataset_index["sux"]["test"]), "sux test")

870 akk train
108 akk test
3753 sux train
396 sux test


In [16]:
def get_pubs_targets(dataset):
    new_sourceandtargets = []

    added_sources = set()

    def add_line_ranges(area, b, e):
    #                     print("-"*50)
        ls = " ".join([x.text for x in area.lines[b:e]])
        ls = " ".join(ls.split(" "))
        prefixed_ls = st_prefix + ls
        if prefixed_ls in added_sources:
            return
        lt = " ".join([(x.languages[t] if t in x.languages else "") for x in area.lines[b:e]])
        lt = " ".join(lt.split(" "))
        lt = languages.replace_unsupported(lt)
        if not target_ok(lt):
            return
    #                     print(ls)
    #                     print(lt)
        added_sources.add(prefixed_ls)
        new_sourceandtargets.append((prefixed_ls, lt))
        if is_bi:
            new_sourceandtargets.append((ts_prefix + lt, ls))

    for s in source_langs:
        pub_index = dataset_index[s][dataset]
        for t in target_langs:
            print("Preparing", s, "to", t)            
            st_prefix = get_prefix(s, t)
            ts_prefix = get_prefix(t, s)
            for pub in tqdm([p for p in publications if p.language==s and p.id in pub_index]):
                for area in pub.text_areas:
                    if not any(x for x in area.lines if t in x.languages):
                        continue
                    if use_paragraphs:
                        paragraphs = area.lines_to_paragraphs(s)
                        line_ranges = []                
                        for p in paragraphs:                    
                            wlines = wrap_paragraph(p, area.lines, s, t)
                            line_ranges.extend(wlines)
        #                 print("="*50, len(area.lines))
                        for b, e in line_ranges:
                            add_line_ranges(area, b, e)
                    if use_lines:
                        for i, _ in enumerate(area.lines):
                            add_line_ranges(area, i, i + 1)
    random.shuffle(new_sourceandtargets)
    return Dataset.from_dict({"source": [x[0] for x in new_sourceandtargets], "target": [x[1] for x in new_sourceandtargets]})

train_dataset = get_pubs_targets("train")
test_dataset = get_pubs_targets("test")
print(len(train_dataset), "train")
print(len(test_dataset), "test")

Preparing akk to en


  0%|          | 0/874 [00:00<?, ?it/s]

Preparing akk to en


  0%|          | 0/108 [00:00<?, ?it/s]

16754 train
1728 test


In [17]:
train_dataset

Dataset({
    features: ['source', 'target'],
    num_rows: 16754
})

In [18]:
test_dataset

Dataset({
    features: ['source', 'target'],
    num_rows: 1728
})

In [19]:
test_dataset[1120:1200]

{'source': ['translate Akkadian to English: i-szar-li-im',
  'translate Akkadian to English: sza-ar-ka-szu-um',
  'translate Akkadian to English: [{d}suen]',
  'translate Akkadian to English: sza a-ah na-ri-im',
  'translate Akkadian to English: s,al-lam {d}utu _en zimbir{ki} a-szib-bi e2-babbar-ra s,al-lam {d}utu _en zimbir{ki} a-szib-bi e2-babbar-ra',
  'translate Akkadian to English: su4-nu-ti',
  'translate Akkadian to English: [u3 {d}inanna?]',
  'translate Akkadian to English: _kur_ asz-ma ki-sir-ti',
  'translate Akkadian to English: {d}da-gan',
  'translate Akkadian to English: [AN]-nu-ba-ni-ni',
  'translate Akkadian to English: _1(u) sze gur_',
  'translate Akkadian to English: ki-ma sza-ma?!',
  'translate Akkadian to English: a-na ti-li u3 ka-ar-mi',
  'translate Akkadian to English: [ha]-a-ra-am _dumu_ a-ta-ni-im',
  'translate Akkadian to English: i-na ma-asz-ha-li-im ta-sza-ah-ha-al',
  'translate Akkadian to English: isz-ku8-na-ma',
  'translate Akkadian to English: wa-

In [20]:
translations = DatasetDict({"train": train_dataset, "test": test_dataset})
translations

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 16754
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 1728
    })
})

In [21]:
original_tests = translations["test"]
original_tests

Dataset({
    features: ['source', 'target'],
    num_rows: 1728
})

In [22]:
test_starts = [f"translate {languages.all_languages[s]} to " for s in source_langs]
print(test_starts)

def should_test(t):
    return any(t["source"].startswith(s) for s in test_starts)

translations["test"] = original_tests.filter(should_test)
translations["test"]



['translate Akkadian to ']


  0%|          | 0/2 [00:00<?, ?ba/s]

Dataset({
    features: ['source', 'target'],
    num_rows: 1728
})

## Tokenize the Data

In [23]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, model_max_length=model_max_length)

In [24]:
tokenizer.model_max_length

512

In [25]:
print("pad", tokenizer.pad_token, tokenizer.pad_token_id)
print("eos", tokenizer.eos_token, tokenizer.eos_token_id)
print("unk", tokenizer.unk_token, tokenizer.unk_token_id)

pad <pad> 0
eos </s> 1
unk <unk> 2


In [26]:
ccc = 0
sum_src_chars_per_token = 0.0
num_src_chars_per_token = 0
sum_tgt_chars_per_token = 0.0
num_tgt_chars_per_token = 0

def preprocess_function(examples):
    global ccc, sum_src_chars_per_token, sum_tgt_chars_per_token, num_src_chars_per_token, num_tgt_chars_per_token
#     print(examples)
    inputs = [example for example in examples["source"]]
    targets = [example for example in examples["target"]]
    model_inputs = tokenizer(inputs, max_length=model_max_length, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=model_max_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    nexamples = len(inputs)
    for i in range(nexamples):
        nchar = len(inputs[i])
        ntoks = len(model_inputs["input_ids"][i])
        if ntoks > 0:
            sum_src_chars_per_token += nchar / ntoks
            num_src_chars_per_token += 1
        nchar = len(targets[i])
        ntoks = len(model_inputs["labels"][i])
        if ntoks > 0:
            sum_tgt_chars_per_token += nchar / ntoks
            num_tgt_chars_per_token += 1
    
    ccc += 1
    if ccc == 1:
        print(model_inputs["input_ids"][0])
        print(model_inputs["labels"][0])
        nchar = len(targets[0])
        ntoks = len(model_inputs["labels"][0])
        print(nchar, ntoks, nchar / ntoks)

    return model_inputs

tokenized_translations = translations.map(preprocess_function, batched=True)
tokenized_translations

  0%|          | 0/17 [00:00<?, ?ba/s]

[13959, 4823, 1258, 8603, 12, 1566, 10, 3, 2, 26, 2, 7, 76, 35, 18, 76, 357, 4663, 18, 7, 15, 7412, 15, 40, 18, 40, 23, 3, 834, 26, 440, 76, 4663, 834, 206, 18, 450, 7412, 2, 26, 2, 76, 17, 76, 1]
[180, 3851, 18, 302, 7999, 6, 520, 13, 10037, 18, 134, 1483, 3198, 6, 1]
31 15 2.066666666666667


  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 16754
    })
    test: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1728
    })
})

In [27]:
avg_src_chars_per_token = sum_src_chars_per_token / num_src_chars_per_token
avg_tgt_chars_per_token = sum_tgt_chars_per_token / num_tgt_chars_per_token
print("avg_src_chars_per_token", "=", avg_src_chars_per_token)
print("avg_tgt_chars_per_token", "=", avg_tgt_chars_per_token)

avg_src_chars_per_token = 1.9792946054117202
avg_tgt_chars_per_token = 2.77862621552965


In [28]:
tokenized_translations["train"] = tokenized_translations["train"].remove_columns(["source", "target"])
tokenized_translations["test"] = tokenized_translations["test"].remove_columns(["source", "target"])
tokenized_translations

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 16754
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1728
    })
})

In [29]:
source_max_length = max([len(x["input_ids"]) for x in tokenized_translations["train"]])
target_max_length = max([len(x["labels"]) for x in tokenized_translations["train"]])
source_max_length, target_max_length

(510, 305)

In [30]:
tokenized_translations["train"][0]["labels"][:10]

[180, 3851, 18, 302, 7999, 6, 520, 13, 10037, 18]

## Load the Model

In [31]:
model = AutoModelForSeq2SeqLM.from_pretrained(finetune_model_id if is_finetune else base_model_id, 
                                              max_length=model_max_length)

In [32]:
model.config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "max_length": 512,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
 

## Train

In [33]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
# data_collator

In [34]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"../results/{model_id}",
    evaluation_strategy="epoch",
    learning_rate=2*2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    fp16=has_cuda,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_translations["train"],
    eval_dataset=tokenized_translations["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)



Using amp half precision backend


In [35]:
trainer.train()

***** Running training *****
  Num examples = 16754
  Num Epochs = 30
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 62850
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mpraeclarum[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,3.0373,2.579712
2,2.6051,2.298664
3,2.36,2.154751
4,2.1704,2.061167
5,1.9921,2.003892
6,1.8862,1.963448
7,1.7719,1.937001
8,1.6988,1.906494
9,1.5673,1.905355
10,1.5239,1.896165


Saving model checkpoint to ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-500
Configuration saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-500/config.json
Model weights saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-1000
Configuration saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-1000/config.json
Model weights saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpo

Configuration saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-7000/config.json
Model weights saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-7000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-7000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-7000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-5500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-7500
Configuration saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-7500/config.json
Model weights saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-7500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-7500/tokenizer_config.json
Special tokens file saved in ../results/t5-base

Special tokens file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-13000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-11500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-13500
Configuration saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-13500/config.json
Model weights saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-13500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-13500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-13500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-12000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-14000
Configuration saved in ../resu

Configuration saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-19500/config.json
Model weights saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-19500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-19500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-19500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-18000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-20000
Configuration saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-20000/config.json
Model weights saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-20000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-20000/tokenizer_config.json
Special tokens file saved in ../result

Special tokens file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-25500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-24000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-26000
Configuration saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-26000/config.json
Model weights saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-26000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-26000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-26000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-24500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-26500
Configuration saved in ../resu

Configuration saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-32000/config.json
Model weights saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-32000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-32000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-32000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-30500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-32500
Configuration saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-32500/config.json
Model weights saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-32500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-32500/tokenizer_config.json
Special tokens file saved in ../result

Special tokens file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-38000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-36500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-38500
Configuration saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-38500/config.json
Model weights saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-38500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-38500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-38500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-37000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-39000
Configuration saved in ../resu

Configuration saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-44500/config.json
Model weights saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-44500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-44500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-44500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-43000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-45000
Configuration saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-45000/config.json
Model weights saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-45000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-45000/tokenizer_config.json
Special tokens file saved in ../result

Special tokens file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-50500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-49000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-51000
Configuration saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-51000/config.json
Model weights saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-51000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-51000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-51000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-49500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-51500
Configuration saved in ../resu

Configuration saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-57000/config.json
Model weights saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-57000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-57000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-57000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-55500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-57500
Configuration saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-57500/config.json
Model weights saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-57500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-p-l-akk-en-20220725-224830/checkpoint-57500/tokenizer_config.json
Special tokens file saved in ../result

TrainOutput(global_step=62850, training_loss=1.4417311488282805, metrics={'train_runtime': 8234.1664, 'train_samples_per_second': 61.041, 'train_steps_per_second': 7.633, 'total_flos': 1.0294543513534464e+17, 'train_loss': 1.4417311488282805, 'epoch': 30.0})

## Sample

In [36]:
pipeline = TranslationPipeline(model=model.to("cpu"), tokenizer=tokenizer, max_length=model_max_length)

In [37]:
pipeline

<transformers.pipelines.text2text_generation.TranslationPipeline at 0x7fc612096e30>

In [38]:
pipeline("translate English to French: hello my name is Frank")

[{'translation_text': 'Bonjour, mon nom est Frank.'}]

In [39]:
source_test = translations["test"][0]["source"]
target_test = translations["test"][0]["target"]
print(source_test)
print("-"*80)
print(target_test)

translate Akkadian to English: a-na ti-lu-ti-szu-nu
--------------------------------------------------------------------------------
and to their assistance


In [40]:
def translate(text):
    return pipeline(text)

translate(source_test)

[{'translation_text': 'for their cultic duties'}]

In [41]:
tests = original_tests
def sample(num_samples=100):
    for i in range(min(num_samples, tests.num_rows)):
        t = tests[i]
    #     print(t)
        src = t["source"]
        tgt = t["target"]
        query = src
        pred = pipeline(query)[0]["translation_text"]
        print("-"*48)
        print("QUERY ", query)
        print("TARGET", tgt)
        print("PRED  ", pred)
    #     break
    
sample()

------------------------------------------------
QUERY  translate Akkadian to English: a-na ti-lu-ti-szu-nu
TARGET and to their assistance
PRED   for their cultic duties
------------------------------------------------
QUERY  translate Akkadian to English: dumu ha-ri-ia#
TARGET son of Hariya,
PRED   son of Hariya,
------------------------------------------------
QUERY  translate Akkadian to English: a-na
TARGET For
PRED   to
------------------------------------------------
QUERY  translate Akkadian to English: _musz igi-min_
TARGET two-faced serpent.
PRED   the eyewitness
------------------------------------------------
QUERY  translate Akkadian to English: i-na _e2-gal_-li-ia
TARGET "In my palace"
PRED   in my palace
------------------------------------------------
QUERY  translate Akkadian to English: ma-gi-ir te-es3-li-ti-im
TARGET who is agreeable to petition,
PRED   a penalty of destitution
------------------------------------------------
QUERY  translate Akkadian to English: _lug

------------------------------------------------
QUERY  translate Akkadian to English: dal-ba-at{ki}
TARGET of Dilbat,
PRED   of Dalbat,
------------------------------------------------
QUERY  translate Akkadian to English: u3 u3-sza-ap-szu-t,u2
TARGET or shall cause it to be effaced
PRED   and he shall remove,
------------------------------------------------
QUERY  translate Akkadian to English: ka-an-ku il-qu2-nim!-ma
TARGET sealed, they took and
PRED   the potholder took away, and
------------------------------------------------
QUERY  translate Akkadian to English: [a]-na-ku u2-sza-aq-t,i3-il
TARGET I had slaughtered;
PRED   I have sent. They shall remove
------------------------------------------------
QUERY  translate Akkadian to English: i-din-{d}utu _sanga_ {d}nin-in-si-na _dumu_ ku3-{d}nin-in-si-na _ARAD_ a-bi-e-szu-uh-ke4
TARGET Iddin-Shamash, sanga priest of the goddess Ninisina, son of Ku-Ninisina, servant of Abi-eshuh.
PRED   Iddin-Shamash, sanga priest of Nininsina, son o

------------------------------------------------
QUERY  translate Akkadian to English: _ARAD2_ ha-ia-a-bu-um
TARGET servant of Haya-abum,
PRED   servant of Haia-abum.
------------------------------------------------
QUERY  translate Akkadian to English: {d}a-ia-lum _lugal_ a-ba-at-tim{ki}
TARGET Aialum, king of Abattum
PRED   Ilum, king of the land of
------------------------------------------------
QUERY  translate Akkadian to English: _e2-gal_ {disz}asz-pap-a _man szu2_ _man kur_ asz _a_ geszkim-masz _man kur_ asz-ma
TARGET (Property of) the palace of Assurnasirpal, king of everything, king of Assyria, son of Tukulti-Ninurta, king of Assyria.
PRED   Palace of Assurnasirpal, king of the universe, king of Assyria, son of Geshkimash, king of Assyria.
------------------------------------------------
QUERY  translate Akkadian to English: lugal:dingir-kalam
TARGET of the god Bēl-matim,
PRED   Shar-ilam,
------------------------------------------------
QUERY  translate Akkadian to English: 

## Save to Huggingface

In [42]:
model_path = os.path.abspath(f"/home/fak/nn/Data/generated/cuneiform/{model_id}-fullytrained")
trainer.save_model(model_path)
model_path

Saving model checkpoint to /home/fak/nn/Data/generated/cuneiform/t5-base-p-l-akk-en-20220725-224830-fullytrained
Configuration saved in /home/fak/nn/Data/generated/cuneiform/t5-base-p-l-akk-en-20220725-224830-fullytrained/config.json
Model weights saved in /home/fak/nn/Data/generated/cuneiform/t5-base-p-l-akk-en-20220725-224830-fullytrained/pytorch_model.bin
tokenizer config file saved in /home/fak/nn/Data/generated/cuneiform/t5-base-p-l-akk-en-20220725-224830-fullytrained/tokenizer_config.json
Special tokens file saved in /home/fak/nn/Data/generated/cuneiform/t5-base-p-l-akk-en-20220725-224830-fullytrained/special_tokens_map.json


'/home/fak/nn/Data/generated/cuneiform/t5-base-p-l-akk-en-20220725-224830-fullytrained'

In [43]:
tokenizer.save_pretrained(model_path)

tokenizer config file saved in /home/fak/nn/Data/generated/cuneiform/t5-base-p-l-akk-en-20220725-224830-fullytrained/tokenizer_config.json
Special tokens file saved in /home/fak/nn/Data/generated/cuneiform/t5-base-p-l-akk-en-20220725-224830-fullytrained/special_tokens_map.json


('/home/fak/nn/Data/generated/cuneiform/t5-base-p-l-akk-en-20220725-224830-fullytrained/tokenizer_config.json',
 '/home/fak/nn/Data/generated/cuneiform/t5-base-p-l-akk-en-20220725-224830-fullytrained/special_tokens_map.json',
 '/home/fak/nn/Data/generated/cuneiform/t5-base-p-l-akk-en-20220725-224830-fullytrained/tokenizer.json')