# Train Translator

Based on: https://huggingface.co/docs/transformers/tasks/translation

In [1]:
import sys, os, datetime
import json
import torch
import random
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TranslationPipeline
from datasets import load_dataset, Dataset

In [2]:
os.environ["WANDB_NOTEBOOK_NAME"] = "TrainTranslator.ipynb"

source_langs = set(["akk", "elx", "sux"])

# target_langs = set(["en", "it", "es", "fr", "de"])
target_langs = set(["en"])

base_model_id = "t5-base"

model_max_length = 256
batch_size = 32
num_train_epochs = 60

is_bi = False

In [3]:
date_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
flags = ""
if is_bi:
    flags += "-bi"
model_id = f"{base_model_id}{flags}-{''.join(sorted(list(source_langs)))}-{''.join(sorted(list(target_langs)))}-{date_id}"
model_id

't5-base-akkelxsux-en-20220719-204449'

In [4]:
has_cuda = torch.cuda.is_available()
device = torch.cuda.device(0) if has_cuda else "cpu"
has_cuda, device

(True, <torch.cuda.device at 0x7f24d9f9b0a0>)

In [5]:
!nvidia-smi

Tue Jul 19 20:44:49 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  N/A |
| 30%   40C    P8    29W / 350W |    168MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

## Load Training Data

In [6]:
lang_full = {
    "akk": "Akkadian",
    "elx": "Elamite",
    "sux": "Sumerian",
    "akkts": "Akkadian",
    "elxts": "Elamite",
    "suxts": "Sumerian",
    "en": "English",
    "it": "Italian",
    "es": "Spanish",
    "fr": "French",
    "de": "German",
}

In [7]:
def get_prefix(src_lang, tgt_lang):
    s = lang_full[src_lang]
    t = lang_full[tgt_lang]
    return f"translate {s} to {t}: "
    
get_prefix("suxts", "es")

'translate Sumerian to Spanish: '

In [8]:
translations = load_dataset("json", data_files="../data/translations.jsonl")
translations

Using custom data configuration default-1f050c38726e6453
Reusing dataset json (/home/fak/.cache/huggingface/datasets/json/default-1f050c38726e6453/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['p', 'a', 'l', 'sux', 'en', 'grc', 'fr', 'de', 'peo', 'grcts', 'akkts', 'es', 'elxts', 'elx', 'akk', 'qpn', 'peots', 'ug', 'ugts', 'it', 'suxts', 'qpnts', 'arc', 'arcts'],
        num_rows: 91566
    })
})

In [9]:
replacements = [
    ("ā", "a"),
    ("Ā", "a"),
    ("ḫ", "h"),
    ("Ḫ", "H"),
    ("ī", "i"),
    ("Ī", "I"),
#     ("î", "i"),
#     ("Î", "I"),
    ("ř", "r"),
    ("Ř", "R"),
    ("š", "sh"),
    ("Š", "Sh"),
    ("ṣ", "sh"),
    ("Ṣ", "Sh"),
    ("ṭ", "t"),
    ("Ṭ", "T"),
    ("ū", "u"),
    ("Ū", "U"),
]
def replace_unsupported(text):
    r = text
    for s, t in replacements:
        r = r.replace(s, t)
    return r

In [10]:
sourceandtargets = []
for s in source_langs:
    print("Preparing", s)
    for t in tqdm(target_langs):
        st_prefix = get_prefix(s, t)
        ts_prefix = get_prefix(t, s)
        for line in translations["train"]:
            ls = line[s]
            lt = line[t]
            if ls is not None and len(ls) > 0 and lt is not None and len(lt) > 0:
                lt = replace_unsupported(lt)
                if lt[-1] == "." or lt[-1] == "!" or lt[-1] == ";" or lt[-1] == ",":
                    lt = lt[:-1]
                sourceandtargets.append((st_prefix + ls, lt))
                if is_bi:
                    sourceandtargets.append((ts_prefix + lt, ls))
                
random.shuffle(sourceandtargets)
all_translations = Dataset.from_dict({"source": [x[0] for x in sourceandtargets], "target": [x[1] for x in sourceandtargets]})
all_translations

Preparing akk


  0%|          | 0/1 [00:00<?, ?it/s]

Preparing elx


  0%|          | 0/1 [00:00<?, ?it/s]

Preparing sux


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['source', 'target'],
    num_rows: 89419
})

In [11]:
translations = all_translations.train_test_split(test_size=0.1)
translations

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 80477
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 8942
    })
})

In [12]:
original_tests = translations["test"]
original_tests

Dataset({
    features: ['source', 'target'],
    num_rows: 8942
})

In [13]:
test_starts = [f"translate {lang_full[s]} to " for s in source_langs]
print(test_starts)

def should_test(t):
    return any(t["source"].startswith(s) for s in test_starts)

translations["test"] = original_tests.filter(should_test)
translations["test"]



['translate Akkadian to ', 'translate Elamite to ', 'translate Sumerian to ']


  0%|          | 0/9 [00:00<?, ?ba/s]

Dataset({
    features: ['source', 'target'],
    num_rows: 8942
})

## Tokenize the Data

In [14]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, model_max_length=model_max_length)

In [15]:
print("pad", tokenizer.pad_token, tokenizer.pad_token_id)
print("eos", tokenizer.eos_token, tokenizer.eos_token_id)
print("unk", tokenizer.unk_token, tokenizer.unk_token_id)

pad <pad> 0
eos </s> 1
unk <unk> 2


In [16]:
ccc = 0

def preprocess_function(examples):
    global ccc
#     print(examples)
    inputs = [example for example in examples["source"]]
    targets = [example for example in examples["target"]]
    model_inputs = tokenizer(inputs, max_length=model_max_length, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=model_max_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    ccc += 1
    if ccc == 1:
        print(model_inputs["input_ids"][0])
        print(model_inputs["labels"][0])

    return model_inputs

tokenized_translations = translations.map(preprocess_function, batched=True)
tokenized_translations

  0%|          | 0/81 [00:00<?, ?ba/s]

[13959, 4823, 1258, 8603, 12, 1566, 10, 3, 122, 23, 18, 51, 173, 18, 2, 26, 2, 1635, 26, 1598, 1]
[3156, 51, 173, 18, 7286, 26, 1598, 1]


  0%|          | 0/9 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 80477
    })
    test: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 8942
    })
})

In [17]:
tokenized_translations["train"].remove_columns(["source", "target"])
tokenized_translations["test"].remove_columns(["source", "target"])

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 8942
})

In [18]:
source_max_length = max([len(x["input_ids"]) for x in tokenized_translations["train"]])
target_max_length = max([len(x["labels"]) for x in tokenized_translations["train"]])
source_max_length, target_max_length

(189, 108)

In [19]:
tokenized_translations["train"][0]["labels"][:10]

[3156, 51, 173, 18, 7286, 26, 1598, 1]

## Load the Model

In [20]:
model = AutoModelForSeq2SeqLM.from_pretrained(base_model_id, max_length=model_max_length)

In [21]:
model.config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "max_length": 256,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
 

## Train

In [22]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
# data_collator

In [23]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"../results/{model_id}",
    evaluation_strategy="epoch",
    learning_rate=2*2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    fp16=has_cuda,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_translations["train"],
    eval_dataset=tokenized_translations["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)



Using amp half precision backend


In [24]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, source. If target, source are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 80477
  Num Epochs = 60
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 150900
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mpraeclarum[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,2.6621,2.420769
2,2.3359,2.141625
3,2.1459,1.992723
4,1.9952,1.89202
5,1.8734,1.810962
6,1.8046,1.752047
7,1.7105,1.706529
8,1.6429,1.665118
9,1.5748,1.636691
10,1.5184,1.61139


Saving model checkpoint to ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-500
Configuration saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-500/config.json
Model weights saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-1000
Configuration saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-1000/config.json
Model weights saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-akkelxsux-en-202

Configuration saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-6500/config.json
Model weights saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-6500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-6500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-6500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-5000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-7000
Configuration saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-7000/config.json
Model weights saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-7000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-7000/tokenizer_config.json
Special tokens file saved in 

Deleting older checkpoint [../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-10500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-12500
Configuration saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-12500/config.json
Model weights saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-12500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-12500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-12500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-11000] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, source. If target, source are not expected by `T5ForConditionalGenerati

***** Running Evaluation *****
  Num examples = 8942
  Batch size = 32
Saving model checkpoint to ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-18000
Configuration saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-18000/config.json
Model weights saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-18000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-18000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-18000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-16500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-18500
Configuration saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-18500/config.json
Model weights saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint

tokenizer config file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-23500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-23500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-22000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-24000
Configuration saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-24000/config.json
Model weights saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-24000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-24000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-24000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-22500] due to args.sav

Saving model checkpoint to ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-29500
Configuration saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-29500/config.json
Model weights saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-29500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-29500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-29500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-28000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-30000
Configuration saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-30000/config.json
Model weights saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-30000/pytorch_model.bin
tokenizer config file saved in ../results/t5-b

tokenizer config file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-35000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-35000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-33500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, source. If target, source are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 8942
  Batch size = 32
Saving model checkpoint to ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-35500
Configuration saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-35500/config.json
Model weights saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-35500/pytorch_model.bin
t

tokenizer config file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-40500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-40500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-39000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-41000
Configuration saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-41000/config.json
Model weights saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-41000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-41000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-41000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-39500] due to args.sav

Saving model checkpoint to ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-46500
Configuration saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-46500/config.json
Model weights saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-46500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-46500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-46500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-45000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-47000
Configuration saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-47000/config.json
Model weights saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-47000/pytorch_model.bin
tokenizer config file saved in ../results/t5-b

tokenizer config file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-52000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-52000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-50500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-52500
Configuration saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-52500/config.json
Model weights saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-52500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-52500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-52500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-51000] due to args.sav

The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, source. If target, source are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 8942
  Batch size = 32
Saving model checkpoint to ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-58000
Configuration saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-58000/config.json
Model weights saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-58000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-58000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-58000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-akkelxsux-en-20220719-204449/checkpoint-56500] due to args.save_total_limit
S

KeyboardInterrupt: 

## Sample

In [25]:
pipeline = TranslationPipeline(model=model.to("cpu"), tokenizer=tokenizer, max_length=model_max_length)

In [26]:
pipeline

<transformers.pipelines.text2text_generation.TranslationPipeline at 0x7f24781c8f70>

In [27]:
pipeline("translate English to French: hello my name is Frank")

[{'translation_text': '(Say) hi, my name is Frank'}]

In [28]:
source_test = translations["test"][0]["source"]
target_test = translations["test"][0]["target"]
print(source_test)
print("-"*80)
print(target_test)

translate Sumerian to English: {d}szul-gi
--------------------------------------------------------------------------------
Shulgi


In [29]:
def translate(text):
    return pipeline(text)

translate(source_test)

[{'translation_text': 'Shulgi'}]

In [30]:
tests = original_tests
def sample(num_samples=100):
    for i in range(min(num_samples, tests.num_rows)):
        t = tests[i]
    #     print(t)
        src = t["source"]
        tgt = t["target"]
        query = src
        pred = pipeline(query)[0]["translation_text"]
        print("-"*48)
        print("QUERY ", query)
        print("TARGET", tgt)
        print("PRED  ", pred)
    #     break
    
sample()

------------------------------------------------
QUERY  translate Sumerian to English: {d}szul-gi
TARGET Shulgi
PRED   Shulgi
------------------------------------------------
QUERY  translate Sumerian to English: lugal ki-en-gi ki-uri
TARGET king of Sumer and Akkad
PRED   and king of Sumer and Akkad
------------------------------------------------
QUERY  translate Akkadian to English: ra-bi-isz e-pu-usz
TARGET and I rebuilt them in a grand fashion
PRED   I did a great job
------------------------------------------------
QUERY  translate Akkadian to English: _kiszib3_ qar-dum-{d}iszkur
TARGET seal of Qardum-Adad
PRED   Seal of Qardum-Adad
------------------------------------------------
QUERY  translate Akkadian to English: a-na har-ra-ni sza a-lak-ta-sza2 la ta-a-a-rat
TARGET to the route whose path is one of he who does not return
PRED   to the harrani drum of her ablutions ceased
------------------------------------------------
QUERY  translate Akkadian to English: u2-gal-lib2-ma kal

------------------------------------------------
QUERY  translate Sumerian to English: 8(asz@c)# kusz3#-numun# [sa2]
TARGET 8 seed-cubits squared:
PRED   8 seed-cubits squared:
------------------------------------------------
QUERY  translate Akkadian to English: i-na ne2-kur-ti _lu2_ ha-szi-[im{ki}]
TARGET in hostilities with the Man of Hashshum
PRED   In the midst of the Hashumu people
------------------------------------------------
QUERY  translate Sumerian to English: e2 dingir gal-gal-e-ne-ka
TARGET the temples of the great gods
PRED   (and) the temples of the great gods
------------------------------------------------
QUERY  translate Sumerian to English: {disz}e2-a-tu-ra-am u3 e2-a-ta-a-a-ar
TARGET Ea-turam and Ea-tayar
PRED   Ea-turam and Ea-tura
------------------------------------------------
QUERY  translate Sumerian to English: 4(disz) ad7 gu4
TARGET 4 carcasses of oxen
PRED   4 carcasses of oxen
------------------------------------------------
QUERY  translate Sumerian to

------------------------------------------------
QUERY  translate Sumerian to English: u4-ba
TARGET then
PRED   At that time
------------------------------------------------
QUERY  translate Sumerian to English: e2 an-sze3 1(szar'u) GAN2 ki-sze3 5(szar2) GAN2
TARGET House, 10 shar2 area at its upper end, 5 shar2 area at its lower end
PRED   Temple, towards the sky it is 1 shar2, towards the earth it is 5 shar2
------------------------------------------------
QUERY  translate Sumerian to English: gu2 ki gal2-la-ba ba-e-sug2-esz kur-re ba-ab-DU-DU-na-esz
TARGET They stood in the manner of one whose neck is pushed to the ground(?) the mountain(dwellers?) carried(?) them away
PRED   They were bound up in their folds, they were bound up in the netherworld
------------------------------------------------
QUERY  translate Akkadian to English: u3 u3-sza-asz2-t,a2-ru
TARGET or shall cause it to be written
PRED   and I shall levy
------------------------------------------------
QUERY  translate 

## Save to Huggingface

In [None]:
model_path = os.path.abspath("../../cuneiform")
trainer.save_model(model_path)
model_path

In [None]:
tokenizer.save_pretrained(model_path)