# Finetune

Based on: https://huggingface.co/docs/transformers/tasks/translation

In [1]:
import sys, os, datetime
import json
import torch
import random
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TranslationPipeline
from datasets import load_dataset, Dataset

In [2]:
os.environ["WANDB_NOTEBOOK_NAME"] = "Finetune.ipynb"

source_langs = set(["akk", "sux"])

target_langs = set(["en", "it", "es", "fr", "de"])

base_model_id = "t5-base"

model_max_length = 256
batch_size = 32
num_train_epochs = 40

In [3]:
date_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
model_id = f"{base_model_id}-bi-{'-'.join(sorted(list(source_langs)))}-{date_id}"
model_id

't5-base-bi-akk-sux-20220719-133120'

In [4]:
has_cuda = torch.cuda.is_available()
device = torch.cuda.device(0) if has_cuda else "cpu"
has_cuda, device

(True, <torch.cuda.device at 0x7f7b4938ab30>)

In [5]:
!nvidia-smi

Tue Jul 19 13:31:21 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  N/A |
| 30%   46C    P8    34W / 350W |    168MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

## Load Training Data

In [6]:
lang_full = {
    "akk": "Akkadian",
    "sux": "Sumerian",
    "akkts": "Akkadian",
    "suxts": "Sumerian",
    "en": "English",
    "it": "Italian",
    "es": "Spanish",
    "fr": "French",
    "de": "German",
}

In [7]:
def get_prefix(src_lang, tgt_lang):
    s = lang_full[src_lang]
    t = lang_full[tgt_lang]
    return f"translate {s} to {t}: "
    
get_prefix("suxts", "es")

'translate Sumerian to Spanish: '

In [8]:
translations = load_dataset("json", data_files="../data/translations.jsonl")
translations

Using custom data configuration default-dc8b07d8fd701d7f
Reusing dataset json (/home/fak/.cache/huggingface/datasets/json/default-dc8b07d8fd701d7f/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['p', 'a', 'l', 'sux', 'en', 'akk', 'akkts', 'de', 'suxts', 'fr', 'elx', 'es', 'it'],
        num_rows: 74584
    })
})

In [9]:
replacements = [
    ("ā", "a"),
    ("Ā", "a"),
    ("ḫ", "h"),
    ("Ḫ", "H"),
    ("ī", "i"),
    ("Ī", "I"),
#     ("î", "i"),
#     ("Î", "I"),
    ("ř", "r"),
    ("Ř", "R"),
    ("š", "sh"),
    ("Š", "Sh"),
    ("ṣ", "sh"),
    ("Ṣ", "Sh"),
    ("ū", "u"),
    ("Ū", "U"),
]
def replace_unsupported(text):
    r = text
    for s, t in replacements:
        r = r.replace(s, t)
    return r

In [10]:
sourceandtargets = []
for s in source_langs:
    print("Preparing", s)
    for t in tqdm(target_langs):
        st_prefix = get_prefix(s, t)
        ts_prefix = get_prefix(t, s)
        for line in translations["train"]:
            ls = line[s]
            lt = line[t]
            if ls is not None and lt is not None:
                lt = replace_unsupported(lt)
                if lt[-1] == "." or lt[-1] == "!" or lt[-1] == ";" or lt[-1] == ",":
                    lt = lt[:-1]
                sourceandtargets.append((st_prefix + ls, lt))
                sourceandtargets.append((ts_prefix + lt, ls))
                
random.shuffle(sourceandtargets)
translations = Dataset.from_dict({"source": [x[0] for x in sourceandtargets], "target": [x[1] for x in sourceandtargets]})
translations

Preparing akk


  0%|          | 0/5 [00:00<?, ?it/s]

Preparing sux


  0%|          | 0/5 [00:00<?, ?it/s]

Dataset({
    features: ['source', 'target'],
    num_rows: 140868
})

In [11]:
translations = translations.train_test_split(test_size=0.1)
translations

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 126781
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 14087
    })
})

In [12]:
tests = translations["test"]
tests

Dataset({
    features: ['source', 'target'],
    num_rows: 14087
})

## Tokenize the Data

In [13]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, model_max_length=model_max_length)

In [14]:
print("pad", tokenizer.pad_token, tokenizer.pad_token_id)
print("eos", tokenizer.eos_token, tokenizer.eos_token_id)
print("unk", tokenizer.unk_token, tokenizer.unk_token_id)

pad <pad> 0
eos </s> 1
unk <unk> 2


In [15]:
ccc = 0

def preprocess_function(examples):
    global ccc
#     print(examples)
    inputs = [example for example in examples["source"]]
    targets = [example for example in examples["target"]]
    model_inputs = tokenizer(inputs, max_length=model_max_length, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=model_max_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    ccc += 1
    if ccc == 1:
        print(model_inputs["input_ids"][0])
        print(model_inputs["labels"][0])

    return model_inputs

tokenized_translations = translations.map(preprocess_function, batched=True)
tokenized_translations



  0%|          | 0/127 [00:00<?, ?ba/s]

[13959, 1566, 12, 4823, 1258, 8603, 10, 9927, 6, 11, 1]
[3, 23, 18, 526, 18, 15, 7, 172, 18, 51, 9, 1]


  0%|          | 0/15 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 126781
    })
    test: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14087
    })
})

In [16]:
tokenized_translations["train"].remove_columns(["source", "target"])
tokenized_translations["test"].remove_columns(["source", "target"])

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 14087
})

In [17]:
source_max_length = max([len(x["input_ids"]) for x in tokenized_translations["train"]])
target_max_length = max([len(x["labels"]) for x in tokenized_translations["train"]])
source_max_length, target_max_length

(201, 194)

In [18]:
tokenized_translations["train"][0]["labels"][:10]

[3, 23, 18, 526, 18, 15, 7, 172, 18, 51]

## Load the Model

In [19]:
model = AutoModelForSeq2SeqLM.from_pretrained(base_model_id)

In [20]:
model.config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translatio

## Train

In [21]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
data_collator

DataCollatorForSeq2Seq(tokenizer=PreTrainedTokenizerFast(name_or_path='t5-base', vocab_size=32100, model_max_len=256, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extr

In [22]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"../results/{model_id}",
    evaluation_strategy="epoch",
    learning_rate=2*2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    fp16=has_cuda,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_translations["train"],
    eval_dataset=tokenized_translations["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)



Using amp half precision backend


In [23]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, source. If target, source are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 126781
  Num Epochs = 400
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1584800
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mpraeclarum[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,1.9424,1.670588
2,1.6154,1.432208
3,1.4618,1.294569
4,1.3288,1.210426
5,1.2554,1.147311
6,1.1752,1.099326
7,1.1201,1.058509
8,1.0746,1.030408
9,1.0191,1.007711
10,0.9635,0.981915


Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-500
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-500/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-1000
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-1000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpo

Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-7000
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-7000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-7000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-7000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-7000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-5500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-7500
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-7500/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-7500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133

Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-11000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-13000
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-13000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-13000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-13000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-13000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-11500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-13500
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-13500/config.json
Model weights saved in ../results/t5-base-bi-akk

Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-19000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-19000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-19000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-19000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-17500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-19500
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-19500/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-19500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-19500/tokenizer_config.json
Special tokens file saved in ../result

Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-23000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-25000
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-25000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-25000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-25000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-25000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-23500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-25500
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-25500/config.json
Model weights saved in ../results/t5-base-bi-akk

Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-31000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-31000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-31000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-31000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-29500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-31500
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-31500/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-31500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-31500/tokenizer_config.json
Special tokens file saved in ../result

Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-35000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-37000
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-37000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-37000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-37000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-37000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-35500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-37500
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-37500/config.json
Model weights saved in ../results/t5-base-bi-akk

Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-43000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-43000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-43000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-43000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-41500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-43500
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-43500/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-43500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-43500/tokenizer_config.json
Special tokens file saved in ../result

Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-47000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-49000
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-49000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-49000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-49000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-49000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-47500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-49500
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-49500/config.json
Model weights saved in ../results/t5-base-bi-akk

Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-55000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-55000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-55000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-55000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-53500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, source. If target, source are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 14087
  Batch size = 32
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-55500
Configuratio

Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-59000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-61000
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-61000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-61000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-61000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-61000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-59500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-61500
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-61500/config.json
Model weights saved in ../results/t5-base-bi-akk

Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-67000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-67000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-67000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-67000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-65500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, source. If target, source are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 14087
  Batch size = 32
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-67500
Configuratio

Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-71000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-73000
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-73000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-73000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-73000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-73000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-71500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-73500
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-73500/config.json
Model weights saved in ../results/t5-base-bi-akk

Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-79000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-79000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-79000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-79000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-77500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, source. If target, source are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 14087
  Batch size = 32
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-79500
Configuratio

Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-83000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-85000
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-85000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-85000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-85000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-85000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-83500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-85500
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-85500/config.json
Model weights saved in ../results/t5-base-bi-akk

Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-91000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-91000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-91000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-91000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-89500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, source. If target, source are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 14087
  Batch size = 32
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-91500
Configuratio

Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-95000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-97000
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-97000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-97000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-97000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-97000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-95500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-97500
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-97500/config.json
Model weights saved in ../results/t5-base-bi-akk

Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-103000
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-103000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-103000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-103000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-103000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-101500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, source. If target, source are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 14087
  Batch size = 32
Saving

Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-108500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-107000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-109000
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-109000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-109000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-109000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-109000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-107500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-109500
Configuration saved i

Deleting older checkpoint [../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-113000] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, source. If target, source are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 14087
  Batch size = 32
Saving model checkpoint to ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-115000
Configuration saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-115000/config.json
Model weights saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-115000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-115000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-sux-20220719-133120/checkpoint-115000/special_tokens_map.json
Deleti

KeyboardInterrupt: 

## Sample

In [24]:
pipeline = TranslationPipeline(model=model.to("cpu"), tokenizer=tokenizer, max_length=model_max_length)

In [25]:
pipeline

<transformers.pipelines.text2text_generation.TranslationPipeline at 0x7f7a7d584100>

In [26]:
pipeline("translate English to French: hello my name is Frank")

[{'translation_text': 'me-te ma mme nidba'}]

In [27]:
source_test = translations["test"][0]["source"]
target_test = translations["test"][0]["target"]
print(source_test)
print("-"*80)
print(target_test)

translate Sumerian to English: a2 gesz-ur3-ra
--------------------------------------------------------------------------------
work of harrowing


In [29]:
def translate(text):
    return pipeline(text)

translate("translate Sumerian to English: a2 gesz-ur3-ra")

[{'translation_text': 'labor of harrowing'}]

In [31]:
def sample(num_samples=100):
    for i in range(min(num_samples, tests.num_rows)):
        t = tests[i]
    #     print(t)
        src = t["source"]
        tgt = t["target"]
        query = src
        pred = pipeline(query)[0]["translation_text"]
        print("-"*48)
        print("QUERY ", query)
        print("TARGET", tgt)
        print("PRED  ", pred)
    #     break
    
sample()

------------------------------------------------
QUERY  translate Sumerian to English: a2 gesz-ur3-ra
TARGET work of harrowing
PRED   labor of harrowing
------------------------------------------------
QUERY  translate English to Akkadian: and
TARGET [u3]
PRED   u3
------------------------------------------------
QUERY  translate Sumerian to English: dumu-[(x)]
TARGET (...) son
PRED   the son(?)
------------------------------------------------
QUERY  translate Sumerian to English: abul# nig2-ku5-da
TARGET Gate of the Nigku taxes
PRED   in the city gate taken
------------------------------------------------
QUERY  translate Sumerian to English: lagasz
TARGET of Lagash
PRED   of Lagash
------------------------------------------------
QUERY  translate English to Sumerian: n sheep, barley-fed, of Lugal-magure
TARGET [n] udu niga lugal-ma2!-gur8-re
PRED   [n] udu niga lugal-ma2-gur8#-re#
------------------------------------------------
QUERY  translate Sumerian to English: e2-mar-uru5-sze3


------------------------------------------------
QUERY  translate English to Sumerian: who was unmarried, ‘You had sexual relations,’
TARGET e2 nu-un-gi4-a gesz3 i3-zu
PRED   nu-tuku-me-en3 a-gin7 ba-tuku
------------------------------------------------
QUERY  translate Akkadian to English: a-na _ka_ s,i-im-da-at
TARGET in accordance with the edict
PRED   to the mouth of the slanderer
------------------------------------------------
QUERY  translate English to Sumerian: ...
TARGET kaskal-mah-x
PRED   geszuzu x-x
------------------------------------------------
QUERY  translate English to Sumerian: 1 sheep
TARGET 1(disz) udu
PRED   1(disz) udu
------------------------------------------------
QUERY  translate Akkadian to English: _lugal kisz_
TARGET king of the world
PRED   king of the world
------------------------------------------------
QUERY  translate Sumerian to English: iti u5-bi2-gu7
TARGET month: “Ubi feast,”
PRED   month: “ubi-feast,”
-------------------------------------------

------------------------------------------------
QUERY  translate Sumerian to English: la2-ia3 1(disz) ur-{d}isztaran dumu du-du
TARGET Deficit: 1 Ur-Ishtaran, son of Dudu
PRED   the deficit: 1: Ur-Ishtaran, son of Dudu
------------------------------------------------
QUERY  translate Sumerian to English: {d}nansze-GIR2@g-gal maszkim
TARGET Nanshe-GIRgal was the requisitioner
PRED   Nanshe-GIRgal was enforcer
------------------------------------------------
QUERY  translate English to Sumerian: and those priests
TARGET gudu4-be2-ne
PRED   en-na-bi
------------------------------------------------
QUERY  translate English to Sumerian: Basket-of-tablets:
TARGET pisan-dub-ba
PRED   pisan-dub-ba
------------------------------------------------
QUERY  translate English to Sumerian: (total:) 13
TARGET ($ blank space $) 1(u) 3(disz)
PRED   1(u) 3(disz)


## Save to Huggingface

In [34]:
model_path = os.path.abspath("../../cuneiform")
trainer.save_model(model_path)
model_path

Saving model checkpoint to /home/fak/Projects/cuneiform
Configuration saved in /home/fak/Projects/cuneiform/config.json
Model weights saved in /home/fak/Projects/cuneiform/pytorch_model.bin
tokenizer config file saved in /home/fak/Projects/cuneiform/tokenizer_config.json
Special tokens file saved in /home/fak/Projects/cuneiform/special_tokens_map.json


'/home/fak/Projects/cuneiform'

In [35]:
tokenizer.save_pretrained(model_path)

tokenizer config file saved in /home/fak/Projects/cuneiform/tokenizer_config.json
Special tokens file saved in /home/fak/Projects/cuneiform/special_tokens_map.json


('/home/fak/Projects/cuneiform/tokenizer_config.json',
 '/home/fak/Projects/cuneiform/special_tokens_map.json',
 '/home/fak/Projects/cuneiform/tokenizer.json')