# Train Translator

Based on: https://huggingface.co/docs/transformers/tasks/translation

In [1]:
import sys, os, datetime
import json
import torch
import random
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TranslationPipeline
from datasets import load_dataset, Dataset

In [14]:
os.environ["WANDB_NOTEBOOK_NAME"] = "TrainTranslator.ipynb"

source_langs = set(["akk", "elx", "sux"])

target_langs = set(["en", "it", "es", "fr", "de"])

base_model_id = "t5-base"

model_max_length = 256
batch_size = 32
num_train_epochs = 60

In [3]:
date_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
model_id = f"{base_model_id}-bi-{'-'.join(sorted(list(source_langs)))}-{date_id}"
model_id

't5-base-bi-akk-elx-sux-20220719-190709'

In [4]:
has_cuda = torch.cuda.is_available()
device = torch.cuda.device(0) if has_cuda else "cpu"
has_cuda, device

(True, <torch.cuda.device at 0x7fc1a0614280>)

In [5]:
!nvidia-smi

Tue Jul 19 19:07:10 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  N/A |
|  0%   48C    P8    32W / 350W |    168MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

## Load Training Data

In [6]:
lang_full = {
    "akk": "Akkadian",
    "elx": "Elamite",
    "sux": "Sumerian",
    "akkts": "Akkadian",
    "elxts": "Elamite",
    "suxts": "Sumerian",
    "en": "English",
    "it": "Italian",
    "es": "Spanish",
    "fr": "French",
    "de": "German",
}

In [7]:
def get_prefix(src_lang, tgt_lang):
    s = lang_full[src_lang]
    t = lang_full[tgt_lang]
    return f"translate {s} to {t}: "
    
get_prefix("suxts", "es")

'translate Sumerian to Spanish: '

In [8]:
translations = load_dataset("json", data_files="../data/translations.jsonl")
translations

Using custom data configuration default-1f050c38726e6453
Reusing dataset json (/home/fak/.cache/huggingface/datasets/json/default-1f050c38726e6453/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['p', 'a', 'l', 'sux', 'en', 'grc', 'fr', 'de', 'peo', 'grcts', 'akkts', 'es', 'elxts', 'elx', 'akk', 'qpn', 'peots', 'ug', 'ugts', 'it', 'suxts', 'qpnts', 'arc', 'arcts'],
        num_rows: 91566
    })
})

In [9]:
replacements = [
    ("ā", "a"),
    ("Ā", "a"),
    ("ḫ", "h"),
    ("Ḫ", "H"),
    ("ī", "i"),
    ("Ī", "I"),
#     ("î", "i"),
#     ("Î", "I"),
    ("ř", "r"),
    ("Ř", "R"),
    ("š", "sh"),
    ("Š", "Sh"),
    ("ṣ", "sh"),
    ("Ṣ", "Sh"),
    ("ū", "u"),
    ("Ū", "U"),
]
def replace_unsupported(text):
    r = text
    for s, t in replacements:
        r = r.replace(s, t)
    return r

In [10]:
sourceandtargets = []
for s in source_langs:
    print("Preparing", s)
    for t in tqdm(target_langs):
        st_prefix = get_prefix(s, t)
        ts_prefix = get_prefix(t, s)
        for line in translations["train"]:
            ls = line[s]
            lt = line[t]
            if ls is not None and lt is not None:
                lt = replace_unsupported(lt)
                if lt[-1] == "." or lt[-1] == "!" or lt[-1] == ";" or lt[-1] == ",":
                    lt = lt[:-1]
                sourceandtargets.append((st_prefix + ls, lt))
                sourceandtargets.append((ts_prefix + lt, ls))
                
random.shuffle(sourceandtargets)
all_translations = Dataset.from_dict({"source": [x[0] for x in sourceandtargets], "target": [x[1] for x in sourceandtargets]})
all_translations

Preparing sux


  0%|          | 0/5 [00:00<?, ?it/s]

Preparing elx


  0%|          | 0/5 [00:00<?, ?it/s]

Preparing akk


  0%|          | 0/5 [00:00<?, ?it/s]

Dataset({
    features: ['source', 'target'],
    num_rows: 184216
})

In [11]:
translations = all_translations.train_test_split(test_size=0.1)
translations

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 165794
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 18422
    })
})

In [12]:
original_tests = translations["test"]
original_tests

Dataset({
    features: ['source', 'target'],
    num_rows: 18422
})

In [13]:
test_starts = [f"translate {lang_full[s]} to " for s in source_langs]
print(test_starts)

def should_test(t):
    return any(t["source"].startswith(s) for s in test_starts)

translations["test"] = original_tests.filter(should_test)
translations["test"]



['translate Sumerian to ', 'translate Elamite to ', 'translate Akkadian to ']


  0%|          | 0/19 [00:00<?, ?ba/s]

Dataset({
    features: ['source', 'target'],
    num_rows: 9273
})

## Tokenize the Data

In [15]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, model_max_length=model_max_length)

In [16]:
print("pad", tokenizer.pad_token, tokenizer.pad_token_id)
print("eos", tokenizer.eos_token, tokenizer.eos_token_id)
print("unk", tokenizer.unk_token, tokenizer.unk_token_id)

pad <pad> 0
eos </s> 1
unk <unk> 2


In [17]:
ccc = 0

def preprocess_function(examples):
    global ccc
#     print(examples)
    inputs = [example for example in examples["source"]]
    targets = [example for example in examples["target"]]
    model_inputs = tokenizer(inputs, max_length=model_max_length, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=model_max_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    ccc += 1
    if ccc == 1:
        print(model_inputs["input_ids"][0])
        print(model_inputs["labels"][0])

    return model_inputs

tokenized_translations = translations.map(preprocess_function, batched=True)
tokenized_translations

  0%|          | 0/166 [00:00<?, ?ba/s]

[13959, 4823, 1258, 8603, 12, 1566, 10, 16, 18, 29, 9, 18, 9, 26, 18, 52, 76, 18, 29, 603, 18, 51, 9, 3, 29, 9, 18, 152, 18, 107, 76, 18, 1000, 3, 23, 18, 7, 1629, 7412, 17, 159, 172, 357, 1]
[3, 52, 5855, 581, 140, 6, 9706, 114, 3, 9, 1472, 1]


  0%|          | 0/10 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 165794
    })
    test: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 9273
    })
})

In [18]:
tokenized_translations["train"].remove_columns(["source", "target"])
tokenized_translations["test"].remove_columns(["source", "target"])

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 9273
})

In [19]:
source_max_length = max([len(x["input_ids"]) for x in tokenized_translations["train"]])
target_max_length = max([len(x["labels"]) for x in tokenized_translations["train"]])
source_max_length, target_max_length

(201, 194)

In [20]:
tokenized_translations["train"][0]["labels"][:10]

[3, 52, 5855, 581, 140, 6, 9706, 114, 3, 9]

## Load the Model

In [21]:
model = AutoModelForSeq2SeqLM.from_pretrained(base_model_id)

In [22]:
model.config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translatio

## Train

In [23]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
data_collator

DataCollatorForSeq2Seq(tokenizer=PreTrainedTokenizerFast(name_or_path='t5-base', vocab_size=32100, model_max_len=256, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extr

In [24]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"../results/{model_id}",
    evaluation_strategy="epoch",
    learning_rate=2*2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    fp16=has_cuda,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_translations["train"],
    eval_dataset=tokenized_translations["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)



Using amp half precision backend


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: source, target. If source, target are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 165794
  Num Epochs = 60
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 310920
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mpraeclarum[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


Saving model checkpoint to ../results/t5-base-bi-akk-elx-sux-20220719-190709/checkpoint-500
Configuration saved in ../results/t5-base-bi-akk-elx-sux-20220719-190709/checkpoint-500/config.json
Model weights saved in ../results/t5-base-bi-akk-elx-sux-20220719-190709/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-bi-akk-elx-sux-20220719-190709/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-bi-akk-elx-sux-20220719-190709/checkpoint-500/special_tokens_map.json


## Sample

In [None]:
pipeline = TranslationPipeline(model=model.to("cpu"), tokenizer=tokenizer, max_length=model_max_length)

In [None]:
pipeline

In [None]:
pipeline("translate English to French: hello my name is Frank")

In [None]:
source_test = translations["test"][0]["source"]
target_test = translations["test"][0]["target"]
print(source_test)
print("-"*80)
print(target_test)

In [None]:
def translate(text):
    return pipeline(text)

translate("translate Sumerian to English: a2 gesz-ur3-ra")

In [None]:
def sample(num_samples=100):
    for i in range(min(num_samples, tests.num_rows)):
        t = tests[i]
    #     print(t)
        src = t["source"]
        tgt = t["target"]
        query = src
        pred = pipeline(query)[0]["translation_text"]
        print("-"*48)
        print("QUERY ", query)
        print("TARGET", tgt)
        print("PRED  ", pred)
    #     break
    
sample()

## Save to Huggingface

In [34]:
model_path = os.path.abspath("../../cuneiform")
trainer.save_model(model_path)
model_path

Saving model checkpoint to /home/fak/Projects/cuneiform
Configuration saved in /home/fak/Projects/cuneiform/config.json
Model weights saved in /home/fak/Projects/cuneiform/pytorch_model.bin
tokenizer config file saved in /home/fak/Projects/cuneiform/tokenizer_config.json
Special tokens file saved in /home/fak/Projects/cuneiform/special_tokens_map.json


'/home/fak/Projects/cuneiform'

In [35]:
tokenizer.save_pretrained(model_path)

tokenizer config file saved in /home/fak/Projects/cuneiform/tokenizer_config.json
Special tokens file saved in /home/fak/Projects/cuneiform/special_tokens_map.json


('/home/fak/Projects/cuneiform/tokenizer_config.json',
 '/home/fak/Projects/cuneiform/special_tokens_map.json',
 '/home/fak/Projects/cuneiform/tokenizer.json')