# Finetune

Based on: https://huggingface.co/docs/transformers/tasks/translation

In [1]:
import sys, os, datetime
import json
import torch
import random
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TranslationPipeline
from datasets import load_dataset, Dataset

In [2]:
os.environ["WANDB_NOTEBOOK_NAME"] = "Finetune.ipynb"

source_langs = set(["akk", "sux"])

target_langs = set(["en", "it", "es", "fr", "de"])

base_model_id = "t5-base"

model_max_length = 256
batch_size = 48

In [3]:
date_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
model_id = f"{base_model_id}-{'-'.join(sorted(list(source_langs)))}-{date_id}"
model_id

't5-base-akk-sux-20220719-014641'

In [4]:
has_cuda = torch.cuda.is_available()
device = torch.cuda.device(0) if has_cuda else "cpu"
has_cuda, device

(True, <torch.cuda.device at 0x7f03d04fdae0>)

In [5]:
!nvidia-smi

Tue Jul 19 01:46:41 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  N/A |
| 30%   40C    P8    30W / 350W |    168MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

## Load Training Data

In [6]:
lang_full = {
    "akk": "Akkadian",
    "sux": "Sumerian",
    "akkts": "Akkadian",
    "suxts": "Sumerian",
    "en": "English",
    "it": "Italian",
    "es": "Spanish",
    "fr": "French",
    "de": "German",
}

In [7]:
def get_prefix(src_lang, tgt_lang):
    s = lang_full[src_lang]
    t = lang_full[tgt_lang]
    return f"translate {s} to {t}: "
    
get_prefix("suxts", "es")

'translate Sumerian to Spanish: '

In [8]:
translations = load_dataset("json", data_files="../data/translations.jsonl")
translations

Using custom data configuration default-dc8b07d8fd701d7f
Reusing dataset json (/home/fak/.cache/huggingface/datasets/json/default-dc8b07d8fd701d7f/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['p', 'a', 'l', 'sux', 'en', 'akk', 'akkts', 'de', 'suxts', 'fr', 'elx', 'es', 'it'],
        num_rows: 74584
    })
})

In [9]:
sourceandtargets = []
for s in source_langs:
    print("Preparing", s)
    for t in tqdm(target_langs):
        prefix = get_prefix(s, t)
        for line in translations["train"]:
            ls = line[s]
            lt = line[t]
            if ls is not None and lt is not None:
                if lt[-1] == "." or lt[-1] == "!" or lt[-1] == ";" or lt[-1] == ",":
                    lt = lt[:-1]
                sourceandtargets.append((prefix + ls, lt))
                
random.shuffle(sourceandtargets)
translations = Dataset.from_dict({"source": [x[0] for x in sourceandtargets], "target": [x[1] for x in sourceandtargets]})
translations

Preparing sux


  0%|          | 0/5 [00:00<?, ?it/s]

Preparing akk


  0%|          | 0/5 [00:00<?, ?it/s]

Dataset({
    features: ['source', 'target'],
    num_rows: 70434
})

In [10]:
translations = translations.train_test_split(test_size=0.1)
translations

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 63390
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 7044
    })
})

In [11]:
tests = translations["test"]
tests

Dataset({
    features: ['source', 'target'],
    num_rows: 7044
})

## Tokenize the Data

In [12]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, model_max_length=model_max_length)

In [13]:
print("pad", tokenizer.pad_token, tokenizer.pad_token_id)
print("eos", tokenizer.eos_token, tokenizer.eos_token_id)
print("unk", tokenizer.unk_token, tokenizer.unk_token_id)

pad <pad> 0
eos </s> 1
unk <unk> 2


In [14]:
ccc = 0

def preprocess_function(examples):
    global ccc
#     print(examples)
    inputs = [example for example in examples["source"]]
    targets = [example for example in examples["target"]]
    model_inputs = tokenizer(inputs, max_length=model_max_length, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=model_max_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    ccc += 1
    if ccc == 1:
        print(model_inputs["input_ids"][0])
        print(model_inputs["labels"][0])

    return model_inputs

tokenized_translations = translations.map(preprocess_function, batched=True)
tokenized_translations



  0%|          | 0/64 [00:00<?, ?ba/s]

[13959, 4823, 1258, 8603, 12, 1566, 10, 3, 23, 26, 18, 26, 23, 18, 77, 18, 51, 9, 1]
[1891, 6, 11, 1]


  0%|          | 0/8 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 63390
    })
    test: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 7044
    })
})

In [15]:
tokenized_translations["train"].remove_columns(["source", "target"])
tokenized_translations["test"].remove_columns(["source", "target"])

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 7044
})

In [16]:
source_max_length = max([len(x["input_ids"]) for x in tokenized_translations["train"]])
target_max_length = max([len(x["labels"]) for x in tokenized_translations["train"]])
source_max_length, target_max_length

(201, 107)

In [17]:
tokenized_translations["train"][0]["labels"][:10]

[1891, 6, 11, 1]

## Load the Model

In [18]:
model = AutoModelForSeq2SeqLM.from_pretrained(base_model_id)

In [19]:
model.config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translatio

## Train

In [20]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
data_collator

DataCollatorForSeq2Seq(tokenizer=PreTrainedTokenizerFast(name_or_path='t5-base', vocab_size=32100, model_max_len=256, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extr

In [21]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"../results/{model_id}",
    evaluation_strategy="epoch",
    learning_rate=2*2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=400,
    fp16=has_cuda,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_translations["train"],
    eval_dataset=tokenized_translations["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)



Using amp half precision backend


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: source, target. If source, target are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 63390
  Num Epochs = 400
  Instantaneous batch size per device = 48
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 1
  Total optimization steps = 528400
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mpraeclarum[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,2.8673,2.365838
2,2.318,2.075084
3,2.1229,1.901576


Saving model checkpoint to ../results/t5-base-akk-sux-20220719-014641/checkpoint-500
Configuration saved in ../results/t5-base-akk-sux-20220719-014641/checkpoint-500/config.json
Model weights saved in ../results/t5-base-akk-sux-20220719-014641/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-akk-sux-20220719-014641/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-akk-sux-20220719-014641/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ../results/t5-base-akk-sux-20220719-014641/checkpoint-1000
Configuration saved in ../results/t5-base-akk-sux-20220719-014641/checkpoint-1000/config.json
Model weights saved in ../results/t5-base-akk-sux-20220719-014641/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-akk-sux-20220719-014641/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-akk-sux-20220719-014641/checkpoint-1000/special_tokens_map.js

## Sample

In [None]:
pipeline = TranslationPipeline(model=model.to("cpu"), tokenizer=tokenizer, max_length=model_max_length)

In [None]:
pipeline

In [None]:
pipeline("translate English to French: hello my name is Frank")

In [None]:
source_test = translations["test"][0]["source"]
target_test = translations["test"][0]["target"]
print(source_test)
print("-"*80)
print(target_test)

In [None]:
def translate(text):
    return pipeline(prefix + source_test)

translate("ina ebūrim")

In [None]:
def sample(num_samples=1000):
    for i in range(min(num_samples, tests.num_rows)):
        t = tests[i]
    #     print(t)
        src = t["source"]
        tgt = t["target"]
        query = src
        pred = pipeline(query)[0]["translation_text"]
        print("-"*48)
        print("QUERY ", query)
        print("TARGET", tgt)
        print("PRED  ", pred)
    #     break
    
sample()