# Finetune

Based on: https://huggingface.co/docs/transformers/tasks/translation

In [1]:
import sys, os
import json
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TranslationPipeline
from datasets import load_dataset

In [2]:
source_lang = "akkts"
source_lang_full = "Akkadian"

target_lang = "en"
target_lang_full = "English"

base_model_id = "t5-small"

model_max_length = 128

In [3]:
model_id = f"{base_model_id}-{source_lang}-{target_lang}"
model_id

't5-small-akkts-en'

In [4]:
prefix = f"translate {source_lang_full} to {target_lang_full}: "
prefix

'translate Akkadian to English: '

In [5]:
has_cuda = False

## Load Training Data

In [6]:
translations = load_dataset("json", data_files="../data/translations.jsonl")
translations

Using custom data configuration default-4aec92ba7e22081f
Reusing dataset json (/Users/fak/.cache/huggingface/datasets/json/default-4aec92ba7e22081f/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['pid', 'a', 'l', 'translation'],
        num_rows: 74588
    })
})

In [7]:
translations = translations.filter(lambda x: x["translation"][source_lang] is not None and x["translation"][target_lang] is not None)
translations



  0%|          | 0/75 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['pid', 'a', 'l', 'translation'],
        num_rows: 7376
    })
})

In [8]:
translations["train"][0]["translation"]

{'sux': None,
 'en': 'When the august Anu,',
 'de': None,
 'fr': None,
 'akk': 'i3-nu an s,i-ru-um',
 'akkts': 'īnu anum ṣīrum',
 'es': None,
 'elx': None,
 'suxts': None,
 'it': None}

In [9]:
translations = translations["train"].train_test_split(test_size=0.1)
translations

DatasetDict({
    train: Dataset({
        features: ['pid', 'a', 'l', 'translation'],
        num_rows: 6638
    })
    test: Dataset({
        features: ['pid', 'a', 'l', 'translation'],
        num_rows: 738
    })
})

## Tokenize the Data

In [10]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, model_max_length=model_max_length)

In [11]:
def preprocess_function(examples):
#     print(examples)
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [12]:
tokenized_translations = translations.map(preprocess_function, batched=True)

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [13]:
tokenized_translations

DatasetDict({
    train: Dataset({
        features: ['pid', 'a', 'l', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 6638
    })
    test: Dataset({
        features: ['pid', 'a', 'l', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 738
    })
})

In [14]:
tokenized_translations["train"][0]["labels"][:10]

[3305, 7, 6, 1]

In [15]:
model = AutoModelForSeq2SeqLM.from_pretrained(base_model_id)

In [16]:
pipeline = TranslationPipeline(model=model, tokenizer=tokenizer)

In [17]:
pipeline

<transformers.pipelines.text2text_generation.TranslationPipeline at 0x1360cfd30>

In [18]:
pipeline("translate English to French: hello my name is Frank")

[{'translation_text': 'Bonjour mon nom est Frank'}]

In [19]:
source_test = translations["test"][0]["translation"][source_lang]
target_test = translations["test"][0]["translation"][target_lang]
print(source_test)
print("-"*80)
print(target_test)

nadītam qadištam
--------------------------------------------------------------------------------
as a nadītu, a qadištu,


In [20]:
def translate(text):
    return pipeline(prefix + source_test)

translate("ina ebūrim")

Your input_length: 23 is bigger than 0.9 * max_length: 20. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


[{'translation_text': 'Akkadian nadtam qaditam'}]

In [21]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [22]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    fp16=has_cuda,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_translations["train"],
    eval_dataset=tokenized_translations["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)



In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: a, pid, l, translation. If a, pid, l, translation are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6638
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4150


Epoch,Training Loss,Validation Loss


In [None]:
tests = translations["test"]
tests

In [None]:
for i in range(tests.num_rows):
    t = tests[i]
#     print(t)
    src = t[source_lang]
    tgt = t[target_lang]
    query = prefix + src
    pred = pipeline(query)
    print("-"*48)
    print("QUERY ", query)
    print("TARGET", tgt)
    print("PRED  ", pred)
#     break