# Finetune

Based on: https://huggingface.co/docs/transformers/tasks/translation

In [1]:
import sys, os
import json
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset

In [2]:
source_lang = "akk"
source_lang_full = "Akkadian"

target_lang = "en"
target_lang_full = "English"

base_model_id = "t5-small"


In [3]:
model_id = f"{base_model_id}-{source_lang}-{target_lang}"
model_id

't5-small-akk-en'

In [4]:
prefix = f"translate {source_lang_full} to {target_lang_full}: "
prefix

'translate Akkadian to English: '

## Load Training Data

In [5]:
translations = load_dataset("json", data_files="../data/translations.jsonl")
translations

Using custom data configuration default-8a6a4b52133232f6


Downloading and preparing dataset json/default to /Users/fak/.cache/huggingface/datasets/json/default-8a6a4b52133232f6/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /Users/fak/.cache/huggingface/datasets/json/default-8a6a4b52133232f6/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sux', 'en', 'de', 'akk', 'akkts', 'es', 'fr', 'suxts', 'elx', 'it'],
        num_rows: 6996
    })
})

In [6]:
translations = translations.filter(lambda x: x[source_lang] is not None and x[target_lang] is not None)
translations



  0%|          | 0/7 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['sux', 'en', 'de', 'akk', 'akkts', 'es', 'fr', 'suxts', 'elx', 'it'],
        num_rows: 1307
    })
})

In [7]:
rem_cols = [x for x in translations["train"].column_names if x != source_lang and x != target_lang]
translations = translations["train"].remove_columns(rem_cols)
translations

Dataset({
    features: ['en', 'akk'],
    num_rows: 1307
})

In [8]:
translations = translations.train_test_split(test_size=0.1)
translations

DatasetDict({
    train: Dataset({
        features: ['en', 'akk'],
        num_rows: 1176
    })
    test: Dataset({
        features: ['en', 'akk'],
        num_rows: 131
    })
})

In [9]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [10]:
def preprocess_function(examples):
#     print(examples)
    inputs = [prefix + example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
tokenized_translations = translations.map(preprocess_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [12]:
tokenized_translations

DatasetDict({
    train: Dataset({
        features: ['en', 'akk', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1176
    })
    test: Dataset({
        features: ['en', 'akk', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 131
    })
})

In [14]:
tokenized_translations["train"][0]["labels"][:10]

[156, 3, 9, 388, 3, 9, 3, 2, 76, 122]

In [15]:
model = AutoModelForSeq2SeqLM.from_pretrained(base_model_id)

In [16]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [17]:
has_cuda = False

In [18]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    fp16=has_cuda,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_translations["train"],
    eval_dataset=tokenized_translations["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)



In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: akk, en. If akk, en are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1176
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 74
