# Finetune

Based on: https://huggingface.co/docs/transformers/tasks/translation

In [1]:
import sys, os
import json
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TranslationPipeline
from datasets import load_dataset

In [2]:
source_lang = "akkts"
source_lang_full = "Akkadian"

target_lang = "en"
target_lang_full = "English"

base_model_id = "t5-small"

model_max_length = 128

In [3]:
model_id = f"{base_model_id}-{source_lang}-{target_lang}"
model_id

't5-small-akkts-en'

In [4]:
prefix = f"translate {source_lang_full} to {target_lang_full}: "
prefix

'translate Akkadian to English: '

In [5]:
has_cuda = False

## Load Training Data

In [6]:
translations = load_dataset("json", data_files="../data/translations.jsonl")
translations

Using custom data configuration default-b7e58f096bb99928
Reusing dataset json (/Users/fak/.cache/huggingface/datasets/json/default-b7e58f096bb99928/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['p', 'a', 'l', 'en', 'sux', 'de', 'akk', 'akkts', 'es', 'fr', 'suxts', 'elx', 'it'],
        num_rows: 74584
    })
})

In [7]:
translations = translations.filter(lambda x: x[source_lang] is not None and x[target_lang] is not None)
translations



  0%|          | 0/75 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['p', 'a', 'l', 'en', 'sux', 'de', 'akk', 'akkts', 'es', 'fr', 'suxts', 'elx', 'it'],
        num_rows: 7369
    })
})

In [8]:
translations["train"][0]

{'p': 'P464358',
 'a': 'prologue',
 'l': 0,
 'en': 'When the august Anu,',
 'sux': None,
 'de': None,
 'akk': 'i3-nu an s,i-ru-um',
 'akkts': 'īnu anum ṣīrum',
 'es': None,
 'fr': None,
 'suxts': None,
 'elx': None,
 'it': None}

In [9]:
rem_cols = [x for x in translations["train"].column_names if x != source_lang and x != target_lang]
translations = translations["train"].remove_columns(rem_cols)
translations

Dataset({
    features: ['en', 'akkts'],
    num_rows: 7369
})

In [10]:
translations = translations.train_test_split(test_size=0.1)
translations

DatasetDict({
    train: Dataset({
        features: ['en', 'akkts'],
        num_rows: 6632
    })
    test: Dataset({
        features: ['en', 'akkts'],
        num_rows: 737
    })
})

In [27]:
tests = translations["test"]
tests

Dataset({
    features: ['en', 'akkts'],
    num_rows: 737
})

## Tokenize the Data

In [11]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, model_max_length=model_max_length)

In [12]:
def preprocess_function(examples):
#     print(examples)
    inputs = [prefix + example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    model_inputs = tokenizer(inputs, max_length=model_max_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=model_max_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [13]:
tokenized_translations = translations.map(preprocess_function, batched=True)
tokenized_translations

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['en', 'akkts', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 6632
    })
    test: Dataset({
        features: ['en', 'akkts', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 737
    })
})

In [14]:
source_max_length = max([len(x["input_ids"]) for x in tokenized_translations["train"]])
target_max_length = max([len(x["labels"]) for x in tokenized_translations["train"]])
source_max_length, target_max_length

(59, 44)

In [15]:
tokenized_translations["train"][0]["labels"][:10]

[499, 3, 322, 26, 225, 214, 55, 1]

## Load the Model

In [16]:
model = AutoModelForSeq2SeqLM.from_pretrained(base_model_id)

In [17]:
model.config

T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "tran

In [29]:
pipeline = TranslationPipeline(model=model, tokenizer=tokenizer, max_length=model_max_length)

In [30]:
pipeline

<transformers.pipelines.text2text_generation.TranslationPipeline at 0x12ae3c730>

In [20]:
pipeline("translate English to French: hello my name is Frank")

[{'translation_text': 'Bonjour mon nom est Frank'}]

In [21]:
source_test = translations["test"][0][source_lang]
target_test = translations["test"][0][target_lang]
print(source_test)
print("-"*80)
print(target_test)

ina maḫar ilim
--------------------------------------------------------------------------------
before the god


In [22]:
def translate(text):
    return pipeline(prefix + source_test)

translate("ina ebūrim")

[{'translation_text': 'ina maar ilim'}]

In [35]:
def sample(num_samples=50):
    for i in range(min(num_samples, tests.num_rows)):
        t = tests[i]
    #     print(t)
        src = t[source_lang]
        tgt = t[target_lang]
        query = prefix + src
        pred = pipeline(query)[0]["translation_text"]
        print("-"*48)
        print("QUERY ", query)
        print("TARGET", tgt)
        print("PRED  ", pred)
    #     break
    
sample(10)

------------------------------------------------
QUERY  translate Akkadian to English: ina maḫar ilim
TARGET before the god
PRED   in the sand
------------------------------------------------
QUERY  translate Akkadian to English: šalmāku mimma libbaka
TARGET I am well; your heart at all
PRED   almku mimma libbaka
------------------------------------------------
QUERY  translate Akkadian to English: paṣṣunat
TARGET shall be veiled;
PRED   the king of the king
------------------------------------------------
QUERY  translate Akkadian to English: litēr
TARGET may she turn;
PRED   litr
------------------------------------------------
QUERY  translate Akkadian to English: ... šar(?) māt sumeri u akkade muma’’ir māti
TARGET ... king(?) of the land of Sumer and Akkad, who leads the land;
PRED   ... ar(?) if a man who sumeri u a man
------------------------------------------------
QUERY  translate Akkadian to English: iddunū panūšuma
TARGET they will give, and if he so chooses,
PRED   he shall

## Train

In [23]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [24]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    fp16=has_cuda,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_translations["train"],
    eval_dataset=tokenized_translations["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)



In [25]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: en, akkts. If en, akkts are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6632
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4150


Epoch,Training Loss,Validation Loss
1,No log,3.867995
2,4.351400,3.675492


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: en, akkts. If en, akkts are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 737
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: en, akkts. If en, akkts are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 737
  Batch size = 16


KeyboardInterrupt: 

In [36]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: en, akkts. If en, akkts are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6632
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4150


Epoch,Training Loss,Validation Loss
1,No log,3.563645
2,3.807500,3.483984


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: en, akkts. If en, akkts are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 737
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: en, akkts. If en, akkts are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 737
  Batch size = 16


KeyboardInterrupt: 

In [37]:
sample()

------------------------------------------------
QUERY  translate Akkadian to English: ina maḫar ilim
TARGET before the god
PRED   in the king of the king
------------------------------------------------
QUERY  translate Akkadian to English: šalmāku mimma libbaka
TARGET I am well; your heart at all
PRED   the lord of the swathes
------------------------------------------------
QUERY  translate Akkadian to English: paṣṣunat
TARGET shall be veiled;
PRED   the king
------------------------------------------------
QUERY  translate Akkadian to English: litēr
TARGET may she turn;
PRED   the king
------------------------------------------------
QUERY  translate Akkadian to English: ... šar(?) māt sumeri u akkade muma’’ir māti
TARGET ... king(?) of the land of Sumer and Akkad, who leads the land;
PRED   ... a shammer he shall take, a smear he shall give.
------------------------------------------------
QUERY  translate Akkadian to English: iddunū panūšuma
TARGET they will give, and if he so ch