# Train Transliteration

Based on: https://huggingface.co/docs/transformers/tasks/translation

In [1]:
import sys, os
import json
import torch
import random
import copy
from tqdm.notebook import tqdm
from transformers import PreTrainedTokenizerFast, AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TranslationPipeline
from datasets import load_dataset, Dataset, dataset_dict
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

In [2]:
os.environ["WANDB_NOTEBOOK_NAME"] = "TrainTransliteration.ipynb"

source_langs = set(["akk", "sux"])

target_langs = set(["akkts", "suxts"])

base_model_id = "t5-small"

model_max_length = 128
batch_size = 256

In [3]:
model_id = f"{base_model_id}-{'-'.join(source_langs)}"
model_id

't5-small-akk-sux'

In [4]:
has_cuda = torch.cuda.is_available()

In [5]:
device = torch.cuda.device(0) if has_cuda else "cpu"
device

<torch.cuda.device at 0x7ff083f82fb0>

## Load Training Data

In [6]:
all_translations = load_dataset("json", data_files="../data/translations.jsonl")
all_translations

Using custom data configuration default-dc8b07d8fd701d7f
Reusing dataset json (/home/fak/.cache/huggingface/datasets/json/default-dc8b07d8fd701d7f/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['p', 'a', 'l', 'sux', 'en', 'akk', 'akkts', 'de', 'suxts', 'fr', 'elx', 'es', 'it'],
        num_rows: 74584
    })
})

In [7]:
sourceandtargets = []
for line in tqdm(all_translations["train"]):
    for s in source_langs:
        for t in target_langs:
            ls = line[s]
            lt = line[t]
            if ls is not None and lt is not None:
                if lt[-1] == "." or lt[-1] == "!" or lt[-1] == ";" or lt[-1] == ",":
                    lt = lt[:-1]
                sourceandtargets.append((ls, lt))
                
random.shuffle(sourceandtargets)
translations = Dataset.from_dict({"source": [x[0] for x in sourceandtargets], "target": [x[1] for x in sourceandtargets]})
translations

  0%|          | 0/74584 [00:00<?, ?it/s]

Dataset({
    features: ['source', 'target'],
    num_rows: 7432
})

In [8]:
translations = translations.train_test_split(test_size=0.1)
translations

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 6688
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 744
    })
})

In [9]:
tests = translations["test"]
tests

Dataset({
    features: ['source', 'target'],
    num_rows: 744
})

In [10]:
tests[:10]

{'source': ['a-na di-a-szi-im i-gur',
  '_dumu-mesz_ u2-li-sum2-ma',
  'szum-ma a-wi-lum',
  'szum-ma _lu2 dumu-munus lu2_ im-ha-as,-ma',
  'i-na mi-szi-tim',
  'i-na _e2 a-ba_',
  'u3 mu-ur2-te-di-sza i-gur',
  '_geme2_ a-na mu-ti-sza',
  'a-na _e2_ e-t,e-e szu-bat {d}ir-kal-la',
  'a-szar la a-ma-ri pu-uz-ra'],
 'target': ['ana diāšim īgur',
  'mārī ūlissumma',
  'šumma awīlum',
  'šumma a’īlu mārat a’īle imḫaṣma',
  'ina mīšītim',
  'ina bīt abim',
  'u murteddīša īgur',
  'amtam ana mutiša',
  'ana bīt eṭê šubat irkalla',
  'ašar lā amāri puzra']}

## Train Tokenizer

* https://huggingface.co/blog/how-to-train
* https://huggingface.co/docs/transformers/v4.20.1/en/fast_tokenizers

In [11]:
tokenizer_txt_lines = []

for d in ["train", "test"]:
    for t in tqdm(translations[d]):
        tokenizer_txt_lines.append(t["source"])
        tokenizer_txt_lines.append(t["target"])
        
len(tokenizer_txt_lines)

  0%|          | 0/6688 [00:00<?, ?it/s]

  0%|          | 0/744 [00:00<?, ?it/s]

14864

In [12]:
tokenizer_txt_path = os.path.abspath("tokenizer_training_data.txt")
with open(tokenizer_txt_path, "wb") as f:
    f.write(bytes("\n".join(tokenizer_txt_lines), "utf8"))
tokenizer_txt_path

'/home/fak/Projects/CuneiformTranslators/tools/tokenizer_training_data.txt'

In [13]:
!tail tokenizer_training_data.txt

li-szi-im-szum
lišīmšum
le-u2#-ut-ka i-lu ra-bu!-ut-tum i-da#-al-la-lu
lē'ûtka ilū rabûtum idallalū
i-lu _lugal_-ri2
ilū šarrī
lu-u2 _lu2-sag lugal_
lū ša-rēš šarri
i-na lib3-bi
ina libbi

In [14]:
special_tokens=["<pad>", "</s>", "<unk>"]

In [15]:
additional_special_tokens = [f"<extra_id_{i}>" for i in range(100)]
additional_special_tokens[:3]

['<extra_id_0>', '<extra_id_1>', '<extra_id_2>']

In [16]:
tokenizer = Tokenizer(BPE(unk_token="<unk>"))
trainer = BpeTrainer(special_tokens=special_tokens + additional_special_tokens)

tokenizer.pre_tokenizer = Whitespace()
tokenizer.post_processor = TemplateProcessing(
    single="$0 </s>",
    pair="$A </s> $B:1 </s>:1",
    special_tokens=[("</s>", 1)],
)
tokenizer.model_max_length=model_max_length
files = [tokenizer_txt_path]
tokenizer.train(files, trainer)

print(tokenizer.get_vocab_size(), "vocab_size")

os.makedirs("../results", exist_ok=True)
tokenizer_json_path = os.path.abspath("../results/transliteration_tokenizer.json")
tokenizer.save(tokenizer_json_path)
tokenizer_json_path




12136 vocab_size


'/home/fak/Projects/CuneiformTranslators/results/transliteration_tokenizer.json'

In [17]:
test_txt = tokenizer_txt_lines[3]
test_tokens = tokenizer.encode(test_txt).ids
print(test_txt)
print(test_tokens)

nibīt
[6371, 1]


In [18]:
tokenizer.decode(test_tokens)

'nibīt'

In [19]:
tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer, model_max_len=model_max_length)
tokenizer.decode(test_tokens)

'nibīt </s>'

In [20]:
# atokenizer = AutoTokenizer.from_pretrained(base_model_id, model_max_length=model_max_length)
# atokenizer.special_tokens_map

In [21]:
# atokenizer.sep_token

In [22]:
tokenizer.add_special_tokens({
    'eos_token': '</s>',
    'unk_token': '<unk>',
    'pad_token': '<pad>',
    "additional_special_tokens": additional_special_tokens })



0

In [23]:
tokenizer.padding_side = "right"

In [24]:
print("pad", tokenizer.pad_token, tokenizer.pad_token_id)
print("eos", tokenizer.eos_token, tokenizer.eos_token_id)
print("unk", tokenizer.unk_token, tokenizer.unk_token_id)

pad <pad> 0
eos </s> 1
unk <unk> 2


In [25]:
# tokenizer.special_tokens_map

In [26]:
tokenizer.vocab_size

12136

In [27]:
tokenizer.model_max_length = model_max_length

In [28]:
tokenizer.model_max_length

128

## Tokenize the Data

In [29]:
ccc = 0

def preprocess_function(examples):
    global ccc
#     print(examples)
    inputs = [example for example in examples["source"]]
    targets = [example for example in examples["target"]]
    model_inputs = tokenizer(inputs, max_length=model_max_length, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=model_max_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    ccc += 1
    if ccc == 1:
        print(model_inputs["input_ids"][0])
        print(model_inputs["labels"][0])

    return model_inputs

tokenized_translations = translations.map(preprocess_function, batched=True)
tokenized_translations



  0%|          | 0/7 [00:00<?, ?ba/s]

[987, 112, 1223, 157, 112, 207, 221, 112, 353, 1]
[3500, 229, 2716, 1]


  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6688
    })
    test: Dataset({
        features: ['source', 'target', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 744
    })
})

In [30]:
source_max_length = max([len(x["input_ids"]) for x in tokenized_translations["train"]])
target_max_length = max([len(x["labels"]) for x in tokenized_translations["train"]])
source_max_length, target_max_length

(50, 23)

In [31]:
tokenized_translations["train"][0]["labels"][:10]

[3500, 229, 2716, 1]

## Load the Model

In [32]:
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_id)

In [33]:
model_config = copy.deepcopy(base_model.config)
model_config

T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translation_

In [34]:
model_config.vocab_size = tokenizer.vocab_size

In [35]:
model = AutoModelForSeq2SeqLM.from_config(model_config)
model.config.vocab_size

12136

## Train

In [36]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [37]:
data_collator

DataCollatorForSeq2Seq(tokenizer=PreTrainedTokenizerFast(name_or_path='', vocab_size=12136, model_max_len=128, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42

In [38]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"../results/{model_id}",
    evaluation_strategy="epoch",
    learning_rate=0.75*2e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=200,
    fp16=has_cuda,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_translations["train"],
    eval_dataset=tokenized_translations["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)



Using amp half precision backend


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, source, token_type_ids. If target, source, token_type_ids are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6688
  Num Epochs = 200
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 5400
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mpraeclarum[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,6.341805
2,No log,5.818153
3,No log,5.504787
4,No log,5.272661
5,No log,5.123484
6,No log,5.029773
7,No log,4.962452
8,No log,4.924763
9,No log,4.870087
10,No log,4.77533


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, source, token_type_ids. If target, source, token_type_ids are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 744
  Batch size = 256
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, source, token_type_ids. If target, source, token_type_ids are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 744
  Batch size = 256
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, source, token_type_ids. If target, source, token_type_ids are not expected by `T5ForConditionalGene

## Sample

In [None]:
pipeline = TranslationPipeline(model=model.to("cpu"), tokenizer=tokenizer, max_length=model_max_length)

In [None]:
pipeline

In [None]:
pipeline("translate English to French: hello my name is Frank")

In [None]:
source_test = translations["test"][0]["source"]
target_test = translations["test"][0]["target"]
print(source_test)
print("-"*80)
print(target_test)

In [None]:
def translate(text):
    return pipeline(text)[0]["translation_text"]

print(translate(source_test))

In [None]:
def sample(num_samples=50):
    for i in range(num_samples):
        t = tests[random.randint(0, tests.num_rows)]
    #     print(t)
        src = t["source"]
        tgt = t["target"]
        query = src
        pred = pipeline(query)[0]["translation_text"]
        print("-"*48)
        print("QUERY ", query)
        print("TARGET", tgt)
        print("PRED  ", pred)
    #     break
    
sample()

## Transliterate All

In [None]:
for t in tqdm(all_translations):
    pass