In [1]:
import os, re, gzip, random, warnings
from xml.etree.ElementTree import iterparse

import numpy as np
import torch 
from datasets import Dataset
import evaluate
from sklearn.model_selection import train_test_split 

from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, set_seed
) 
from peft import LoraConfig, get_peft_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
set_seed(42)

In [3]:
file_path = "uk-en.tmx.gz"

In [4]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.benchmark = True

device = "cuda" if torch.cuda.is_available() else "cpu"
use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
use_fp16 = (not use_bf16) and torch.cuda.is_available()

In [5]:
device

'cuda'

In [6]:
use_bf16

True

In [7]:
moskalski = re.compile(r"[ёъыэЁЪЫЭ]")
url = re.compile(r"""https?://\S+|www\.\S+""", re.IGNORECASE)
email = re.compile(r"""[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}""", re.IGNORECASE)
multispace = re.compile(r"\s+")
apostrophes = "’‘`´ʼʹʽ＇＇❛❜ꞌʾ"
apostrophe_re = re.compile(f"[{apostrophes}]")
quotes = "«»“”„‟❝❞〝〞＂"
quotes_re = re.compile(f"[{quotes}]")
dashes = "–—−‒―"
dashes_re = re.compile(f"[{dashes}]")
right_apostrophe = "\u02BC"

def read_tmx_pairs(file_path, limit):
    pairs = []
    seen_pairs = set()
    seen_src   = set()
    with gzip.open(file_path, "r") as file:
        counter = 0
        for _, element in iterparse(file, events=("end",)):
            if element.tag == "tu":
                en_text = None
                uk_text = None
                for inner_element in element:
                    if inner_element.tag == "tuv":
                        language = inner_element.attrib.get("{http://www.w3.org/XML/1998/namespace}lang")
                        seg = inner_element.find("seg")
                        sentence = seg.text.strip()
                        sentence = unicodedata.normalize("NFKC", sentence)
                        sentence = url.sub(" ", sentence)
                        sentence = email.sub(" ", sentence)
                        sentence = multispace.sub(" ", sentence)
                        sentence = apostrophe_re.sub(right_apostrophe, sentence)
                        sentence = quotes_re.sub("'", sentence)
                        sentence = dashes_re.sub("-", sentence)
                        if language == "en":
                            english = sentence
                        elif language == "uk":
                            ukrainian = sentence
                if len(english.split()) > 2 and len(ukrainian.split()) > 1 and not moskalski.search(ukrainian):
                    if english in seen_src:
                        pass
                    else:
                        key = (english, ukrainian)
                        if key in seen_pairs:
                            pass
                        else:
                            pairs.append(key)
                            seen_pairs.add(key)
                            seen_src.add(english)
                            counter += 1
                            if counter >= limit:
                                break
                element.clear()
                element.clear()
    return pairs

pairs = read_tmx_pairs(file_path, 100000)

In [8]:
len(pairs)

100000

In [9]:
pairs[0]

('Developers decided to explore something new after atomizers and developed the simplest kit called Pollux 25.',
 'Після атомайзерів розробники вирішили спробувати себе в чомусь новому - та створили найпростіший набір Pollux 25.')

In [10]:
X = [sentences[0] for sentences in pairs]
y = [sentences[1] for sentences in pairs]

In [11]:
X[0], y[0]

('Developers decided to explore something new after atomizers and developed the simplest kit called Pollux 25.',
 'Після атомайзерів розробники вирішили спробувати себе в чомусь новому - та створили найпростіший набір Pollux 25.')

In [12]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.10, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

In [13]:
train_raw = Dataset.from_dict({"src": X_train, "tgt": y_train})
valid_raw = Dataset.from_dict({"src": X_valid, "tgt": y_valid})
test_raw  = Dataset.from_dict({"src": X_test,  "tgt": y_test})

In [14]:
len(train_raw), len(valid_raw), len(test_raw)

(90000, 5000, 5000)

In [15]:
tokenizer = AutoTokenizer.from_pretrained("google/mt5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")
model.config.pad_token_id = tokenizer.pad_token_id

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [16]:
model.config.use_cache = False 
model.gradient_checkpointing_enable()

In [17]:
peft_cfg = LoraConfig(
    r = 16, lora_alpha = 32, lora_dropout = 0.05, bias = "none", task_type = "SEQ_2_SEQ_LM",
    target_modules = ["q","k","v","o","wi_0","wi_1","wo"]
)

In [18]:
model = get_peft_model(model, peft_cfg)
model.print_trainable_parameters()

trainable params: 6,782,976 || all params: 589,184,256 || trainable%: 1.1512


In [19]:
PREFIX = "translate English to Ukrainian: "

In [20]:
def preprocess(batch):
    inputs = [PREFIX + s for s in batch["src"]]
    model_inputs = tokenizer(
        inputs, max_length = 128, truncation = True, padding = False
    )
    labels = tokenizer(
        batch["tgt"], max_length = 128, truncation = True, padding = False
    )["input_ids"]
    model_inputs["labels"] = labels
    return model_inputs

In [21]:
train_ds = train_raw.map(preprocess, batched = True, remove_columns = train_raw.column_names)
valid_ds = valid_raw.map(preprocess, batched = True, remove_columns = valid_raw.column_names)
test_ds  = test_raw.map(preprocess,  batched = True, remove_columns = test_raw.column_names)


Map: 100%|██████████| 90000/90000 [00:06<00:00, 14959.89 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 15776.39 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 15652.53 examples/s]


In [22]:
collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model = model)

In [23]:
sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")

def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [[l.strip()] for l in labels]
    return preds, labels

sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")

def compute_metrics(eval_pred):
    preds, labels = eval_pred

    if isinstance(preds, tuple):
        preds = preds[0]

    pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0

    preds = np.asarray(preds)
    preds[preds < 0] = pad_id
    preds = preds.tolist()

    labels = np.asarray(labels)
    labels[labels < 0] = pad_id
    labels = labels.tolist()

    pred_txt = tokenizer.batch_decode(preds, skip_special_tokens = True, clean_up_tokenization_spaces = True)
    ref_txt  = tokenizer.batch_decode(labels, skip_special_tokens = True, clean_up_tokenization_spaces = True)

    bleu = sacrebleu.compute(predictions = pred_txt, references = [[t] for t in ref_txt])["score"]
    chrf_score = chrf.compute(predictions = pred_txt, references = [[t] for t in ref_txt])["score"]

    return {"bleu": bleu, "chrf": chrf_score}





In [24]:
output_dir = r"D:\ml"
os.makedirs(output_dir, exist_ok=True)

args = Seq2SeqTrainingArguments(
    output_dir = output_dir,
    save_safetensors = True,

    eval_strategy = "epoch",
    logging_strategy = "epoch",

    save_strategy = "no",
    load_best_model_at_end = False,
    metric_for_best_model = "eval_chrf",
    greater_is_better = True,

    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    gradient_accumulation_steps = 2,
    num_train_epochs = 10,

    learning_rate = 2e-4,
    weight_decay = 0.01,
    lr_scheduler_type = "cosine",
    warmup_ratio = 0.05,
    label_smoothing_factor = 0.1,

    predict_with_generate = True,
    generation_max_length = 128,
    generation_num_beams = 4,

    fp16 = use_fp16,
    bf16 = use_bf16,

    gradient_checkpointing = True,
    remove_unused_columns = False,
    dataloader_num_workers = 0,
    report_to = "none",
    optim = "adamw_torch_fused",
)

In [25]:
trainer = Seq2SeqTrainer(
    model = model,
    args = args,
    train_dataset = train_ds,
    eval_dataset = valid_ds,
    data_collator = collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Chrf
1,6.0861,3.65907,12.042669,35.390954
2,4.1735,3.542022,14.013178,37.808354
3,4.0219,3.482545,15.090935,39.473074
4,3.9239,3.43642,15.828057,40.471892
5,3.8577,3.407248,16.692737,41.650764
6,3.809,3.374879,17.104681,42.134303
7,3.7756,3.366297,17.328059,42.46198
8,3.7542,3.359988,17.595295,42.913361
9,3.7427,3.35576,17.700842,42.934947


In [None]:
merged = model.merge_and_unload()
full_dir = rf"{output_dir}_full"
os.makedirs(full_dir, exist_ok=True)

merged.save_pretrained(full_dir, safe_serialization=True)
tokenizer.save_pretrained(full_dir)

print(f"Full merged model saved to: {full_dir}")

[INFO] Full merged model saved to: C:\ml\m_full
