In [1]:
import os
import pandas as pd
import numpy as np
import torch

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [48]:
MODEL_NAME = "google/mt5-base"     # use mt5-small if VRAM is limited
MT5_SMALL_MODEL_NAME = "google/mt5-small"
TASK_PREFIX = "translate Romanian to Rromani: "
MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 10
LR = 3e-4
OUTPUT_DIR = "./mt5-small-ro-rmy"
CSV_PATH = "full_corpus.csv"

In [39]:
df = pd.read_csv(CSV_PATH)

# Rename if needed
df = df.rename(columns={"Text_ro": "ro", "Text_rom": "rmy"})

# Drop invalid rows
df = df.dropna(subset=["ro", "rmy"])
df = df[df["ro"].str.strip() != ""]
df = df[df["rmy"].str.strip() != ""]

df = df.reset_index(drop=True)

print("Rows after cleaning:", len(df))


Rows after cleaning: 10379


In [40]:
dataset = Dataset.from_pandas(df)

dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_ds = dataset["train"]
eval_ds = dataset["test"]


In [16]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "google/mt5-small",
    use_fast=False,   # IMPORTANT for SentencePiece
    extra_ids=0,      # CRITICAL for translation
)


In [17]:
print(type(tokenizer))



<class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>


In [18]:
#tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForSeq2SeqLM.from_pretrained(MT5_SMALL_MODEL_NAME)
model = model.to('cuda')
model.config.pad_token_id = tokenizer.pad_token_id
next(model.parameters()).device


device(type='cuda', index=0)

In [19]:
SRC_LANG = "Romanian"
TGT_LANG = "Rromani"

def preprocess(batch):
    inputs = [TASK_PREFIX + x for x in batch["ro"]]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LEN,
        truncation=True,
    )
    labels = tokenizer(
        batch["rmy"],
        max_length=MAX_LEN,
        truncation=True,
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [20]:
train_tok = train_ds.map(
    preprocess,
    batched=True,
    remove_columns=train_ds.column_names,
    load_from_cache_file=False,
)

eval_tok = eval_ds.map(
    preprocess,
    batched=True,
    remove_columns=eval_ds.column_names,
    load_from_cache_file=False,
)


Map: 100%|██████████| 9341/9341 [00:02<00:00, 3761.18 examples/s]
Map: 100%|██████████| 1038/1038 [00:00<00:00, 4160.80 examples/s]


In [21]:
bleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Logits → token IDs
    if preds.ndim == 3:
        preds = np.argmax(preds, axis=-1)

    # Replace -100 so we can decode
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(
        preds,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )
    decoded_labels = tokenizer.batch_decode(
        labels,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )

    # Normalize (IMPORTANT)
    decoded_preds = [p.strip().lower() for p in decoded_preds]
    decoded_labels = [l.strip().lower() for l in decoded_labels]

    bleu_result = bleu.compute(
        predictions=decoded_preds,
        references=[[l] for l in decoded_labels],
        tokenize="intl",
    )

    chrf_result = chrf.compute(
        predictions=decoded_preds,
        references=[[l] for l in decoded_labels],
        word_order=2,  # chrF++
    )

    return {
        "bleu": bleu_result["score"],
        "chrf": chrf_result["score"],
    }

In [22]:
use_cuda = torch.cuda.is_available()

args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="steps",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    fp16=False,
    bf16=False,
    gradient_checkpointing=False,
    logging_steps=100,
    save_steps=500,
    eval_steps=500,
    save_total_limit=2,
    report_to="tensorboard",
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
)


In [23]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)

2.6.0+cu124
True
12.4


In [24]:
import numpy as np

def inspect_batch(dataset_tok, n=3):
    for i in range(n):
        labels = np.array(dataset_tok[i]["labels"])
        print(i, "len:", len(labels), "ignored:", (labels == -100).sum(), "min/max:", labels.min(), labels.max())

inspect_batch(train_tok, 5)

0 len: 84 ignored: 0 min/max: 1 216916
1 len: 39 ignored: 0 min/max: 1 213557
2 len: 61 ignored: 0 min/max: 1 187530
3 len: 31 ignored: 0 min/max: 1 203783
4 len: 45 ignored: 0 min/max: 1 197791


In [25]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Seq2SeqTrainer(


In [26]:
sample = train_tok[0]

print("input_ids:", sample["input_ids"][:20])
print("labels:", sample["labels"][:20])
print("All labels -100?", all(l == -100 for l in sample["labels"]))


input_ids: [37194, 259, 149217, 288, 531, 63562, 266, 267, 960, 1464, 1090, 273, 259, 263, 34833, 99350, 3460, 2485, 438, 447]
labels: [1599, 259, 216916, 259, 592, 886, 13522, 340, 62973, 3507, 340, 47274, 268, 1787, 1917, 59489, 62425, 1176, 379, 35100]
All labels -100? False


In [68]:
trainer.train()

Step,Training Loss,Validation Loss,Bleu,Chrf
500,0.0,,0.0,0.0
1000,0.0,,0.0,0.0


KeyboardInterrupt: 

In [27]:
model.eval()

inputs = tokenizer(
    "translate Romanian to Romani: Pavel a stat în picioare în mijlocul Areopagului.",
    return_tensors="pt"
).to(model.device)

with torch.no_grad():
    out = model.generate(
        **inputs,
        max_new_tokens=50,
        decoder_start_token_id=tokenizer.pad_token_id,
    )

print(tokenizer.decode(out[0], skip_special_tokens=False))

<pad> <extra_id_0>.</s>


In [147]:
import torch
import numpy as np

def debug_one_example(dataset_tok, idx=0, max_new_tokens=128, num_beams=4):
    model = trainer.model
    model.eval()

    ex = dataset_tok[idx]
    input_ids = torch.tensor([ex["input_ids"]], device=model.device)
    attention_mask = torch.tensor([ex["attention_mask"]], device=model.device)
    labels = torch.tensor([ex["labels"]], device=model.device)

    # 1) Forward pass loss
    with torch.no_grad():
        out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = out.loss

    # 2) Generate prediction
    with torch.no_grad():
        gen_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            decoder_start_token_id=tokenizer.pad_token_id,
        )

    # 3) Decode nicely
    labels_np = labels.detach().cpu().numpy()
    labels_np = np.where(labels_np != -100, labels_np, tokenizer.pad_token_id)

    src = tokenizer.decode(input_ids[0].detach().cpu().tolist(), skip_special_tokens=True)
    ref = tokenizer.decode(labels_np[0].tolist(), skip_special_tokens=True)
    pred = tokenizer.decode(gen_ids[0].detach().cpu().tolist(), skip_special_tokens=True)

    print("DEVICE:", model.device)
    print("LOSS:", float(loss) if loss is not None else loss)
    print("\n[SRC]\n", src)
    print("\n[REF]\n", ref)
    print("\n[PRED]\n", pred)

# Try a few examples from eval:
debug_one_example(eval_tok, idx=0)


DEVICE: cuda:0
LOSS: 37.58132553100586

[SRC]
 translate Romanian to Rromani: Pavel a stat în picioare în mijlocul Areopagului şi a zis: „Bărbaţi atenieni! În toate privinţele vă găsesc foarte religioşi.

[REF]
 Atunći o Pavel uśtilo ande punrenθe maśkar ol manuśa d‐and‐o Areopago haj phendǎs: — Manuśalen atenienǎ! Me dikhav ke tume den but pakǐv tumare develen.

[PRED]
 <extra_id_0>.


In [None]:
trainer.save_model("./mt5-ro-rmy-final")
tokenizer.save_pretrained("./mt5-ro-rmy-final")

In [132]:
sample = train_tok[0]

print("Decoded input:")
print(tokenizer.decode(sample["input_ids"], skip_special_tokens=False))

print("\nDecoded labels:")
print(tokenizer.decode(
    [x for x in sample["labels"] if x != -100],
    skip_special_tokens=False
))


Decoded input:
translate Romanian to Rromani: ca să fiu slujitorul lui Isus Hristos între Neamuri. Eu îmi împlinesc cu scumpătate slujba Evangheliei lui Dumnezeu, pentruca Neamurile să -I fie o jertfă bine primită, sfinţită de Duhul Sfînt.</s>

Decoded labels:
Te avav ek pasturi le Kristosko le narodoske kai Nai Zhiduvuria, ai kadia kerav iek buchi swunto kai angerav e lashi viasta le Devleski, saxke kodola manush kai Nai Zhiduvuria aven lashe le Devleske, ai sai aven leske katar o Swunto Duxo.</s>


In [44]:
import torch
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from peft import LoraConfig, get_peft_model, TaskType

# 1. Model and Tokenizer
MODEL_NAME = "facebook/nllb-200-distilled-600M"
tokenizer = NllbTokenizer.from_pretrained(MODEL_NAME)

# Set the source and target language codes
# Romanian: ron_Latn | Rromani (Vlax/Generic): rom_Latn
tokenizer.src_lang = "ron_Latn"
tokenizer.tgt_lang = "rom_Latn"

def preprocess_nllb(examples):
    inputs = examples["ro"]
    targets = examples["rmy"]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    # Target tokenization
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_ds.map(preprocess_nllb, batched=True, remove_columns=train_ds.column_names)
eval_dataset = eval_ds.map(preprocess_nllb, batched=True, remove_columns=eval_ds.column_names)

# 2. Apply LoRA for Efficiency
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, device_map="auto")

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"], # NLLB-specific attention modules
    lora_dropout=0.05,
    task_type=TaskType.SEQ_2_SEQ_LM
)
model = get_peft_model(model, lora_config)

# 3. Training Arguments
args = Seq2SeqTrainingArguments(
    output_dir="./nllb-ro-rom-v1",
    eval_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True, # NLLB is more stable in FP16 than mT5
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset, # Map your preprocessed data here
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics,
)

# 2. Run evaluation on the UNTRAINED model
print("--- Initializing Baseline Evaluation ---")
baseline_metrics = trainer.evaluate()

print("\nBaseline Results:")
print(f"BLEU: {baseline_metrics['eval_bleu']:.2f}")
print(f"chrF: {baseline_metrics['eval_chrf']:.2f}")

#trainer.train()

Map: 100%|██████████| 9341/9341 [00:02<00:00, 3248.02 examples/s]
Map: 100%|██████████| 1038/1038 [00:00<00:00, 3482.32 examples/s]
  trainer = Seq2SeqTrainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


--- Initializing Baseline Evaluation ---



Baseline Results:
BLEU: 0.14
chrF: 15.20


In [47]:
args = Seq2SeqTrainingArguments(
    output_dir="./nllb-rom-ron-results",
    # Evaluation Strategy
    eval_strategy="epoch",      # Run eval after each full pass
    save_strategy="epoch",            # Save a checkpoint so you don't lose progress
    logging_steps=50,                 # Log training loss every 50 steps

    # Hyperparameters for 8k pairs
    learning_rate=2e-4,               # Stable for LoRA
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,    # Effective batch size 64
    num_train_epochs=5,               # 5 passes is usually the sweet spot

    # Hardware/Speed
    fp16=True,                        # Keep enabled if your GPU supports it
    predict_with_generate=True,       # CRITICAL for BLEU/chrF calculation
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset, # Map your preprocessed data here
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics,
)

# Start the process
trainer.train()

  trainer = Seq2SeqTrainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss,Bleu,Chrf
1,5.9274,5.320968,0.298713,10.078488
2,5.2667,4.858421,0.506031,11.107254
3,4.9585,4.605099,0.660644,12.678128
4,4.7752,4.480512,0.750095,13.090813
5,4.7193,4.440754,0.851127,13.66596


TrainOutput(global_step=730, training_loss=5.163723065101937, metrics={'train_runtime': 7058.526, 'train_samples_per_second': 6.617, 'train_steps_per_second': 0.103, 'total_flos': 6879264787636224.0, 'train_loss': 5.163723065101937, 'epoch': 5.0})

In [50]:
trainer.save_model("./nllb-rom-ron-results")
tokenizer.save_pretrained("./nllb-rom-ron-results")

print("Saved to:", "nllb-rom-ron-results")

Saved to: nllb-rom-ron-results
