In [2]:
# ============================================
# 1. Install dependencies
# ============================================
%pip install transformers datasets sentencepiece -q

import torch
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import Dataset

# ============================================
# 2. Load lightweight model & tokenizer
# ============================================
model_name = "google/mt5-small"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

# ============================================
# 3. Tiny toy dataset (multilingual examples)
# ============================================
data = {
    "src_text": [
        "Translate English to Hindi: Hello, how are you?",
        "Translate English to Spanish: I love programming.",
        "Translate Hindi to English: मेरा नाम सुकृति है।",
        "Translate Spanish to English: Me gusta aprender IA."
    ],
    "tgt_text": [
        "नमस्ते, आप कैसे हैं?",
        "Me encanta programar.",
        "My name is Sukirti.",
        "I like learning AI."
    ],
}

dataset = Dataset.from_dict(data)

# ============================================
# 4. Tokenization
# ============================================
def preprocess(batch):
    inputs = tokenizer(batch["src_text"], truncation=True, padding="max_length", max_length=64)
    labels = tokenizer(batch["tgt_text"], truncation=True, padding="max_length", max_length=64)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

# ============================================
# 5. Training setup (minimal, no errors)
# ============================================
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5-small-finetuned",
    per_device_train_batch_size=2,
    num_train_epochs=1,             # just 1 epoch (fast demo)
    logging_steps=2,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(), # GPU if available
)

# ============================================
# 6. Trainer
# ============================================
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# ============================================
# 7. Train
# ============================================
trainer.train()

# ============================================
# 8. Quick test
# ============================================
test_text = "Translate English to Hindi: How is the weather today?"
inputs = tokenizer(test_text, return_tensors="pt")
outputs = model.generate(**inputs, max_length=40)
print("Output:", tokenizer.decode(outputs[0], skip_special_tokens=True))



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
Map: 100%|██████████| 4/4 [00:00<00:00,  7.35 examples/s]
  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
2,61.5728


Output: <extra_id_0>abinsk


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
