In [None]:
# Import the necessary libraries
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from huggingface_hub import login
import torch

In [None]:
dataset = load_dataset("")

In [None]:
# Load the tokenizer, model, and data collator
model_checkpoint = "csebuetnlp/banglat5"
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
def tokenize_label(examples): 
    # The "input" are the input prompt:
    inputs = examples['input']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    # The "output" are the expected output:
    labels = tokenizer(text_target=examples['output'], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_dataset = dataset.map(tokenize_label) 

In [None]:
batch_size = 8
# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="",
    evaluation_strategy="steps",
    eval_steps = 10000,
    save_strategy = 'steps',
    save_steps = 10000,
    optim="adafactor",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_total_limit=2,
    num_train_epochs=2,
    predict_with_generate=True,
    load_best_model_at_end=True,
)

# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [None]:
# Train the model
trainer.train()

In [None]:
trainer.save_model('')
trainer.push_to_hub()