In [None]:
!pip install git+https://github.com/huggingface/transformers.git@master
!pip install git+https://github.com/huggingface/datasets.git@master
!pip install sentencepiece

In [None]:
from datasets import load_dataset
from transformers import (
    MBartForConditionalGeneration, MBart50TokenizerFast, 
    Seq2SeqTrainingArguments, Seq2SeqTrainer
  )

import torch
from torch.utils.data import random_split

# **Creating Dataset**

In [None]:
#path to train files
with open("train.en") as f1, open("train.mr") as f2:
    for src, tgt in zip(f1, f2):
      train_data.append(
          {
              "translation": {
                  "en_XX": src.strip(),
                  "mr_IN": tgt.strip()
              }
          }
      )
print(f'total size of train data is {len(train_data)}')

In [None]:
#path to valid files
valid_data = []
with open("tun.en") as f1, open("tun.mr") as f2:
    for src, tgt in zip(f1, f2):
      valid_data.append(
          {
              "translation": {
                  "en_XX": src.strip(),
                  "mr_IN": tgt.strip()
              }
          }
      )
print(f'total size of valid data is {len(valid_data)}')

In [None]:
def data_collator(features:list):

  inputs = [f["translation"]["en_XX"] for f in features]
  labels = [f["translation"]["mr_IN"] for f in features]
  

  #batch = tokenizer.prepare_seq2seq_batch(src_texts=inputs, src_lang="en_XX", tgt_lang="mr_IN", tgt_texts=labels, max_length=32, max_target_length=32)

  #for k in batch:
  #  batch[k] = torch.tensor(batch[k])
  
  input = tokenizer(inputs, return_tensors="pt", max_length=32, truncation=True, padding=True)
  with tokenizer.as_target_tokenizer():
    label = tokenizer(labels, return_tensors="pt", max_length=32, truncation=True, padding=True).input_ids
  
  batch = input
  batch['labels'] = label

  return batch

# **Training**

In [None]:
# initiating model, tokenizer
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="mr_IN")


In [None]:
total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total number of parameters: {total_params}")
print(f"Total number of trainable parameters: {total_trainable_params}")

Total number of parameters: 610879488
Total number of trainable parameters: 610879488


In [None]:
# defining training related arguments
args = Seq2SeqTrainingArguments(output_dir="en-mr",
                        do_train=True,
                        do_eval=True,
                        evaluation_strategy="epoch",
                        per_device_train_batch_size=16,
                        per_device_eval_batch_size=16,
                        learning_rate=5e-4,
                        num_train_epochs=10,
                        save_strategy="epoch",
                        gradient_accumulation_steps=4,
                        eval_accumulation_steps=4,
                        logging_dir="/logs",
                        save_total_limit=1)

In [None]:
trainer = Seq2SeqTrainer(model=model, 
                args=args, 
                data_collator=data_collator, 
                train_dataset=train_data, 
                eval_dataset=valid_data)

In [None]:
trainer.train()