In [None]:
from transformers import (
    MBartForConditionalGeneration, MBartTokenizer, 
    Seq2SeqTrainingArguments, Seq2SeqTrainer
  )

import torch
from torch.utils.data import random_split

In [None]:
import pandas as pd
data = pd.read_csv('../input/english-to-hindi-parallel-dataset/newdata.csv')
data.head(2)

In [None]:
data.shape

In [None]:
file = []
for  index, row in data.iterrows():
      file.append(
          {
              "translation": {
                  "hi": row['hindi_sentence'],
                  "en": row['english_sentence']
              }
          }
      )
print(f'total size of data is {len(file)}')

In [None]:
def data_collator(features:list):

  labels = [f["translation"]["en"] for f in features]
  inputs = [f["translation"]["hi"] for f in features]

  batch = tokenizer.prepare_seq2seq_batch(src_texts=inputs, src_lang="hi_IN", tgt_lang="en_XX", tgt_texts=labels, max_length=32, max_target_length=32)

  for k in batch:
    batch[k] = torch.tensor(batch[k])

  return batch

In [None]:
split = 0.98
train_dataset, eval_dataset = random_split(file, lengths=[int((1-split)*len(file))+1, int(split*len(file))])

In [None]:
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")

In [None]:
# defining training related arguments
args = Seq2SeqTrainingArguments(output_dir="indic-mbart",
                        do_train=True,
                        do_eval=True,
                        evaluation_strategy="epoch",
                        per_device_train_batch_size=16,
                        per_device_eval_batch_size=16,
                        learning_rate=5e-5,
                        num_train_epochs=1,
                        logging_dir="/logs")

In [None]:
trainer = Seq2SeqTrainer(model=model, 
                args=args, 
                data_collator=data_collator, 
                train_dataset=train_dataset,
                eval_dataset=eval_dataset)

In [None]:
trainer.train()

In [None]:
trainer.save_model("mbart-hin-eng")

In [None]:
!ls  mbart-hin-eng

In [None]:
model = MBartForConditionalGeneration.from_pretrained('mbart-hin-eng')

In [None]:
T1 = "अंतिम प्रविष्ट घटना को हाइलाइट करो"
inputs = tokenizer(T1, return_tensors="pt")
inputs

In [None]:
device = "cuda:0"
model = model.to(device)
inputs = inputs.to(device)

In [None]:
# T1 = "अंतिम प्रविष्ट घटना को हाइलाइट करो"
# inputs = tokenizer(T1, return_tensors="pt")
# # translated_tokens = model(inputs)
translated_tokens = model.generate(**inputs,forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])


In [None]:
translated_tokens

In [None]:
tokenizer.batch_decode(translated_tokens)[0]