In [1]:
!pip install transformers sentencepiece torch

from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import torch



In [2]:
model_path = "/kaggle/input/english-nagamese-mbart-50-model/results/checkpoint-800"
model = MBartForConditionalGeneration.from_pretrained(model_path)
tokenizer = MBart50TokenizerFast.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Model and Tokenizer Loaded Successfully!")

Model and Tokenizer Loaded Successfully!


In [3]:
new_tokens = ["nagm"]
tokenizer.add_special_tokens({"additional_special_tokens": new_tokens})

model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


MBartScaledWordEmbedding(250055, 1024, padding_idx=1)

In [4]:
def translate_english_to_nagamese(text):
    tokenizer.src_lang = "en_XX"  # Use the correct English token

    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    nagm_token_id = tokenizer.convert_tokens_to_ids("nagm")

    if nagm_token_id is None:
        raise ValueError("Error: 'nagm' token not found in tokenizer. Check if it was added properly.")

    output_tokens = model.generate(**inputs, forced_bos_token_id=nagm_token_id)

    return tokenizer.batch_decode(output_tokens, skip_special_tokens=True)[0]


In [5]:
english_sentences = [
    "For God so loved the world that He gave His only Son, that whoever believes in Him should not perish but have eternal life.",
    "I am going to the market.",
    "Can you help me with this?",
    "The roads are difficult to travel during the rainy season.",
    "Do to others as you would have them do to you."
]

translations = [translate_english_to_nagamese(sentence) for sentence in english_sentences]

for eng, nag in zip(english_sentences, translations):
    print(f"English: {eng}\nNagamese: {nag}\n")

English: For God so loved the world that He gave His only Son, that whoever believes in Him should not perish but have eternal life.
Nagamese: Kilemane Isor itu prithibi ke morom korise, Tai laga ekjon morom thaka Chokra ke, jun Tai ke biswas koribo, khotom nahobo, kintu anonto jibon pabo.

English: I am going to the market.
Nagamese: Moi bemarkhan te jai ase.

English: Can you help me with this?
Nagamese: Aru itu ami ke kiba modot koribo?

English: The roads are difficult to travel during the rainy season.
Nagamese: Itu homoi te rasta khan bisi digdar hoise.

English: Do to others as you would have them do to you.
Nagamese: Itu nisena dusra khan ke bi eneka he koribi.

