In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import (
    Seq2SeqTrainer, Seq2SeqTrainingArguments,
    pipeline, DataCollatorForSeq2Seq, EarlyStoppingCallback
)
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from safetensors.torch import load_file

In [2]:
# Read the CSV file
data = pd.read_csv("/kaggle/input/eng-naga/eng-naga.csv")

In [3]:
data.shape

(7950, 2)

In [4]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [5]:
target_texts = list(data["English"])  
source_texts = list(data["Nagamese"]) 



# Split the data into training and validation sets
train_source_texts, val_source_texts, train_target_texts, val_target_texts = train_test_split(
    source_texts, target_texts, test_size=0.20, random_state=40
)


In [6]:
print(len(train_source_texts))
print(len(val_source_texts))

6360
1590


In [7]:
from datasets import Dataset

# Tokenize function
def tokenize_function(examples):
    model_inputs = tokenizer(examples["source_texts"], return_tensors="pt",max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(examples["target_texts"], return_tensors="pt",max_length=128, truncation=True, padding="max_length")["input_ids"]
    # labels = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels]
    model_inputs["labels"]=labels
    return model_inputs

# Convert lists to Hugging Face Dataset
train_dataset = Dataset.from_dict({"source_texts": train_source_texts, "target_texts": train_target_texts})
val_dataset = Dataset.from_dict({"source_texts": val_source_texts, "target_texts": val_target_texts})

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["source_texts", "target_texts"])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["source_texts", "target_texts"])



Map:   0%|          | 0/6360 [00:00<?, ? examples/s]

Map:   0%|          | 0/1590 [00:00<?, ? examples/s]

In [8]:
print(train_dataset.shape)
print(val_dataset.shape)

(6360, 3)
(1590, 3)


In [9]:
print(train_dataset["input_ids"][0])
print(train_dataset["input_ids"][1])

print(train_dataset["labels"][0])
print(train_dataset["labels"][1])

[250004, 51730, 15025, 61653, 1121, 12292, 67, 333, 1645, 104035, 74473, 964, 4, 6098, 6327, 298, 3238, 298, 61217, 12292, 67, 87, 4970, 196, 11, 54, 19678, 8086, 150, 45, 61217, 4, 16362, 77748, 6327, 3080, 11, 298, 3238, 70144, 298, 61217, 12292, 67, 5, 36501, 12292, 67, 4, 4419, 311, 333, 8753, 14, 15025, 10, 184, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[250004, 36501, 311, 28, 319, 775, 669, 24, 3238, 301, 15025, 67732, 14, 21238, 74, 200, 18461, 931, 10706, 11, 35518, 1098, 11, 75, 2347, 5, 36501, 311, 1630, 18837, 196, 11, 67732, 14, 40, 74, 200, 18461, 739, 13089, 35518, 1098, 11, 75, 2347, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [10]:
!pip install sacrebleu
import sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.1.1 sacrebleu-2.5.1


In [11]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode tokenized outputs into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # SacreBLEU expects reference translations as a **list of lists**
    bleu = sacrebleu.corpus_bleu(decoded_preds, [decoded_labels])

    return {"bleu": bleu.score}

In [12]:
# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    logging_strategy="epoch", 
    eval_strategy="epoch", 
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_ratio=0.1,
    seed = 40,
    save_total_limit=2,
    num_train_epochs=25,
    predict_with_generate=True,
    optim="adafactor", 
    report_to=[],
    fp16=True,
    logging_dir="./logs",
    overwrite_output_dir=True,
    load_best_model_at_end=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=1)],
    # compute_metrics=compute_metrics,
)

# Start Training
trainer.train()


Epoch,Training Loss,Validation Loss
1,4.2837,0.411973
2,0.3687,0.354896
3,0.2841,0.352357
4,0.2035,0.361629


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1592, training_loss=1.2849944129062059, metrics={'train_runtime': 2414.6467, 'train_samples_per_second': 65.848, 'train_steps_per_second': 4.121, 'total_flos': 6891472662036480.0, 'train_loss': 1.2849944129062059, 'epoch': 4.0})

In [13]:
model.save_pretrained("./mbart_model")
tokenizer.save_pretrained("./mbart_model")

('./mbart_model/tokenizer_config.json',
 './mbart_model/special_tokens_map.json',
 './mbart_model/sentencepiece.bpe.model',
 './mbart_model/added_tokens.json',
 './mbart_model/tokenizer.json')

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
model = model.to(device)

Using device: cuda


In [15]:
source_texts = val_source_texts
reference_texts = val_target_texts

In [16]:
# Translate source texts using the model
model.eval()
machine_translations = []
batch_size=32
for i in range(0, len(source_texts), batch_size):
    batch_texts = source_texts[i : i + batch_size]  # Slice batch
    inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    
    with torch.no_grad():
        outputs = model.generate(**inputs)
    
    batch_translations = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    machine_translations.extend(batch_translations)

In [17]:

wrapped_reference_texts = [reference_texts] 
bleu = sacrebleu.corpus_bleu(machine_translations, wrapped_reference_texts)
print("BLEU Score:", bleu.score)
ter = sacrebleu.corpus_ter(machine_translations, wrapped_reference_texts)
print("TER Score:", ter.score)
chrf = sacrebleu.corpus_chrf(machine_translations, wrapped_reference_texts)
print("CHRF Score:", chrf.score)

BLEU Score: 28.6628823426343
TER Score: 64.01130563635365
CHRF Score: 47.62316556973598


In [18]:
naga_sentences = [
    "Tai exam bhal para likhi se",
    "Itu laga jawab tho ki hobo na",
    "Toi laga naam ki ase",
    "Thoi ki kuri sheii",
    "aapuni kimaan baajite aahibo"
]
for text in naga_sentences:
    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True).to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs)
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(translated_text)

He wrote the exam well
What will be your answer?
What is the name of the tree?
Whatever you see
how many times will come upon the earth
