In [1]:
import pandas as pd
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    Seq2SeqTrainer, Seq2SeqTrainingArguments,
    pipeline, DataCollatorForSeq2Seq, EarlyStoppingCallback
)
from datasets import Dataset, DatasetDict
import torch
import os

!pip install sacrebleu
import sacrebleu

os.environ["WANDB_DISABLED"] = "true"

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.1.1 sacrebleu-2.5.1


In [2]:
# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [3]:
data = pd.read_csv("/kaggle/input/english-nagamese-raw-data/raw_data.csv")
# data = data[:100]
data.rename(columns={"English": "en", "Nagamese": "nagamese"}, inplace=True)
dataset = Dataset.from_pandas(data)

In [4]:
# Split dataset into train and validation sets
train_test_split = dataset.train_test_split(test_size=0.2, seed=40)
dataset = DatasetDict({
    "train": train_test_split["train"],
    "validation": train_test_split["test"]
})


In [5]:
# Load Pretrained NLLB Model and Tokenizer
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [6]:
# Modify dropout rates (Joint Dropout)
model.config.dropout = 0.3  # Dropout rate for the whole model
model.config.attention_dropout = 0.3  # Dropout rate for attention layers
model.config.activation_dropout = 0.3  # Dropout in activation functions
model.config.encoder_layerdrop = 0.3  # Dropout in encoder layers
model.config.decoder_layerdrop = 0.3  # Dropout in decoder layers

In [7]:
# Add a new language token for Nagamese
new_lang_token = "__nag__"

# Ensure the token is added to the tokenizer
tokenizer.add_tokens([new_lang_token])  # Add Nagamese token to tokenizer


# Define source and target languages
tgt_lang = "__eng_Latn__"  
src_lang = new_lang_token  

tokenizer.src_lang = src_lang
tokenizer.tgt_lang = tgt_lang

In [8]:
# Tokenize Dataset
def preprocess_function(examples):
    inputs = examples["nagamese"]
    targets = examples["en"]
    model_inputs = tokenizer(inputs, text_target=targets, truncation=True, max_length=128)
    return model_inputs

# Apply preprocessing
tokenized_datasets = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/6360 [00:00<?, ? examples/s]

Map:   0%|          | 0/1590 [00:00<?, ? examples/s]

In [9]:
# Check the dataset structure
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['en', 'nagamese', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 6360
    })
    validation: Dataset({
        features: ['en', 'nagamese', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1590
    })
})


In [10]:
# Define Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="steps", 
    save_strategy="steps",
    eval_steps=50,
    save_steps=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    warmup_steps=200,
    seed = 40,
    weight_decay=0.01,
    save_total_limit=6,
    num_train_epochs=20,
    predict_with_generate=True,
    optim="adafactor", 
    report_to=[],
    fp16=True,
    logging_dir="./logs",
    logging_steps=50,
    overwrite_output_dir=True,
    load_best_model_at_end=True,
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)]
)

# Start Training
trainer.train()

  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss
50,3.8469,3.148268
100,2.8952,2.415066
150,2.4477,2.095598
200,2.0796,1.738081
250,1.7485,1.48058
300,1.5873,1.34112
350,1.4443,1.242018
400,1.3587,1.173767
450,1.1606,1.132076
500,1.148,1.090613


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1400, training_loss=1.2225524861471995, metrics={'train_runtime': 2791.0, 'train_samples_per_second': 45.575, 'train_steps_per_second': 1.426, 'total_flos': 7127860993916928.0, 'train_loss': 1.2225524861471995, 'epoch': 7.035175879396985})

In [11]:
# Save the Fine-tuned Model
model.save_pretrained("./fine_tuned_nllb")
tokenizer.save_pretrained("./fine_tuned_nllb")


('./fine_tuned_nllb/tokenizer_config.json',
 './fine_tuned_nllb/special_tokens_map.json',
 './fine_tuned_nllb/sentencepiece.bpe.model',
 './fine_tuned_nllb/added_tokens.json',
 './fine_tuned_nllb/tokenizer.json')

In [12]:
# Translation Pipeline
translator = pipeline(
    "translation",
    model="./fine_tuned_nllb",
    tokenizer="./fine_tuned_nllb",
    device=0 if torch.cuda.is_available() else -1  
)


# Test Translation
translated = translator(
    "How are you?",
    tgt_lang="eng_Latn",  
    src_lang="nag"        
)
print(translated)

Device set to use cuda:0


[{'translation_text': 'How are you?'}]


In [13]:
!pip install transformers sacrebleu
import os
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import sacrebleu
import pandas as pd



In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [15]:
model = model.to(device)

In [16]:
# Extract source and reference texts from the validation set
source_texts = dataset["validation"]["nagamese"]  # English source texts
reference_texts = dataset["validation"]["en"]  # Nagamese target texts

# Check the extracted texts
print("Source Texts (Nagamese):", source_texts[:5])
print("Reference Texts (English):", reference_texts[:5])


Source Texts (Nagamese): ['Titia Tai sorgo phale sai se aru jor pora awaj di kena koise, “Ephphatha!” -Motlob ase “Khuli jabi!”', 'Moi, John- apnikhan laga bhai aru ekjon jun Tai laga rajyo nimite eke logote mili kena dukh korise, aru nomro kori kena Jisu nimite sob kaam korise- Moi Patmos laga ekta majuli majote thakise, Isor laga kotha aru Jisu laga gawahi nimite.', 'Jodi itu prithibi te amikhan laga jibon to Khrista laga asha te jinda thake, tinehoile itu prithibi te sob pora mon dukh manu khan to amikhan he hobo.', 'Moi apni khan logote gaw pora nathaki le bi, moi apni khan logote atma te ase. Moi apni khan laga takot aru Khrista uporte biswas aru bhal kaam khan kori thaka sai kena khushi kori ase.', 'Ekta jal phela nisena, jiman manu duniya te thaki ase taikhan uporte ahibo.']
Reference Texts (English): ['Then he looked up to heaven, sighed, and said to him, "Ephphatha," that is to say, "Open!"', 'I, John—your brother and the one who shares with you in the suffering and kingdom an

In [17]:
# Ensure tokenizer recognizes `__nag__` for forced decoding
model.eval()
machine_translations = []

for text in source_texts:
    inputs = tokenizer(text, return_tensors="pt", max_length=64, truncation=True).to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang))
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    machine_translations.append(translated_text)


In [18]:
# Compute BLEU Score
wrapped_reference_texts = [reference_texts]  # Ensure correct format
bleu = sacrebleu.corpus_bleu(machine_translations, wrapped_reference_texts)
print("BLEU Score:", bleu.score)

BLEU Score: 41.138693905003635
