In [1]:
import pandas as pd
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    Seq2SeqTrainer, Seq2SeqTrainingArguments,
    pipeline, DataCollatorForSeq2Seq
)
from datasets import Dataset, DatasetDict
import torch
import os

!pip install sacrebleu
import sacrebleu

os.environ["WANDB_DISABLED"] = "true"

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.1.1 sacrebleu-2.5.1


In [2]:
# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [3]:
data = pd.read_csv("/kaggle/input/english-nagamese-raw-data/raw_data.csv")
# data = data[:100]
data.rename(columns={"English": "en", "Nagamese": "nagamese"}, inplace=True)
dataset = Dataset.from_pandas(data)

In [4]:
# Split dataset into train and validation sets
train_test_split = dataset.train_test_split(test_size=0.2, seed=40)
dataset = DatasetDict({
    "train": train_test_split["train"],
    "validation": train_test_split["test"]
})


In [5]:
# Load Pretrained NLLB Model and Tokenizer
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [6]:
# Add a new language token for Nagamese
new_lang_token = "__nag__"

# Ensure the token is added to the tokenizer
# if new_lang_token not in list(tokenizer.lang_code_to_id.keys()):
#     tokenizer.lang_code_to_id[new_lang_token] = len(tokenizer.lang_code_to_id)
tokenizer.add_tokens([new_lang_token])  # Add Nagamese token to tokenizer


# Define source and target languages
src_lang = "__eng_Latn__"  # English in Latin script
tgt_lang = new_lang_token  # Nagamese custom token

tokenizer.src_lang = src_lang
tokenizer.tgt_lang = tgt_lang

In [7]:
# Tokenize Dataset
def preprocess_function(examples):
    inputs = examples["en"]
    targets = examples["nagamese"]
    model_inputs = tokenizer(inputs, text_target=targets, truncation=True, max_length=128)
    return model_inputs

# Apply preprocessing
tokenized_datasets = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/6360 [00:00<?, ? examples/s]

Map:   0%|          | 0/1590 [00:00<?, ? examples/s]

In [8]:
# Check the dataset structure
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['en', 'nagamese', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 6360
    })
    validation: Dataset({
        features: ['en', 'nagamese', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1590
    })
})


In [9]:
# Define Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="steps", 
    save_strategy="steps",
    eval_steps=10,
    save_steps=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=20,
    seed = 40,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    optim="adafactor", 
    report_to=[],
    fp16=True,
    logging_dir="./logs",
    logging_steps=50,
    overwrite_output_dir=True,
    load_best_model_at_end=True,
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start Training
trainer.train()


  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss
10,No log,7.255644
20,No log,6.189346
30,No log,5.213388
40,No log,4.607205
50,6.085500,4.243982
60,6.085500,3.903365
70,6.085500,3.662715
80,6.085500,3.478337
90,6.085500,3.351778
100,3.880300,3.239403


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1194, training_loss=2.3510703273754023, metrics={'train_runtime': 4830.6351, 'train_samples_per_second': 3.95, 'train_steps_per_second': 0.247, 'total_flos': 2166782922522624.0, 'train_loss': 2.3510703273754023, 'epoch': 3.0})

In [10]:
# Save the Fine-tuned Model
model.save_pretrained("./fine_tuned_nllb")
tokenizer.save_pretrained("./fine_tuned_nllb")


('./fine_tuned_nllb/tokenizer_config.json',
 './fine_tuned_nllb/special_tokens_map.json',
 './fine_tuned_nllb/sentencepiece.bpe.model',
 './fine_tuned_nllb/added_tokens.json',
 './fine_tuned_nllb/tokenizer.json')

In [11]:
# Translation Pipeline
translator = pipeline(
    "translation",
    model="./fine_tuned_nllb",
    tokenizer="./fine_tuned_nllb",
    device=0 if torch.cuda.is_available() else -1  # Use GPU if available
)


# Test Translation
translated = translator(
    "How are you?",
    src_lang="eng_Latn",  # Source language (English in Latin script)
    tgt_lang="nag"        # Target language (Nagamese)
)
print(translated)

Device set to use cuda:0


[{'translation_text': 'nagor to?'}]


In [12]:
!pip install transformers sacrebleu
import os
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import sacrebleu
import pandas as pd
# from sklearn.model_selection import train_test_split



In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [14]:
model = model.to(device)

In [15]:
# Extract source and reference texts from the validation set
source_texts = dataset["validation"]["en"]  # English source texts
reference_texts = dataset["validation"]["nagamese"]  # Nagamese target texts

# Check the extracted texts
print("Source Texts (English):", source_texts[:5])
print("Reference Texts (Nagamese):", reference_texts[:5])


Source Texts (English): ['Then he looked up to heaven, sighed, and said to him, "Ephphatha," that is to say, "Open!"', 'I, John—your brother and the one who shares with you in the suffering and kingdom and patient endurance that are in Jesus—was on the island called Patmos because of the word of God and the testimony about Jesus.', 'If only in this life we hope in Christ, of all people we are most to be pitied.', 'Although I am not with you in the flesh, yet I am with you in spirit. I rejoice to see your good order and the strength of your faith in Christ.', 'For it will come upon everyone living on the face of the whole earth.']
Reference Texts (Nagamese): ['Titia Tai sorgo phale sai se aru jor pora awaj di kena koise, “Ephphatha!” -Motlob ase “Khuli jabi!”', 'Moi, John- apnikhan laga bhai aru ekjon jun Tai laga rajyo nimite eke logote mili kena dukh korise, aru nomro kori kena Jisu nimite sob kaam korise- Moi Patmos laga ekta majuli majote thakise, Isor laga kotha aru Jisu laga gawah

In [16]:
# tokenizer.lang_code_to_id["nag"] = tokenizer.lang_code_to_id.get("eng_Latn") 

In [17]:
# Ensure tokenizer recognizes `__nag__` for forced decoding
model.eval()
machine_translations = []

for text in source_texts:
    inputs = tokenizer(text, return_tensors="pt", max_length=64, truncation=True).to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids(new_lang_token))
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    machine_translations.append(translated_text)


In [18]:
# Compute BLEU Score
wrapped_reference_texts = [reference_texts]  # Ensure correct format
bleu = sacrebleu.corpus_bleu(machine_translations, wrapped_reference_texts)
print("BLEU Score:", bleu.score)

BLEU Score: 13.978757065196794
