## Part 3: Transformer

1. Importing necessary libraries

In [3]:
import os
import random
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
import evaluate
os.environ["WANDB_DISABLED"] = "true"

2. Loading and defining Dataset

In [6]:
with open("/kaggle/input/seq2seq1/fra.txt", encoding="utf-8") as f:
    lines = f.read().strip().split("\n")
pairs = [line.split('\t')[:2] for line in lines if '\t' in line and len(line.split('\t')) >= 2]
random.seed(42)
random.shuffle(pairs)
pairs = pairs[:200000]

In [9]:
dataset_dict = {
    "translation": [{"en": en.strip(), "fr": fr.strip()} for en, fr in pairs]
}

In [10]:
raw_dataset = Dataset.from_list(dataset_dict["translation"])
raw_dataset = raw_dataset.train_test_split(test_size=0.1, seed=42)

3. Model Selection

In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

4. Tokenization

In [14]:
max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    inputs = tokenizer(examples["en"], max_length=max_input_length, padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["fr"], max_length=max_target_length, padding="max_length", truncation=True)
    inputs["labels"] = labels["input_ids"]
    return inputs

In [None]:
tokenized_datasets = raw_dataset.map(preprocess_function, batched=True)

5. Evaluation Metrics Function definition

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
bleu = evaluate.load("sacrebleu")

In [17]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_labels = [[label] for label in decoded_labels]
    return {"bleu": bleu.compute(predictions=decoded_preds, references=decoded_labels)["score"]}

6. Training Model

In [26]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./transformer-nmt",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    predict_with_generate=True,
    logging_strategy="epoch",
    fp16=True,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [27]:
trainer.train()



Step,Training Loss
5625,0.0274
11250,0.0276
16875,0.0222




TrainOutput(global_step=16875, training_loss=0.025743223741319446, metrics={'train_runtime': 7980.8635, 'train_samples_per_second': 67.662, 'train_steps_per_second': 2.114, 'total_flos': 1.830511706112e+16, 'train_loss': 0.025743223741319446, 'epoch': 3.0})

7. Final BLEU Score

In [31]:
metrics = trainer.evaluate()
print(f"\n Final BLEU Score: {metrics['eval_bleu']:.2f}")


 Final BLEU Score: 56.80
