<a href="https://colab.research.google.com/github/r-m-steffi/Bart_EAMT/blob/main/Bart_semeval_EAMT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Problem Statement : The task is to develop machine translation systems that can accurately translate s named entities in the input sentence to the target language.
Here, the source language is English and target language is Arabic.


*  Named entities are entities that are referred to by proper names, such as people, organizations, locations, dates, and more.
* Named entities are often challenging even for human translators, as sometimes there are cultural or domain-specific references that are not easily translatable.

#Dataset:
The dataset we are using is from the source, mintaka.
For our project we are using training and validation data

#Install required libraries

In [15]:

!pip install transformers datasets sentencepiece evaluate sacrebleu --quiet


#Various imports


In [16]:
# Data handling
import pandas as pd
import json

# Model & tokenizer handling
import torch
from datasets import Dataset
from transformers import (
    MBartForConditionalGeneration,         # Pretrained multilingual translation model
    MBart50TokenizerFast,                  # Fast tokenizer for mbart50 model
    Seq2SeqTrainer,                        # Trainer class for sequence-to-sequence models
    Seq2SeqTrainingArguments,              # Training args specific for seq2seq
    DataCollatorForSeq2Seq                 # Dynamically pads sequences during batching
)

# Metric loading for BLEU
import evaluate


#Download Data then unzip data

In [17]:
'''Download data'''
def download_data_and_prep(url,filename):
  import requests

  response = requests.get(url)
  with open(filename, 'wb') as f:
    f.write(response.content)

  print(f"{filename} downloaded successfully.")
  '''Unzip the zip file then delete the zip file'''
  import zipfile
  import os
  extract_folder = filename[:-4]  # You can rename this
  os.makedirs(extract_folder, exist_ok=True)

  with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

  print(f"Extracted to: {extract_folder}")


  # Delete the zip file

  os.remove(filename)
  print(f"Deleted archive: {filename}")

#Give train and validation data url and file name

In [18]:
train_url = 'https://sapienzanlp.github.io/ea-mt/assets/files/semeval.train.v2-e0d1c28b78c8dd4969d25eea5d3bc9cc.zip'
train_filename = 'train_data.zip'
val_url = 'https://sapienzanlp.github.io/ea-mt/assets/files/semeval.validation.v2-889a1492ba6c3791baa8f4224bc8e685.zip'
val_filename = 'val_data.zip'

#Download train and validation data

In [19]:
download_data_and_prep(train_url,train_filename)
download_data_and_prep(val_url,val_filename)

train_data.zip downloaded successfully.
Extracted to: train_data
Deleted archive: train_data.zip
val_data.zip downloaded successfully.
Extracted to: val_data
Deleted archive: val_data.zip


#Convert json data to pandas dataframe

In [20]:
'''Convert json to dataframe'''
def json_to_df(path):
  import json
  import pandas as pd
  jsonl_path = path
  with open(jsonl_path, 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

  # Convert to DataFrame for inspection
  df = pd.DataFrame(data)
  return df

#Store result in train_df and val_df

In [21]:
#Json to df for both train and test
train_df = json_to_df("train_data/semeval/train/fr/train.jsonl")
val_df = json_to_df("val_data/validation/fr_FR.jsonl")


#Explore Validation dataset

In [22]:
val_df['target'] = val_df['targets']
val_df.drop('targets',axis=1, inplace= True)
val_df['target']

Unnamed: 0,target
0,[{'translation': 'Quelle est la portée de la N...
1,[{'translation': 'Comment la Nomenclature stat...
2,[{'translation': 'Pourquoi la Nomenclature sta...
3,[{'translation': 'Comment fonctionne l'évoluti...
4,[{'translation': 'Qu'est-ce que l'évolution de...
...,...
719,[{'translation': 'Quel type d'endroit est la m...
720,[{'translation': ': Pouvez-vous fournir une br...
721,[{'translation': 'Qui est l'auteur du Concerto...
722,[{'translation': 'Comment décririe-vous le gen...


In [23]:
print(type(train_df["target"][0]))
print(train_df["target"][0])


<class 'str'>
Quelle est la septième plus haute montagne d’Amérique du Nord ?


#Since validation set has multiple targets for multiple entities, flaten the target columnn

#Define the model to use (mBART supports 50+ languages)
Load tokenizer and model from Hugging Face
Set the tokenizer's source and target language for encoding input text.

In [32]:
# For train_df: Assume target is a single string already
#flat_train_df = train_df.rename(columns={"source": "input", "target": "target"})
# Wrap Train in entity tags using spacy
import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to wrap detected entities with <entity> tags
def tag_entities_spacy(text):
    doc = nlp(text)
    for ent in reversed(doc.ents):  # Reverse to avoid offset issues
        text = text[:ent.start_char] + f"<entity>{ent.text}</entity>" + text[ent.end_char:]
    return text

# Apply to training data
train_df["input"] = train_df["source"].apply(tag_entities_spacy)

# If train_df["target"] is already clean, keep it
train_df["target"] = train_df["target"]  # Or rename as needed


# For val_df: Flatten list of translations (target) per source
def flatten_val_df(df):
    flat_data = []
    for _, row in df.iterrows():
        #for tgt in row["target"]:  # Each entry is a dict with 'mention' and 'translation'
        tgt = row["target"][0] # Take only the first translation
        # Use XML-style tags for the entity
        tagged_input = row["source"].replace(tgt["mention"], f"<entity>{tgt['mention']}</entity>")
        flat_data.append({
            "input": tagged_input,
            "target": tgt["translation"]
        })
    return pd.DataFrame(flat_data)

flat_val_df = flatten_val_df(val_df)


In [36]:
# Define the model to use (mBART supports 50+ languages)
model_name = "facebook/mbart-large-50-many-to-many-mmt"

# Load tokenizer and model from Hugging Face
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Define source and target languages.
# These should be ISO language codes supported by mBART50.
SRC_LANG = "en_XX"
TGT_LANG = "fr_FR"  # Change this to the appropriate target language

# Set the tokenizer's source language for encoding input text.
tokenizer.src_lang = SRC_LANG
tokenizer.tgt_lang = TGT_LANG


# Function to tokenize each example

In [37]:
# Function to tokenize each example
def tokenize_fn(example):
    tokenizer.src_lang = "en_XX"
    model_inputs = tokenizer(example["input"], padding="max_length", truncation=True, max_length=128)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["target"], padding="max_length", truncation=True, max_length=128)

    model_inputs["labels"] = labels["input_ids"]  # Add labels for training
    return model_inputs

# Convert pandas to Hugging Face dataset and tokenize
train_dataset = Dataset.from_pandas(train_df).map(tokenize_fn)
val_dataset = Dataset.from_pandas(flat_val_df).map(tokenize_fn)


Map:   0%|          | 0/5531 [00:00<?, ? examples/s]



Map:   0%|          | 0/724 [00:00<?, ? examples/s]

In [38]:
from evaluate import load

# Step 1: Load the BLEU metric from Hugging Face evaluate library
bleu = load("sacrebleu")

# Step 2: Define a compute_metrics function to pass into Seq2SeqTrainer
def compute_metrics(eval_pred):
    predictions, labels = eval_pred  # Unpack predictions and labels from evaluation output

    # Step 3: Decode the predicted token IDs to text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Step 4: Decode label token IDs to text (labels may include padding or -100)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Step 5: BLEU expects references as list of lists. So wrap each label string in a list.
    decoded_labels = [[label] for label in decoded_labels]

    # Step 6: Compute BLEU score using sacrebleu
    return bleu.compute(predictions=decoded_preds, references=decoded_labels)


In [29]:
# Training configuration
'''training_args = Seq2SeqTrainingArguments(
    output_dir="./ea_mt_model",              # Where to save model
    per_device_train_batch_size=4,           # Adjust based on RAM
    per_device_eval_batch_size=4,
    num_train_epochs=6,                      # Total training epochs
    learning_rate=5e-5,
    logging_dir="./logs",                    # For TensorBoard
    save_strategy="epoch",                   # Save after each epoch
    eval_strategy="epoch",            # Evaluate on val each epoch
    predict_with_generate=True,              # Required for translation
    fp16=torch.cuda.is_available(),          # Use FP16 if GPU available
    report_to="none",                         # Disable wandb logging

)'''
from transformers import EarlyStoppingCallback
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./ea_mt_model",
    per_device_train_batch_size=4,
    num_train_epochs=10,                          # Use more epochs; early stopping will prevent overfitting
    learning_rate=1e-5,
    logging_dir="./logs",
    eval_strategy="epoch",                 # BLEU evaluated each epoch
    save_strategy="epoch",
    save_total_limit=2,
    predict_with_generate=True,
    load_best_model_at_end=True,                 # Restore best model using BLEU
    metric_for_best_model="eval_score",           # Use BLEU to determine best model
    greater_is_better=True,                      # Higher BLEU is better
    report_to="none",                            # Disable wandb
    fp16=torch.cuda.is_available(),
    generation_max_length=128,
    generation_num_beams=4,

)

model.config.forced_bos_token_id = tokenizer.lang_code_to_id["fr_XX"]


In [30]:
# Trainer will handle training loop, eval, saving, etc.
'''trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model)
)

# Train the model
trainer.train()'''
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  #  Stop if BLEU doesn't improve for 2 epochs
)
trainer.train()


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Score,Counts,Totals,Precisions,Bp,Sys Len,Ref Len
1,0.0994,0.251144,37.006808,"[5357, 3195, 2067, 1319]","[8195, 7471, 6747, 6023]","[65.36912751677852, 42.76535938963994, 30.63583815028902, 21.899385688195252]",1.0,8195,8125
2,0.0584,0.265434,34.367274,"[5285, 3041, 1939, 1238]","[8383, 7659, 6935, 6211]","[63.0442562328522, 39.70492231361796, 27.959625090122568, 19.93237803896313]",1.0,8383,8125
3,0.0408,0.295097,35.193013,"[5292, 3091, 1985, 1239]","[8288, 7564, 6840, 6116]","[63.851351351351354, 40.86462189317821, 29.02046783625731, 20.25833878351864]",1.0,8288,8125


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=4149, training_loss=0.2708240455246799, metrics={'train_runtime': 3441.1162, 'train_samples_per_second': 16.073, 'train_steps_per_second': 4.019, 'total_flos': 4494898029920256.0, 'train_loss': 0.2708240455246799, 'epoch': 3.0})

In [39]:
print("Train size:", len(train_df))
print("Val size:", len(flat_val_df))
print("\nSample train:")
print(train_df.sample(3))

print("\nSample val:")
print(flat_val_df.sample(3))

Train size: 5531
Val size: 724

Sample train:
            id source_locale target_locale  \
2391  91ebb809            en            fr   
3133  7983ef63            en            fr   
1157  f6523b5b            en            fr   

                                                 source  \
2391              How many husbands did Cleopatra have?   
3133  How many times did the U.S. soccer team play i...   
1157    Is the Nile River longer than the Amazon River?   

                                                 target entities     from  \
2391         Combien de maris Cléopâtre avait-elle eu ?   [Q635]  mintaka   
3133  Combien de fois l’équipe des États-Unis de foo...    [Q30]  mintaka   
1157            Le Nil est-il plus long que l’Amazone ?  [Q3392]  mintaka   

                                                  input  
2391  How many husbands did <entity>Cleopatra</entit...  
3133  How many times did the <entity>U.S.</entity> s...  
1157  Is the <entity>Nile River</entity> longer t

In [34]:
# Generate predictions on validation set
results = trainer.predict(val_dataset)

# Decode predictions and labels
decoded_preds = tokenizer.batch_decode(results.predictions, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(results.label_ids, skip_special_tokens=True)


In [44]:
# Use sacreBLEU to evaluate translation performance
bleu = evaluate.load("sacrebleu")

# Wrap references in a list of one ref per prediction
refs = [[ref] for ref in decoded_labels]

# Compute BLEU score
bleu_result = bleu.compute(predictions=decoded_preds, references=refs)
print("BLEU Score:", bleu_result["score"])


BLEU Score: 37.00680812347288


# Work with Base Model to compare with finetuned model

In [40]:
# Define the model to use (mBART supports 50+ languages)
base_model_name = "facebook/mbart-large-50-many-to-many-mmt"

# Load tokenizer and model from Hugging Face
base_tokenizer = MBart50TokenizerFast.from_pretrained(base_model_name)
base_model = MBartForConditionalGeneration.from_pretrained(base_model_name)

# Define source and target languages.
# These should be ISO language codes supported by mBART50.
SRC_LANG = "en_XX"
TGT_LANG = "fr_XX"  # Change this to the appropriate target language

# Set the tokenizer's source language for encoding input text.
base_tokenizer.src_lang = SRC_LANG
base_tokenizer.tgt_lang = TGT_LANG


In [41]:
from transformers import AutoModelForSeq2SeqLM
import torch
from evaluate import load

# Step 1: Load base (untrained) mBART model
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")
base_model.eval()

# Step 2: Generate translations using base model
base_predictions = []

for example in val_dataset:
    # Tokenize input and move to device
    inputs = tokenizer(
        example["input"],
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    ).to(base_model.device)

    # Generate translation (French target)
    with torch.no_grad():
        output = base_model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"]
        )
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    base_predictions.append(decoded)


In [47]:
# Build references: list of [target] for each input

references = [[t] for t in flat_val_df["target"]]
# Evaluate BLEU
bleu = load("sacrebleu")
bleu_base = bleu.compute(predictions=base_predictions, references=references)

print("BLEU score (Base model):", bleu_base["score"])


BLEU score (Base model): 39.492900673834455
