<a href="https://colab.research.google.com/github/sashkoangelov/NLP_final_project/blob/main/model2_backtranslation_overlap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs and imports

In [1]:
import torch
import random
import transformers
import copy
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering, TrainingArguments, Trainer, AdamW
from torch.utils.data import Dataset, DataLoader
import os
import json
from pathlib import Path
from tqdm import tqdm

In [2]:
print(transformers.__version__)

4.38.2


#Data loading

In [3]:
def read_data(path):
    with open(path, 'r', encoding='utf-8') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []

    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']

                # Check if 'answers' is present and has entries
                if 'answers' in qa and qa['answers']:
                    for answer in qa['answers']:
                        contexts.append(context)
                        questions.append(question)
                        answers.append(answer)
                else:
                    # Handle unanswerable questions
                    contexts.append(context)
                    questions.append(question)
                    answers.append({"text": "", "answer_start": -1})

    return contexts, questions, answers

In [4]:
train_contexts, train_questions, train_answers = read_data('/content/drive/MyDrive/NLP/train-v2.0.json')
bt_contexts, bt_questions, bt_answers = read_data('/content/drive/MyDrive/NLP/train-v2.0-with-back-translation-fr.json')
val_contexts, val_questions, val_answers = read_data('/content/drive/MyDrive/NLP/dev-v2.0.json')

In [37]:
print(sum(1 for answer in train_answers if answer["answer_start"] == -1) + sum(1 for answer in val_answers if answer["answer_start"] == -1))

49443


# Data visualization

## Displaying random examples

In [5]:
def display_random_examples(contexts, questions, answers, num_examples=3):
    for _ in range(num_examples):
        rand = random.randint(0, len(contexts) - 1)
        context = contexts[rand]
        question = questions[rand]
        answer_text = answers[rand]['text'] if answers[rand]['text'] != '' else 'No answer'
        answer_start = answers[rand]['answer_start'] if answers[rand]['answer_start'] > -1 else -1

        print(f"Question: {question}\n")
        print(f"Answer: {answer_text}")
        print(f"Answer Starts at: {answer_start}\n")

        if answer_start != -1:
            pre_highlight = context[:answer_start]
            highlight = context[answer_start:answer_start+len(answer_text)]
            post_highlight = context[answer_start+len(answer_text):]

            # Apply coloring outside the string concatenation to avoid escape sequence disruption
            highlighted_context = pre_highlight + "\033[1;31m" + highlight + "\033[0m" + post_highlight
        else:
            highlighted_context = context

        wrapped_context = "\n".join([highlighted_context[i:i+80] for i in range(0, len(highlighted_context), 80)])
        print(f"Context:\n{wrapped_context}\n")
        print("-"*80)


In [6]:
display_random_examples(train_contexts, train_questions, train_answers, 3)

Question: Where was Fauconnier's studio located? 

Answer: Boulevard de Montparnasse
Answer Starts at: 417

Context:
In contrast, the Salon Cubists built their reputation primarily by exhibiting re
gularly at the Salon d'Automne and the Salon des Indépendants, both major non-ac
ademic Salons in Paris. They were inevitably more aware of public response and t
he need to communicate. Already in 1910 a group began to form which included Met
zinger, Gleizes, Delaunay and Léger. They met regularly at Henri le Fauconnier's
 studio near the [1;31mBoulevard de Montparnasse[0m. These soirées often inclu
ded writers such as Guillaume Apollinaire and André Salmon. Together with other 
young artists, the group wanted to emphasise a research into form, in opposition
 to the Neo-Impressionist emphasis on color.

--------------------------------------------------------------------------------
Question: What is the name of the Presbyterian church in Brazil with Dutch origins?

Answer: The Evangelical

## Display a single example

In [7]:
def display_example(contexts, questions, answers, example):

    rand = example
    context = contexts[rand]
    question = questions[rand]

    answer_text = answers[rand]['text'] if answers[rand]['text'] != '' else 'No answer'
    answer_start = answers[rand]['answer_start'] if answers[rand]['answer_start'] != '' else -1

    print(f"Question: {question}\n")
    print(f"Answer: {answer_text}")
    print(f"Answer Starts at: {answer_start}\n")

    # Highlighting answer in context
    if answer_start != -1:
        highlighted_context = context[:answer_start] + "\033[1;31m" + context[answer_start:answer_start+len(answer_text)] + "\033[0m" + context[answer_start+len(answer_text):]
    else:
        highlighted_context = context

    # Wrapping context for readability
    wrapped_context = "\n".join([highlighted_context[i:i+80] for i in range(0, len(highlighted_context), 80)])
    print(f"Context:\n{wrapped_context}\n")
    print("-"*80)

In [8]:
display_example(train_contexts, train_questions, train_answers, 19)

Question: When did Beyonce take a hiatus in her career and take control of her management?

Answer: 2010
Answer Starts at: 586

Context:
Following the disbandment of Destiny's Child in June 2005, she released her seco
nd solo album, B'Day (2006), which contained hits "Déjà Vu", "Irreplaceable", an
d "Beautiful Liar". Beyoncé also ventured into acting, with a Golden Globe-nomin
ated performance in Dreamgirls (2006), and starring roles in The Pink Panther (2
006) and Obsessed (2009). Her marriage to rapper Jay Z and portrayal of Etta Jam
es in Cadillac Records (2008) influenced her third album, I Am... Sasha Fierce (
2008), which saw the birth of her alter-ego Sasha Fierce and earned a record-set
ting six Grammy Awards in [1;31m2010[0m, including Song of the Year for "Singl
e Ladies (Put a Ring on It)". Beyoncé took a hiatus from music in 2010 and took 
over management of her career; her fourth album 4 (2011) was subsequently mellow
er in tone, exploring 1970s funk, 1980s pop, and 1990

In [9]:
display_example(bt_contexts, bt_questions, bt_answers, 19)

Question: When did Beyonce take a break in her career and take control of her leadership?

Answer: 2010
Answer Starts at: 586

Context:
After the dismantling of Destiny's Child in June 2005, she released her second s
olo album, B'Day (2006), which contains hits "Déjà Vu", "Irreplaceable" and "Bea
utiful Liar". Beyoncé also ventured into the performance, with a nominated Golde
n Globe performance in Dreamgirls (2006), and featured roles in The Pink Panther
 (2006) and Obsessed (2009). Her marriage to rapper Jay Z and the performance of
 Etta James in Cadillac Records (2008) influenced her third album, I Am... Sasha
 Fierce (2008), which saw the birth of her alter-ego Sasha Fierce and obtained a
 record of six Grammy Awar[1;31mds i[0mn 2010, including Song of the Year for 
"Sing of the Single Ladies (Put a Ring on It)". Beyoncé took a hiatus of music i
n 2010 and took over the direction of her career; her fourth album 4 (2011) was 
then melliferous in tone, exploring the 1970s pop, and

In [14]:
display_example(updated_contexts, updated_questions, updated_answers, 18)

Question: When did Beyonce take a hiatus in her career and take control of her management?

Answer: 2010
Answer Starts at: 592

Context:
After the dismantling of Destiny's Child in June 2005, she released her second s
olo album, B'Day (2006), which contains hits "Déjà Vu", "Irreplaceable" and "Bea
utiful Liar". Beyoncé also ventured into the performance, with a nominated Golde
n Globe performance in Dreamgirls (2006), and featured roles in The Pink Panther
 (2006) and Obsessed (2009). Her marriage to rapper Jay Z and the performance of
 Etta James in Cadillac Records (2008) influenced her third album, I Am... Sasha
 Fierce (2008), which saw the birth of her alter-ego Sasha Fierce and obtained a
 record of six Grammy Awards in [1;31m2010[0m, including Song of the Year for 
"Sing of the Single Ladies (Put a Ring on It)". Beyoncé took a hiatus of music i
n 2010 and took over the direction of her career; her fourth album 4 (2011) was 
then melliferous in tone, exploring the 1970s pop, an

# Assign new start positions for the answers in the back translated contexts if it exists

In [11]:
def assign_new_start_pos(bt_contexts, train_questions, train_answers):
    updated_contexts = []
    updated_questions = []
    updated_answers = []

    for context, question, answer in zip(bt_contexts, train_questions, train_answers):
        try:

            # Attempt to find the non-translated answer in the translated context
            new_start = context.index(answer['text'])

            # If found, update the entry
            updated_contexts.append(context)
            updated_questions.append(question)
            updated_answers.append({'text': answer['text'], 'answer_start': new_start})

        except ValueError:
            continue

    return updated_contexts, updated_questions, updated_answers

In [12]:
updated_contexts, updated_questions, updated_answers = assign_new_start_pos(bt_contexts, train_questions, train_answers)

In [13]:
print("Preserved samples:", round((len(updated_contexts) / len(bt_contexts))*100, 2), "%")

Preserved samples: 77.31 %


## Discarding identical triplets of contexts, questions and answers

In [15]:
def discard_duplicate_triplets(train_contexts, train_questions, train_answers, updated_contexts, updated_questions, updated_answers):
    seen = set()
    unique_triplets = []

    combined_triplets = list(zip(train_contexts, train_questions, train_answers)) + list(zip(updated_contexts, updated_questions, updated_answers))

    for context, question, answer in combined_triplets:

        triplet_tuple = (context, question, answer['text'])  # Convert to a tuple for hashability

        if triplet_tuple not in seen:
            seen.add(triplet_tuple)
            # Append the original items
            unique_triplets.append((context, question, answer))

    unique_contexts, unique_questions, unique_answers = zip(*unique_triplets)

    return list(unique_contexts), list(unique_questions), list(unique_answers)


In [16]:
unique_contexts, unique_questions, unique_answers = discard_duplicate_triplets(train_contexts, train_questions, train_answers, updated_contexts, updated_questions, updated_answers)

In [17]:
print("Preserved samples:", round(((len(unique_contexts) - len(train_contexts)) / len(bt_contexts))*100, 2), "%")

Preserved samples: 77.21 %


# Data pre-processing

In [18]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

## Prepare features V2

In [19]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting is needed.

In [20]:
def prepare_train_features(contexts, questions, answers):
    # Strip leading and trailing whitespace
    questions = [question.strip() for question in questions]

    # Tokenize the question and context pairs, with special handling for long texts.
    tokenized = tokenizer(
        questions,
        contexts,
        truncation="only_second",  # only truncate the context if necessary
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",  # pad sequences to `max_length`
        return_tensors="pt"
    )

    # necessary for locating answer positions
    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized.pop("offset_mapping")

    tokenized["start_positions"] = []
    tokenized["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized["input_ids"][i]
        # Find the index of the CLS token, used as a fallback answer position
        cls_index = (input_ids == tokenizer.cls_token_id).nonzero(as_tuple=True)[0][0].item()
        sequence_ids = tokenized.sequence_ids(i)
        sample_index = sample_mapping[i]
        answer = answers[sample_index]

        # If no answer is provided, use the CLS token index as both start and end positions
        if answer["answer_start"] == -1:
            tokenized["start_positions"].append(cls_index)
            tokenized["end_positions"].append(cls_index)
        else:
            start_char = answer["answer_start"]
            end_char = start_char + len(answer["text"])

            # Find the token index where the context starts
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # Find the token index where the context ends
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # If the answer is not fully contained within the current span, use CLS token positions as fallback
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized["start_positions"].append(cls_index)
                tokenized["end_positions"].append(cls_index)
            else:
                # adjust start and end token positions to tightly enclose the answer text
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized["start_positions"].append(token_start_index - 1)

                while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized["end_positions"].append(token_end_index + 1)

    return tokenized


In [21]:
# define a custom class for the data loader
class QADataset(torch.utils.data.Dataset):
    def __init__(self, encodings, answers=None):
        self.encodings = encodings
        self.answers = answers

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [22]:
train_tokenized = prepare_train_features(unique_contexts, unique_questions, unique_answers)
train_dataset = QADataset(train_tokenized)

val_tokenized = prepare_train_features(val_contexts, val_questions, val_answers)
val_dataset = QADataset(val_tokenized)

#Fine-tuning

In [23]:
model = RobertaForQuestionAnswering.from_pretrained("roberta-base")

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Number of epochs
n_epochs = 1

for epoch in range(n_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{n_epochs}'):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{n_epochs}, Training Loss: {avg_loss}')

    # Validation phase
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f'Epoch {epoch + 1}/{n_epochs}, Validation Loss: {avg_val_loss}')

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 1/1: 100%|██████████| 14731/14731 [1:21:30<00:00,  3.01it/s]


Epoch 1/1, Training Loss: 1.0041225527937743
Epoch 1/1, Validation Loss: 1.2539261535094066


In [25]:
directory_path = '/content/drive/MyDrive/NLP'
os.makedirs(directory_path, exist_ok=True)

In [27]:
model_path = os.path.join(directory_path, 'model2_backtranslation_overlapV3.pth')
torch.save(model, model_path)