<a href="https://colab.research.google.com/github/sashkoangelov/NLP_final_project/blob/main/model2_overlap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs and imports

In [None]:
import torch
import random
import transformers
import copy
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering, TrainingArguments, Trainer, AdamW
from torch.utils.data import Dataset, DataLoader
import os
import json
from pathlib import Path
from tqdm import tqdm

In [None]:
print(transformers.__version__)

4.38.2


#Data loading

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def read_data(path):
    with open(path, 'r', encoding='utf-8') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []

    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']

                # Check if 'answers' is present and has entries
                if 'answers' in qa and qa['answers']:
                    for answer in qa['answers']:
                        contexts.append(context)
                        questions.append(question)
                        answers.append(answer)
                else:
                    # Handle unanswerable questions
                    contexts.append(context)
                    questions.append(question)
                    answers.append({"text": "", "answer_start": -1})

    return contexts, questions, answers

In [None]:
train_contexts, train_questions, train_answers = read_data('/content/drive/MyDrive/NLP/train-v2.0.json')
val_contexts, val_questions, val_answers = read_data('/content/drive/MyDrive/NLP/dev-v2.0.json')

## Displaying random examples

In [None]:
def display_random_examples(contexts, questions, answers, num_examples=3):
    for _ in range(num_examples):
        rand = random.randint(0, len(contexts) - 1)
        context = contexts[rand]
        question = questions[rand]
        answer_text = answers[rand]['text'] if answers[rand]['text'] != '' else 'No answer'
        answer_start = answers[rand]['answer_start'] if answers[rand]['answer_start'] > -1 else -1

        print(f"Question: {question}\n")
        print(f"Answer: {answer_text}")
        print(f"Answer Starts at: {answer_start}\n")

        if answer_start != -1:
            pre_highlight = context[:answer_start]
            highlight = context[answer_start:answer_start+len(answer_text)]
            post_highlight = context[answer_start+len(answer_text):]

            # Apply coloring outside the string concatenation to avoid escape sequence disruption
            highlighted_context = pre_highlight + "\033[1;31m" + highlight + "\033[0m" + post_highlight
        else:
            highlighted_context = context

        wrapped_context = "\n".join([highlighted_context[i:i+80] for i in range(0, len(highlighted_context), 80)])
        print(f"Context:\n{wrapped_context}\n")
        print("-"*80)


In [None]:
display_random_examples(train_contexts, train_questions, train_answers)

Question: What French department store is part of the same group as Galeries Lafayette?

Answer: BHV (Bazar de l'Hotel de Ville)
Answer Starts at: 470

Context:
France's major upscale department stores are Galeries Lafayette and Le Printemps
, which both have flagship stores on Boulevard Haussmann in Paris and branches a
round the country. The first department store in France, Le Bon Marché in Paris,
 was founded in 1852 and is now owned by the luxury goods conglomerate LVMH. La 
Samaritaine, another upscale department store also owned by LVMH, closed in 2005
. Mid-range department stores chains also exist in France such as the [1;31mBHV
 (Bazar de l'Hotel de Ville)[0m, part of the same group as Galeries Lafayette.

--------------------------------------------------------------------------------
Question: What type of climate was normal in the Ordovician period?

Answer: stable greenhouse conditions
Answer Starts at: 150

Context:
The most-commonly accepted theory is that these event

# Data pre-processing

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

## Prepare features V2

In [None]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

In [None]:
def prepare_train_features(contexts, questions, answers):
    # Strip leading and trailing whitespace
    questions = [question.strip() for question in questions]

    # Tokenize the question and context pairs, with special handling for long texts.
    tokenized = tokenizer(
        questions,
        contexts,
        truncation="only_second",  # only truncate the context if necessary
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",  # pad sequences to `max_length`
        return_tensors="pt"
    )

    # necessary for locating answer positions
    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized.pop("offset_mapping")

    tokenized["start_positions"] = []
    tokenized["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized["input_ids"][i]
        # Find the index of the CLS token, used as a fallback answer position
        cls_index = (input_ids == tokenizer.cls_token_id).nonzero(as_tuple=True)[0][0].item()
        sequence_ids = tokenized.sequence_ids(i)
        sample_index = sample_mapping[i]
        answer = answers[sample_index]

        # If no answer is provided, use the CLS token index as both start and end positions
        if answer["answer_start"] == -1:
            tokenized["start_positions"].append(cls_index)
            tokenized["end_positions"].append(cls_index)
        else:
            start_char = answer["answer_start"]
            end_char = start_char + len(answer["text"])

            # Find the token index where the context starts
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # Find the token index where the context ends
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # If the answer is not fully contained within the current span, use CLS token positions as fallback
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized["start_positions"].append(cls_index)
                tokenized["end_positions"].append(cls_index)
            else:
                # adjust start and end token positions to tightly enclose the answer text
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized["start_positions"].append(token_start_index - 1)

                while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized["end_positions"].append(token_end_index + 1)

    return tokenized


In [None]:
class QADataset(torch.utils.data.Dataset):
    def __init__(self, encodings, answers=None):
        self.encodings = encodings
        self.answers = answers

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [None]:
train_tokenized = prepare_train_features(train_contexts, train_questions, train_answers)
train_dataset = QADataset(train_tokenized)

val_tokenized = prepare_train_features(val_contexts, val_questions, val_answers)
val_dataset = QADataset(val_tokenized)

#Fine-tuning

In [None]:
model = RobertaForQuestionAnswering.from_pretrained("roberta-base")

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Number of epochs
n_epochs = 2

for epoch in range(n_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{n_epochs}'):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{n_epochs}, Training Loss: {avg_loss}')

    # Validation phase
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f'Epoch {epoch + 1}/{n_epochs}, Validation Loss: {avg_val_loss}')

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 1/2: 100%|██████████| 8239/8239 [45:48<00:00,  3.00it/s]


Epoch 1/2, Training Loss: 1.140290895537752
Epoch 1/2, Validation Loss: 1.166481977660362


Epoch 2/2: 100%|██████████| 8239/8239 [45:46<00:00,  3.00it/s]


Epoch 2/2, Training Loss: 0.8125494404558361
Epoch 2/2, Validation Loss: 1.136212824907613


# Save the model

In [None]:
directory_path = '/content/drive/MyDrive/NLP'
os.makedirs(directory_path, exist_ok=True)

In [None]:
model_path = os.path.join(directory_path, 'model2_overlapV4.pth')
torch.save(model, model_path)