<a href="https://colab.research.google.com/github/sashkoangelov/NLP_final_project/blob/main/model2_overlap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs and imports

In [2]:
%%capture


In [4]:
import pandas as pd
import torch
import random
#import accelerate
import transformers
import textwrap
import numpy as np
import matplotlib.pyplot as plt
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering, TrainingArguments, Trainer, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import os
import json
from pathlib import Path
from tqdm import tqdm

In [5]:
print(transformers.__version__)

4.38.2


#Data loading

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
def read_data(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []

    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

In [9]:
train_contexts, train_questions, train_answers = read_data('/content/drive/MyDrive/NLP/train-v2.0.json')
val_contexts, val_questions, val_answers = read_data('/content/drive/MyDrive/NLP/dev-v2.0.json')

## Displaying random examples

In [10]:
def display_random_examples(contexts, questions, answers, num_examples=3):

    for _ in range(num_examples):
        rand = random.randint(0, len(contexts) - 1)
        context = contexts[rand]
        question = questions[rand]

        answer_text = answers[rand]['text'] if answers[rand]['text'] != '' else 'No answer'
        answer_start = answers[rand]['answer_start'] if answers[rand]['answer_start'] != '' else -1

        print(f"Question: {question}\n")
        print(f"Answer: {answer_text}")
        print(f"Answer Starts at: {answer_start}\n")

        # Highlighting answer in context
        if answer_start != -1:
            highlighted_context = context[:answer_start] + "\033[1;31m" + context[answer_start:answer_start+len(answer_text)] + "\033[0m" + context[answer_start+len(answer_text):]
        else:
            highlighted_context = context

        # Wrapping context for readability
        wrapped_context = "\n".join([highlighted_context[i:i+80] for i in range(0, len(highlighted_context), 80)])
        print(f"Context:\n{wrapped_context}\n")
        print("-"*80)


In [11]:
display_random_examples(train_contexts, train_questions, train_answers, 3)

Question: Where is Bond brought after he is kidnapped?

Answer: the old MI6 building
Answer Starts at: 524

Context:
Bond and Swann return to London where they meet M, Bill Tanner, Q, and Moneypenn
y; they intend to arrest C and stop Nine Eyes from going online. Swann leaves Bo
nd, telling him she cannot be part of a life involving espionage, and is subsequ
ently kidnapped. On the way, the group is ambushed and Bond is kidnapped, but th
e rest still proceed with the plan. After Q succeeds in preventing the Nine Eyes
 from going online, a brief struggle between M and C ends with the latter fallin
g to his death. Meanwhile, Bond is taken to [1;31mthe old MI6 building[0m, whi
ch is scheduled for demolition, and frees himself. Moving throughout the ruined 
labyrinth, he encounters a disfigured Blofeld, who tells him that he has three m
inutes to escape the building before explosives are detonated or die trying to s
ave Swann. Bond finds Swann and the two escape by boat as the building co

# Data pre-processing

In [12]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

## Prepare features V2

In [24]:
train_answers[0]

{'text': 'in the late 1990s', 'answer_start': 269}

In [13]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

In [39]:
def prepare_train_features(contexts, questions, answers):
    # Strip leading and trailing whitespace from the questions
    questions = [question.strip() for question in questions]

    # Tokenize with truncation on the context, allowing for overflow to handle long texts
    tokenized = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
        return_tensors="pt"
    )

    # Extract and remove utility mappings from tokenized output
    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized.pop("offset_mapping")

    # Initialize position lists
    tokenized["start_positions"] = []
    tokenized["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized["input_ids"][i]
        cls_index = (input_ids == tokenizer.cls_token_id).nonzero(as_tuple=True)[0].item()
        sequence_ids = tokenized.sequence_ids(i)
        sample_index = sample_mapping[i]
        answer = answers[sample_index]

        # Check if the answer is provided
        if answer["answer_start"] == 0:
            tokenized["start_positions"].append(cls_index)
            tokenized["end_positions"].append(cls_index)
        else:
            start_char = answer["answer_start"]
            end_char = start_char + len(answer["text"])

            # Find tokens that overlap with the answer
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Check if the answer is within the span
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized["start_positions"].append(cls_index)
                tokenized["end_positions"].append(cls_index)
            else:
                # Adjust start and end positions within the token offsets
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized["start_positions"].append(token_start_index - 1)

                while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized["end_positions"].append(token_end_index + 1)

    return tokenized

In [37]:
class QADataset(torch.utils.data.Dataset):
    def __init__(self, encodings, answers=None):
        self.encodings = encodings
        self.answers = answers

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items() if key not in ['overflow_to_sample_mapping', 'offset_mapping']}
        if self.answers is not None:
            # Adjusting this part to directly use the processed start_positions and end_positions
            item['start_positions'] = self.encodings['start_positions'][idx]
            item['end_positions'] = self.encodings['end_positions'][idx]
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])


In [40]:
train_tokenized = prepare_train_features(train_contexts, train_questions, train_answers)
train_dataset = QADataset(train_tokenized)

val_tokenized = prepare_train_features(val_contexts, val_questions, val_answers)
val_dataset = QADataset(val_tokenized)

## Prepare features V1

In [None]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

In [None]:
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [None]:
train_tokenized = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_tokenized = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [None]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [None]:
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [None]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

#Fine-tuning

In [29]:
model = RobertaForQuestionAnswering.from_pretrained("roberta-base")

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
import copy

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Number of epochs and early stopping parameters
n_epochs = 10
best_val_loss = float('inf')
best_model = None
patience = 2  # How many epochs to wait after last time validation loss improved.
patience_counter = 0

for epoch in range(n_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{n_epochs}'):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{n_epochs}, Training Loss: {avg_loss}')

    # Validation phase
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f'Epoch {epoch + 1}/{n_epochs}, Validation Loss: {avg_val_loss}')

    # Early Stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model = copy.deepcopy(model.state_dict())
        patience_counter = 0
        print("Validation loss decreased, saving model...")
    else:
        patience_counter += 1
        print(f'Validation loss did not decrease, patience counter: {patience_counter}')

    if patience_counter >= patience:
        print("Stopping early due to lack of improvement in validation loss.")
        break

# Load the best model found during training
model.load_state_dict(best_model)

Epoch 1/10:   0%|          | 0/5487 [00:00<?, ?it/s]

Epoch 1/10, Training Loss: 1.0863919592803133
Epoch 1/10, Validation Loss: 1.053797155401981
Validation loss decreased, saving model...


Epoch 2/10:   0%|          | 0/5487 [00:00<?, ?it/s]

Epoch 2/10, Training Loss: 0.8049255220172986
Epoch 2/10, Validation Loss: 1.0078285601533927
Validation loss decreased, saving model...


Epoch 3/10:   0%|          | 0/5487 [00:00<?, ?it/s]

Epoch 3/10, Training Loss: 0.6511369250064364
Epoch 3/10, Validation Loss: 1.067973772290942
Validation loss did not decrease, patience counter: 1


Epoch 4/10:   0%|          | 0/5487 [00:00<?, ?it/s]

Epoch 4/10, Training Loss: 0.5460568185714407
Epoch 4/10, Validation Loss: 1.2335976451030504
Validation loss did not decrease, patience counter: 2


Epoch 5/10:   0%|          | 0/5487 [00:00<?, ?it/s]

Epoch 5/10, Training Loss: 0.47305200736700176
Epoch 5/10, Validation Loss: 1.2352900299026943
Validation loss did not decrease, patience counter: 3
Stopping early due to lack of improvement in validation loss.


<All keys matched successfully>

In [None]:
directory_path = '/content/drive/MyDrive/NLP'
os.makedirs(directory_path, exist_ok=True)

In [None]:
model_path = os.path.join(directory_path, 'model2_overlap.pth')
torch.save(model, model_path)