<a href="https://colab.research.google.com/github/sashkoangelov/NLP_final_project/blob/main/model2_overlap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs and imports

In [24]:
import torch
import random
import transformers
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering, TrainingArguments, Trainer, AdamW
from torch.utils.data import Dataset, DataLoader
import os
import json
from pathlib import Path
from tqdm import tqdm

In [2]:
print(transformers.__version__)

4.38.2


#Data loading

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
def read_data(path):
    with open(path, 'r', encoding='utf-8') as f:  # Use 'r' for reading text files, and specify encoding
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []

    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']

                # Check if 'answers' is present and has entries
                if 'answers' in qa and qa['answers']:
                    for answer in qa['answers']:
                        contexts.append(context)
                        questions.append(question)
                        answers.append(answer)
                else:
                    # Handle unanswerable questions
                    contexts.append(context)
                    questions.append(question)
                    answers.append({"text": "", "answer_start": -1})

    return contexts, questions, answers

In [5]:
train_contexts, train_questions, train_answers = read_data('/content/drive/MyDrive/NLP/train-v2.0.json')
val_contexts, val_questions, val_answers = read_data('/content/drive/MyDrive/NLP/dev-v2.0.json')

## Displaying random examples

In [6]:
def display_random_examples(contexts, questions, answers, num_examples=3):
    for _ in range(num_examples):
        rand = random.randint(0, len(contexts) - 1)
        context = contexts[rand]
        question = questions[rand]
        answer_text = answers[rand]['text'] if answers[rand]['text'] != '' else 'No answer'
        answer_start = answers[rand]['answer_start'] if answers[rand]['answer_start'] > -1 else -1

        print(f"Question: {question}\n")
        print(f"Answer: {answer_text}")
        print(f"Answer Starts at: {answer_start}\n")

        if answer_start != -1:
            pre_highlight = context[:answer_start]
            highlight = context[answer_start:answer_start+len(answer_text)]
            post_highlight = context[answer_start+len(answer_text):]

            # Apply coloring outside the string concatenation to avoid escape sequence disruption
            highlighted_context = pre_highlight + "\033[1;31m" + highlight + "\033[0m" + post_highlight
        else:
            highlighted_context = context

        wrapped_context = "\n".join([highlighted_context[i:i+80] for i in range(0, len(highlighted_context), 80)])
        print(f"Context:\n{wrapped_context}\n")
        print("-"*80)


In [23]:
display_random_examples(train_contexts, train_questions, train_answers, 3)

Question: What did Bortolazzi say about the sound? 

Answer: less pleasing...hard, zither-like tone
Answer Starts at: 430

Context:
In his 1805 mandolin method, Anweisung die Mandoline von selbst zu erlernen nebs
t einigen Uebungsstucken von Bortolazzi, Bartolomeo Bortolazzi popularised the C
remonese mandolin, which had four single-strings and a fixed bridge, to which th
e strings were attached. Bortolazzi said in this book that the new wire strung m
andolins were uncomfortable to play, when compared with the gut-string instrumen
ts. Also, he felt they had a "[1;31mless pleasing...hard, zither-like tone[0m"
 as compared to the gut string's "softer, full-singing tone." He favored the fou
r single strings of the Cremonese instrument, which were tuned the same as the N
eapolitan.

--------------------------------------------------------------------------------
Question: What did Bakker see as destabalizing?

Answer: No answer
Answer Starts at: -1

Context:
The revisionist paleontologis

# Data pre-processing

In [8]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

## Prepare features V2

In [9]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

In [13]:
def prepare_train_features(contexts, questions, answers):
    # Strip leading and trailing whitespace from the questions
    questions = [question.strip() for question in questions]

    # Tokenize with truncation on the context, allowing for overflow to handle long texts
    tokenized = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
        #return_tensors="pt"
    )

    # Extract and remove utility mappings from tokenized output
    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized.pop("offset_mapping")

    # Initialize position lists
    tokenized["start_positions"] = []
    tokenized["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized.sequence_ids(i)
        sample_index = sample_mapping[i]
        answer = answers[sample_index]

        # Check if the answer is provided
        if answer["answer_start"] == -1:
            tokenized["start_positions"].append(cls_index)
            tokenized["end_positions"].append(cls_index)
        else:
            start_char = answer["answer_start"]
            end_char = start_char + len(answer["text"])

            # Find tokens that overlap with the answer
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Check if the answer is within the span
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized["start_positions"].append(cls_index)
                tokenized["end_positions"].append(cls_index)
            else:
                # Adjust start and end positions within the token offsets
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized["start_positions"].append(token_start_index - 1)

                while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized["end_positions"].append(token_end_index + 1)

    return tokenized

In [17]:
class QADataset(torch.utils.data.Dataset):
    def __init__(self, encodings, answers=None):
        self.encodings = encodings
        self.answers = answers

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [18]:
train_tokenized = prepare_train_features(train_contexts, train_questions, train_answers)
train_dataset = QADataset(train_tokenized)

val_tokenized = prepare_train_features(val_contexts, val_questions, val_answers)
val_dataset = QADataset(val_tokenized)

#Fine-tuning

In [15]:
model = RobertaForQuestionAnswering.from_pretrained("roberta-base")

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
import copy

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Number of epochs and early stopping parameters
n_epochs = 3
best_val_loss = float('inf')
best_model = None
patience = 1  # How many epochs to wait after last time validation loss improved.
patience_counter = 0

for epoch in range(n_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{n_epochs}'):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{n_epochs}, Training Loss: {avg_loss}')

    # Validation phase
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f'Epoch {epoch + 1}/{n_epochs}, Validation Loss: {avg_val_loss}')

    # Early Stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model = copy.deepcopy(model.state_dict())
        patience_counter = 0
        print("Validation loss decreased, saving model...")
    else:
        patience_counter += 1
        print(f'Validation loss did not decrease, patience counter: {patience_counter}')

    if patience_counter > patience:
        print("Stopping early due to lack of improvement in validation loss.")
        break

# Load the best model found during training
model.load_state_dict(best_model)

Epoch 1/3: 100%|██████████| 8239/8239 [46:33<00:00,  2.95it/s]


Epoch 1/3, Training Loss: 1.1240454097911996
Epoch 1/3, Validation Loss: 1.0875867099244372
Validation loss decreased, saving model...


Epoch 2/3: 100%|██████████| 8239/8239 [46:29<00:00,  2.95it/s]


Epoch 2/3, Training Loss: 0.7997703331640542
Epoch 2/3, Validation Loss: 1.1251634506152597
Validation loss did not decrease, patience counter: 1


Epoch 3/3: 100%|██████████| 8239/8239 [46:29<00:00,  2.95it/s]


Epoch 3/3, Training Loss: 0.6427354060139655
Epoch 3/3, Validation Loss: 1.2355130884570067
Validation loss did not decrease, patience counter: 2
Stopping early due to lack of improvement in validation loss.


<All keys matched successfully>

In [20]:
directory_path = '/content/drive/MyDrive/NLP'
os.makedirs(directory_path, exist_ok=True)

In [22]:
model_path = os.path.join(directory_path, 'model2_overlapV2.pth')
torch.save(model, model_path)