<a href="https://colab.research.google.com/github/sashkoangelov/NLP_final_project/blob/main/model2_trunctuation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs and imports

In [1]:
%%capture

!pip install transformers datasets
!pip install datasets
!pip install pyngrok
!pip install git+https://github.com/huggingface/accelerate

In [2]:
import pandas as pd
import torch
import random
import accelerate
import transformers
import textwrap
import numpy as np
import matplotlib.pyplot as plt
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering, TrainingArguments, Trainer, AdamW
from torch.utils.data import Dataset, DataLoader
from datasets import Dataset, load_dataset, load_metric
from pyngrok import ngrok
from torch.utils.tensorboard import SummaryWriter
import os
import json
from pathlib import Path
from tqdm import tqdm

In [3]:
print(transformers.__version__)

4.38.2


#Data loading

In [4]:
def read_data(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []

    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

In [5]:
train_contexts, train_questions, train_answers = read_data('/content/drive/MyDrive/NLP/train-v2.0.json')
val_contexts, val_questions, val_answers = read_data('/content/drive/MyDrive/NLP/dev-v2.0.json')

## Displaying random examples

In [6]:
def display_random_examples(contexts, questions, answers, num_examples=3):

    for _ in range(num_examples):
        rand = random.randint(0, len(contexts) - 1)
        context = contexts[rand]
        question = questions[rand]

        answer_text = answers[rand]['text'] if answers[rand]['text'] != '' else 'No answer'
        answer_start = answers[rand]['answer_start'] if answers[rand]['answer_start'] != '' else -1

        print(f"Question: {question}\n")
        print(f"Answer: {answer_text}")
        print(f"Answer Starts at: {answer_start}\n")

        # Highlighting answer in context
        if answer_start != -1:
            highlighted_context = context[:answer_start] + "\033[1;31m" + context[answer_start:answer_start+len(answer_text)] + "\033[0m" + context[answer_start+len(answer_text):]
        else:
            highlighted_context = context

        # Wrapping context for readability
        wrapped_context = "\n".join([highlighted_context[i:i+80] for i in range(0, len(highlighted_context), 80)])
        print(f"Context:\n{wrapped_context}\n")
        print("-"*80)


In [7]:
display_random_examples(train_contexts, train_questions, train_answers, 3)

Question: Who was the Grand Service originally made for?

Answer: Prince of Wales
Answer Starts at: 346

Context:
State banquets also take place in the Ballroom; these formal dinners are held on
 the first evening of a state visit by a foreign head of state. On these occasio
ns, for up to 170 guests in formal "white tie and decorations", including tiaras
, the dining table is laid with the Grand Service, a collection of silver-gilt p
late made in 1811 for the [1;31mPrince of Wales[0m, later George IV. The large
st and most formal reception at Buckingham Palace takes place every November whe
n the Queen entertains members of the diplomatic corps. On this grand occasion, 
all the state rooms are in use, as the royal family proceed through them, beginn
ing at the great north doors of the Picture Gallery. As Nash had envisaged, all 
the large, double-mirrored doors stand open, reflecting the numerous crystal cha
ndeliers and sconces, creating a deliberate optical illusion of space and li

# Data pre-processing

In [8]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

In [9]:
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [10]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [11]:
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [12]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [13]:
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [14]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [15]:
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

#Fine-tuning

In [16]:
model = RobertaForQuestionAnswering.from_pretrained("roberta-base")

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optimizer = AdamW(model.parameters(), lr=5e-5)

# Number of epochs
n_epochs = 3

for epoch in range(n_epochs):
    total_loss = 0  # Track total loss for the epoch

    # Wrap train_loader with tqdm for a progress bar
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{n_epochs}')

    for batch in progress_bar:
        optimizer.zero_grad()

        # Transfer to GPU if available
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Model forward pass
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss

        # Backward pass and optimizer step
        loss.backward()
        optimizer.step()

        total_loss += loss.item()  # Update total loss

        # Update progress bar with the instantaneous loss
        progress_bar.set_postfix({'loss': loss.item()})

    # Calculate and print the average loss for the epoch outside the progress bar
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{n_epochs}, Average Loss: {avg_loss}')

model.eval()

Epoch 1/3: 100%|██████████| 5427/5427 [41:24<00:00,  2.18it/s, loss=1.41]


Epoch 1/3, Average Loss: 1.091257085422027


Epoch 2/3: 100%|██████████| 5427/5427 [41:22<00:00,  2.19it/s, loss=0.612]


Epoch 2/3, Average Loss: 0.794190366313298


Epoch 3/3: 100%|██████████| 5427/5427 [41:19<00:00,  2.19it/s, loss=1.04]

Epoch 3/3, Average Loss: 0.6682244309963652





RobertaForQuestionAnswering(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Lay

In [21]:
directory_path = '/content/drive/MyDrive/NLP/second'
os.makedirs(directory_path, exist_ok=True)

In [23]:
model_path = os.path.join(directory_path, 'second.pth')  # It's good practice to add a file extension
torch.save(model, model_path)