# BERT Integration in Question-Answering (QA) Systems

This notebook assumes the use of Google Colab. For running locally, make sure all relevant packages are installed into your environment

# Install & import necessary packages

In [None]:
!pip install transformers



In [None]:
import torch
import os
from tqdm import tqdm
import json

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Remove pretraining warnings, if desired

In [None]:
from transformers import logging
logging.set_verbosity_error()

Check GPU/cuda

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using {device}')

Using cuda


# Define desired save/load path

In [None]:
source_path = '/content/drive/MyDrive/CS7641'

Check that path exists

In [None]:
if not os.path.exists(source_path):
    os.mkdir(source_path)

# Get SQuAD Dataset

In [None]:
!wget -nc https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
!wget -nc https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json

File ‘train-v2.0.json’ already there; not retrieving.

File ‘dev-v2.0.json’ already there; not retrieving.



# Seperate contexts, questions, and answers

In [None]:
def load_data(file_path):
    with open(file_path, 'rb') as f:
        dataset = json.load(f)

    contexts, questions, answers = [], [], []

    for data in dataset['data']:
        for paragraph in data['paragraphs']:
            context = paragraph['context']
            for qas in paragraph['qas']:
                question = qas['question']
                for answer in qas['answers']:
                    text = answer['text']
                    answer_start = answer['answer_start']
                    answer['answer_end'] = answer_start + len(text)

                    # # SQuAD labels can be off by an index or two sometimes
                    # if context[answer_start:answer_end] == text:
                    #     answer['answer_end'] = answer_end
                    # elif context[answer_start-1:answer_end-1] == text:
                    #     answer['answer_start'] = answer_start - 1
                    #     answer['answer_end'] = answer_end - 1
                    # elif context[answer_start-2:answer_end-2] == text:
                    #     answer['answer_start'] = answer_start - 2
                    #     answer['answer_end'] = answer_end - 2

                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

In [None]:
# If loading SQuAD instead of downloading, change paths here
train_path, test_path = 'train-v2.0.json', 'dev-v2.0.json'
train_contexts, train_questions, train_answers = load_data(train_path)
test_contexts, test_questions, test_answers = load_data(test_path)

# Define tokenizer & encode data

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
train_encodings = tokenizer(train_contexts,
                            train_questions,
                            truncation=True,
                            padding=True)

test_encodings = tokenizer(test_contexts,
                           test_questions,
                           truncation=True,
                           padding=True)

Add token start/end for answers from character start/end

In [None]:
def answers_char_to_token(answers, encodings):
    answer_start_tokens, answer_end_tokens = [], []
    for i in range(len(answers)):
        start_token = encodings.char_to_token(i, answers[i]['answer_start'])
        end_token = encodings.char_to_token(i, answers[i]['answer_end'] - 1)

        answer_start_tokens.append(start_token)
        answer_end_tokens.append(end_token)

        # Check for truncated answer passages
        if answer_start_tokens[-1] is None:
            answer_start_tokens[-1] = tokenizer.model_max_length
        if answer_end_tokens[-1] is None:
            answer_end_tokens[-1] = tokenizer.model_max_length

    encodings.update({'answer_start_tokens': answer_start_tokens,
                      'answer_end_tokens': answer_end_tokens})

In [None]:
answers_char_to_token(train_answers, train_encodings)
answers_char_to_token(test_answers, test_encodings)

# Create Torch Dataset for SQuAD

In [None]:
class Custom_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        super().__init__()
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
train_dataset = Custom_Dataset(train_encodings)
test_dataset = Custom_Dataset(test_encodings)

# Define hyperparameters

In [None]:
hyperparameters = {}
hyperparameters['N_EPOCHS'] = 5
hyperparameters['learning_rate'] = 5e-5
hyperparameters['weight_decay'] = 0.01
hyperparameters['batch_size'] = 16

# Get pretrained base model

In [None]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

# Train

In [None]:
from torch.optim import AdamW
from torch.utils.data import DataLoader

def train(model, hyperparameters, train_dataset, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f'Using {device}')

    model.to(device)
    model.train()

    optimizer = AdamW(model.parameters(),
                      lr=hyperparameters['learning_rate'],
                      weight_decay=hyperparameters['weight_decay'])

    train_loader = DataLoader(train_dataset,
                              batch_size=hyperparameters['batch_size'],
                              shuffle=True)

    for epoch in range(hyperparameters['N_EPOCHS']):
        step = tqdm(train_loader, leave=True)
        for batch in step:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            answer_start_tokens = batch['answer_start_tokens'].to(device)
            answer_end_tokens = batch['answer_end_tokens'].to(device)

            outputs = model(input_ids,
                            attention_mask=attention_mask,
                            start_positions=answer_start_tokens,
                            end_positions=answer_end_tokens)

            loss = outputs[0]
            loss.backward()
            optimizer.step()

            step.set_description(f'Epoch {epoch+1}')
            step.set_postfix(loss=loss.item())

In [None]:
train(model, hyperparameters, train_dataset, device)

Epoch 1: 100%|██████████| 5427/5427 [21:28<00:00,  4.21it/s, loss=2.43]
Epoch 2: 100%|██████████| 5427/5427 [21:25<00:00,  4.22it/s, loss=1.51]
Epoch 3: 100%|██████████| 5427/5427 [21:27<00:00,  4.21it/s, loss=0.04]
Epoch 4: 100%|██████████| 5427/5427 [21:25<00:00,  4.22it/s, loss=0.263]
Epoch 5: 100%|██████████| 5427/5427 [21:24<00:00,  4.23it/s, loss=0.806]


# Save trained model, or load model from source path

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

def save_model(path, model, tokenizer):
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)

def load_model(path, device=None):
    model = AutoModelForQuestionAnswering.from_pretrained(path)
    tokenizer = AutoTokenizer.from_pretrained(path)
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f'Using {device}')

    model = model.to(device)
    return model, tokenizer

Comment out either save_model or load_model

In [None]:
save_model(source_path, model, tokenizer)
# model, tokenizer = load_model(source_path)

# Test accuracy of model

In [None]:
from torch.utils.data import DataLoader

def model_accuracy(model, test_dataset, batch_size):
    start_accuracies, end_accuracies, batch_accuracies = [], [], []
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_loader):
            attention_mask = batch['attention_mask'].to(device)
            input_ids = batch['input_ids'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)

            start_positions_true = batch['answer_start_tokens'].to(device)
            end_positions_true = batch['answer_end_tokens'].to(device)

            start_positions_pred = torch.argmax(outputs['start_logits'], dim=1)
            end_positions_pred = torch.argmax(outputs['end_logits'], dim=1)

            start_accuracy = (start_positions_pred == start_positions_true).sum()
            start_accuracy = start_accuracy / len(start_positions_pred)
            start_accuracy = start_accuracy.item()
            start_accuracies.append(start_accuracy)

            end_accuracy = (end_positions_pred == end_positions_true).sum()
            end_accuracy = end_accuracy / len(end_positions_pred)
            end_accuracy = end_accuracy.item()
            end_accuracies.append(end_accuracy)

            batch_accuracies.append(start_accuracy)
            batch_accuracies.append(end_accuracy)

    start_accuracy = sum(start_accuracies) / len(start_accuracies)
    end_accuracy = sum(end_accuracies) / len(end_accuracies)
    accuracy = sum(batch_accuracies) / len(batch_accuracies)

    return start_accuracy, end_accuracy, accuracy

In [None]:
start_accuracy, end_accuracy, accuracy = model_accuracy(model,
                                                        test_dataset,
                                                        hyperparameters['batch_size'])

print()
print(f'Start Accuracy: {start_accuracy}')
print(f'End Accuracy: {end_accuracy}')
print(f'Accuracy: {accuracy}')

100%|██████████| 1269/1269 [01:38<00:00, 12.84it/s]


Start Accuracy: 0.6116247326487909
End Accuracy: 0.6617696724180352
Accuracy: 0.636697202533413



