# **Importing Libraries**

In [82]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [83]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from pprint import pprint
import random
from transformers import AutoTokenizer
import os, json


# **Question Answering using ALBERT Model**

In [129]:
# Function to read input from the specified path

def read_input(file: str) -> tuple:    
    location = os.path.join(os.getcwd(), file)
    with open(location, "rb") as json_input:
        dictionary = json.load(json_input)
    contexts, questions, answers = list(), list(), list()
    
# Extracting the contexts, questions and answers from the SQUAD 2.0 dataset

    for sample in dictionary['data']:
        for passage in sample['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                
# Answers and Plausible answers separation

                access = "plausible_answers" if "plausible_answers" in qa.keys() else 'answers'
                for answer in qa[access]:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    
    return contexts, questions, answers

In [130]:
train_contexts, train_questions, train_answers = read_input('train-v2.0.json')
valid_contexts, valid_questions, valid_answers = read_input('dev-v2.0.json')

In [131]:
# Randomly printing 5 contexts, questions and answers from the dataset

random.seed(89)

ind = random.sample(range(0, len(train_contexts)), 5)
for index in ind:
    print(f'Q:  {train_questions[index]}\n')
    print("Context:\n")
    pprint(train_contexts[index])
    print(f"\nAnswer:[{train_answers[index]}]\n")
    print("-" * 100)

Q:   What style came before Modernism?

Context:

('Architects such as Mies van der Rohe, Philip Johnson and Marcel Breuer '
 'worked to create beauty based on the inherent qualities of building '
 'materials and modern construction techniques, trading traditional historic '
 'forms for simplified geometric forms, celebrating the new means and methods '
 'made possible by the Industrial Revolution, including steel-frame '
 'construction, which gave birth to high-rise superstructures. By mid-century, '
 'Modernism had morphed into the International Style, an aesthetic epitomized '
 "in many ways by the Twin Towers of New York's World Trade Center designed by "
 'Minoru Yamasaki.')

Answer:[{'text': 'International', 'answer_start': 464}]

----------------------------------------------------------------------------------------------------
Q:  What label avoided this scene?

Context:

('The innovative production techniques devised by post-punk producers such as '
 'Martin Hannett and Denni

In [132]:
# Since the dataset only has start_index, this function calculates the End index and appends it to the train_answers and valid_answers variables

def answer_end(answers: list, contexts: list) -> list:
    _answers = answers.copy()
    for answer, context in zip(_answers, contexts):
        answer_bound = answer['text']
        start_idx = answer['answer_start']
        answer['answer_end'] = start_idx + len(answer_bound)
    return _answers

In [133]:
train_answers = answer_end(train_answers, train_contexts)
valid_answers = answer_end(valid_answers, valid_contexts)

In [134]:
# Intializing the autotokenizer library for the model 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained('albert-base-v2', use_fast=True)

In [135]:
# Encoding the dataset contexts, questions and answers

def encode_data(contexts: list, questions: list, answers: list) -> dict:

# Tokenizing the input and extracting the encodings 

    encodings = tokenizer(contexts, questions, truncation=True, padding=True, return_tensors="pt")
    start_positions, end_positions = list(), list()

    for index in range(len(answers)):
        start_value = encodings.char_to_token(index, answers[index]['answer_start'])
        end_value   = encodings.char_to_token(index, answers[index]['answer_end'])

# if start position is None, answer is truncated

        if start_value is None:
            start_value = tokenizer.model_max_length
        
# If end position cannot be found, shift position until found

        shift = 1
        while end_value is None:
            end_value = encodings.char_to_token(index, answers[index]['answer_end'] - shift)
            shift += 1

        start_positions.append(start_value)
        end_positions.append(end_value)

    encodings.update({
        'start_positions': start_positions, 'end_positions': end_positions
    })
    return encodings

In [136]:
# Training the entire datset was taking more than 6 days to train. So we have reduced the input dataset size to 10000 contexts and evaluation size to 500 questions for simplicity

train_encodings = encode_data(train_contexts[0:5000], train_questions[0:5000], train_answers[0:5000])
valid_encodings = encode_data(valid_contexts[0:500], valid_questions[0:500], valid_answers[0:500])

In [137]:
# Deleting the old variables containing non-encoded data

del train_contexts, train_questions, train_answers
del valid_contexts, valid_questions, valid_answers

In [138]:
# Importing the pytorch for model bulding and intitializing dataset

import torch
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings: dict) -> None:
        self.encodings = encodings

    def __getitem__(self, index: int) -> dict:
        return {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
    
    def __len__(self):
        return len(self.encodings['input_ids'])

In [139]:
train_ds = SquadDataset(train_encodings)
valid_ds = SquadDataset(valid_encodings)

In [140]:
from transformers import AutoModelForQuestionAnswering
model = AutoModelForQuestionAnswering.from_pretrained('albert-base-v2')
# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device
model.to(device)
# activate training mode of model
model.train()

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForQuestionAnswering: ['predictions.LayerNorm.bias', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.decoder.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN t

AlbertForQuestionAnswering(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias

In [141]:
"""
This cell is adopted from `https://github.com/michaelrzhang/lookahead/blob/master/lookahead_pytorch.py`, which is the
source code of `Lookahead Optimizer: k steps forward, 1 step back` paper (https://arxiv.org/abs/1907.08610).
"""

from collections import defaultdict

import torch
from torch.optim.optimizer import Optimizer


class Lookahead(Optimizer):
    r"""PyTorch implementation of the lookahead wrapper.
    Lookahead Optimizer: https://arxiv.org/abs/1907.08610
    """

    def __init__(self, optimizer, la_steps=5, la_alpha=0.8, pullback_momentum="none"):
        """optimizer: inner optimizer
        la_steps (int): number of lookahead steps
        la_alpha (float): linear interpolation factor. 1.0 recovers the inner optimizer.
        pullback_momentum (str): change to inner optimizer momentum on interpolation update
        """
        self.optimizer = optimizer
        self._la_step = 0  # counter for inner optimizer
        self.la_alpha = la_alpha
        self._total_la_steps = la_steps
        pullback_momentum = pullback_momentum.lower()
        assert pullback_momentum in ["reset", "pullback", "none"]
        self.pullback_momentum = pullback_momentum

        self.state = defaultdict(dict)

        # Cache the current optimizer parameters
        for group in optimizer.param_groups:
            for p in group['params']:
                param_state = self.state[p]
                param_state['cached_params'] = torch.zeros_like(p.data)
                param_state['cached_params'].copy_(p.data)
                if self.pullback_momentum == "pullback":
                    param_state['cached_mom'] = torch.zeros_like(p.data)

    def __getstate__(self):
        return {
            'state': self.state,
            'optimizer': self.optimizer,
            'la_alpha': self.la_alpha,
            '_la_step': self._la_step,
            '_total_la_steps': self._total_la_steps,
            'pullback_momentum': self.pullback_momentum
        }

    def zero_grad(self):
        self.optimizer.zero_grad()

    def get_la_step(self):
        return self._la_step

    def state_dict(self):
        return self.optimizer.state_dict()

    def load_state_dict(self, state_dict):
        self.optimizer.load_state_dict(state_dict)

    def _backup_and_load_cache(self):
        """Useful for performing evaluation on the slow weights (which typically generalize better)
        """
        for group in self.optimizer.param_groups:
            for p in group['params']:
                param_state = self.state[p]
                param_state['backup_params'] = torch.zeros_like(p.data)
                param_state['backup_params'].copy_(p.data)
                p.data.copy_(param_state['cached_params'])

    def _clear_and_load_backup(self):
        for group in self.optimizer.param_groups:
            for p in group['params']:
                param_state = self.state[p]
                p.data.copy_(param_state['backup_params'])
                del param_state['backup_params']

    @property
    def param_groups(self):
        return self.optimizer.param_groups

    def step(self, closure=None):
        """Performs a single Lookahead optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = self.optimizer.step(closure)
        self._la_step += 1

        if self._la_step >= self._total_la_steps:
            self._la_step = 0
            # Lookahead and cache the current optimizer parameters
            for group in self.optimizer.param_groups:
                for p in group['params']:
                    param_state = self.state[p]
                    p.data.mul_(self.la_alpha).add_(param_state['cached_params'], alpha=1.0 - self.la_alpha)  # crucial line
                    param_state['cached_params'].copy_(p.data)
                    if self.pullback_momentum == "pullback":
                        internal_momentum = self.optimizer.state[p]["momentum_buffer"]
                        self.optimizer.state[p]["momentum_buffer"] = internal_momentum.mul_(self.la_alpha).add_(
                            1.0 - self.la_alpha, param_state["cached_mom"])
                        param_state["cached_mom"] = self.optimizer.state[p]["momentum_buffer"]
                    elif self.pullback_momentum == "reset":
                        self.optimizer.state[p]["momentum_buffer"] = torch.zeros_like(p.data)

        return loss

**Model training**

In [142]:
# Initialize adam optimizer with weight decay to minimize overfit

from transformers import AdamW

base  = AdamW(model.parameters(), lr=1e-4)
optim = Lookahead(base)

In [143]:
from torch.utils.data import DataLoader
from tqdm import tqdm

import warnings
warnings.simplefilter("ignore")


# Initialize data loader for training data

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)


for epoch in range(5):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                        start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 313/313 [08:31<00:00,  1.63s/it, loss=1.43]
Epoch 1: 100%|██████████| 313/313 [08:28<00:00,  1.62s/it, loss=0.341]
Epoch 2: 100%|██████████| 313/313 [08:29<00:00,  1.63s/it, loss=0.515]
Epoch 3: 100%|██████████| 313/313 [08:30<00:00,  1.63s/it, loss=0.297]
Epoch 4: 100%|██████████| 313/313 [08:29<00:00,  1.63s/it, loss=0.0659]


In [144]:
# Saving the model in a local directory

MODEL_DIR = "./model"
if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)
model.save_pretrained(MODEL_DIR)
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
MODEL_DIR = "./model"
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_DIR)

**Model Evaluation**

In [145]:
from torch.utils.data import DataLoader

# Switching model to evaluation mode

model.eval()
model = model.to(device)
val_loader = DataLoader(valid_ds, batch_size=16)
acc = list()
for batch in val_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
        
# Calculating average accuracy in total

print(f"Score of the model based on EM: {sum(acc)/len(acc)}")

Score of the model based on EM: 0.5234375


In [146]:
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score

# Switching model to evaluation mode

model.eval()
model = model.to(device)
val_loader = DataLoader(valid_ds, batch_size=16)
start_true_all, end_true_all = [], []
start_pred_all, end_pred_all = [], []
for batch in val_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        start_true_all.extend(start_true.tolist())
        end_true_all.extend(end_true.tolist())
        start_pred_all.extend(start_pred.tolist())
        end_pred_all.extend(end_pred.tolist())

# Calculating F1 score for start and end positions

start_f1 = f1_score(start_true_all, start_pred_all, average='macro')
end_f1 = f1_score(end_true_all, end_pred_all, average='macro')
overall_f1 = (start_f1 + end_f1) / 2

print(f"F1 score of the model: {overall_f1:.3f}")


F1 score of the model: 0.378


In [147]:
# Returns answer to a given a question
def answer_to_questions(context: str, questions: list) -> list:
    encodings = tokenizer([context]*len(questions), questions, truncation=True, padding=True, return_tensors="pt")
    encodings = encodings.to(device)
    outputs = model(**encodings)
    start_pred = torch.argmax(outputs['start_logits'], dim=1)
    end_pred = torch.argmax(outputs['end_logits'], dim=1)  
    answers = list()
    for index, (start_idx, end_idx) in enumerate(zip(start_pred, end_pred)):
        tokens = tokenizer.convert_ids_to_tokens(encodings['input_ids'][index][start_idx:end_idx+1])
        answers.append(tokenizer.convert_tokens_to_string(tokens) )
    print("Context:")
    pprint(context)
    print()
    for question, answer in zip(questions, answers):
        print(f"Q:  {question}")
        print(f"A:  {answer}")
        print("-"*60)
    return answers

**Sample answers from the model**

In [148]:
context = "The modern Olympic Games or Olympics (French: Jeux olympiques)[1][2] are leading international sporting events featuring summer and winter sports competitions in which thousands of athletes from around the world participate in a variety of competitions. The Olympic Games are considered the world's foremost sports competition with more than 200 nations participating.[3] The Olympic Games are normally held every four years, alternating between the Summer and Winter Olympics every two years in the four-year period."
questions = [
    "How often do the Olympic games hold?",
    "How many nations do participate in each Olympic?","what is olympics in french called as?"
]

_ = answer_to_questions(context, questions)

Context:
('The modern Olympic Games or Olympics (French: Jeux olympiques)[1][2] are '
 'leading international sporting events featuring summer and winter sports '
 'competitions in which thousands of athletes from around the world '
 'participate in a variety of competitions. The Olympic Games are considered '
 "the world's foremost sports competition with more than 200 nations "
 'participating.[3] The Olympic Games are normally held every four years, '
 'alternating between the Summer and Winter Olympics every two years in the '
 'four-year period.')

Q:  How often do the Olympic games hold?
A:  every four years,
------------------------------------------------------------
Q:  How many nations do participate in each Olympic?
A:  more than 200
------------------------------------------------------------
Q:  what is olympics in french called as?
A:  jeux olympiques)
------------------------------------------------------------


In [149]:
context = "Vikings is the modern name given to seafaring people primarily from Scandinavia (present-day Denmark, Norway and Sweden), who from the late 8th to the late 11th centuries raided, pirated, traded and settled throughout parts of Europe. They also voyaged as far as the Mediterranean, North Africa, the Middle East, and North America. In some of the countries they raided and settled in, this period is popularly known as the Viking Age, and the term \"Viking\" also commonly includes the inhabitants of the Scandinavian homelands as a collective whole. The Vikings had a profound impact on the Early medieval history of Scandinavia, the British Isles, France, Estonia, and Kievan Rus'."
questions = [
    "When vikings started raided?","who are vikings?","Vikings had impact on which period?"
]

_ = answer_to_questions(context, questions)

Context:
('Vikings is the modern name given to seafaring people primarily from '
 'Scandinavia (present-day Denmark, Norway and Sweden), who from the late 8th '
 'to the late 11th centuries raided, pirated, traded and settled throughout '
 'parts of Europe. They also voyaged as far as the Mediterranean, North '
 'Africa, the Middle East, and North America. In some of the countries they '
 'raided and settled in, this period is popularly known as the Viking Age, and '
 'the term "Viking" also commonly includes the inhabitants of the Scandinavian '
 'homelands as a collective whole. The Vikings had a profound impact on the '
 'Early medieval history of Scandinavia, the British Isles, France, Estonia, '
 "and Kievan Rus'.")

Q:  When vikings started raided?
A:  late 8th to the late 11th centuries
------------------------------------------------------------
Q:  who are vikings?
A:  seafaring people
------------------------------------------------------------
Q:  Vikings had impact on which