In [1]:
!mkdir squad
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

--2021-06-03 03:25:56--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.110.153, 185.199.109.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘squad/train-v2.0.json’


2021-06-03 03:25:58 (67.6 MB/s) - ‘squad/train-v2.0.json’ saved [42123633/42123633]

--2021-06-03 03:25:58--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.111.153, 185.199.109.153, 185.199.108.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘squad/dev-v2.0.json’


2021-06-03 03:25:59 (22.8 MB/s) - ‘squad/dev-v2.0.json’ saved [4370528/4370528]



In [2]:
!ls squad

dev-v2.0.json  train-v2.0.json


In [None]:
from transformers import BertTokenizerFast
import json
from pathlib import Path
import torch
from transformers import BertForQuestionAnswering
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.metrics import accuracy_score
from transformers import get_scheduler
from tqdm.auto import tqdm

In [3]:
def read_squad(path):
    if 'train' in path:
        limit = 21705
    elif 'dev' in path:
        limit = 5075
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    
    #contexts = contexts[:limit]
    #questions = questions[:limit]
    #answers = answers[:limit]

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')
# context means the passage here

In [5]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [6]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [8]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions}) # updating the dictionary

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [9]:
print(train_encodings.keys())
print(train_answers[0]['answer_start'], train_answers[0]['answer_end'], train_encodings.char_to_token(0, train_answers[0]['answer_start']))

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])
269 286 67


In [10]:
# as ususal convert encodings to torch.utils.data.Dataset format
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [12]:
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [13]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optimizer = AdamW(model.parameters(), lr=5e-5)


num_epochs = 1
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


  0%|          | 0/5427 [00:00<?, ?it/s]

In [42]:
eval_loader = DataLoader(val_dataset, batch_size=8)

In [45]:
model.eval()
count = 0
actual_starts = []
predicted_starts = []
actual_ends = []
predicted_ends = []
progress_bar = tqdm(range(len(eval_loader)))
for batch in eval_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions']#.to(device)
    end_positions = batch['end_positions']#.to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)#, start_positions=start_positions, end_positions=end_positions)
    #print(outputs.keys())
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    predictions_start = torch.argmax(start_logits, dim=-1)
    predictions_end = torch.argmax(end_logits, dim=-1)
    actual_starts.append(start_positions.tolist())
    predicted_starts.append(predictions_start.tolist())
    actual_ends.append(end_positions.tolist())
    predicted_ends.append(predictions_end.tolist())
    progress_bar.update(1)

start_accuracy = 0.0
end_accuracy = 0.0
for ast, pst in zip(actual_starts, predicted_starts):
    start_accuracy += accuracy_score(ast, pst)
print(start_accuracy/len(actual_starts))
for aend, pend in zip(actual_ends, predicted_ends):
    end_accuracy += accuracy_score(aend, pend)
print(end_accuracy/len(actual_ends))

  0%|          | 0/2538 [00:00<?, ?it/s]

0.6779944838455477
0.7152777777777778


In [47]:
def answer_the_question(contexts, questions):
    '''
    Given contexts and questions, give the answer for each pair of context and question
    '''
    encodings = tokenizer(contexts, questions, truncation=True, padding=True)
    dataset = SquadDataset(encodings)
    loaded_data = DataLoader(dataset, batch_size=8)
    answers = []
    for batch in loaded_data:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits
        predictions_start = torch.argmax(start_logits, dim=-1)
        predictions_end = torch.argmax(end_logits, dim=-1)
        
        for input_id, pred_start, pred_end in zip(input_ids, predictions_start, predictions_end):
            tokens = tokenizer.convert_ids_to_tokens(input_id)
            answer = ''
            for i in range(pred_start+1, pred_end+1):
                if tokens[i][0:2] == '##':
                    answer += tokens[i][2:]
                else:    
                    answer += ' '+tokens[i]
            answers.append(answer)
    return answers

questions = val_questions[500:517]
contexts = val_contexts[500:517]
answers = answer_the_question(contexts, questions)
for context, question, answer in zip(contexts, questions, answers):
    print(context)
    print('-'*70)
    print(question)
    print('-'*70)
    print(answer)
    print('-'*70)

However, some computational problems are easier to analyze in terms of more unusual resources. For example, a non-deterministic Turing machine is a computational model that is allowed to branch out to check many different possibilities at once. The non-deterministic Turing machine has very little to do with how we physically want to compute algorithms, but its branching exactly captures many of the mathematical models we want to analyze, so that non-deterministic time is a very important resource in analyzing computational problems.
----------------------------------------------------------------------
What is the most critical resource in the analysis of computational problems associated with non-deterministic Turing machines?
----------------------------------------------------------------------
 - deterministic time
----------------------------------------------------------------------
For a precise definition of what it means to solve a problem using a given amount of time and spac