In [None]:
# Install dependencies & download training files
!pip install transformers
!pip install torch

!mkdir squad
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O squad/train-v1.1.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O squad/dev-v1.1.json


Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[?25l[K     |                                | 10 kB 25.7 MB/s eta 0:00:01[K     |▏                               | 20 kB 29.0 MB/s eta 0:00:01[K     |▎                               | 30 kB 31.3 MB/s eta 0:00:01[K     |▍                               | 40 kB 22.5 MB/s eta 0:00:01[K     |▌                               | 51 kB 14.6 MB/s eta 0:00:01[K     |▋                               | 61 kB 12.7 MB/s eta 0:00:01[K     |▊                               | 71 kB 14.0 MB/s eta 0:00:01[K     |▉                               | 81 kB 15.4 MB/s eta 0:00:01[K     |█                               | 92 kB 13.9 MB/s eta 0:00:01[K     |█                               | 102 kB 14.4 MB/s eta 0:00:01[K     |█▏                              | 112 kB 14.4 MB/s eta 0:00:01[K     |█▎                              | 122 kB 14.4 MB/s eta 0:00:01[K     |█▍                              | 133 kB 14.4

In [1]:
# Data processing
# Note: it seems like the questions have IDs but the contexts don't
# Perhaps we can just hash the contents of the context as the ID?
import csv
import json


with open("./squad/train-v1.1.json") as json_file:
  train_data = json.load(json_file)["data"]
with open("./squad/dev-v1.1.json") as json_file:
  dev_data = json.load(json_file)["data"]

with open('contexts.csv', 'w') as csvfile:
    fieldnames = ['id', 'context']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for i in range(0, len(train_data)):
      for j in range(0, len(train_data[i]["paragraphs"])):
        writer.writerow({'id': str(i) + "x" + str(j), 'context': train_data[i]["paragraphs"][j]["context"]})

with open('qa.csv', 'w') as csvfile:
    fieldnames = ['question', 'answer', 'context_id', 'start_pos']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for i in range(0, len(train_data)):
      for j in range(0, len(train_data[i]["paragraphs"])):
        for l in range(0, len(train_data[i]["paragraphs"][j]["qas"])):
          writer.writerow({'context_id': str(i) + "x" + str(j), 'question': train_data[i]["paragraphs"][j]["qas"][l]["question"],  'answer': train_data[i]["paragraphs"][j]["qas"][l]["answers"][0]["text"], "start_pos": train_data[i]["paragraphs"][j]["qas"][l]["answers"][0]["answer_start"]})

with open('testset.csv', 'w') as csvfile:
    fieldnames = ['question', 'answers', 'context_id']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for i in range(0, len(train_data)):
      for j in range(0, len(train_data[i]["paragraphs"])):
        for l in range(0, len(train_data[i]["paragraphs"][j]["qas"])):
          a = "|".join(list(set(map(lambda x: x["text"], train_data[i]["paragraphs"][j]["qas"][l]["answers"]))))
          writer.writerow({'context_id': str(i) + "x" + str(j), 'question': train_data[i]["paragraphs"][j]["qas"][l]["question"],  'answers': a})


In [5]:
# Training + dataset loading code
# Code modified from https://huggingface.co/transformers/custom_datasets.html
import json
import torch
import pandas as pd

def read_squad():
    contexts = pd.read_csv("contexts.csv").set_index('id').to_dict()['context']
    questions = pd.read_csv("qa.csv")
    
    contexts_list = list(map(lambda q: contexts[q[1]['context_id']], questions.iterrows()))
    questions_list = list(questions['question'])
    answers = list(map(lambda q: {'start': q[1]['start_pos'], 'text': q[1]['answer']}, questions.iterrows()))
    
    return contexts_list, questions_list, answers

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

def add_end_idx(answers, contexts):
    to_delete = list()
    for i, (answer, context) in enumerate(zip(answers, contexts)):
        gold_text = answer['text']
        if isinstance(answer['text'], float):
            print(answer['text'])
            to_delete.append(i)
            continue
            
        start_idx = answer['start']
        end_idx = start_idx + len(gold_text)
        
        answer['end'] = end_idx

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['start'] = start_idx - 1
            answer['end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['start'] = start_idx - 2
            answer['end'] = end_idx - 2     # When the gold label is off by two characters
    for i in to_delete:
        del answers[i]
        del contexts[i]


from transformers import BertTokenizerFast, BertConfig, BertModel


def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['end'] - 1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start': start_positions, 'end': end_positions})

from torch.utils.data import DataLoader
from transformers import AdamW


def train(train_dataset):
  # Initializing a BERT bert-base-uncased style configuration
  config = BertConfig()

  # Initializing a model from the bert-base-uncased style configuration
  model = BertModel(config)

  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

  model.to(device)
  model.train()

  train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

  optim = AdamW(model.parameters(), lr=5e-5)

  for epoch in range(3):
      for batch in train_loader:
          optim.zero_grad()
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          start_positions = batch['start'].to(device)
          end_positions = batch['end'].to(device)
          outputs = model(input_ids, attention_mask=attention_mask, start=start_positions, end=end_positions)
          loss = outputs[0]
          loss.backward()
          optim.step()

  model.eval()
  return model


In [None]:
# Train model on baseline data

train_contexts, train_questions, train_answers = read_squad()

add_end_idx(train_answers, train_contexts)

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
add_token_positions(train_encodings, train_answers)

train_dataset = SquadDataset(train_encodings)
model = train(train_dataset)
torch.save(model.state_dict(), "base_model.json")

nan


In [2]:
# Generate adversarial examples

In [None]:
# Train model on adversairal examples

In [None]:
# Eval baseline and adversarial models