# CS 525 Assignment 3
Sirut Buasai, sbuasai2@wpi.edu

### Imports and Downloads

In [None]:
import numpy as np
import pandas as pd
import torch
import json
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import BertForQuestionAnswering
from transformers import BertTokenizerFast
from tqdm import tqdm

### Data Retrieval and Processing
### Process JSON Data Format into Dataframe

In [None]:
# declare load data function for preprocessing
def load_data(path):  
  # load the json file
  with open(path, 'r') as f:
    raw_data = json.load(f)

  # initialize return lists
  ids = []
  contexts = []
  questions = []
  answers = []

  # initialize empty answer dict
  empty_ans = {'text': '',
               'answer_start': 10}

  # iterate through the json file and place each data into their respective lists
  for data in raw_data['data']:
    for topic in data['paragraphs']:
      context = topic['context']
      for qa in topic['qas']:
        question = qa['question']
        qid = qa['id']

        # if there is no answer, append empty string
        if not qa['answers']:
          contexts.append(context)
          questions.append(question)
          answers.append(empty_ans)
          ids.append(qid)
          
        else:
          for answer in qa['answers']:
            contexts.append(context)
            questions.append(question)
            answers.append(answer)
            ids.append(qid)

  # initialize dataframe
  df = pd.DataFrame({
    'id': ids,
    'context': contexts,
    'question': questions,
    'answer': answers
  })

  return df

# initialize dataset files
train_json = 'train-v2.0.json'
test_json = 'dev-v2.0.json'

# load data
train_data = load_data(train_json)
test_data = load_data(test_json)

### Sample Subset of Data

In [None]:
# sample a subset of train data for faster training
train_size = int(0.1*len(train_data))
train_data = train_data[:train_size]

# sample a subset of test data (keep full size for final predictions)
test_size = int(len(test_data))
test_data = test_data[:test_size]

### Create End Index for Each Answer

In [None]:
# declare add end index function for each answer
def add_end_idx(answers, contexts):
  # get starting and ending index
  for answer, context in zip(answers, contexts):
    answer_text = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(answer_text)

    # auto adjust in case answers length are offset
    if context[start_idx:end_idx] == answer_text:
      answer['answer_end'] = end_idx

    # answers are off by 1
    elif context[start_idx-1:end_idx-1] == answer_text:
      answer['answer_start'] = start_idx - 1
      answer['answer_end'] = end_idx - 1

    # answers are off by 2
    elif context[start_idx-2:end_idx-2] == answer_text:
      answer['answer_start'] = start_idx - 2
      answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

# add end index for train and test dataset
add_end_idx(train_data['answer'], train_data['context'])
add_end_idx(test_data['answer'], test_data['context'])

### Tokenize Dataset Based on Context and Questions

In [None]:
# initialize tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# initialize dataset encodings
train_encodings = tokenizer(list(train_data['context']), list(train_data['question']), truncation=True, padding=True)
test_encodings = tokenizer(list(test_data['context']), list(test_data['question']), truncation=True, padding=True)

### Process Each Answer Starting and Ending Index Positions as Encodings

In [None]:
# declare add answer index positions to token encodings
def add_token_positions(encodings, answers):
  # initialize starting and ending encoding positions
  starts = []
  ends = []

  # populate encoding positions
  for i in range(len(answers)):
    starts.append(encodings.char_to_token(i, answers[i]['answer_start']))
    ends.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

    # handle truncated answers
    if not starts[-1]:
      starts[-1] = tokenizer.model_max_length
    if not ends[-1]:
      ends[-1] = tokenizer.model_max_length

  encodings.update({'starts': starts, 'ends': ends})

# add positional tokens to training and testing set
add_token_positions(train_encodings, train_data['answer'])
add_token_positions(test_encodings, test_data['answer'])

### Define Dataloaders for Training

In [None]:
# create custom Dataset class for torch Dataloader
class QnA_Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings

  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

  def __len__(self):
    return len(self.encodings.input_ids)

# convert dataset to torch Dataset
train_dataset = QnA_Dataset(train_encodings)
test_dataset = QnA_Dataset(test_encodings)

# create the dataloaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

## Fine-Tuning Question and Answering BERT

In [None]:
# create device to force GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# initalize model and optimizer
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
optim = AdamW(model.parameters(), lr=5e-5)
model.to(device)
model.train()

# training loop 
epoch = 5
for epoch in range(epoch):
  loop = tqdm(train_loader, leave=True)
  for batch in loop:
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    starts = batch['starts'].to(device)
    ends = batch['ends'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, start_positions=starts, end_positions=ends)
    loss = outputs[0]
    loss.backward()
    optim.step()

    loop.set_description(f'Epoch {epoch+1}')
    loop.set_postfix(loss=loss.item())

## Evaluating Question and Answering BERT
### Declare Evaluation Functions for SQuAD Dataset

In [None]:
def get_prediction(context, question):
  inputs = tokenizer.encode_plus(question, context, return_tensors='pt', truncation=True, max_length=512).to(device)
  outputs = model(**inputs)
  
  answer_start = torch.argmax(outputs[0])  
  answer_end = torch.argmax(outputs[1]) + 1 
  
  answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))
  
  return answer

def normalize_text(s):
  """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
  import string, re
  def remove_articles(text):
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)
  def white_space_fix(text):
    return " ".join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_f1(prediction, truth):
  pred_tokens = normalize_text(prediction).split()
  truth_tokens = normalize_text(truth).split()
  
  # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
  if len(pred_tokens) == 0 or len(truth_tokens) == 0:
    return int(pred_tokens == truth_tokens)
  
  common_tokens = set(pred_tokens) & set(truth_tokens)
  
  # if there are no common tokens then f1 = 0
  if len(common_tokens) == 0:
    return 0
  
  prec = len(common_tokens) / len(pred_tokens)
  rec = len(common_tokens) / len(truth_tokens)
  
  return round(2 * (prec * rec) / (prec + rec), 2)

### Convert Predictions into JSON File

In [None]:
# get predictions
test_data['predictions'] = test_data.apply(lambda row: get_prediction(row['context'], row['question']), axis=1)

# convert dataframe into json
predictions_json = dict(zip(test_data['id'], test_data['predictions']))
with open('pred.json', 'w') as f:
  json.dump(predictions_json, f)