In [None]:
#Importing all necessary and important libraries that would be used in this project
!pip install -U -q PyDrive
!pip install -U sentence-transformers
!pip install transformers
!pip install wandb
!pip install pytorch_lightning
!pip install matplotlib
!wandb login  #Requires the user to create an account on Weights & Biases tool
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import os
from google.colab import files
import zipfile
import shutil
import glob
import pickle
import collections
import math
import random
import wandb
import matplotlib.pyplot as plt
import torch
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, LongformerTokenizerFast, AutoModelForQuestionAnswering
import collections
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import AdamW
import time

In [None]:
#Initializing tokenizer and QA Longformer models that would be used in the subsequent cells/functions
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#LOAD INFORMATION RETRIEVAL MODEL - this ranks passages in a larger context in order to construct a smaller context
model_ir = SentenceTransformer('distilroberta-base-msmarco-v2') #Distilroberta trained on MS MARCO (open-domain QA dataset)
model_ir = model_ir.to(device)
#LOAD QUESTION ANSWERING MODEL - this performs the traditional QA task, which right now is span return
model_name_or_path = "mrm8488/longformer-base-4096-finetuned-squadv2" #Longformer trained on SQUaD 2.0
model_qa = AutoModelForQuestionAnswering.from_pretrained(model_name_or_path)
tokenizer_qa = LongformerTokenizerFast.from_pretrained(model_name_or_path)
model_qa = model_qa.to(device)

In [None]:
#This function allows colab to hook up Google Drive and use it as local storage
def setup():
  # Authenticate and create the PyDrive client.
  print("Step 1. Authenticating Google account user")
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)
  #Authenticate access to Google drive
  print("Step 2. Authenticating access to Google drive")
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)
  os.chdir("/content/drive/My Drive")
  directory_path = os.getcwd()+"/RulesDataset"
  if not os.path.exists(directory_path):
    os.makedirs(directory_path)
  else:
    shutil.rmtree(directory_path)
    os.makedirs(directory_path)
  os.chdir(directory_path)
  #Upload zip file containing .txt files
  print("Step 3. Please upload only two files: InitialTest.zip and saved_csv_dict.p")
  uploaded = files.upload()

In [None]:
#This function reads in the files uploaded in setup() and imports the finetuning data into variables
def process_data():
  with zipfile.ZipFile("InitialTest.zip", "r") as zip_file:
    zip_file.extractall()
  data_dict={}
  directory_path = "/content/drive/My Drive/RulesDataset/InitialTest"
  filepaths = glob.glob(directory_path+"/*.txt")
  filenames = os.listdir(directory_path)
  for filepath, filename in zip(filepaths, filenames):
    with open(filepath, 'r') as file:
      data_content = file.read().splitlines()
    data_content = list(filter(None, data_content))
    data_dict[filename.split('.')[0]] = data_content
  csv_dict = pickle.load(open("saved_csv_dict.p", "rb"))
  return data_dict, csv_dict

In [None]:
#This function returns the word embeddings of the context (rules in the rulebook)
def get_context_embeddings(context_sentences):
  context_embeddings = torch.from_numpy(model_ir.encode(context_sentences))
  return context_embeddings

In [None]:
#This function reduces the size of the context by selecting top k passages in the context and concatenates them
def get_final_context_strings(questions, answers, context):
  top_k = 3 # Choose number of top passages to concatenate. Typical is 3 for Longformer.
  context_embeddings = get_context_embeddings(context)
  num_questions = len(questions)
  num_answers = len(answers)
  final_contexts = [[] for i in range(num_questions)]
  final_context_strings = ["" for i in range(num_questions)]
  top_k_sentences = []

  with torch.no_grad():
    question_embeddings = torch.from_numpy(model_ir.encode(questions)) # -> Tensor, Size: Number of Questions x Embedding Size
  cos_sim_sentences = util.pytorch_cos_sim(question_embeddings, context_embeddings) # Use cosine simularity to rate sentences. -> Tensor, Size: Number of Questions x Number of Passages in Context
  top_k_sentences = torch.topk(cos_sim_sentences,top_k)[1] #Indices Tensor (Size: Number of Questions x Indices of Top k Values in Context) - Discard values tensor at index 0
  sorted_indices = torch.sort(top_k_sentences,dim=-1)[0] ## Sort sentences into document order to preserve structure, removing extra indices tensor from sort fxn. -> Indices Tensor (Size: Number of Questions x Indices of Top k Values in Context)
  sorted_indices_list = sorted_indices.tolist()

  for qxn in range(num_questions): # Build all the final contexts
    for chunk in sorted_indices_list[qxn]:
      final_contexts[qxn].append(sentences[chunk])
    final_context_strings[qxn] = " ".join(final_contexts[qxn])
  
  return final_context_strings

In [None]:
#Naive search for answer span start and end characters in a batch. Not reliable versus multiple possible span returns, but the answer spans in the fine-tuning set only have on possible gold string.

def get_answer_spans(questions, answers, final_context_strings):
  original_context_embeddings = tokenizer_qa(questions,final_context_strings,return_tensors='pt',padding=True,truncation=True) # Tokenize the input batch using the QA model. Add padding to make all the same size.
  num_questions = len(questions)
  num_answers = len(answers)
  ans_char_starts = [[] for i in range(num_answers)] # Character index for answer start
  ans_char_ends = [[] for i in range(num_answers)] # Character index for answer end
  ans_starts = [[] for i in range(num_answers)] # Token index for answer start
  ans_ends = [[] for i in range(num_answers)] # Token index for answer end
  
  for i in range(num_questions):
    answer_length = len(answers[i])
    if final_context_strings[i].find(answers[i]) == -1: # If the answer is no longer in the final span, set correct answer to CLS.
      ans_starts[i] = 0
      ans_ends[i] = 1
    else: # If the answer is still in the span, record the start and end characters of the span.
      ans_char_starts[i] = final_context_strings[i].find(answers[i]) # Get character index for start.
      ans_char_ends[i] = ans_char_starts[i] + answer_length # Get character index for end.
      question_len = len(tokenizer_qa(questions[i])['input_ids'])
      ans_starts[i] = (original_context_embeddings.char_to_token(i,ans_char_starts[i])) - question_len # Extracts token index for answer start
      ans_ends[i] = (original_context_embeddings.char_to_token(i,ans_char_ends[i] - 1)) - question_len + 1 # Extracts token index for answer end
  
  return ans_starts, ans_ends

In [None]:
#This function computes F1 score between the gold sentence and the predicted sentence (sourced from SQuAD)
def compute_f1(a_gold, a_pred): 
  gold_toks = tokenizer_qa(a_gold)
  pred_toks = tokenizer_qa(a_pred)
  common = collections.Counter(gold_toks['input_ids']) & collections.Counter(pred_toks['input_ids'])
  num_same = sum(common.values())
  if len(gold_toks) == 0 or len(pred_toks['input_ids']) == 0:
    return int(gold_toks['input_ids'] == pred_toks['input_ids'])
  if num_same == 0:
    return 0
  precision = 1.0 * num_same / len(pred_toks['input_ids'])
  recall = 1.0 * num_same / len(gold_toks['input_ids'])
  f1 = (2 * precision * recall) / (precision + recall)
  return f1

In [None]:
#Performs split on the rulebook data into training, evaluation and test with 80/10/10 split ratio
random.seed(123) #for reproducing results
def three_split(questions):
  num_questions = len(questions)
  question_ids = [x for x in range(num_questions)]
  templen = num_questions
  trainlen = math.ceil(float(0.8 * templen))
  templen = templen - trainlen
  vallen = math.ceil(float(0.5 * templen))

  train_ids = random.sample(question_ids, trainlen)
  for i in train_ids:
    question_ids.remove(i)
  
  val_ids = random.sample(question_ids, vallen)
  for i in val_ids:
    question_ids.remove(i)

  test_ids = question_ids

  return train_ids, val_ids, test_ids

In [None]:
#This function computes and prints the bi-gram statistics for a given input list of questions 
def compute_stats(list_of_questions):
  keys = ["Pandemic", "Twilight_Struggle", "Terra_Mystica", "Food_Chain_Magnate", "Great_Western_Trail", "Catan", "Carcassonne", "Terraforming_Mars", "Power_Grid_Recharged"]

  word_dict = {}
  for question in list_of_questions:
    words = question.split()
    first = words[0]
    second = words[1]

    key = first+"-"+second

    if key in word_dict:
      word_dict[key] = word_dict[key] + 1
    else:
      word_dict[key] = 1

  word_dict_desc = dict(sorted(word_dict.items(), key=operator.itemgetter(1),reverse=True))

  print("The bigram statistics are:\n")
  for word, count in word_dict_desc.items():
    print(word, count)

In [None]:
setup() #Perform setup

In [None]:
context_dict, csv_dict = process_data() #Read in the Rulebook dataset

In [None]:
#Reducing size of context for each (question, answer, context) triplet in the dataset
keys = ["Pandemic", "Twilight_Struggle", "Terra_Mystica", "Food_Chain_Magnate", "Great_Western_Trail", "Catan", "Carcassonne", "Terraforming_Mars", "Power_Grid_Recharged"]

for key in keys:
  print("Processing: ",key)
  sentences = context_dict[key]
  questions = csv_dict[key][0]
  answers = csv_dict[key][1]
  final_context_strings = get_final_context_strings(questions, answers, sentences)
  csv_dict[key].append(final_context_strings)

In [None]:
#Computing answer spans for each (question, answer, context) triplet in the dataset
for key in keys:
  print("Processing: ",key)
  questions = csv_dict[key][0]
  answers = csv_dict[key][1]
  final_context_strings = csv_dict[key][2]
  ans_starts, ans_ends = get_answer_spans(questions, answers, final_context_strings)
  csv_dict[key].append(ans_starts)
  csv_dict[key].append(ans_ends)

In [None]:
#Splitting the dataset into training, evaluation and test sets
for key in keys:
  print("Processing: ",key)
  questions = csv_dict[key][0]
  train_ids, val_ids, test_ids = three_split(questions)
  csv_dict[key].append(train_ids)
  csv_dict[key].append(val_ids)
  csv_dict[key].append(test_ids)

In csv_dict : 0 - questions, 1 - answers, 2 - final context strings, 3 - ans_start, 4 - ans_end, 5 - train_ids, 6 - val_ids, 7 - test_ids

In [None]:
#Creating the actual train, evaluation and test dataset splits

#Preparing list of training questions and their contexts using train_labels
train_questions = []
train_contexts = []
train_answers = []

#Preparing list of validation questions and their contexts using eval_labels 
eval_questions = []
eval_contexts = []
eval_answers = []

#Preparing list of test questions and their contexts using test_labels 
test_questions = []
test_contexts = []
test_answers = []

for key in keys:
  print("Processing: ",key)
  questions = csv_dict[key][0]
  answers = csv_dict[key][1]
  contexts = csv_dict[key][2]
  ans_start = csv_dict[key][3]
  ans_end = csv_dict[key][4]
  train_labels = csv_dict[key][5]
  eval_labels = csv_dict[key][6]
  test_labels = csv_dict[key][7]

  for i in train_labels:
    train_questions.append(questions[i])
    train_contexts.append(contexts[i])
    train_answers.append({"text": answers[i], 'answer_start': ans_start[i],'answer_end':ans_end[i]})
  
  for i in eval_labels:
    eval_questions.append(questions[i])
    eval_contexts.append(contexts[i])
    eval_answers.append({"text": answers[i], 'answer_start': ans_start[i], 'answer_end':ans_end[i]})
  
  for i in test_labels:
    test_questions.append(questions[i])
    test_contexts.append(contexts[i])
    test_answers.append({"text": answers[i], 'answer_start': ans_start[i], 'answer_end':ans_end[i]})

In [None]:
#Obtain encodings for training and evaluation datasets
train_encodings = tokenizer_qa(train_contexts, train_questions, truncation=True, padding=True)
eval_encodings = tokenizer_qa(eval_contexts, eval_questions, truncation=True, padding=True)

In [None]:
#This function adds token positions to the encodings based on dataset type
def add_token_positions(encodings, label_type):
  start_positions = []
  end_positions = []
  for key in keys:
    print("Processing: ",key)
    ans_start = csv_dict[key][3]
    ans_end = csv_dict[key][4]
    labels = csv_dict[key][label_type]
    for i in labels:
      start_positions.append(ans_start[i])
      end_positions.append(ans_end[i])
  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [None]:
#Defining custom class for the Rulebook Dataset
class BoardGameDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
#Adding token position to training and evaluation dataset encodings and also creating the training and evaluation BoardGameDataset objects
add_token_positions(train_encodings, 5)
add_token_positions(eval_encodings, 6)

train_dataset = BoardGameDataset(train_encodings)
eval_dataset = BoardGameDataset(eval_encodings)

In [None]:
#Defining the configuration file which will be used to perform hyperparameter sweep experiments
sweep_config = {
    'method': 'grid', #grid, random
    'metric': {
      'name': 'loss',
      'goal': 'minimize'   
    },
    'parameters': {
        'accumulation_steps': {
            'values': [8, 16, 32]
        },
        'learning_rate': {
            'values': [1e-5,2e-5,3e-5,4e-5,5e-5]
        }
    }
}

config_defaults = {
        'learning_rate': 0.1,
        'accumulation_steps' : 8,
        'train_batch_size' : 1,
        'eval_batch_size' : 2,
        'epochs' : 6

    }

#Initializing Weights & Biases project for performing hyperparameter sweep experiments
sweep_id = wandb.sweep(sweep_config, project="nlp_project_sweep_test")
model_list = [] #For storing each the finetuned models

In [None]:
# Use this if you need to clear GPU memory.
del model_qa
torch.cuda.empty_cache()

In [None]:
#Defining evaluation loss function
def eval_loss(eval_loader, model_temp):
    model_temp.eval()
    with torch.no_grad():
      for batch in eval_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model_temp(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        wandb.log({"eval loss": loss})

In [None]:
#Defining the function which performs finetuning of the Longformer model on the Rulebook dataset, the reference for which will be passed into Weights & Biases for performing hyperparameter sweep experiments
def train_eval():
  model_qa = AutoModelForQuestionAnswering.from_pretrained(model_name_or_path)
  model_qa = model_qa.to(device)
  torch.manual_seed(123)
  wandb.init(config=config_defaults)

  train_loader = DataLoader(train_dataset, batch_size=wandb.config.train_batch_size, shuffle=True)
  eval_loader = DataLoader(eval_dataset, batch_size=wandb.config.eval_batch_size, shuffle=True)
  optim = AdamW(model_qa.parameters(), lr=wandb.config.learning_rate)

  wandb.watch(model_qa, log="all")

  optim.zero_grad()

  for epoch in range(wandb.config.epochs):
    print("At epoch: ",epoch)
    accumulator = 0 # Initialize accumulator for gradient accumulation
    model_qa.train()
    for batch in train_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)
      outputs = model_qa(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
      loss = outputs[0]
      loss = loss / wandb.config.accumulation_steps # Normalize loss
      loss.backward()
      accumulator += 1
      if (accumulator) % wandb.config.accumulation_steps == 0: # If we've finished accumulating gradient, update model and evaluate.
        wandb.log({"training loss": loss})
        optim.step()
        optim.zero_grad()
        eval_loss(eval_loader, model_qa) # get evaluation loss

  #Computing Average F1 score between the set of predicted test answers and gold rest answers
  f1_scores = []
  for i in range(len(test_questions)):
    current_test_question = test_questions[i]
    current_test_context = test_contexts[i]
    current_test_answer = test_answers[i]
    test_inputs = tokenizer_qa(current_test_questions, current_test_context, return_tensors='pt') # Tokenize the input using the QA model.
    with torch.no_grad():
      test_inputs = test_inputs.to(device)
      answer_start_scores, answer_end_scores = model_qa(**test_inputs) # Get outputs from QA model.
      answer_start = torch.argmax(answer_start_scores)
      answer_end = torch.argmax(answer_end_scores) + 1
      inference_answer = tokenizer_qa.convert_tokens_to_string(tokenizer_qa.convert_ids_to_tokens(test_inputs["input_ids"][0][answer_start:answer_end])).lstrip(' ') # Remove leading whitespace
      f1_score = compute_f1(current_test_answer, inference_answer)
      f1_scores.append(f1_score)
  
  average_f1_score = (float)(sum(f1_scores) / len(f1_scores))
  wandb.log({"Average F1 score": average_f1_score})

  
  torch.save(model_qa.state_dict(), 'model.h5')
  wandb.save('model.h5')

  model_list.append(model_qa)
  torch.cuda.empty_cache()

In [None]:
#Weights and Biases function callback to perform the actual hyperparameter sweep experiment
wandb.agent(sweep_id, function=train_eval)