<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/ai-powered-search/14_3_qa_model_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In this notebook, we"re going to install a transformer model, analyze the embedding output, and compare some vectors

In [None]:
#outdoors
![ ! -d 'outdoors' ] && git clone --depth=1 https://github.com/ai-powered-search/outdoors.git
! cd outdoors && git pull
! cd outdoors && cat outdoors.tgz.part* > outdoors.tgz
! cd outdoors && mkdir -p '../data/outdoors/' && tar -xvf outdoors.tgz -C '../data/outdoors/'

Cloning into 'outdoors'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 25 (delta 0), reused 22 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (25/25), 491.39 MiB | 16.98 MiB/s, done.
Updating files: 100% (23/23), done.
Already up to date.
README.md
concepts.pickle
._guesses.csv
guesses.csv
._guesses_all.json
guesses_all.json
outdoors_concepts.pickle
outdoors_embeddings.pickle
._outdoors_golden_answers.csv
outdoors_golden_answers.csv
._outdoors_golden_answers.xlsx
outdoors_golden_answers.xlsx
._outdoors_golden_answers_20210130.csv
outdoors_golden_answers_20210130.csv
outdoors_labels.pickle
outdoors_question_answering_contexts.json
outdoors_questionanswering_test_set.json
outdoors_questionanswering_train_set.json
._posts.csv
posts.csv
predicates.pickle
pull_aips_dependency.py
._question-answer-seed-contexts.csv
question-answer-seed-contexts.csv
question-answer-sq

In [None]:
%%capture

!pip install datasets

In [None]:
import sys
import os
import time
sys.path.append("../..")
import pandas as pd
import numpy as np
import pickle
import json
import tqdm
import random

import torch

from transformers import pipeline, RobertaTokenizerFast, RobertaForQuestionAnswering
from transformers import RobertaForQuestionAnswering, TrainingArguments, Trainer, default_data_collator
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk


from IPython.display import display,HTML

In [None]:
def get_processor_type():
    gpu_device = torch.device("cuda:0")
    cpu_device = torch.device("cpu")
    return gpu_device or cpu_device

def get_processor_device():
    return 0 if torch.cuda.is_available() else -1

In [None]:
print("Processor: " + str(get_processor_type()))
print("Device id: " + str(get_processor_device()))

Processor: cuda:0
Device id: 0


In [None]:
# loading a question-answering dataset
contexts = pd.read_csv("data/outdoors/question-answer-seed-contexts.csv")
contexts.head()

Unnamed: 0,id,question,context,url
0,4410,Who places the anchors that rock climbers use?,There are two distinct styles of free rock cli...,https://outdoors.stackexchange.com/questions/4410
1,5347,"Who places the bolts on rock climbing routes, ...",What you're talking about is Sport climbing. G...,https://outdoors.stackexchange.com/questions/5347
2,20662,Who gets the bill if you activate a PLB to hel...,"Almost always the victim gets the bill, but as...",https://outdoors.stackexchange.com/questions/2...
3,7623,What knot is this one? What are its purposes?,Slip knot It's undoubtably a slip knot that's ...,https://outdoors.stackexchange.com/questions/7623
4,11587,"What sort of crane, and what sort of snake?","To answer the snake part of it, looking at som...",https://outdoors.stackexchange.com/questions/1...


## Creating silver dataset

In [None]:
def get_processor_device():
    return 0 if torch.cuda.is_available() else -1

def display_guesses(guesses):
    display(HTML(pd.DataFrame(guesses[0:10]).to_html(index=False)))

In [None]:
model_name = "deepset/roberta-base-squad2"
device = get_processor_device()

In [None]:
def answer_questions(contexts, k=10):
  nlp = pipeline('question-answering', model=model_name, tokenizer=model_name, device=device)
  guesses = []
  for _, row in tqdm.tqdm(contexts[0:k].iterrows(), total=k):
    results = nlp({
        'context': row['context'],
        'question': row['question']
    })
    guesses.append(results)
  return guesses

In [None]:
# Automatically labeling data from a pretrained model
guesses = answer_questions(contexts, k=len(contexts))

In [None]:
display_guesses(guesses)

score,start,end,answer
0.278927,474,516,a local enthusiast or group of enthusiasts
0.200848,81,117,the person who is creating the climb
0.018632,14,24,the victim
0.222317,29,38,slip knot
0.000551,1255,1262,aquatic
0.374997,15,40,a high-tech treasure hunt
0.563755,192,232,"a tube of lightweight, stretchy material"
0.110915,125,154,the cheapest one of the three
0.805174,68,76,blocking
0.247008,227,265,the traditional longbow made from wood


In [None]:
contexts["answers"] = guesses
contexts.to_csv("data/outdoors/qa-squad2-guesses.csv", index=False)

##Human-in-the-loop training

In [None]:
random.seed(0)

def get_training_data(filename):
    golden_answers = pd.read_csv(filename)
    golden_answers = golden_answers[golden_answers["class"] != None]
    qa_data = []
    for _, row in golden_answers.iterrows():
        answers = row["gold"].split("|")
        starts = [row["context"].find(a) for a in answers]
        missing = -1 in starts
        if not missing:
            row["title"] = row["question"]
            row["answers"] = {"text": answers, "answer_start": starts}
            qa_data.append(row)
    columns = ["id", "url", "title", "question", "context", "answers"]
    df = pd.DataFrame(qa_data, columns=columns).sample(frac=1)
    train_split = int(len(df) * 0.75)
    eval_split = (int((len(df) - train_split) / 1.25) +
                  train_split - 1)
    train_dataset = Dataset.from_pandas(df[:train_split])
    test_dataset = Dataset.from_pandas(df[train_split:eval_split])
    validation_dataset = Dataset.from_pandas(df[eval_split:])
    return DatasetDict({"train": train_dataset, "test": test_dataset,"validation": validation_dataset})

In [None]:
#It took about 2-3 hours to label 200 question/answer rows
#Doing so will give you a deeper appreciation for the difficulty of the NLP task.
#I *highly* encourage you to label even more documents, and re-run the fine-tuning tasks coming up.
datadict = get_training_data("data/outdoors/outdoors_golden_answers.csv")
model_path = "data/question-answering/question-answering-training-set"

datadict.save_to_disk(model_path)

datadict

Saving the dataset (0/1 shards):   0%|          | 0/1243 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/331 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/84 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 1243
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 331
    })
    validation: Dataset({
        features: ['id', 'url', 'title', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 84
    })
})

##Tokenizer

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

In [None]:

#This method adopted from the following example notebook:
#https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb

file = "data/question-answering/question-answering-training-set"
datadict = load_from_disk(file)

In [None]:
def tokenize_dataset(examples):

    maximum_tokens = 384 # This will be the number of tokens in BOTH the question and context
    document_overlap = 128 # Sometimes we need to split the context into smaller chunks, so we will overlap with this window
    pad_on_right = tokenizer.padding_side == "right"

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=maximum_tokens,
        stride=document_overlap,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    print(tokenized_examples[0])

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [None]:
"""
To apply this function on all the sentences (or pairs of sentences) in our dataset,
we just use the map method of our dataset object we created earlier.
This will apply the function on all the elements of all the splits in dataset,
so our training, validation and testing data will be preprocessed in one single command.
Since our preprocessing changes the number of samples, we need to remove the old columns when applying it.
"""
tokenized_datasets = datadict.map(tokenize_dataset, batched=True, remove_columns=datadict["train"].column_names)

In [None]:
tokenized_datasets.save_to_disk("data/question-answering/qa-training-set-tokenized")

## Finetuning

In [None]:
model = RobertaForQuestionAnswering.from_pretrained('deepset/roberta-base-squad2')

In [None]:
training_args = TrainingArguments(
    evaluation_strategy="epoch",                          # evaluate loss per epoch
    num_train_epochs=3,                                   # total # of training epochs
    per_device_train_batch_size=16,                       # batch size per device during training
    per_device_eval_batch_size=64,                        # batch size for evaluation
    warmup_steps=500,                                     # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                                    # strength of weight decay
    logging_dir="data/question-answering/logs",           # directory for storing logs
    output_dir="data/question-answering/results")         # output directory

trainer = Trainer(
    model=model,                                          # the instantiated 🤗 Transformers model to be trained
    args=training_args,                                   # training arguments, defined above
    data_collator=default_data_collator,
    tokenizer=tokenizer,
    train_dataset=tokenized_datasets["train"],            # training dataset
    eval_dataset=tokenized_datasets["test"])              # evaluation dataset

In [None]:
trainer.train()

In [None]:
model_name = "data/question-answering/roberta-base-squad2-fine-tuned"
trainer.save_model(model_name)

In [None]:
evaluation = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
display(evaluation)

## Building the reader

In [None]:
device = get_processor_device()

# Loading the fine-tuned outdoors question-answering model
model_name = "data/question-answering/roberta-base-squad2-fine-tuned"
qa_reader = pipeline('question-answering', model=model_name, tokenizer=model_name, device=device)

In [None]:
# Evaluating the fine-tuned question-answering model
def answer_questions(examples):
    answers = []
    success = 0
    for example in examples:
        question = {"question": example["question"][0],
                    "context": example["context"][0]}
        answer = qa_reader(question)
        label = example["answers"][0]["text"][0]
        result = answer["answer"]
        print(question["question"])
        print("Label:", label)
        print("Result:", result)
        print("----------")
        success += (1 if (label == result) else 0)
        answers.append(answer)
    print(f"{success}/{len(examples)} correct")

In [None]:
datadict["validation"].set_format(type="pandas", output_all_columns=True)
validation_examples = [example for example in datadict["validation"]]
answer_questions(validation_examples)

## Incorporating the retriever