<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/ai-powered-search/14_2_qa_data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In this notebook, we"re going to install a transformer model, analyze the embedding output, and compare some vectors

In [1]:
#outdoors
![ ! -d 'outdoors' ] && git clone --depth=1 https://github.com/ai-powered-search/outdoors.git
! cd outdoors && git pull
! cd outdoors && cat outdoors.tgz.part* > outdoors.tgz
! cd outdoors && mkdir -p '../data/outdoors/' && tar -xvf outdoors.tgz -C '../data/outdoors/'

Cloning into 'outdoors'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 25 (delta 0), reused 22 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (25/25), 491.39 MiB | 24.03 MiB/s, done.
Updating files: 100% (23/23), done.
Already up to date.
README.md
concepts.pickle
._guesses.csv
guesses.csv
._guesses_all.json
guesses_all.json
outdoors_concepts.pickle
outdoors_embeddings.pickle
._outdoors_golden_answers.csv
outdoors_golden_answers.csv
._outdoors_golden_answers.xlsx
outdoors_golden_answers.xlsx
._outdoors_golden_answers_20210130.csv
outdoors_golden_answers_20210130.csv
outdoors_labels.pickle
outdoors_question_answering_contexts.json
outdoors_questionanswering_test_set.json
outdoors_questionanswering_train_set.json
._posts.csv
posts.csv
predicates.pickle
pull_aips_dependency.py
._question-answer-seed-contexts.csv
question-answer-seed-contexts.csv
question-answer-sq

In [6]:
import sys
import os
import time
sys.path.append("../..")
import pandas as pd
import numpy as np
import pickle
import json
import tqdm

import torch

from transformers import pipeline


from IPython.display import display,HTML

In [4]:
# loading a question-answering dataset
contexts = pd.read_csv("data/outdoors/question-answer-seed-contexts.csv")
contexts.head()

Unnamed: 0,id,question,context,url
0,4410,Who places the anchors that rock climbers use?,There are two distinct styles of free rock cli...,https://outdoors.stackexchange.com/questions/4410
1,5347,"Who places the bolts on rock climbing routes, ...",What you're talking about is Sport climbing. G...,https://outdoors.stackexchange.com/questions/5347
2,20662,Who gets the bill if you activate a PLB to hel...,"Almost always the victim gets the bill, but as...",https://outdoors.stackexchange.com/questions/2...
3,7623,What knot is this one? What are its purposes?,Slip knot It's undoubtably a slip knot that's ...,https://outdoors.stackexchange.com/questions/7623
4,11587,"What sort of crane, and what sort of snake?","To answer the snake part of it, looking at som...",https://outdoors.stackexchange.com/questions/1...


## Creating the silver set

In [10]:
def get_processor_device():
    return 0 if torch.cuda.is_available() else -1

def display_guesses(guesses):
    display(HTML(pd.DataFrame(guesses[0:10]).to_html(index=False)))

In [11]:
model_name = "deepset/roberta-base-squad2"
device = get_processor_device()

In [12]:
def answer_questions(contexts, k=10):
  nlp = pipeline('question-answering', model=model_name, tokenizer=model_name, device=device)
  guesses = []
  for _, row in tqdm.tqdm(contexts[0:k].iterrows(), total=k):
    results = nlp({
        'context': row['context'],
        'question': row['question']
    })
    guesses.append(results)
  return guesses

In [14]:
# Automatically labeling data from a pretrained model
guesses = answer_questions(contexts, k=len(contexts))

Device set to use cuda:0
100%|██████████| 1662/1662 [00:57<00:00, 28.80it/s]


In [15]:
display_guesses(guesses)

score,start,end,answer
0.278927,474,516,a local enthusiast or group of enthusiasts
0.200848,81,117,the person who is creating the climb
0.018632,14,24,the victim
0.222317,29,38,slip knot
0.000551,1255,1262,aquatic
0.374997,15,40,a high-tech treasure hunt
0.563755,192,232,"a tube of lightweight, stretchy material"
0.110915,125,154,the cheapest one of the three
0.805174,68,76,blocking
0.247008,227,265,the traditional longbow made from wood


In [16]:
contexts["answers"] = guesses
contexts.to_csv("data/outdoors/qa-squad2-guesses.csv", index=False)

##Human-in-the-loop training

In [None]:
from transformers import RobertaTokenizerFast,PreTrainedTokenizerFast

tokenizer2 = RobertaTokenizerFast.from_pretrained('roberta-base')
assert isinstance(tokenizer2, PreTrainedTokenizerFast)

In [None]:
def tokenize_dataset(examples, maximum_tokens=384, document_overlap=128):

    #maximum_tokens = 384 # This will be the number of tokens in BOTH the question and context
    #document_overlap = 128 # Sometimes we need to split the context into smaller chunks, so we will overlap with this window
    pad_on_right = tokenizer.padding_side == "right"

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer2(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=maximum_tokens,
        stride=document_overlap,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    return tokenized_examples

In [None]:
example = {"question" : question, "context" : context}
tokenized_examples = tokenize_dataset(example, maximum_tokens=16, document_overlap=3)
windowed_inputs = tokenized_examples["input_ids"]
windows = ["<table cellpadding=0 cellspacing=0>"]
for window in windowed_inputs:
    row = tokenizer2.convert_ids_to_tokens(window)
    rhtml = ["<tr>"]
    for cell in row:
        rhtml.append(f'<td style="font-size:0.9em;font-family:courier;margin:0;padding:0;">{clean_token(cell)}</td>')
    rhtml.append("</tr>")
    windows.append("".join(rhtml))
windows.append("</table>")
display(HTML("\n".join(windows)))

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
<s>,What,_are,_minimalist,_shoes,</s>,</s>,There,_was,_actually,_a,_project,_done,_on,_the,</s>
<s>,What,_are,_minimalist,_shoes,</s>,</s>,_done,_on,_the,_definition,_of,_what,_a,_minimalist,</s>
<s>,What,_are,_minimalist,_shoes,</s>,</s>,_what,_a,_minimalist,_shoe,_is,_and,_the,_result,</s>
<s>,What,_are,_minimalist,_shoes,</s>,</s>,_and,_the,_result,_was,"_""",Foot,wear,_providing,</s>
<s>,What,_are,_minimalist,_shoes,</s>,</s>,Foot,wear,_providing,_minimal,_interference,_with,_the,_natural,</s>
<s>,What,_are,_minimalist,_shoes,</s>,</s>,_with,_the,_natural,_movement,_of,_the,_foot,_due,</s>
<s>,What,_are,_minimalist,_shoes,</s>,</s>,_the,_foot,_due,_to,_its,_high,_flexibility,",",</s>
<s>,What,_are,_minimalist,_shoes,</s>,</s>,_high,_flexibility,",",_low,_heel,_to,_toe,_drop,</s>
<s>,What,_are,_minimalist,_shoes,</s>,</s>,_to,_toe,_drop,",",_weight,_and,_stack,_height,</s>
<s>,What,_are,_minimalist,_shoes,</s>,</s>,_and,_stack,_height,",",_and,_the,_absence,_of,</s>
