# Unsupervised Question + Answer Generation

In [23]:
!nvidia-smi

Sat Apr  9 16:56:58 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    30W /  70W |   6216MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Loading Models

In [24]:
!pip install farm-haystack datasets



### Questions Model

In [25]:
from haystack.nodes import QuestionGenerator

QG_model = 'valhalla/t5-base-e2e-qg'  # default
# QG_model = 'valhalla/t5-small-e2e-qg'  # small
# QG_model = 'allenai/unifiedqa-t5-base' # n/a

question_generator = QuestionGenerator(model_name_or_path=QG_model)

INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1


### Questions and Answers Pipeline

In [26]:
from haystack.pipeline import QuestionAnswerGenerationPipeline
from haystack.nodes import FARMReader

# squad is used to generate the answers for the generated questions
reader = FARMReader("deepset/roberta-base-squad2")

qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)

INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find deepset/roberta-base-squad2 locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...
INFO - haystack.modeling.model.language_model -  Loaded deepset/roberta-base-squad2
INFO - haystack.modeling.logger -  ML Logging is turned off. No parameters, metrics or artifacts will be logged to MLFlow.
INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.infer -  Got ya 2 parallel workers to do inference ...
INFO - haystack.modeling.infer -   0     0  
INFO - haystack.modeling.infer -  /w\   /w\ 
INFO - haystack.modeling.infer -  /'\   / \ 


### Loading Data

In [27]:
import io
from dataclasses import dataclass

# qag pipeline expects an object with content and id properties for some reason, so we fake that here:
@dataclass
class Doc:
    id: int
    content: str

    def __repr__(self):
        # only show part of document when representing:
        c = self.content
        return f"<Doc {self.id} - {c[:3]}...{c[-3:]}>"

In [28]:
from datasets import load_dataset
import pandas as pd
_dataset = load_dataset('GroNLP/ik-nlp-22_slp')
df_train = pd.DataFrame(_dataset['train'])

df_train_summaries = df_train.loc[df_train['section'] == "Summary"]



  0%|          | 0/1 [00:00<?, ?it/s]

In [29]:
%%script echo v1, non-cleaned

docs = {}

# for index, row in df_train.iterrows():
for index, row in df_train_summaries.iterrows():
    docs[str(index)] = Doc(index, row['text'])

v1, non-cleaned


In [30]:
# %%script echo v2, cleaned

# v2, removes bullet point character and splits sentences on it
# and remove '-' or ' ' at the start of the sentence

docs = {}

index = 0
for _, row in df_train_summaries.iterrows():

    for text in filter(lambda _: _, row['text'].split('•')):
        index += 1
        text = text.strip(' -')
        docs[str(index)] = Doc(index, text)

print("amount of docs:", len(docs))

amount of docs: 71


## Paraphrasing Input Data

In [31]:
from transformers import pipeline

PARAPHRASE_MODEL = 'tuner007/pegasus_paraphrase'

# device = 0 to use GPU 
paraphrase_pipeline = pipeline("text2text-generation", model=PARAPHRASE_MODEL, device=0)

In [32]:
def paraphrase(text, n=4):
    # generate 4 alternatives, skip paraphrases that are not questions (= don't end with ?)
    return [p['generated_text'] for p in paraphrase_pipeline(text, num_return_sequences=n)]

In [33]:
import spacy
nlp = spacy.load('en_core_web_sm')

def split_sentences(sent):
    return nlp(sent).sents

In [34]:
%%script echo v1

# extra: extending the documents with paraphrases

for doc in list(docs.values()):
    for sentence in split_sentences(doc.content):

        # max length of paraphrase() is 55 tokens, cut off after:
        for alternative in paraphrase(str(sentence[:55]), 3):
            index += 1 # continued from previous block
            docs[str(index)] = Doc(index, alternative)


print("amount of docs:", len(docs))

v1


In [35]:
# %%script echo v2, cleaned

# extra: extending the documents with paraphrases (improved, cleaned)

for doc in list(docs.values()):
    for sentence in split_sentences(doc.content):
        # convert to string and clean:
        sentence = str(sentence).strip(' -')

        # max length of paraphrase() is 60, so we split on sentences to prevent long input
        for alternative in paraphrase(sentence, 3):
            index += 1 # continued from previous block
            docs[str(index)] = Doc(index, alternative)


print("amount of docs:", len(docs))



amount of docs: 175


In [36]:
from time import time
from IPython.display import clear_output

def generate_qa_pairs(d):
    qa_pairs = qag_pipeline.run(documents=d)

    pairs = []
    for pair in qa_pairs['results']:
        _ans = pair['answers']
        if not _ans:
            continue

        ans = _ans[0]

        id = ans.document_id
        if '-' in id:
            id = id.split('-')[0]

        doc = docs[id]  # 0-0 -> 0
        
        pairs.append({
            'question': pair['query'],
            'answer': ans.answer,
            # 'context': ans.context, # <- very short, not full paragraph
            'context': doc.content,
            'start_pos': ans.offsets_in_document[0].start,
        })

    return pairs

In [37]:
%%script echo skipping

# method 1: pass all docs at once
# note: this method seems to yield only a few qa pairs
t = time()

result = generate_qa_pairs(docs.values())

clear_output(wait=True)
print(len(result), "pairs generated")
print("Generating took:", time() - t)

skipping


In [38]:
# method 2: pass document by document
t = time()

result = []
d_n = len(docs)
for idx, doc in enumerate(docs.values()):
    _result = generate_qa_pairs([doc])
    result.extend(_result)
    print(f"{idx}/{d_n}: +{len(_result)} = {len(result)}")

clear_output(wait=True)
print(len(result), "pairs generated")
print("Generating took:", time() - t)

274 pairs generated
Generating took: 128.46997022628784


In [39]:
from csv import DictWriter

with open('qa.csv', 'w') as f:
    writer = DictWriter(f, fieldnames=['question', 'answer', 'context', 'start_pos'])
    writer.writeheader()
    for pair in result:
        if not pair['question'].endswith('?'):
            # skip non-questions
            continue

        writer.writerow(pair)

In [40]:
result

[{'answer': 'the regular expression',
  'context': "This chapter introduced a fundamental tool in language processing, the regular expression, and showed how to perform basic text normalization tasks including word segmentation and normalization, sentence segmentation, and stemming. We also introduced the important minimum edit distance algorithm for comparing strings. Here's a summary of the main points we covered about these ideas:",
  'question': ' What is a fundamental tool in language processing?',
  'start_pos': 67},
 {'answer': 'basic text normalization tasks',
  'context': "This chapter introduced a fundamental tool in language processing, the regular expression, and showed how to perform basic text normalization tasks including word segmentation and normalization, sentence segmentation, and stemming. We also introduced the important minimum edit distance algorithm for comparing strings. Here's a summary of the main points we covered about these ideas:",
  'question': ' What di