# Question Generation
Code taken from https://haystack.deepset.ai/tutorials/13_question_generation

In [1]:
from pprint import pprint
from tqdm.auto import tqdm
from haystack.nodes import QuestionGenerator, BM25Retriever, FARMReader
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.pipelines import RetrieverQuestionGenerationPipeline, QuestionAnswerGenerationPipeline
from haystack.utils import print_questions

2023-09-26 15:20:40.479708: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
text1 = "Mendel postulated that genes (characteristics) are inherited as pairs of alleles (traits) that behave in a dominant and recessive pattern. Alleles segregate into gametes such that each gamete is equally likely to receive either one of the two alleles present in a diploid individual. In addition, genes are assorted into gametes independently of one another. That is, alleles are generally not more likely to segregate into a gamete with a particular allele of another gene. A dihybrid cross demonstrates independent assortment when the genes in question are on different chromosomes or distant from each other on the same chromosome. For crosses involving more than two genes, use the forked line or probability methods to predict offspring genotypes and phenotypes rather than a Punnett square."
text2 = "A chemical equation shows the reactants and the products of a chemical reaction. Balancing chemical equations is necessary because the same atoms should be present before and after the reaction, just in different arrangements. A balanced chemical equation is useful in showing us the ratios of reactants and products"
docs = [{"content": text1}, {"content": text2}]

# Initialize document store and write in the documents
document_store = ElasticsearchDocumentStore()
# Clear documents from previous runs
document_store.delete_documents()
# Write documents from current execution
document_store.write_documents(docs)
# Initialize Question Generator
question_generator = QuestionGenerator(split_length=100, split_overlap=20, use_gpu=True)

  return self.fget.__get__(instance, owner)()
You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
Using sep_token, but it is not set yet.


# Question Answer Generation
Input: Documents<br>
Output: Question, Answers<br>
Procedure:
- Question generated using `valhalla/t5-base-e2e-qg`. 
- Answers for the generated question generated using `deepset/roberta-base-squad2`. 
- Both pretrained model (T5 and RoBERTa base) are finetuned on SQuADv2. 

In [None]:
reader = FARMReader("deepset/roberta-base-squad2", use_gpu=True)
qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)
for idx, document in enumerate(tqdm(document_store)):

    print(f"\n * Generating questions and answers for document {idx}: {document.content[:50]}...\n")
    result = qag_pipeline.run(documents=[document])
    print_questions(result)

# Retriever Question Generation Pipeline
Input: Document, Query<br>
Output: QA Pairs from documents belonging to the query<br>
Procedure:
- Filter documents using BM25, TF-IDF, Embedding based retriever.
- Generate QA pairs on the filtered documents.


In [None]:
retriever = BM25Retriever(document_store=document_store)
rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator)

print(f"\n * Generating questions for documents matching the query 'Arya Stark'\n")
result = rqg_pipeline.run(query="genes")
result

In [3]:
reader = FARMReader("deepset/deberta-v3-large-squad2", use_gpu=True)

In [None]:
#reader = FARMReader("deepset/deberta-v3-base-squad2", use_gpu=True)

In [4]:
question_generator = QuestionGenerator(split_length=100, split_overlap=20, use_gpu=True)
retriever = BM25Retriever(document_store=document_store)

Using sep_token, but it is not set yet.


In [None]:
from haystack import Pipeline

p = Pipeline()
p.add_node(component=retriever, name="BM25Retriever", inputs=["Query"])
p.add_node(component=question_generator, name="QGenerator", inputs=["BM25Retriever"])
p.add_node(component=reader, name='Reader', inputs=["QGenerator"])
res = p.run(query="genes")

In [5]:
docs=retriever.retrieve("genes")

In [11]:
from haystack.pipelines import QuestionAnswerGenerationPipeline
pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)

In [None]:
pipeline.

In [14]:
docs

[<Document: {'content': 'Mendel postulated that genes (characteristics) are inherited as pairs of alleles (traits) that behave in a dominant and recessive pattern. Alleles segregate into gametes such that each gamete is equally likely to receive either one of the two alleles present in a diploid individual. In addition, genes are assorted into gametes independently of one another. That is, alleles are generally not more likely to segregate into a gamete with a particular allele of another gene. A dihybrid cross demonstrates independent assortment when the genes in question are on different chromosomes or distant from each other on the same chromosome. For crosses involving more than two genes, use the forked line or probability methods to predict offspring genotypes and phenotypes rather than a Punnett square.', 'content_type': 'text', 'score': 0.5343001172019939, 'meta': {}, 'id_hash_keys': ['content'], 'embedding': None, 'id': '23a9177c84ad0781b4e65fe4e523a91c'}>]

In [15]:
results = pipeline.run(documents=docs)

Inferencing Samples: 100%|██████████| 1/1 [00:06<00:00,  6.06s/ Batches]


In [17]:
results

{'queries': ['Mendel postulated that genes are inherited as pairs of alleles that behave in what pattern?',
  'Alleles segregate into what?',
  'Genes are assorted into what independently of one another?',
  'A dihybrid cross demonstrates independent assortment when the genes in question are on what chromosomes?',
  'What is assortment when the genes in question are on different chromosomes or distant from each other?',
  'For crosses involving more than two genes, use the forked line or probability methods to predict offspring genotypes and what else?'],
 'answers': [[<Answer {'answer': 'dominant and recessive', 'type': 'extractive', 'score': 0.7278926968574524, 'context': 'cs) are inherited as pairs of alleles (traits) that behave in a dominant and recessive pattern. Alleles segregate into gametes such that each gamete i', 'offsets_in_document': [{'start': 107, 'end': 129}], 'offsets_in_context': [{'start': 64, 'end': 86}], 'document_ids': ['23a9177c84ad0781b4e65fe4e523a91c'], 'meta'

In [10]:
for doc in docs:
    questions = question_generator.generate(doc.content)
    for question in questions:
        print(question)
        print(reader.predict(query=question, documents=[doc]), end='\n--\n')
        

Mendel postulated that genes are inherited as pairs of alleles that behave in what pattern?


Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.25 Batches/s]


{'query': 'Mendel postulated that genes are inherited as pairs of alleles that behave in what pattern?', 'no_ans_gap': 14.597965002059937, 'answers': [<Answer {'answer': 'dominant and recessive', 'type': 'extractive', 'score': 0.7278921604156494, 'context': 'cs) are inherited as pairs of alleles (traits) that behave in a dominant and recessive pattern. Alleles segregate into gametes such that each gamete i', 'offsets_in_document': [{'start': 107, 'end': 129}], 'offsets_in_context': [{'start': 64, 'end': 86}], 'document_ids': ['23a9177c84ad0781b4e65fe4e523a91c'], 'meta': {}}>]}
--
Alleles segregate into what?


Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.33 Batches/s]


{'query': 'Alleles segregate into what?', 'no_ans_gap': 17.14580476284027, 'answers': [<Answer {'answer': 'gametes', 'type': 'extractive', 'score': 0.9946974515914917, 'context': 'that behave in a dominant and recessive pattern. Alleles segregate into gametes such that each gamete is equally likely to receive either one of the t', 'offsets_in_document': [{'start': 162, 'end': 169}], 'offsets_in_context': [{'start': 72, 'end': 79}], 'document_ids': ['23a9177c84ad0781b4e65fe4e523a91c'], 'meta': {}}>]}
--
Genes are assorted into what independently of one another?


Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.33 Batches/s]


{'query': 'Genes are assorted into what independently of one another?', 'no_ans_gap': 18.16763973236084, 'answers': [<Answer {'answer': 'gametes', 'type': 'extractive', 'score': 0.886199951171875, 'context': 's present in a diploid individual. In addition, genes are assorted into gametes independently of one another. That is, alleles are generally not more ', 'offsets_in_document': [{'start': 321, 'end': 328}], 'offsets_in_context': [{'start': 72, 'end': 79}], 'document_ids': ['23a9177c84ad0781b4e65fe4e523a91c'], 'meta': {}}>]}
--
A dihybrid cross demonstrates independent assortment when the genes in question are on what chromosomes?


Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.33 Batches/s]


{'query': 'A dihybrid cross demonstrates independent assortment when the genes in question are on what chromosomes?', 'no_ans_gap': 13.008788824081421, 'answers': [<Answer {'answer': 'different chromosomes', 'type': 'extractive', 'score': 0.7684093713760376, 'context': 'strates independent assortment when the genes in question are on different chromosomes or distant from each other on the same chromosome. For crosses ', 'offsets_in_document': [{'start': 562, 'end': 583}], 'offsets_in_context': [{'start': 65, 'end': 86}], 'document_ids': ['23a9177c84ad0781b4e65fe4e523a91c'], 'meta': {}}>]}
--
What is assortment when the genes in question are on different chromosomes or distant from each other?


Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.33 Batches/s]


{'query': 'What is assortment when the genes in question are on different chromosomes or distant from each other?', 'no_ans_gap': 4.974249839782715, 'answers': [<Answer {'answer': 'independent', 'type': 'extractive', 'score': 0.5395277738571167, 'context': 'th a particular allele of another gene. A dihybrid cross demonstrates independent assortment when the genes in question are on different chromosomes o', 'offsets_in_document': [{'start': 505, 'end': 516}], 'offsets_in_context': [{'start': 70, 'end': 81}], 'document_ids': ['23a9177c84ad0781b4e65fe4e523a91c'], 'meta': {}}>]}
--
For crosses involving more than two genes, use the forked line or probability methods to predict offspring genotypes and what else?


Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.32 Batches/s]

{'query': 'For crosses involving more than two genes, use the forked line or probability methods to predict offspring genotypes and what else?', 'no_ans_gap': 19.046285390853882, 'answers': [<Answer {'answer': 'phenotypes', 'type': 'extractive', 'score': 0.9995484352111816, 'context': ' involving more than two genes, use the forked line or probability methods to predict offspring genotypes and phenotypes rather than a Punnett square.', 'offsets_in_document': [{'start': 756, 'end': 766}], 'offsets_in_context': [{'start': 110, 'end': 120}], 'document_ids': ['23a9177c84ad0781b4e65fe4e523a91c'], 'meta': {}}>]}
--





In [None]:
for doc in docs:
  questions = question_generator.generate(doc.content)
  print(questions)
  for q in questions:
    print(reader.predict(query=q, documents=[doc]).answers)