# Data Preparation 

In [1]:
import helper
import pandas as pd
from pprint import pprint
from tqdm.auto import tqdm
from haystack.nodes import QuestionGenerator, BM25Retriever, FARMReader
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.pipelines import RetrieverQuestionGenerationPipeline, QuestionAnswerGenerationPipeline
from haystack.utils import print_questions, export_answers_to_csv

2023-08-30 18:27:15.958452: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Openstax Biology
topic = 'test'
#docs = helper.openstax_to_doc(path='data/openstax_biology_sample.csv')
docs = helper.csv_to_doc(path='data/openstax_biology_sample.csv', title='summary_heading', 
                        subject='subject', content='summary_text')

In [3]:
doc_store = helper.add_to_docstore(docs, index=topic, delete_docs=True)

In [4]:
doc_store = helper.classify_docs(labels=['physics', 'chemistry', 'biology'],
                                doc_store=doc_store, index=topic)

Classifying documents: 100%|██████████| 10/10 [00:02<00:00,  4.10it/s]


In [5]:
question_generator = QuestionGenerator(model_name_or_path='valhalla/t5-base-e2e-qg',
                                       max_length=420, split_length=75, 
                                       split_overlap=20, use_gpu=True)

  return self.fget.__get__(instance, owner)()
You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
Using sep_token, but it is not set yet.


In [6]:
#reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", 
#                    use_gpu=True, confidence_threshold=0.70)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", 
                    top_k=1, use_gpu=True)

In [7]:
pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)

In [8]:
#docs = doc_store.get_all_documents(filters={'classification': ['l']})
#docs = doc_store.get_all_documents()
#docs = doc_store.get_all_documents(filters={'classification': {'label': ['loda']}})
#filterx = {"classification": {'score': 0.9655949473381042}}
filterx = {"classification.label": "biology"} 
#filterx = {'topic': ' Synthesis of Biological Macromolecules'}
d = doc_store.get_all_documents(filters=filterx)
#x = doc_store.query(query=filter)
d

[<Document: {'content': 'Biology is the science that studies living organisms and their interactions with one another and their environments. Science attempts to describe and understand the nature of the universe in whole or in part by rational means. Science has many fields. Those fields related to the physical world and its phenomena are natural sciences.', 'content_type': 'text', 'score': None, 'meta': {'subject': 'biology', 'topic': ' The Science of Biology', 'classification': {'score': 0.9655949473381042, 'details': {'chemistry': 0.010395056568086147, 'biology': 0.9655949473381042, 'physics': 0.024009989574551582}, 'label': 'biology'}}, 'id_hash_keys': ['content'], 'embedding': None, 'id': 'd6659290230e3cad35f25fd7af40139f'}>,
 <Document: {'content': 'Biology is the science of life. All living organisms share several key properties such as order, sensitivity or response to stimuli, reproduction, growth and development, regulation, homeostasis, and energy processing. Living things 

In [110]:
docs

[<Document: {'content': 'Biology is the science that studies living organisms and their interactions with one another and their environments. Science attempts to describe and understand the nature of the universe in whole or in part by rational means. Science has many fields. Those fields related to the physical world and its phenomena are natural sciences.', 'content_type': 'text', 'score': None, 'meta': {'subject': 'biology', 'topic': ' The Science of Biology', 'classification': {'score': 0.9655949473381042, 'details': {'chemistry': 0.010395056568086147, 'biology': 0.9655949473381042, 'physics': 0.024009989574551582}, 'label': 'biology'}}, 'id_hash_keys': ['content'], 'embedding': None, 'id': 'd6659290230e3cad35f25fd7af40139f'}>,
 <Document: {'content': 'Biology is the science of life. All living organisms share several key properties such as order, sensitivity or response to stimuli, reproduction, growth and development, regulation, homeostasis, and energy processing. Living things 

In [120]:
import pprint
pprint.pprint({'content': 'This is a biology text\n', 'content_type': 'text', 'score': None, 'meta': {'subject': 'biology', 'topic': 'Biology', 'classification': {'score': 0.9655949473381042, 'details': {'chemistry': 0.010395056568086147, 'biology': 0.9655949473381042, 'physics': 0.024009989574551582}, 'label': 'biology'}}, 'id_hash_keys': ['content'], 'embedding': None, 'id': 'd6659290230e3cad35f25fd7af40139f'})

{'content': 'This is a biology text\n',
 'content_type': 'text',
 'embedding': None,
 'id': 'd6659290230e3cad35f25fd7af40139f',
 'id_hash_keys': ['content'],
 'meta': {'classification': {'details': {'biology': 0.9655949473381042,
                                         'chemistry': 0.010395056568086147,
                                         'physics': 0.024009989574551582},
                             'label': 'biology',
                             'score': 0.9655949473381042},
          'subject': 'biology',
          'topic': 'Biology'},
 'score': None}


In [121]:
x = {'content': 'This is a biology text\n',
 'content_type': 'text',
 'embedding': None,
 'id': 'd6659290230e3cad35f25fd7af40139f',
 'id_hash_keys': ['content'],
 'meta': {'classification': {'details': {'biology': 0.9655949473381042,
                                         'chemistry': 0.010395056568086147,
                                         'physics': 0.024009989574551582},
                             'label': 'biology',
                             'score': 0.9655949473381042},
          'subject': 'biology',
          'topic': 'Biology'},
 'score': None}

In [123]:
x['meta']['classification']['label']

'biology'

In [28]:
filterx = {'meta': {'topic': ' The Science of Biology'}}

In [56]:
d = doc_store.get_all_documents()

In [69]:
for doc in d:
    print(doc.meta['classification']['label'])

biology
biology
chemistry
biology
biology
biology
biology
biology
biology
biology


In [71]:
for doc in docs:
    result = pipeline.run(documents=[doc], params={'filters': 
                                                        {'meta': 
                                                            {'classification': 
                                                                {
                                                                    'label': ['biology']
                                                                }
                                                            }
                                                        }
                                                    }
                                                      )
    #result = pipeline.run(documents=[doc], params={'content_type': 'text'})
    #result = pipeline.run(documents=[doc], metadata={'label': 'biology'})
    print(result)
    break

ValueError: No node(s) or global parameter(s) named filters found in pipeline.

In [15]:
result

{'queries': ['What is the science that studies living organisms and their interactions with one another and their environments?',
  'Science attempts to describe and understand the nature of the universe in whole or in part by what means?',
  'What fields related to the physical world and its phenomena are natural sciences?'],
 'answers': [[<Answer {'answer': 'Biology', 'type': 'extractive', 'score': 0.9669034481048584, 'context': 'Biology is the science that studies living organisms and their interactions with one another and their environments. Science attempts to describe and ', 'offsets_in_document': [{'start': 0, 'end': 7}], 'offsets_in_context': [{'start': 0, 'end': 7}], 'document_ids': ['d6659290230e3cad35f25fd7af40139f'], 'meta': {'subject': 'biology', 'topic': 'The Science of Biology'}}>],
  [<Answer {'answer': 'rational', 'type': 'extractive', 'score': 0.8733130693435669, 'context': 'cribe and understand the nature of the universe in whole or in part by rational means. Scienc

In [23]:
for query_content, answer_content, document_content in zip(result['queries'], result['answers'], result['documents']):
        answer = answer_content[0]
        document = document_content[0]
        if answer.score > 0.75:
            print(answer.answer)
            print(query_content)
            print(document.content, end='\n--\n')
            
            #generated_ques.append(query_content)
            #generated_ans.append(answer.answer)
            #doc_contexts.append(document.content)

Biology
What is the science that studies living organisms and their interactions with one another and their environments?
Biology is the science that studies living organisms and their interactions with one another and their environments. Science attempts to describe and understand the nature of the universe in whole or in part by rational means. Science has many fields. Those fields related to the physical world and its phenomena are natural sciences.
--
rational
Science attempts to describe and understand the nature of the universe in whole or in part by what means?
Biology is the science that studies living organisms and their interactions with one another and their environments. Science attempts to describe and understand the nature of the universe in whole or in part by rational means. Science has many fields. Those fields related to the physical world and its phenomena are natural sciences.
--


: 

In [None]:
docs = doc_store.get_all_documents()
results = pipeline.run(documents=docs)

In [None]:
generated_ques = []
generated_ans = []
doc_contexts = []

for query_content, answer_content, document_content in zip(results['queries'], results['answers'], results['documents']):
    answer = answer_content[0]
    document = document_content[0]
    if answer.score > 0.75:
        generated_ques.append(query_content)
        generated_ans.append(answer.answer)
        doc_contexts.append(document.content)
        #doc_contexts.append(document)
        #print(query_content)
        #print(answer.answer, answer.score)
        #print(document, end='\n--\n')

In [None]:
df_gen_qa = pd.DataFrame(data={'generated_question':generated_ques, 'generated_answer':generated_ans,
                               'document_context':doc_contexts})

In [None]:
df_gen_qa.to_csv(f'data/{topic}_generated_QA.csv', index=False)

In [12]:
x = [1,2,3]
y = ['a', 'b']

In [13]:
x.extend(y)

In [14]:
print(x)

[1, 2, 3, 'a', 'b']


In [15]:
int('d')

ValueError: invalid literal for int() with base 10: 'd'