FARM-Haystack QnA

INPUT-autoencoded representation using pipelined vectors

In [None]:
### Autoencoder
import keras
from keras.layers import Input, Dense
from keras.models import Model
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')


class Autoencoder:

    def fit(self, X):
        if not self.autoencoder:
            self._compile(X.shape[1])
        X_train, X_test = train_test_split(X)
        self.his = self.autoencoder.fit(X_train, X_train,
                                        epochs=200,
                                        batch_size=128,
                                        shuffle=True,
                                        validation_data=(X_test, X_test), verbose=0)

In [None]:
!pip install "git+https://github.com/deepset-ai/FARM.git@0749dcb8fb46dace0d0987d6fdedf5f28120a461"
!pip install "git+https://github.com/deepset-ai/haystack.git@0cffc6cb1df8782bbf1bbc336807de355687584b"
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [None]:
import logging
from haystack import Finder
from haystack.database.sql import SQLDocumentStore
from haystack.retriever.tfidf import TfidfRetriever
from haystack.reader.farm import FARMReader
from haystack.utils import print_answers
import json
import os
from pathlib import Path
import re
import tqdm
from xml.etree import ElementTree as ET
from zipfile import ZipFile

logger = logging.getLogger("haystack")
logger.setLevel(logging.ERROR)


def write_documents_to_db(document_store, autoencoder, clean_func=None, only_empty_db=False, 
    split_paragraphs=False, abstracts_only=False):
    
    # check if db has already docs
    if only_empty_db:
        n_docs = document_store.get_document_count()
        if n_docs > 0:
          
            return None

    # read and add docs
    docs_to_index = []
    count = 1
   
 def get_finder(full_text, abstracts_only=False):
   
    document_store = SQLDocumentStore(autoencoder)
    write_documents_to_db(document_store=document_store, 
        document_dir=data_directory, only_empty_db=True, 
        abstracts_only=abstracts_only)
    retriever = TfidfRetriever(document_store=document_store)
    reader = FARMReader(
        model_name_or_path="deepset/bert-large-uncased-whole-word-masking-squad2", 
        use_gpu=True)
        #model_name_or_path="gdario/biobert_bioasq", 
        #use_gpu=False)
    finder = Finder(reader, retriever)
    return finder

class Result(object):
    """
    A data structure to store question answering results (at document-section level).
    """
    def __init__(self, title, url, authors, section_title, text, spans):
       
        self.title = title
        self.url = url
        self.authors = authors
        self.section_title = section_title
        self.text = text
        self.spans = spans

def get_results(finder, top_k_retriever, top_k_reader, candidate_doc_ids, question):
   
    paragraphs, meta_data = finder.retriever.retrieve(question, top_k=top_k_retriever, candidate_doc_ids=candidate_doc_ids)
    results = []

    if len(paragraphs) > 0:

        # 3) Apply reader to get granular answer(s)
        len_chars = sum([len (p) for p in paragraphs])
        predictions = finder.reader.predict(question=question,
            paragraphs=paragraphs,
            meta_data_paragraphs=meta_data,
            top_k=top_k_reader)

        # Add corresponding document_name if an answer contains the document_id 
        for prediction in predictions["answers"]:
            document = finder.retriever.document_store.get_document_by_id(prediction["document_id"])
            title, section_title, authors, paper_id = document["name"].split("|||")
            url = "https://cord-19.apps.allenai.org/paper/%s" % paper_id
            spans = [{
                "start" : prediction["offset_start"], 
                "end" : prediction["offset_end"], 
            }]
            result = Result(
                title, 
                url, 
                authors, 
                section_title, 
                prediction["context"], 
                spans, 
            )
            results.append(result)
    
    return results

def add_search_result_element(container, result):
   
    # Title
    div = ET.SubElement(container, "div")
    a = ET.SubElement(div, "a", href=result.url, target="_blank")
    a.text = result.title

    # Authors
    div = ET.SubElement(container, "div")
    b = ET.SubElement(div, "b")
    b.text = result.authors

    # Section Title
    div = ET.SubElement(container, "div")
    b = ET.SubElement(div, "b", style="color: grey;")
    b.text = result.section_title
    
    # Snippet
    cursor = 0
    for span in result.spans:
        div = ET.SubElement(container, "div")
        p = ET.SubElement(div, "p")
        span_element = ET.SubElement(p, "span")
        span_element.text = result.text[:span["start"]]
        span_element = ET.SubElement(p, "span", style="background-color: #DCDCDC; border-radius: 5px; padding: 5px;")
        span_element.text = result.text[span["start"]:span["end"]]
        cursor = span["end"]
    if cursor < len(result.text):
        span_element = ET.SubElement(p, "span")
        span_element.text = result.text[cursor:]

def generate_html(question, results):
   
    container = ET.Element("div")
    
    # Add question
    div = ET.SubElement(container, "div")
    h = ET.SubElement(div, "h1")
    h.text = question
    
    # Add answers
    for result in results:
        add_search_result_element(container, result)
        ET.SubElement(container, "hr")
    html = str(ET.tostring(container))[2:-1]
    return html

In [None]:
finder = get_finder("/kaggle/input/", abstracts_only=False)

In [None]:
top_k_retriever = 50
top_k_reader = 5
candidate_doc_ids = None
questions = [
    
  
   
    "Is there any difference in symptoms of coronavirus disease for pregnant women ?",
   "What are the Treatment Options for Pregnant Women Infected with COVID-19 ? ",
     "Is there any difference in symptoms of coronavirus disease for neonates?",
     "Who is most at risk for COVID-19 pregnant women or non-pregnant women?",
     "How can pregnant women protect themselves against COVID-19?",
   "Do pregnant women with suspected or confirmed COVID-19 need to give birth by caesarean section?",
    "Can COVID-19 be passed from a woman to her unborn or newborn baby?",
    "What are considerations for neonates at risk for COVID-19?",
    "Will COVID-19 be transmitted vertically to the fetus from the pregnant mother "
    "will pregnant or recently pregnant women with COVID-19 give birth prematurely?",
    "What are the risks for pregnant women with COVID-19 alongside other co-morbidities?",
  "Will I be able to breastfeed my baby if I have suspected or confirmed coronavirus?"


  
]

In [None]:
from IPython.core.display import display
from IPython.core.display import HTML
from IPython.utils import io

html_string = ""
for question in questions:
    with io.capture_output() as captured:
        results = get_results(finder, top_k_retriever, top_k_reader, candidate_doc_ids, question)
    current_html_string = generate_html(question, results)
    html_string += current_html_string
    display(HTML(current_html_string))