## Using Neural Textual Question Answering to get the latest status 

inspiring by https://www.kaggle.com/aotegi/neural-question-answering-for-cord19-task8

this work is to update the dataset and models based on previous work. 
Using latest dataset, DPR as retrieve and unifiedQA as MRC model.   
hopefully it can have a better result.  


1. Install packages and load libraries

In [None]:
pip install farm-haystack

In [None]:
from haystack import Finder
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers
import pandas as pd 
from transformers import pipeline

2. Load info from metadata file

In [None]:
path_mdata = '/kaggle/input/CORD-19-research-challenge/metadata.csv'
fields = ['cord_uid','title', 'publish_time', 'abstract', 'journal','url', 'pdf_json_files', 'pmc_json_files', 'pmcid', 'sha']
df_mdata = pd.read_csv(path_mdata, skipinitialspace=True, index_col='cord_uid',usecols=fields)
df_mdata = df_mdata.loc[~df_mdata.index.duplicated(keep='first')]
print("Number of papers loaded from metadata (after filtering out the repeated ones):", len(df_mdata))

In [None]:
times = [
    '2020-10',
    '2020-11',
    '2020-12',
    '2021'
]
# Create a filter with 'False' values
index_list =  list(df_mdata.index.values) 
filter = pd.Series([False] * len(index_list))
filter.index = index_list
  
# Update the filter using the time list
for t in times:
    # Check if a synonym is in title or abstract
    filter = filter | df_mdata.publish_time.str.contains(t)

# Filter out papers in metadata dataframe using the above filter
df_mdata = df_mdata[filter]

# Sanity check
print("After filtering, number of papers in metadata related to 'COVID-19':", len(df_mdata))

In [None]:
not_indexed = []
indexed_sha = []
firtered_dict = []
for ind in df_mdata.index: 
    indexed = False
    
    # If paper has an abstract, index the abstract
    if not pd.isnull(df_mdata.loc[ind,'abstract']):
        if pd.isnull(df_mdata.loc[ind,'title']):
            df_mdata.at[ind,'title'] = ""
        # Add document to the index
        firtered_dict.append({ 'meta': {'name': ind+"##abs"},'text': df_mdata['abstract'][ind]})
        indexed = True
    
    # If paper has PMC or PDF full text, access its JSON file and index each paragraph separately
    # First check if paper has PMC xml
    if df_mdata['pmc_json_files'][ind] == True:
        if pd.isnull(df_mdata.loc[ind,'title']):
            df_mdata.at[ind,'title'] = ""
        
        # Find JSON file: path specified in 'full_text_file', file name specidfied in 'pmcid'
        path_json = path_dataset + df_mdata['pmc_json_files'][ind] + '/' + df_mdata['pmc_json_files'][ind] + '/pmc_json/' + df_mdata['pmcid'][ind] + '.xml.json'
        with open(path_json, 'r') as j:
            jsondata = json.load(j)
            
            ## Iterate through paragraphs of body_text
            for p, paragraph in enumerate(jsondata['body_text']):  
                # Add document to the index
                firtered_dict.append({ 'meta': {'name': ind+"##pmc-" + str(p)},'text': paragraph['text']})
                indexed = True
    
    # As current paper does not have PMC, check if it has JSON PDF
    elif df_mdata['pdf_json_files'][ind] == True:
        if pd.isnull(df_mdata.loc[ind,'title']):
            df_mdata.at[ind,'title'] = ""
        
        # Find JSON file: path specified in 'full_text_file', file name specidfied in 'sha'
        # There could be more than one reference in 'sha' separated by ;
        shas = df_mdata['sha'][ind].split(';')
        for sha in shas:
            sha = sha.strip()
            # Check if paper with this sha has been indexed already
            if sha not in indexed_sha:
                indexed_sha.append(sha)
                path_json = path_dataset + df_mdata['pdf_json_files'][ind] + '/' + df_mdata['pdf_json_files'][ind] + '/pdf_json/' + sha + '.json'
                with open(path_json, 'r') as j:
                    jsondata = json.load(j)
            
                    ## iterate through paragraphs of body_text
                    for p, paragraph in enumerate(jsondata['body_text']):  
                        # Add document to the index
                        firtered_dict.append({ 'meta': {'name': ind+"##pdf-" + str(p)},'text': paragraph['text']})
                        indexed = True
    
    if not indexed:
        not_indexed.append(ind)

Initalize DPR

In [None]:
from haystack.document_store.faiss import FAISSDocumentStore

document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

In [None]:
document_store.write_documents(firtered_dict)

In [None]:
from haystack.retriever.dense import DensePassageRetriever
retriever = DensePassageRetriever(document_store=document_store,
                                  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                  max_seq_len_query=64,
                                  max_seq_len_passage=256,
                                  batch_size=16,
                                  use_gpu=True,
                                  embed_title=True,
                                  use_fast_tokenizers=True)

In [None]:
document_store.update_embeddings(retriever)

Question

In [None]:
tasks = [
    {
        'task': "Task1 - What is known about transmission, incubation, and environmental stability?",
        'questions': [
            "Range of incubation periods for the disease in humans",
            "Range of incubation periods for the disease in humans depending on age",
            "Range of incubation periods for the disease in humans depending on health status",
            "How long individuals are contagious?",
            "Prevalence of asymptomatic shedding and transmission",
            "Prevalence of asymptomatic shedding and transmission in children",
            "Seasonality of transmission",
            "Charge distribution",
            "Adhesion to hydrophilic/phobic surfaces",
            "Environmental survival to inform decontamination efforts for affected areas",
            "Viral shedding",
            "Persistence and stability on nasal discharge",
            "Persistence and stability on sputum",
            "Persistence and stability on urine",
            "Persistence and stability on fecal matter",
            "Persistence and stability on blood",
            "Persistence of virus on surfaces of different materials",
            "Persistence of virus on copper",
            "Persistence of virus on stainless steel",
            "Persistence of virus on plastic",
            "Natural history of the virus",
            "Shedding the virus from an infected person",
            "Implementation of diagnostics to improve clinical processes",
            "Implementation of products to improve clinical processes",
            "Disease models, including animal models for infection, disease and transmission",
            "Tools to monitor phenotypic change and potential adaptation of the virus",
            "Studies to monitor phenotypic change and potential adaptation of the virus",
            "Immune response and immunity",
            "Effectiveness of movement control strategies to prevent secondary transmission in health care and community settings",
            "Effectiveness of personal protective equipment (PPE) and its usefulness to reduce risk of transmission in health care and community settings",
            "Role of the environment in transmission"
         ]
    },
    {
        'task': "Task 2 - What do we know about COVID-19 risk factors?",
        'questions': [
            "Which are the main risk factors?",
            "Does smoking increase risk for COVID-19?",
            "Is a pre-existing pulmonary disease a risk factor for COVID-19?",
            "Do co-infections increase risk for COVID-19?",
            "Does a respiratory or viral infection increase risk for COVID-19?",
            "Are neonates at increased risk of COVID-19?",
            "Are pregnant women at increased risk of COVID-19?",
            "Is there any socio-economic factor associated with increased risk for COVID-19?",
            "Is there any behavioral factor associated with increased risk for COVID-19?",
            "What is the basic reproductive number?",
            "What is the incubation period?",
            "What are the modes of transmission?",
            "What are the environmental factors?",
            "Risk of fatality among symptomatic hospitalized patients",
            "Risk of fatality among high-risk patient groups",
            "Susceptibility of populations",
            "Public health mitigation measures that could be effective for control"
        ]
    },
    {
        'task': "Task 3 - What do we know about virus genetics, origin, and evolution?",
        'questions': [
            "Real-time tracking of whole genomes to inform the development of diagnostics",
            "Real-time tracking of whole genomes to inform the development of therapeutics",
            "Real-time tracking of whole genomes to track variations of the virus over time",
            "Mechanism for coordinating the rapid dissemination of whole genomes to inform the development of diagnostics",
            "Mechanism for coordinating the rapid dissemination of whole genomes to inform the development of therapeutics",
            "Mechanism for coordinating the rapid dissemination of whole genomes to track variations of the virus over time",
            "Which geographic and temporal diverse sample sets are accessed to understand geographic differences?",
            "Which geographic and temporal diverse sample sets are accessed to understand genomic differences?",
            "Is there more than one strain in circulation?",
            "Is any multi-lateral agreement leveraged such as the Nagoya Protocol?",
            "Is there evidence that livestock could be infected and serve as a reservoir after the epidemic appears to be over?",
            "Has there been any field surveillance to show that livestock could be infected?",
            "Has there been any genetic sequencing to show that livestock could be infected?",
            "Has there been any receptor binding to show that livestock could be infected?",
            "Is there evidence that farmers are infected?",
            "Is there evidence that farmers could have played a role in the origin?",
            "What are the results of the surveillance of mixed wildlife-livestock farms for SARS-CoV-2 and other coronaviruses in Southeast Asia?",
            "What are the results of the experimental infections to test host range for this pathogen?",
            "Which are the animal hosts?",
            "Is there evidence of continued spill-over to humans from animals?",
            "Which are the socioeconomic and behavioral risk factors for the spill-over to humans from animals?",
            "Sustainable risk reduction strategies"
        ]
    },
    {
        'task': "Task 4 - What do we know about vaccines and therapeutics?",
        'questions': [
            "What is known about the effectiveness of drugs being developed to treat COVID-19 patients?",
            "What is known about the effectiveness of drugs tried to treat COVID-19 patients?",
            "Show results of clinical and bench trials to investigate less common viral inhibitors against COVID-19",
            "Show results of clinical and bench trials to investigate naproxen against COVID-19",
            "Show results of clinical and bench trials to investigate clarithromycin against COVID-19",
            "Show results of clinical and bench trials to investigate Minocyclinethat against COVID-19",
            "Which are the methods evaluating potential complication of Antibody-Dependent Enhancement (ADE) in vaccine recipients?",
            "What is known about the use of best animal models and their predictive value for a human vaccine?",
            "Capabilities to discover a therapeutic for the disease",
            "Clinical effectiveness studies to discover therapeutics, to include antiviral agents",
            "Which are the models to aid decision makers in determining how to prioritize and distribute scarce, newly proven therapeutics?",
            "Efforts targeted at a universal coronavirus vaccine",
            "Efforts to develop animal models and standardize challenge studies",
            "Efforts to develop prophylaxis clinical studies and prioritize in healthcare workers",
            "Approaches to evaluate risk for enhanced disease after vaccination",
            "Assays to evaluate vaccine immune response",
            "Process development for vaccines, alongside suitable animal models"
        ]
    },
    {
        'task': "Task 5 - What has been published about medical care?",
        'questions': [
            "Resources to support skilled nursing facilities",
            "Resources to support long term care facilities",
            "Mobilization of surge medical staff to address shortages in overwhelmed communities",
            "Age-adjusted mortality data for Acute Respiratory Distress Syndrome (ARDS)",
            "Age-adjusted mortality data for Acute Respiratory Distress Syndrome (ARDS) for viral etiologies",
            "What are the outcomes of Extracorporeal membrane oxygenation (ECMO) of COVID-19 patients?",
            "What are the outcomes for COVID-19 after mechanical ventilation adjusted for age?",
            "What is known of the frequency, manifestations, and course of extrapulmonary manifestations of COVID-19?",
            "What is known of the frequency, manifestations, and course of cardiomyopathy?",
            "What is known of the frequency, manifestations, and course of cardiac arrest?",
            "Application of regulatory standards (e.g., EUA, CLIA)",
            "Ability to adapt care to crisis standards of care level",
            "Approaches for encouraging and facilitating the production of elastomeric respirators, which can save thousands of N95 masks",
            "Which are the best telemedicine practices?",
            "Which are the facilitators to expand the telemedicine practices?",
            "Which are the specific actions to expand the telemedicine practices?",
            "Guidance on the simple things people can do at home to take care of sick people and manage disease",
            "Which are the oral medications that might potentially work?",
            "Use of artificial intelligence in real-time health care delivery to evaluate interventions",
            "Use of artificial intelligence in real-time health care delivery to evaluate risk factors",
            "Use of artificial intelligence in real-time health care delivery to evaluate outcomes",
            "Which are the challenges, solutions and technologies in hospital flow and organization?",
            "Which are the challenges, solutions and technologies in workforce protection?",
            "Which are the challenges, solutions and technologies in workforce allocation?",
            "Which are the challenges, solutions and technologies in community-based support resources?",
            "Which are the challenges, solutions and technologies in payment?",
            "Which are the challenges, solutions and technologies in supply chain management to enhance capacity, efficiency, and outcomes?",
            "Efforts to define the natural history of disease to inform clinical care, public health interventions, infection prevention control, transmission, and clinical trials",
            "What has been done to develop a core clinical outcome set to maximize usability of data across a range of trials?",
            "Can adjunctive or supportive intervention (e.g. steroids, high flow oxygen)  improve the clinical outcomes of infected patients?"
        ]
    },
    {
        'task': "Task 6 - What do we know about non-pharmaceutical interventions?",
        'questions': [
            "Which is the best way to scale up NPIs in a more coordinated way to give us time to enhance our health care delivery system capacity to respond to an increase in cases?",
            "Which is the best way to mobilize resources to geographic areas where critical shortfalls are identified?",
            "Rapid design and execution of experiments to examine and compare NPIs currently being implemented",
            "What is known about the efficacy of school closures?",
            "What is known about the efficacy of travel bans?",
            "What is known about the efficacy of bans on mass gatherings?",
            "What is known about the efficacy of social distancing approaches?",
            "Which are the methods to control the spread in communities?",
            "Models of potential interventions to predict costs and benefits depending on race",
            "Models of potential interventions to predict costs and benefits depending on income",
            "Models of potential interventions to predict costs and benefits depending on disability",
            "Models of potential interventions to predict costs and benefits depending on age",
            "Models of potential interventions to predict costs and benefits depending on geographic location",
            "Models of potential interventions to predict costs and benefits depending on immigration status",
            "Models of potential interventions to predict costs and benefits depending on housing status",
            "Models of potential interventions to predict costs and benefits depending on employment status",
            "Models of potential interventions to predict costs and benefits depending on health insurance status",
            "Policy changes necessary to enable the compliance of individuals with limited resources and the underserved with NPIs",
            "Why do people fail to comply with public health advice?",
            "Which is the economic impact of any pandemic?",
            "How can we mitigate risks to critical government services in a pandemic?",
            "Alternatives for food distribution and supplies in a pandemic",
            "Alternatives for household supplies in a pandemic",
            "Alternatives for health diagnoses, treatment, and needed care in a pandemic"
        ]
    },
    {
        'task': "Task 7 - Help us understand how geography affects virality",
        'questions': [
            "Are there geographic variations in the rate of COVID-19 spread?",
            "Are there geographic variations in the mortality rate of COVID-19?",
            "Is there any evidence to suggest geographic based virus mutations?"
        ]
    },
    {
        'task': "Task 8 - What do we know about diagnostics and surveillance?",
        'questions': [
            "Which are the sampling methods to determine asymptomatic disease?",
            "What can we do for early detection of disease?",
            "Is the use of screening of neutralizing antibodies such as ELISAs valid for early detection of disease?",
            "Which are the existing diagnostic platforms?",
            "Which are the existing surveillance platforms?",
            "Recruitment, support, and coordination of local expertise and capacity",
            "How states might leverage universities and private laboratories for testing purposes?",
            "Which are the best ways for communications to public health officials and the public?",
            "What is the speed, accessibility, and accuracy of a point-of-care test?",
            "What is the speed, accessibility, and accuracy of rapid bed-side tests?",
            "Rapid design and execution of targeted surveillance experiments calling for all potential testers using PCR in a defined area to start testing and report to a specific entity",
            "Separation of assay development issues from instruments",
            "Which is the role of the private sector to help quickly migrate assays?",
            "What has been done to track the evolution of the virus?",
            "Latency issues and when there is sufficient viral load to detect the pathogen",
            "What is needed in terms of biological and environmental sampling?",
            "Use of diagnostics such as host response markers (e.g., cytokines) to detect early disease or predict severe disease progression",
            "Policies and protocols for screening and testing",
            "Policies to mitigate the effects on supplies associated with mass testing, including swabs and reagents",
            "Technology roadmap for diagnostics",
            "Which are the barriers to developing and scaling up new diagnostic tests?",
            "How future coalition and accelerator models could provide critical funding for diagnostics?",
            "How future coalition and accelerator models could provide critical funding for opportunities for a streamlined regulatory environment?",
            "New platforms and technology (CRISPR) to improve response times",
            "New platforms and technology to employ more holistic approaches",
            "Coupling genomics and diagnostic testing on a large scale",
            "What is needed for rapid sequencing and bioinformatics to target regions of the genome that will allow specificity for a particular variant?",
            "What is needed for sequencing with advanced analytics for unknown pathogens?",
            "What is needed for distinguishing naturally-occurring pathogens from intentional?",
            "What is known about One Health surveillance of humans and potential sources of future spillover or ongoing exposure for this organism and future pathogens?"
        ]
    },
    {
        'task': "Task 9 - What has been published about ethical and social science considerations?",
        'questions': [
            "Articulate and translate existing ethical principles and standards to salient issues in COVID-2019",
            "Embed ethics across all thematic areas, engage with novel ethical issues that arise and coordinate to minimize duplication of oversight",
            "Support sustained education, access, and capacity building in the area of ethics",
            "Establish a team at WHO that will be integrated within multidisciplinary research and operational platforms and that will connect with existing and expanded global networks of social sciences",
            "Develop qualitative assessment frameworks to systematically collect information related to local barriers and enablers for the uptake and adherence to public health measures for prevention and control",
            "How the burden of responding to the outbreak and implementing public health measures affects the physical and psychological health of those providing care for Covid-19 patients?",
            "Identify the underlying drivers of fear, anxiety and stigma that fuel misinformation and rumor, particularly through social media"
        ]
    },
    {
        'task': "Task 10 - What has been published about information sharing and inter-sectoral collaboration?",
        'questions': [
            "Which are the methods for coordinating data-gathering with standardized nomenclature?",
            "Sharing response information among planners, providers, and others",
            "Understanding and mitigating barriers to information-sharing",
            "How to recruit, support, and coordinate local expertise and capacity relevant to public health emergency response?",
            "Integration of federal/state/local public health surveillance systems",
            "Value of investments in baseline public health response infrastructure preparedness",
            "Modes of communicating with target high-risk populations (elderly, health care workers)",
            "Risk communication and guidelines that are easy to understand and follow",
            "Communication that indicates potential risk of disease to all population groups",
            "Misunderstanding around containment and mitigation",
            "Action plan to mitigate gaps and problems of inequity in the Nationâ€™s public health capability, capacity, and funding to ensure all citizens in need are supported and can access information, surveillance, and treatment",
            "Measures to reach marginalized and disadvantaged populations",
            "Data systems and research priorities and agendas incorporate attention to the needs and circumstances of disadvantaged populations and underrepresented minorities",
            "Mitigating threats to incarcerated people from COVID-19, assuring access to information, prevention, diagnosis, and treatment",
            "Understanding coverage policies (barriers and opportunities) related to testing, treatment, and care"
        ]
    }
]

Getting the result with Unified QA

In [None]:
model = pipeline('text2text-generation',"allenai/unifiedqa-t5-base")
for task in tasks:
    for q in task['questions']:
      for ret in retriever.retrieve(query=q,top_k=3):
        result = model(q+"\n"+ret.text)
        print(q,result)