# Scientific Literature Retriever Using BERT and CDQA

# **COVID-19 Open Research Dataset (CORD-19) Analysis**

Install the [End-To-End Closed Domain Question Answering System](http://pypi.org/project/cdqa/)

In [None]:
!pip install --upgrade pip

Upgrade the version of Pandas

In [None]:
#!pip3 install --upgrade pandas

Import all necessary modules

In [None]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import re
import os
import json
import nltk 
from math import log, sqrt
from collections import defaultdict
from copy import deepcopy
import glob
import sys
import gzip, pickle, pickletools
from tabulate import tabulate
from IPython.display import display, HTML
import warnings
warnings.filterwarnings('ignore')

# Step 1. Data Pre-processing

Load csv to dataframe.

In [None]:
file_path = "../input/CORD-19-research-challenge/"

my_df =  pd.read_csv(file_path+'metadata.csv',low_memory=False)

my_df.head()

Counting the number of valid entries for each column.

In [None]:
def display_barchart(my_df):
    column = my_df.columns.tolist()
    # https://datatofish.com/convert-pandas-dataframe-to-list/
    valid_cnt = list(my_df.count())
    plt.bar(column,valid_cnt,align = "center",width = 0.5,alpha = 1)
    plt.xticks(rotation=90)

    
display_barchart(my_df)

Removal of the duplicate entries.

In [None]:

my_df.drop_duplicates(subset=['title'],keep='first')
my_df.drop_duplicates(subset=['abstract'],keep='first')
my_df.drop_duplicates(subset=['doi'],keep='first')
my_df.count()



Abstract plays an important role in finding related journals because it contains the objective or overview of the literature.


So here, We visualize it with bar chart to get insights.

In [None]:

valid_cnt = list(my_df.count())
with_abstract = valid_cnt[8]
total = valid_cnt[0]
without_abstract  = total - with_abstract

tot = plt.bar(1,total,color='green',width = 1)
plt.annotate(str(total), xy=(1,total), ha='center', va='bottom')

abstract = plt.bar(2,with_abstract,color='yellow',width = 1)
plt.annotate(str(with_abstract), xy=(2,with_abstract), ha='center', va='bottom')

lost = plt.bar(3,without_abstract,color='red',width = 1)
plt.annotate(str(without_abstract), xy=(3,without_abstract), ha='center', va='bottom')

plt.legend((tot, abstract,lost), ('Total literature', 'literatures with abstract','literatures without abstract'))

Removal of the empty rows or in other words the rows without an abstract.

In [None]:
my_df.dropna(how='all')
my_df.dropna(subset=['abstract'],inplace=True)
my_df.count()


Covid-19 is found in 2019.
So there is no need to keep literatures that was published before 2019.

Removal of the literatures that were published before the year 2019.

In [None]:
my_df = my_df[pd.DatetimeIndex(my_df.publish_time).year>2018]
# https://www.interviewqs.com/ddi_code_snippets/extract_month_year_pandas
# https://stackoverflow.com/questions/13851535/delete-rows-from-a-pandas-dataframe-based-on-a-conditional-expression-involving
my_df.info()

df = my_df

The papers i.e. literatures which do not contain any terms from "covid_terms" must be removed.

Filtering only papers related to Covid-19. 

In [None]:

my_df = df
covid_terms =['covid', 'coronavirus disease 19', 'sars cov 2', '2019 ncov', '2019ncov', '2019 n cov', '2019n cov',
              'ncov 2019', 'n cov 2019', 'coronavirus 2019', 'wuhan pneumonia', 'wuhan virus', 'wuhan coronavirus',
              'coronavirus 2', 'covid-19', 'SARS-CoV-2', '2019-nCov']
covid_terms = [elem.lower() for elem in covid_terms]
covid_terms = re.compile('|'.join(covid_terms))

def checkYear(date):
    return int(date[0:4])

def checkCovid(row, covid_terms):
    return bool(covid_terms.search(row['abstract'].lower())) and checkYear(row['publish_time']) > 2019



In [None]:
my_df['is_covid'] = my_df.apply(checkCovid, axis=1, covid_terms=covid_terms)
my_df.head()

In [None]:
df_covid_only = my_df[my_df['is_covid']==True]
df_covid_only = df_covid_only.reset_index(drop=True)
df_covid_only.info()

Considering the topic we choose, we store the terms related it in "key_words" list. And we filter papers based on the listed terms. 

Filtering the literatures based on the terms present in the "key_word" list.

In [None]:
key_words = ['transmission','transmitted','long','symptomatic','asymptomatic','infected','infection','range', 'incubation', 'periods', 'surfaces', 'prevent','protective','SARS-CoV-2','infectious','reported','respiratory', 'secretions', 'saliva', 'droplets','short', 'time', 'fomites','sanitation']
pattern = '|'.join(key_words)
df_covid_only = df_covid_only.loc[df_covid_only['abstract'].str.contains(pattern, case=False)]
df_covid_only.info()

Ploting a bar graph related to above result.

In [None]:
df_covid_only.head().abstract
display_barchart(df_covid_only)

In [None]:

df_covid_only.dropna(subset=['pdf_json_files'],inplace=True)
df_covid_only.info()
display_barchart(df_covid_only)

# Step 2. Extraction of data from json files to dataframe format

Path to json files

In [None]:
base_path = '../input/CORD-19-research-challenge/'
all_selected_json = df_covid_only['pdf_json_files']
print(base_path+all_selected_json[0])

In [None]:
# This piece of code was adopted from the original source at:
# https://www.kaggle.com/xhlulu/cord-19-eda-parse-json-and-generate-clean-csv/notebook 

def format_name(author):
    middle_name = " ".join(author['middle'])
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])

def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    for section, text in texts:
        texts_di[section] += text
    body = ""
    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))
    return "; ".join(formatted)

def load_files(file_count,filenames_selected = all_selected_json):

    raw_files = []
    i = 0
    for filename in filenames_selected:
        if i == file_count:
            break
        
        try:
            file = json.load(open(filename, 'rb'))
            raw_files.append(file)   
            i = i+1
        except:
            try:
                filename = base_path + filename
                file = json.load(open(filename, 'rb'))
                raw_files.append(file)
                i = i+1
            except:
                x = 1
    return raw_files
    

def generate_clean_df(all_files):
    cleaned_files = []

    for file in all_files:

        features = [
            file['paper_id'],
            file['metadata']['title'],
            format_authors(file['metadata']['authors']),
            format_authors(file['metadata']['authors'], 
                           with_affiliation=True),
            format_body(file['abstract']),
            format_body(file['body_text']),
            format_bib(file['bib_entries']),
            file['metadata']['authors'],
            file['bib_entries']
        ]
        cleaned_files.append(features)
    col_names = ['paper_id', 'title', 'authors',
                 'affiliations', 'abstract', 'text', 
                 'bibliography','raw_authors','raw_bibliography']
    clean_df = pd.DataFrame(cleaned_files, columns=col_names)
    clean_df = clean_df.drop(columns=['authors','affiliations','bibliography',
                                      'raw_authors','raw_bibliography'])
    return clean_df

In [None]:
!pip install cdqa 

# Step 3. Corpus formation

In [None]:
def get_corpus():
    num_of_papers = {}
    corpus = pd.DataFrame(columns=['paper_id','title','abstract','text'])
    
    file_count = 40000
    
    print('Reading ', file_count, 'json files')
    print('Loading......')
    files = load_files(file_count)
    print('Generating clean dataframe')
    df = generate_clean_df(files)
    print('Generated......')
    print('Forming Corpus.......')
    print(df.shape[0])
    corpus = pd.concat([corpus, df], ignore_index=True, sort=False)
    print('4')
    
    print('Corpus includes {0} scientific articles.'.format(len(corpus)))
    return corpus, num_of_papers

corpus, num_of_papers = get_corpus()

# Step 4. Processing of Corpus

In [None]:
# This processing algorithm can originaly be found at:
# https://github.com/nilayjain/text-search-engine

inverted_index = defaultdict(list)
num_of_documents = len(corpus)
vects_for_docs = [] 
document_freq_vect = {}
 
def iterate_over_all_docs():
    print('Processing corpus...')
    for i in range(num_of_documents):
        if np.mod(i, 1000) == 0:
            print('{0} of {1}'.format(str(i).zfill(len(str(num_of_documents))),num_of_documents))
        doc_text = corpus['title'][i] + ' ' + corpus['abstract'][i] + ' ' + corpus['text'][i]
        token_list = get_tokenized_and_normalized_list(doc_text)
        vect = create_vector(token_list)
        vects_for_docs.append(vect)
    print('{0} of {1}'.format(num_of_documents, num_of_documents))

def create_vector_from_query(l1):
    vect = {}
    for token in l1:
        if token in vect:
            vect[token] += 1.0
        else:
            vect[token] = 1.0
    return vect

def generate_inverted_index():
    count1 = 0
    for vector in vects_for_docs:
        for word1 in vector:
            inverted_index[word1].append(count1)
        count1 += 1

def create_tf_idf_vector():
    vect_length = 0.0
    for vect in vects_for_docs:
        for word1 in vect:
            word_freq = vect[word1]
            temp = calc_tf_idf(word1, word_freq)
            vect[word1] = temp
            vect_length += temp ** 2
        vect_length = sqrt(vect_length)
        for word1 in vect:
            vect[word1] /= vect_length

def get_tf_idf_from_query_vect(query_vector1):
    vect_length = 0.0
    for word1 in query_vector1:
        word_freq = query_vector1[word1]
        if word1 in document_freq_vect:
            query_vector1[word1] = calc_tf_idf(word1, word_freq)
        else:
            query_vector1[word1] = log(1 + word_freq) * log(
                num_of_documents)
        vect_length += query_vector1[word1] ** 2
    vect_length = sqrt(vect_length)
    if vect_length != 0:
        for word1 in query_vector1:
            query_vector1[word1] /= vect_length

def calc_tf_idf(word1, word_freq):
    return log(1 + word_freq) * log(num_of_documents / document_freq_vect[word1])

def get_dot_product(vector1, vector2):
    if len(vector1) > len(vector2):
        temp = vector1
        vector1 = vector2
        vector2 = temp
    keys1 = vector1.keys()
    keys2 = vector2.keys()
    sum = 0
    for i in keys1:
        if i in keys2:
            sum += vector1[i] * vector2[i]
    return sum

def get_tokenized_and_normalized_list(doc_text):
    tokens = nltk.word_tokenize(doc_text)
    ps = nltk.stem.PorterStemmer()
    stemmed = []
    for words in tokens:
        stemmed.append(ps.stem(words))
    return stemmed

def create_vector(l1):
    vect = {}  
    global document_freq_vect
    for token in l1:
        if token in vect:
            vect[token] += 1
        else:
            vect[token] = 1
            if token in document_freq_vect:
                document_freq_vect[token] += 1
            else:
                document_freq_vect[token] = 1
    return vect

def get_result_from_query_vect(query_vector1):
    parsed_list = []
    for i in range(num_of_documents - 0):
        dot_prod = get_dot_product(query_vector1, vects_for_docs[i])
        parsed_list.append((i, dot_prod))
        parsed_list = sorted(parsed_list, key=lambda x: x[1])
    return parsed_list

# ../input/CORD-19-research-challenge
def pickle_to_file(file_name,object_to_pickle):
    filepath = "./data/"+file_name
    with gzip.open(filepath, "wb") as f:
        pickled = pickle.dumps(object_to_pickle)
        optimized_pickle = pickletools.optimize(pickled)
        f.write(optimized_pickle)
        
def pickle_all():
    pickle_to_file("inverted_index.pkl",inverted_index)
    pickle_to_file("vects_for_docs.pkl",vects_for_docs)
    pickle_to_file("document_freq_vect.pkl",document_freq_vect)
    
iterate_over_all_docs()
generate_inverted_index()
create_tf_idf_vector()
# pickle_all()

In [None]:
'''Tried pickling for direct access to corpus and document vector''' 
           
# def load_all_from_pickle():
#     print("inverted_index loading...")
#     filepath = "../input/pickle-files/inverted_index.pkl"
#     with gzip.open(filepath, 'rb') as f:
#         p = pickle.Unpickler(f)
#         global inverted_index
#         inverted_index = p.load()
#         print(sys.getsizeof(inverted_index))
#     print("inverted_index loaded.")
    
#     print("vects_for_docs loading...")   
#     filepath = "../input/pickle-files/vects_for_docs.pkl"
#     with gzip.open(filepath, 'rb') as f:
#         global vects_for_docs
#         p = pickle.Unpickler(f)
#         vects_for_docs = p.load()
#         print(len(vects_for_docs))
#     print("vects_for_docs loaded.")
        
#     print("document_freq_vect loading...")    
#     filepath = "../input/pickle-files/document_freq_vect.pkl"
#     with gzip.open(filepath, 'rb') as f:
#         global document_freq_vect
#         p = pickle.Unpickler(f)
#         document_freq_vect = p.load()
#         print(sys.getsizeof(document_freq_vect))
#     print("document_freq_vect loaded.")
        

# load_all_from_pickle()

# Step 5. Using pretrained BERT model

In [None]:
# The End-To-End Closed Domain Question Answering System is used here.
# It is available at: https://pypi.org/project/cdqa/

from cdqa.utils.filters import filter_paragraphs
from cdqa.utils.download import download_model, download_bnpp_data
from cdqa.pipeline.cdqa_sklearn import QAPipeline

download_bnpp_data(dir='./data/bnpp_newsroom_v1.1/')
download_model(model='bert-squad_1.1', dir='./models')

# Step 6. Search of the most relevant articles and competent answer on the query

First, for each query the system arranges all the scientific papers within the corpus in the relevant order.

Second, the system analize texts of top N the mosr relevant papers to answer to the query in the best way.

In [None]:
def add_Each_Details(papers_info,top_n_papers,prediction,query,x,result):
    for i in range(top_n_papers):
        if papers_info['title'][i] == prediction[x][1]:
            pid = papers_info['id'][i]

#     print(x+1,'. PAPER ID : {0}\n'.format(pid))
#     print('POSSIBLE ANSWER FROM PAPER: {0}\n'.format(prediction[x][0]))
#     print('PAPER TITLE: {0}\n'.format(prediction[x][1]))
#     print('HIGHLIGHT FROM PAPER: {0}\n'.format(prediction[x][2]))
    temp = pd.DataFrame({"Paper ID":[pid], "Paper Title":[prediction[x][1]], "Highlight from Paper":[prediction[x][2]],"Short answer":[prediction[x][0]]})
    result = result.append(temp,ignore_index= True)
    
    return result
    
    
def find_relevant_articles(query=None, top_n_papers=20, min_n_papers=3):
    if query == None:
        query = input('Please enter your query...')
#     print('\n\n'+'*'*34+' PROCESSING NEW QUERY '+'*'*34+'\n')  
    
    query_list = get_tokenized_and_normalized_list(query)
    query_vector = create_vector_from_query(query_list)
    get_tf_idf_from_query_vect(query_vector)
    result_set = get_result_from_query_vect(query_vector)
    
    papers_info = {'query':query, 'query list':query_list, 'query vector':query_vector,
                   'id':[], 'title':[], 'abstract':[], 'text':[], 'weight':[], 'index':[]}
    
    for i in range(1, top_n_papers+1):
        tup = result_set[-i]
        papers_info['id'].append(corpus['paper_id'][tup[0]])
        papers_info['title'].append(corpus['title'][tup[0]])
        papers_info['abstract'].append(corpus['abstract'][tup[0]])
        papers_info['text'].append(corpus['text'][tup[0]])
        papers_info['weight'].append(tup[1])
        papers_info['index'].append(tup[0])
        
    colms = ['date','title', 'category', 'link', 'abstract', 'paragraphs']
    df = pd.DataFrame(columns=colms)

    for i in range(len(papers_info['text'])):
        papers_info['text'][i] = papers_info['text'][i].replace('\n\n', ' ')
        CurrentText = papers_info['text'][i]
        CurrentText = CurrentText.split('. ')
        CurrentList = ["None", papers_info['title'][i], "None", "None", papers_info['abstract'][i], CurrentText]
        CurrentList = np.array(CurrentList)
        CurrentList = CurrentList.reshape(1, CurrentList.shape[0])
        CurrentList = pd.DataFrame(data = CurrentList, columns=colms)
        df = pd.concat([df, CurrentList], ignore_index=True)
    df = filter_paragraphs(df)

    # Loading QAPipeline with CPU version of BERT Reader pretrained on SQuAD 1.1
    cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib')

    #Drop possible duplicates
    df.drop_duplicates(subset=['title'],keep='first')
    df.drop_duplicates(subset=['link'],keep='first')
    df.dropna(subset=['title'],inplace=True)
    # Fitting the retriever to the list of documents in the dataframe
    cdqa_pipeline.fit_retriever(df=df)

    query = papers_info['query']

    # Sending a question to the pipeline and getting prediction
    prediction = cdqa_pipeline.predict(query=query,n_predictions = min_n_papers)
    
    #Creating result dataframe
    column_names = ["Paper ID", "Paper Title", "Highlight from Paper","Short answer"]
    result = pd.DataFrame(columns = column_names)
    
    
    print('QUERY: {0}\n'.format(query))
#     display(HTML('QUERY: '+query))
    for i in range(min_n_papers):
        result = add_Each_Details(papers_info,top_n_papers,prediction,query,i,result)
       
    display(HTML(result.to_html()))

# Step 7. Getting practical answers and the most relevant papers (query based approach)

Below one can see a list of 10 queries and answers, which have been found by the system due to text mining.

List of queries

# How to use the system

When Steps 0-7 have been completed with a corpus of scientific papers, the system is ready to process your queries. To get an answer to a query, follow two steps: 

1. Input any query in the form of string type variable.

For example,

2. Call the function find_relevant_articles().

For example,

In [None]:
query = 'What is the incubation period for covid19 ?'
find_relevant_articles(query=query, top_n_papers=500, min_n_papers=5)

In [None]:
query = 'What is range of incubation period for coronavirus SARS-CoV-2 COVID-19 in humans'
find_relevant_articles(query=query, top_n_papers=500, min_n_papers=5)

In [None]:
query = 'What is optimal quarantine period for coronavirus COVID-19'
find_relevant_articles(query=query, top_n_papers=500, min_n_papers=5)

In [None]:
query = 'What is effective quarantine period for coronavirus COVID-19'
find_relevant_articles(query=query, top_n_papers=500, min_n_papers=5)

In [None]:
query = 'What is percentage of death cases for coronavirus SARS-CoV-2 COVID-19'
find_relevant_articles(query=query, top_n_papers=500, min_n_papers=5)

In [None]:
query = 'What is death rate for coronavirus COVID-19 and air pollution'
find_relevant_articles(query=query, top_n_papers=500, min_n_papers=5)

In [None]:
query = 'At which temperature coronavirus COVID-19 can survive'
find_relevant_articles(query=query, top_n_papers=500, min_n_papers=5)

In [None]:
query = 'How long coronavirus SARS-CoV-2 can survive on plastic surface'
find_relevant_articles(query=query, top_n_papers=500, min_n_papers=5)

In [None]:
query = 'What are risk factors for coronavirus COVID-19'
find_relevant_articles(query=query, top_n_papers=500, min_n_papers=5)

In [None]:
query = 'What is origin of coronavirus COVID-19'
find_relevant_articles(query=query, top_n_papers=500, min_n_papers=5)

In [None]:

queries =  ['At which temperature coronavirus cannot survive',
           'What is the range of incubation periods for coronavirus SARS-CoV-2 COVID-19 in humans (and how this varies across age and health status) and how long individuals are contagious, even after recovery',
           'What are the prevalence of asymptomatic shedding and transmission of coronavirus COVID-19(e.g., particularly children)',
           'Mention the seasonality COVID-19 transmission',
           'Explain the physical science of the coronavirus (e.g., charge distribution, adhesion to hydrophilic/phobic surfaces, environmental survival to inform decontamination efforts for affected areas and provide information about viral shedding)',
           'What is the period of persistence and stability of coronavirus on a multitude of substrates and sources (e.g., nasal discharge, sputum, urine, fecal matter, blood)',
           'How long does coronavirus persists on surfaces of different materials (e,g., copper, stainless steel, plastic)',
           'Explain the natural history of the coronavirus COVID-19 and shedding of it from an infected person']

In [None]:
for query in queries:
    find_relevant_articles(query,500,5)