In [None]:
!pip install cdqa 

In [None]:
'''Import all necessary modules'''
!pip install --upgrade pandas
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import re
import os
import json
import nltk 
from math import log, sqrt
from collections import defaultdict
from copy import deepcopy
import glob

In [None]:
''' Load csv to dataframe'''
my_df= pd.DataFrame([])

'''Source file path ( change if needed )'''
file_path = "../input/CORD-19-research-challenge/"

'''Some computers lag So we load it chunk wise'''
for df in pd.read_csv(file_path+'metadata.csv', iterator=True, chunksize=10000):  
    my_df = my_df.append(pd.DataFrame(df))

# '''Else remove comment and run the code mentioned below'''
# my_df = pd.read_csv(file_path+"metadata.csv")#,dtype={'cord_uid': str,'sha':str,'source_x':str,'title':str,'doi':str,'pmcid':str,'license':str,'abstract':str,'publish_time':str,'authors':str,'journal':str,'mag_id':str,'arxiv':str,'pdf_json_files':str,'pmc_json_files':str,'url':str,'s2_id':str})

my_df.head()

In [None]:
my_df.info()

In [None]:
'''Count the valid entries for each column'''
def display_barchart(my_df):
    column = my_df.columns.tolist()
    # https://datatofish.com/convert-pandas-dataframe-to-list/
    valid_cnt = list(my_df.count())
    # Visualize the valid entries with bar chart
    plt.bar(column,valid_cnt,align = "center",width = 0.5,alpha = 1)
    plt.xticks(rotation=90)

    
display_barchart(my_df)

In [None]:
my_df.drop_duplicates(subset=['title'],keep='first')
my_df.drop_duplicates(subset=['abstract'],keep='first')
my_df.drop_duplicates(subset=['doi'],keep='first')
my_df.count()

In [None]:
''' Abstract plays an important role in finding related journals because it contains the objective or             overview     of the literature
    We visualize it with bar chart to get insights'''
valid_cnt = list(my_df.count())
with_abstract = valid_cnt[8]
total = valid_cnt[0]
without_abstract  = total - with_abstract

tot = plt.bar(1,total,color='green',width = 1)
plt.annotate(str(total), xy=(1,total), ha='center', va='bottom')

abstract = plt.bar(2,with_abstract,color='yellow',width = 1)
plt.annotate(str(with_abstract), xy=(2,with_abstract), ha='center', va='bottom')

lost = plt.bar(3,without_abstract,color='red',width = 1)
plt.annotate(str(without_abstract), xy=(3,without_abstract), ha='center', va='bottom')

plt.legend((tot, abstract,lost), ('Total literature', 'literatures with abstract','literatures without abstract'))

In [None]:
''' Remove empty rows'''
my_df.dropna(how='all')

''' We have 2 options right now:
    1. Remove rows without abstract
    2. Add abstract from  the first paragraph of the literature'''

'''Remove rows without abstract'''
my_df.dropna(subset=['abstract'],inplace=True)
my_df.count()

In [None]:
''' Covid-19 is found in 2019
    So there is no need to keep literatures that was published before 2019'''
my_df = my_df[pd.DatetimeIndex(my_df.publish_time).year>2018]
# https://www.interviewqs.com/ddi_code_snippets/extract_month_year_pandas
# https://stackoverflow.com/questions/13851535/delete-rows-from-a-pandas-dataframe-based-on-a-conditional-expression-involving
my_df.info()

In [None]:
''' Our task is related to incubation, transmission and environmental effects, 
    So we collect words from literature survey which are related to our topic 
    and store it in 'key_words'. '''
''' To be updated '''
covid_terms =['covid', 'coronavirus disease 19', 'sars cov 2', '2019 ncov', '2019ncov', '2019 n cov', '2019n cov',
              'ncov 2019', 'n cov 2019', 'coronavirus 2019', 'wuhan pneumonia', 'wuhan virus', 'wuhan coronavirus',
              'coronavirus 2', 'covid-19', 'SARS-CoV-2', '2019-nCov']
covid_terms = [elem.lower() for elem in covid_terms]
covid_terms = re.compile('|'.join(covid_terms))

def checkYear(date):
    return int(date[0:4])

def checkCovid(row, covid_terms):
    return bool(covid_terms.search(row['abstract'].lower())) and checkYear(row['publish_time']) > 2019

In [None]:
my_df['is_covid'] = my_df.apply(checkCovid, axis=1, covid_terms=covid_terms)
my_df.head()

In [None]:
df_covid_only = my_df[my_df['is_covid']==True]
df_covid_only = df_covid_only.reset_index(drop=True)
df_covid_only.info()

In [None]:
key_words = ['transmission','transmitted','long','symptomatic','asymptomatic','infected','infection','range', 'incubation', 'periods', 'surfaces', 'prevent','protective','SARS-CoV-2','infectious','reported','respiratory', 'secretions', 'saliva', 'droplets','short', 'time', 'fomites','sanitation']
pattern = '|'.join(key_words)
df_covid_only = df_covid_only.loc[df_covid_only['abstract'].str.contains(pattern, case=False)]
df_covid_only.info()

In [None]:
'''Check if the result is correct'''
pd.options.display.max_colwidth = 100000
df_covid_only.head().abstract
display_barchart(df_covid_only)

In [None]:
'''Take only those files which contain the json_file linked within them because these are the json files which contain the original info of the article'''
df_covid_only.dropna(subset=['pdf_json_files'],inplace=True)
df_covid_only.info()

In [None]:
base_path = '../input/CORD-19-research-challenge/'
all_selected_json = df_covid_only['pdf_json_files']
# print(all_selected_json)
print(base_path+all_selected_json[0])

In [None]:
# This piece of code was adopted from the original source at:
# https://www.kaggle.com/xhlulu/cord-19-eda-parse-json-and-generate-clean-csv/notebook 

def format_name(author):
    middle_name = " ".join(author['middle'])
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])

def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:# First, for each query the system arranges all the scientific papers within the corpus in the relevant order.
# Second, the system analize texts of top N the mosr relevant papers to answer to the query in the best way.
            name_ls.append(name)
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    for section, text in texts:
        texts_di[section] += text
    body = ""
    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))
    return "; ".join(formatted)

def load_files(file_count,filenames_selected = all_selected_json):
#     filenames = os.listdir(dirname)

    raw_files = []
#     if filename:
#         filename = dirname + filename
#         raw_files = [json.load(open(filename, 'rb'))]
#     else:
#         #for filename in tqdm(filenames):
    i = 0
    for filename in filenames_selected:
        if i == file_count:
            break
        
        try:
            file = json.load(open(filename, 'rb'))
            raw_files.append(file)   
            i = i+1
        except:
            try:
                filename = base_path + filename
                file = json.load(open(filename, 'rb'))
                raw_files.append(file)
                i = i+1
            except:
                x = 1
                #print(filename)
    return raw_files
    

def generate_clean_df(all_files):
    cleaned_files = []
    #for file in tqdm(all_files):
#     i = 0
    for file in all_files:
#         if i == 500:
#             break
# #         print(i)
#         i  = i + 1
        features = [
            file['paper_id'],
            file['metadata']['title'],
            format_authors(file['metadata']['authors']),
            format_authors(file['metadata']['authors'], 
                           with_affiliation=True),
            format_body(file['abstract']),
            format_body(file['body_text']),
            format_bib(file['bib_entries']),
            file['metadata']['authors'],
            file['bib_entries']
        ]
        cleaned_files.append(features)
    col_names = ['paper_id', 'title', 'authors',
                 'affiliations', 'abstract', 'text', 
                 'bibliography','raw_authors','raw_bibliography']
    clean_df = pd.DataFrame(cleaned_files, columns=col_names)
    clean_df = clean_df.drop(columns=['authors','affiliations','bibliography',
                                      'raw_authors','raw_bibliography'])
    return clean_df

In [None]:
def get_corpus():
    num_of_papers = {}
    corpus = pd.DataFrame(columns=['paper_id','title','abstract','text'])
    
    file_count = 40928
    
    print('Reading ', file_count, 'json files')
#     num_of_papers[folder_names[i]] = len(filenames)
    print('Loading......')
    files = load_files(file_count)
    print('Generating clean dataframe')
    df = generate_clean_df(files)
    print('Generated......')
    print('Forming Corpus.......')
    corpus = pd.concat([corpus, df], ignore_index=True, sort=False)
    print('4')
    
    print('Corpus includes {0} scientific articles.'.format(len(corpus)))
    return corpus, num_of_papers

corpus, num_of_papers = get_corpus()

In [None]:
# This processing algorithm can originaly be found at:
# https://github.com/nilayjain/text-search-engine

inverted_index = defaultdict(list)
num_of_documents = len(corpus)
vects_for_docs = []  # we will need nos of docs number of vectors, each vector is a dictionary
document_freq_vect = {}  # sort of equivalent to initializing the number of unique words to 0

# It updates the vects_for_docs variable with vectors of all the documents.
def iterate_over_all_docs():
    print('Processing corpus...')
    for i in range(num_of_documents):
        if np.mod(i, 1000) == 0:
            print('{0} of {1}'.format(str(i).zfill(len(str(num_of_documents))),num_of_documents))
        doc_text = corpus['title'][i] + ' ' + corpus['abstract'][i] + ' ' + corpus['text'][i]
        token_list = get_tokenized_and_normalized_list(doc_text)
        vect = create_vector(token_list)
        vects_for_docs.append(vect)
    print('{0} of {1}'.format(num_of_documents, num_of_documents))

def create_vector_from_query(l1):
    vect = {}
    for token in l1:
        if token in vect:
            vect[token] += 1.0
        else:
            vect[token] = 1.0
    return vect

def generate_inverted_index():
    count1 = 0
    for vector in vects_for_docs:
        for word1 in vector:
            inverted_index[word1].append(count1)
        count1 += 1

def create_tf_idf_vector():
    vect_length = 0.0
    for vect in vects_for_docs:
        for word1 in vect:
            word_freq = vect[word1]
            temp = calc_tf_idf(word1, word_freq)
            vect[word1] = temp
            vect_length += temp ** 2
        vect_length = sqrt(vect_length)
        for word1 in vect:
            vect[word1] /= vect_length

def get_tf_idf_from_query_vect(query_vector1):
    vect_length = 0.0
    for word1 in query_vector1:
        word_freq = query_vector1[word1]
        if word1 in document_freq_vect:
            query_vector1[word1] = calc_tf_idf(word1, word_freq)
        else:
            query_vector1[word1] = log(1 + word_freq) * log(
                num_of_documents)
        vect_length += query_vector1[word1] ** 2
    vect_length = sqrt(vect_length)
    if vect_length != 0:
        for word1 in query_vector1:
            query_vector1[word1] /= vect_length

def calc_tf_idf(word1, word_freq):
    return log(1 + word_freq) * log(num_of_documents / document_freq_vect[word1])

def get_dot_product(vector1, vector2):
    if len(vector1) > len(vector2):
        temp = vector1
        vector1 = vector2
        vector2 = temp
    keys1 = vector1.keys()
    keys2 = vector2.keys()
    sum = 0
    for i in keys1:
        if i in keys2:
            sum += vector1[i] * vector2[i]
    return sum

def get_tokenized_and_normalized_list(doc_text):
    tokens = nltk.word_tokenize(doc_text)
    ps = nltk.stem.PorterStemmer()
    stemmed = []
    for words in tokens:
        stemmed.append(ps.stem(words))
    return stemmed

def create_vector(l1):
    vect = {}  # this is a dictionary
    global document_freq_vect
    for token in l1:
        if token in vect:
            vect[token] += 1
        else:
            vect[token] = 1
            if token in document_freq_vect:
                document_freq_vect[token] += 1
            else:
                document_freq_vect[token] = 1
    return vect

def get_result_from_query_vect(query_vector1):
    parsed_list = []
    for i in range(num_of_documents - 0):
        dot_prod = get_dot_product(query_vector1, vects_for_docs[i])
        parsed_list.append((i, dot_prod))
        parsed_list = sorted(parsed_list, key=lambda x: x[1])
    return parsed_list

iterate_over_all_docs()
generate_inverted_index()
create_tf_idf_vector()

In [None]:
# The End-To-End Closed Domain Question Answering System is used here.
# It is available at: https://pypi.org/project/cdqa/

from cdqa.utils.filters import filter_paragraphs
from cdqa.utils.download import download_model, download_bnpp_data
from cdqa.pipeline.cdqa_sklearn import QAPipeline

download_bnpp_data(dir='./data/bnpp_newsroom_v1.1/')
download_model(model='bert-squad_1.1', dir='./models')

In [None]:
def find_relevant_articles(query=None, top_n_papers=20, min_n_papers=3):
    if query == None:
        query = input('Please enter your query...')
    print('\n\n'+'*'*34+' PROCESSING NEW QUERY '+'*'*34+'\n')   
    query_list = get_tokenized_and_normalized_list(query)
    query_vector = create_vector_from_query(query_list)
    get_tf_idf_from_query_vect(query_vector)
    result_set = get_result_from_query_vect(query_vector)
    papers_info = {'query':query, 'query list':query_list, 'query vector':query_vector,
                   'id':[], 'title':[], 'abstract':[], 'text':[], 'weight':[], 'index':[]}
    for i in range(1, top_n_papers+1):
        tup = result_set[-i]
        papers_info['id'].append(corpus['paper_id'][tup[0]])
        papers_info['title'].append(corpus['title'][tup[0]])
        papers_info['abstract'].append(corpus['abstract'][tup[0]])
        papers_info['text'].append(corpus['text'][tup[0]])
        papers_info['weight'].append(tup[1])
        papers_info['index'].append(tup[0])
    colms = ['date', 'title', 'category', 'link', 'abstract', 'paragraphs']
    df = pd.DataFrame(columns=colms)
    for i in range(len(papers_info['text'])):
        papers_info['text'][i] = papers_info['text'][i].replace('\n\n', ' ')
        CurrentText = papers_info['text'][i]
        CurrentText = CurrentText.split('. ')
        #CurrentList = ["None", papers_info['title'][i], "None", "None", "None", CurrentText]
        CurrentList = ["None", papers_info['title'][i], "None", "None", papers_info['abstract'][i], CurrentText]
        CurrentList = np.array(CurrentList)
        CurrentList = CurrentList.reshape(1, CurrentList.shape[0])
        CurrentList = pd.DataFrame(data = CurrentList, columns=colms)
        df = pd.concat([df, CurrentList], ignore_index=True)
    df = filter_paragraphs(df)
    # Loading QAPipeline with CPU version of BERT Reader pretrained on SQuAD 1.1
    cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib')
    # Fitting the retriever to the list of documents in the dataframe
    cdqa_pipeline.fit_retriever(df=df)
    # Sending a question to the pipeline and getting prediction
    query = papers_info['query']
    prediction = cdqa_pipeline.predict(query=query)
    for i in range(top_n_papers):
        if papers_info['title'][i] == prediction[1]:
            pid = papers_info['id'][i]
    response = {query:{'id':pid,'title':prediction[1],'answer':prediction[0],'summary':prediction[2],
                       'important papers':{'id':papers_info['id'],'title':papers_info['title']}}}
    print('QUERY: {0}\n'.format(query))
    print('ANSWER MINED FROM PAPER: {0}\n'.format(prediction[0]))
    print('PAPER TITLE: {0}\n'.format(prediction[1]))
    print('PARAGRAPH IN PAPER: {0}\n'.format(prediction[2]))
    show_paper = np.min([min_n_papers, top_n_papers])
    print('\nTOP {0} MOST RELEVANT PAPERS RELATED TO THE QUERY:\n'.format(show_paper))
    for i in range(show_paper):
        print('PAPER #{0}. \nID: {1} \nTITLE: {2}\n'.format(i+1, papers_info['id'][i], papers_info['title'][i]))
    return response, papers_info, prediction, result_set, df

In [None]:
#List of queries
queries = ['What is range of incubation period for coronavirus SARS-CoV-2 COVID-19 in humans',
           'What is optimal quarantine period for coronavirus COVID-19',
           'What is effective quarantine period for coronavirus COVID-19',
           'What is percentage of death cases for coronavirus SARS-CoV-2 COVID-19',
           'What is death rate for coronavirus COVID-19 and air pollution',
           'At which temperature coronavirus COVID-19 can survive',
           'How long coronavirus SARS-CoV-2 can survive on plastic surface',
           'What are risk factors for coronavirus COVID-19',
           'What is origin of coronavirus COVID-19',
           'At which temperature coronavirus cannot survive'
           'What is the range of incubation periods for coronavirus SARS-CoV-2 COVID-19 in humans (and how this varies across age and health status) and how long individuals are contagious, even after recovery',
           'What are the prevalence of asymptomatic shedding and transmission of coronavirus COVID-19(e.g., particularly children)',
           'Mention the seasonality COVID-19 transmission',
           'Explain the physical science of the coronavirus (e.g., charge distribution, adhesion to hydrophilic/phobic surfaces, environmental survival to inform decontamination efforts for affected areas and provide information about viral shedding)',
           'What is the period of persistence and stability of coronavirus on a multitude of substrates and sources (e.g., nasal discharge, sputum, urine, fecal matter, blood)',
           'How long does coronavirus persists on surfaces of different materials (e,g., copper, stainless steel, plastic)',
           'Explain the natural history of the coronavirus COVID-19 and shedding of it from an infected person',
           'What is the process of implementation of diagnostics and products to improve clinical processes',
           'What are the disease models for coronavirus COVID-19,including animal models for infection, disease and transmission',
           'Mention the tools and studies to monitor phenotypic change and potential adaptation of the coronavirus COVID-19',
           'What is the immune response and immunity of a person affected by coronavirus COVID-19',
           'What are the effective measures to be taken to prevent COVID-19 secondary transmission in health care and community settings',
           'What is the effectiveness of personal protective equipment (PPE) and its usefulness to reduce risk of transmission of coronavirus COVID-19 in health care and community settings',
           'What is the role of the environment in COVID-19 transmission'] 

In [None]:
 for query in queries:
     response, papers_info, prediction, result_set, df = find_relevant_articles(query, top_n_papers=10)

In [None]:
query = 'What is the incubation period for covid19 ?'

In [None]:
find_relevant_articles(query=query, top_n_papers=5, min_n_papers=5);