In [2]:
#first install the library that would help us use BERT in an easy to use interface
#https://github.com/UKPLab/sentence-transformers/tree/master/sentence_transformers
!pip install -U sentence-transformers

In [19]:
import glob
import json
import pandas as pd
from tqdm import tqdm
all_json = glob.glob('/Users/patsnap/Desktop/Neo4J_and_other_codes/Coronavirus_19/CORD-19-research-challenge/comm_use_subset/comm_use_subset/pdf_json/*.json')
len(all_json)

9524

In [21]:
metadata_path = '/Users/patsnap/Desktop/Neo4J_and_other_codes/Coronavirus_19/CORD-19-research-challenge/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df.head(2)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,xqhn0vbp,1e1286db212100993d03cc22374b624f7caee956,PMC,Airborne rhinovirus detection and effect of ul...,10.1186/1471-2458-3-5,PMC140314,12525263,no-cc,"BACKGROUND: Rhinovirus, the most common cause ...",2003-01-13,"Myatt, Theodore A; Johnston, Sebastian L; Rudn...",BMC Public Health,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
1,gi6uaa83,8ae137c8da1607b3a8e4c946c07ca8bda67f88ac,PMC,Discovering human history from stomach bacteria,10.1186/gb-2003-4-5-213,PMC156578,12734001,no-cc,Recent analyses of human pathogens have reveal...,2003-04-28,"Disotell, Todd R",Genome Biol,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...


In [22]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
first_row = FileReader(all_json[0])
print(first_row)

5e0c586f047ff909c8ed3fe171c8975a90608d08: Background: Porcine epidemic diarrhea virus (PEDV) is emerging as a pathogenic coronavirus that causes a huge economic burden to the swine industry. Interaction of the viral spike (S) surface glycopro... Porcine epidemic diarrhea virus (PEDV), which belongs to the Alphacoronavirus genus of the Coronaviridae family, is an etiological agent of porcine epidemic diarrhea (PED) and causes an enteric diseas...


In [23]:
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

In [24]:
dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    content = FileReader(entry)
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
    
    # also create a column for the summary of abstract to be used in a plot
    if len(content.abstract) == 0: 
        # no abstract provided
        dict_['abstract_summary'].append("Not provided.")
    elif len(content.abstract.split(' ')) > 100:
        # abstract provided is too long for plot, take first 300 words append with ...
        info = content.abstract.split(' ')[:100]
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
    else:
        # abstract is short enough
        summary = get_breaks(content.abstract, 40)
        dict_['abstract_summary'].append(summary)
        
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    
    try:
        # if more than one author
        authors = meta_data['authors'].values[0].split(';')
        if len(authors) > 2:
            # more than 2 authors, may be problem when plotting, so take first 2 append with ...
            dict_['authors'].append(". ".join(authors[:2]) + "...")
        else:
            # authors will fit in plot
            dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # if only one author - or Null valie
        dict_['authors'].append(meta_data['authors'].values[0])
    
    # add the title information, add breaks when needed
    try:
        title = get_breaks(meta_data['title'].values[0], 40)
        dict_['title'].append(title)
    # if title was not provided
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])
    
    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])
    
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
df_covid.head()

Processing index: 0 of 9524
Processing index: 952 of 9524
Processing index: 1904 of 9524
Processing index: 2856 of 9524
Processing index: 3808 of 9524
Processing index: 4760 of 9524
Processing index: 5712 of 9524
Processing index: 6664 of 9524
Processing index: 7616 of 9524
Processing index: 8568 of 9524
Processing index: 9520 of 9524


Unnamed: 0,paper_id,abstract,body_text,authors,title,journal,abstract_summary
0,5e0c586f047ff909c8ed3fe171c8975a90608d08,Background: Porcine epidemic diarrhea virus (P...,"Porcine epidemic diarrhea virus (PEDV), which ...","Gong, Lang. Lin, Ying...",Neutralizing antibodies against porcine<br>ep...,Virol J,Background: Porcine epidemic diarrhea virus<b...
1,1579fbff7af9b156c6f49fee0526e48f852ea460,"Currently, live-attenuated IBV vaccines are us...","Generation of rNDVs expressing S1, S2 or S pro...","Shirvani, Edris. Paldurai, Anandan...",A Recombinant Newcastle Disease Virus (NDV)<b...,Sci Rep,"Currently, live-attenuated IBV vaccines are<b..."
2,e0668c4b793d0cad26639b070819334a94648123,,The incidence of complete Achilles tendon rupt...,"Mughal, Faraz. Chew-Graham, Carolyn A...",‘Hajj: what it means for general practice’,BJGP Open,Not provided.
3,38aa050ad79d8a1d7022c33535255ce9d47914e5,The new world arenavirus Junín virus (JUNV) is...,Arenaviruses are enveloped RNA viruses with bi...,"Huang, Cheng. Walker, Aida G....",Potent Inhibition of Junín Virus Infection by...,PLoS Negl Trop Dis,The new world arenavirus Junín virus (JUNV) i...
4,61722c462b054f36461375e96e502cbf22648c04,and subtropical countries and is a significant...,"In this study, the anti-dengue activity of nic...","Jung, Eunhye. Nam, Sangwoo...",Neutralization of Acidic Intracellular<br>Ves...,Sci Rep,and subtropical countries and is a significan...


In [25]:
df_covid.drop_duplicates(['abstract', 'body_text'], inplace=True)
df_covid['abstract'].describe(include='all')

count     8672
unique    7984
top           
freq       685
Name: abstract, dtype: object

In [26]:
df_covid['body_text'].describe(include='all')

count                                                  8672
unique                                                 8672
top       Non-typhoidal Salmonella spp. are important hu...
freq                                                      1
Name: body_text, dtype: object

In [27]:
df_covid.head(2)

Unnamed: 0,paper_id,abstract,body_text,authors,title,journal,abstract_summary
0,5e0c586f047ff909c8ed3fe171c8975a90608d08,Background: Porcine epidemic diarrhea virus (P...,"Porcine epidemic diarrhea virus (PEDV), which ...","Gong, Lang. Lin, Ying...",Neutralizing antibodies against porcine<br>ep...,Virol J,Background: Porcine epidemic diarrhea virus<b...
1,1579fbff7af9b156c6f49fee0526e48f852ea460,"Currently, live-attenuated IBV vaccines are us...","Generation of rNDVs expressing S1, S2 or S pro...","Shirvani, Edris. Paldurai, Anandan...",A Recombinant Newcastle Disease Virus (NDV)<b...,Sci Rep,"Currently, live-attenuated IBV vaccines are<b..."


In [28]:
df_covid.describe()

Unnamed: 0,paper_id,abstract,body_text,authors,title,journal,abstract_summary
count,8672,8672.0,8672,8651,8672,8597,8672
unique,8672,7984.0,8672,8480,8670,778,7984
top,7f302add8b117514b8393d55f49c3ded276faf94,,Non-typhoidal Salmonella spp. are important hu...,"Bande, Faruku. Arshad, Siti Suri...",Back to the Future: Multiparent Populations<b...,PLoS One,Not provided.
freq,1,685.0,1,5,2,1517,685


In [29]:
df_covid.dropna(inplace=True)
df_covid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8576 entries, 0 to 8672
Data columns (total 7 columns):
paper_id            8576 non-null object
abstract            8576 non-null object
body_text           8576 non-null object
authors             8576 non-null object
title               8576 non-null object
journal             8576 non-null object
abstract_summary    8576 non-null object
dtypes: object(7)
memory usage: 536.0+ KB


In [30]:
import re

df_covid['body_text'] = df_covid['body_text'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
df_covid['abstract'] = df_covid['abstract'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))

In [31]:
def lower_case(input_str):
    input_str = input_str.lower()
    return input_str

df_covid['body_text'] = df_covid['body_text'].apply(lambda x: lower_case(x))
df_covid['abstract'] = df_covid['abstract'].apply(lambda x: lower_case(x))

In [32]:
df_covid.head(2)

Unnamed: 0,paper_id,abstract,body_text,authors,title,journal,abstract_summary
0,5e0c586f047ff909c8ed3fe171c8975a90608d08,background porcine epidemic diarrhea virus ped...,porcine epidemic diarrhea virus pedv which bel...,"Gong, Lang. Lin, Ying...",Neutralizing antibodies against porcine<br>ep...,Virol J,Background: Porcine epidemic diarrhea virus<b...
1,1579fbff7af9b156c6f49fee0526e48f852ea460,currently liveattenuated ibv vaccines are used...,generation of rndvs expressing s1 s2 or s prot...,"Shirvani, Edris. Paldurai, Anandan...",A Recombinant Newcastle Disease Virus (NDV)<b...,Sci Rep,"Currently, live-attenuated IBV vaccines are<b..."


In [70]:
text = df_covid.drop(["authors", "journal"], axis=1)
text.reset_index(inplace = True, drop = True)

paper_id                     74bedf868bf72275e1d5a60150246aaae93d3be5
abstract            amoebiasis is a parasitic disease that causes ...
body_text           the intestinal parasite entamoeba histolytica ...
title                Development of a Novel Ex-vivo 3D Model to<br...
abstract_summary     Amoebiasis is a parasitic disease that causes...
Name: 18, dtype: object

In [72]:
text_dict = text.to_dict()
len_text = len(text_dict["paper_id"])
len_text

8576

In [73]:
paper_id_list  = []
body_text_list = []

title_list = []
abstract_list = []
abstract_summary_list = []
for i in range(0,len_text):
    paper_id = text_dict["paper_id"][i]
    body_text = text_dict["body_text"][i].split("\n")
    title = text_dict["title"][i]
    abstract = text_dict["abstract"][i]
    abstract_summary = text_dict["abstract_summary"][i]
    for b in body_text:
        paper_id_list.append(paper_id)
        body_text_list.append(b)
        title_list.append(title)
        abstract_list.append(abstract)
        abstract_summary_list.append(abstract_summary)

In [78]:
df_sentences = pd.DataFrame({"paper_id":paper_id_list},index=body_text_list)
df_sentences.head(2)

Unnamed: 0,paper_id
porcine epidemic diarrhea virus pedv which belongs to the alphacoronavirus genus of the coronaviridae family is an etiological agent of porcine epidemic diarrhea ped and causes an enteric disease that affects all ages of swine [1 2] the clinical presentations and complications of infection are characterized by acute vomiting dehydration watery diarrhea and high mortality in sucking piglets [3] and are indistinguishable from those of infection by either transmissible gastroenteritis virus tgev or porcine enteric alphacoronavirus peav [4 5],5e0c586f047ff909c8ed3fe171c8975a90608d08
first detected in the uk in 1971 pedv resulted in mass epidemics within europe in the 1970s and 1980s [6] before 2013 ped was prevalent in asia and europe [1] after spring 2013 however ped outbreaks reached north america which was due to variant pedv strains that researchers revealed might derive from chinese variants [7 8] in spite of widespread immunization with the currently marketed vaccine ped still persists in swine raising countries and resulted in devastating damage to the pork producers [9],5e0c586f047ff909c8ed3fe171c8975a90608d08


In [79]:
df_sentences_full = pd.DataFrame({"paper_id":paper_id_list,"title":title_list,"abstract":abstract_list,"abstract_summary":abstract_summary_list},index=body_text_list)
df_sentences_full.head(2)

Unnamed: 0,paper_id,title,abstract,abstract_summary
porcine epidemic diarrhea virus pedv which belongs to the alphacoronavirus genus of the coronaviridae family is an etiological agent of porcine epidemic diarrhea ped and causes an enteric disease that affects all ages of swine [1 2] the clinical presentations and complications of infection are characterized by acute vomiting dehydration watery diarrhea and high mortality in sucking piglets [3] and are indistinguishable from those of infection by either transmissible gastroenteritis virus tgev or porcine enteric alphacoronavirus peav [4 5],5e0c586f047ff909c8ed3fe171c8975a90608d08,Neutralizing antibodies against porcine<br>ep...,background porcine epidemic diarrhea virus ped...,Background: Porcine epidemic diarrhea virus<b...
first detected in the uk in 1971 pedv resulted in mass epidemics within europe in the 1970s and 1980s [6] before 2013 ped was prevalent in asia and europe [1] after spring 2013 however ped outbreaks reached north america which was due to variant pedv strains that researchers revealed might derive from chinese variants [7 8] in spite of widespread immunization with the currently marketed vaccine ped still persists in swine raising countries and resulted in devastating damage to the pork producers [9],5e0c586f047ff909c8ed3fe171c8975a90608d08,Neutralizing antibodies against porcine<br>ep...,background porcine epidemic diarrhea virus ped...,Background: Porcine epidemic diarrhea virus<b...


In [80]:
df_sentences = df_sentences["paper_id"].to_dict()
df_sentences_list = list(df_sentences.keys())
len(df_sentences_list)

271127

In [82]:
list(df_sentences.keys())[:2]

['porcine epidemic diarrhea virus pedv which belongs to the alphacoronavirus genus of the coronaviridae family is an etiological agent of porcine epidemic diarrhea ped and causes an enteric disease that affects all ages of swine [1 2]  the clinical presentations and complications of infection are characterized by acute vomiting dehydration watery diarrhea and high mortality in sucking piglets [3] and are indistinguishable from those of infection by either transmissible gastroenteritis virus tgev or porcine enteric alphacoronavirus peav [4 5] ',
 'first detected in the uk in 1971 pedv resulted in mass epidemics within europe in the 1970s and 1980s [6]  before 2013 ped was prevalent in asia and europe [1]  after spring 2013 however ped outbreaks reached north america which was due to variant pedv strains that researchers revealed might derive from chinese variants [7 8]  in spite of widespread immunization with the currently marketed vaccine ped still persists in swine raising countries 

In [83]:
df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]

100%|██████████| 271127/271127 [00:00<00:00, 1693581.34it/s]


In [88]:
#https://github.com/UKPLab/sentence-transformers/blob/master/examples/application_semantic_search.py
"""
This is a simple application for sentence embeddings: semantic search
We have a corpus with various sentences. Then, for a given query sentence,
we want to find the most similar sentence in this corpus.
This script outputs for various queries the top 5 most similar sentences in the corpus.
"""

from sentence_transformers import SentenceTransformer
import scipy.spatial
import pickle as pkl
embedder = SentenceTransformer('bert-base-nli-mean-tokens')

# Corpus with example sentences
corpus = df_sentences_list
corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
# with open("/Users/patsnap/Desktop/Neo4J_and_other_codes/Coronavirus_19/CORD-19-research-challenge/corpus_embeddings.pkl" , "rb") as file_:
#     corpus_embeddings = pkl.load(file_)

# Query sentences:
queries = ['What has been published about medical care?',
           'Knowledge of the frequency, manifestations, and course of extrapulmonary manifestations of COVID-19, including, but not limited to, possible cardiomyopathy and cardiac arrest',
           'Use of AI in real-time health care delivery to evaluate interventions, risk factors, and outcomes in a way that could not be done manually',
           'Resources to support skilled nursing facilities and long term care facilities.',
           'Mobilization of surge medical staff to address shortages in overwhelmed communities .',
           'Age-adjusted mortality data for Acute Respiratory Distress Syndrome (ARDS) with/without other organ failure – particularly for viral etiologies .']
query_embeddings = embedder.encode(queries,show_progress_bar=True)

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
closest_n = 5
print("\nTop 5 most similar sentences in corpus:")
for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n=========================================================")
    print("==========================Query==============================")
    print("===",query,"=====")
    print("=========================================================")


    for idx, distance in results[0:closest_n]:
        print("Score:   ", "(Score: %.4f)" % (1-distance) , "\n" )
        print("Paragraph:   ", corpus[idx].strip(), "\n" )
        row_dict = df_sentences_full.loc[df_sentences_full.index== corpus[idx]].to_dict()
        print("paper_id:  " , row_dict["paper_id"][corpus[idx]] , "\n")
        print("Title:  " , row_dict["title"][corpus[idx]] , "\n")
        print("Abstract:  " , row_dict["abstract"][corpus[idx]] , "\n")
        print("Abstract_Summary:  " , row_dict["abstract_summary"][corpus[idx]] , "\n")
        print("-------------------------------------------")


Top 5 most similar sentences in corpus:


=== What has been published about medical care? =====
Score:    (Score: 0.8220) 

Paragraph:    to identify how one health has been used recently in the medical literature 

paper_id:   f703b510de361a759c9fe3419fa3aeb092c95512 

Title:    One Health and Zoonoses: The Evolution of One<br>Health and Incorporation of Zoonoses 

Abstract:    

Abstract_Summary:   Not provided. 

-------------------------------------------
Score:    (Score: 0.8187) 

Paragraph:    medical questions in the personal questionnaire included the following 

paper_id:   2afa3da371e5495dd55f0b4dde4ae7700eaf99d6 

Title:    Rhinitis, Asthma and Respiratory Infections<br>among Adults in Relation to the Home Environment in<br>Multi-Family Buildings in Sweden 

Abstract:   risk factors for rhinitis asthma and respiratory infections in the home environment were studied by a questionnaire survey totally 5775 occupants 18 years old from a stratified random sample of multifamily 

paper_id:   957ab1fcf790e8dd9d681f1f216287db1177cb78 

Title:    Mathematical epidemiology is not an oxymoron 

Abstract:   a brief description of the importance of communicable diseases in history and the development of mathematical modelling of disease transmission is given this includes reasons for mathematical modelling the history of mathematical modelling from the foundations laid in the late nineteenth century to the present some of the accomplishments of mathematical modelling and some challenges for the future our purpose is to demonstrate the importance of mathematical modelling for the understanding and management of infectious disease transmission 

Abstract_Summary:    A brief description of the importance of<br>communicable diseases in history and the development of<br>mathematical modelling of disease transmission is given.<br>This includes reasons for mathematical modelling,<br>the history of mathematical modelling from the<br>foundations laid in the late nineteenth cen



=== Use of AI in real-time health care delivery to evaluate interventions, risk factors, and outcomes in a way that could not be done manually =====
Score:    (Score: 0.8002) 

Paragraph:    conclusion several methods and approaches could be used in the healthcare arena time series is an analytical tool to study diseases and resources management at healthcare institutions the flexibility to follow up and recognize data patterns and provide explanations must not be neglected in studies of healthcare interventions in this study the arima model was introduced without the use of mathematical details or other extensions to the model the investigator or the healthcare organization involved in disease management programs could have great advantages when using analytical methodology in several areas with the ability to perform provisions in many cases despite the analytical possibility by statistical means this approach does not replace investigators common sense and experience in disease in



=== Resources to support skilled nursing facilities and long term care facilities. =====
Score:    (Score: 0.7985) 

Paragraph:    health care workforce capacity building adequacy of health care workforce 

paper_id:   2e87670d524b1b76e2af6090d5cba472a6448f85 

Title:    Capacity building in health care professions<br>within the Gulf cooperation council countries:<br>paving the way forward 

Abstract:   background there is a worldwide shortage of health care workers this problem is particularly severe in the gulf cooperation council gcc countries because of shortages in certain medical disciplines due to a lack of nationallytrained professionals and a less developed educational system compared to other high income countries consequently gcc countries are heavily dependent on an expatriate health care workforce a problem exacerbated by high turnover we discuss challenges and potential strategies for improving and strengthening capacity building efforts in health care professions in th

paper_id:   aecd12a815bf5fd6486b92bc8fb631d00fef541f 

Title:    Using simulation for training and to change<br>protocol during the outbreak of severe acute<br>respiratory syndrome 

Abstract:   introduction during the 2003 severe acute respiratory syndrome sars crisis we proposed and tested a new protocol for cardiac arrest in a patient with sars the protocol was rapidly and effectively instituted by teamwork training using highfidelity simulation
methods phase 1 was a curriculum design of a sarsspecific cardiac arrest protocol in three steps planning the new protocol repeated simulations of this protocol in a classroom and a subsequent simulation of a cardiac arrest on a hospital ward phase 2 was the training of 275 healthcare workers hcws using the new protocol training involved a seminar practice in wearing the mandatory personal protection system pps and cardiac arrest simulations with subsequent debriefing
results simulation provided insights that had not been considered in earli



=== Age-adjusted mortality data for Acute Respiratory Distress Syndrome (ARDS) with/without other organ failure – particularly for viral etiologies . =====
Score:    (Score: 0.8562) 

Paragraph:    patients were categorized according to their comorbidity status ie patients without comorbidity with chronic respiratory disease or with other comorbidities and compared using chisquared tests viral and bacterial infection status were compared by age group and comorbidity status using chisquared tests fishers exact tests and chisquared tests for trend inhospital mortality rates were calculated by viral and bacterial infection status and compared with those of the virusnegative group the effects of viral infection on inhospital mortality were expressed as risk ratios with 95 confidence intervals ci and estimated using poisson regression models with robust standard errors age study site comorbidity status duration of symptoms month of diagnosis antibiotic use and presence of bacteria were co

In [None]:
#import pickle as pkl
#with open("/Users/patsnap/Desktop/Neo4J_and_other_codes/Coronavirus_19/CORD-19-research-challenge/corpus_embeddings.pkl" , "wb") as file_:
#  pkl.dump(corpus_embeddings,file_)

#https://github.com/theamrzaki/COVID-19-BERT-ResearchPapers-Semantic-Search/blob/master/COVID_19_BERT_ResearchPapers_Semantic_Search.ipynb