### Load library

In [37]:
import re
import os
import nltk
import json
import glob
import sent2vec
import nltk.data
import pandas as pd
from nltk import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
from scipy.spatial import distance
from tqdm import tqdm, tqdm_notebook

#### Load nltk NLP files

In [4]:
#nltk.download('stopwords')
#nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle')
# instantiate progress bar for notebook
tqdm.pandas(tqdm_notebook)

#### Functions to load articles repository

In [6]:
class PaperLoader():
    """
    Loads, parses and merges metadata for papers
    """
    
    def __init__(self, root_dir, no_bib=True):
        """
        Initializes PaperLoader class to read all .json files from root_directory
            
            no_bib: if true, clean noisy sections with bibliographies
            root_dir: root directory for papers
        """
        self.ROOT_DIR = root_dir
        self.JSON_FILES = glob.glob(f'{root}/**/*.json', recursive=True)
        self.PAPERS_COLUMN = {
            "doc_id": [None],
            "title": [None],
            "abstract": [None],
            "text_body": [None]
        }
        self.PAPERS_DF = None
        self.PAPERS_SECTION_DF = None
        self.NO_BIB = no_bib

    
    def __clean_bib(self, body_text, thres):
        """
        Removes sections with more than 5 URL/DOI/HTTP instances
            
            body_text: array of dictionaries for text_body
            thres: number of hyperlinks tolerated before removal 
        """
        # Sometimes, the text body has duplicate sections consecutively.
        merged_body = []
        for segment in body_text:
            # We will combine these duplicate sections
            if len(merged_body) > 0:
                if merged_body[-1]['section'] == segment['section']:
                    merged_body[-1]['text'] += '\n' + segment['text']
                    continue
            merged_body.append(segment)

        merged_body = [
            segment for segment in merged_body
            if len(re.findall("(http|doi|www)", segment['text'])) <= thres
        ]
        return merged_body
       
    def create_paper_df(self):
        """
        Creates a Pandas DataFrame from all json files in root_directory
        Each json file represents a paper. 
        Features extracted are: doc_id, title, abstract, text_body
        """
        df_list = []
        df_sent = pd.DataFrame()
        df_section = pd.DataFrame()
        for i in tqdm(range(len(self.JSON_FILES))):
            file_name = self.JSON_FILES[i]
            
            #Initialize row for returned df. Each row represents a paper
            row = {x: None for x in self.PAPERS_COLUMN}
            sent_row = {x: None for x in self.PAPERS_COLUMN}
            with open(file_name) as json_data:
                data = json.load(json_data)

                row['doc_id'] = data['paper_id']
                row['title'] = data['metadata']['title']
                
                # If title is empty, we skip the paper
                if len(row['title']) <= 2:
                    continue

                # If a paper does not have an abstract of a body, we will skip it
                if ('abstract' not in data or 'body_text' not in data):
                    continue
                else:
                    # Now need all of the abstract. Put it all in
                    # a list then use str.join() 
                    abstract_list = [abst['text'] for abst in data['abstract']]
                    abstract = "\n ".join(abstract_list)

                # Skip the paper if abstract is empty
                if len(abstract) <= 2:
                    continue

                row['abstract'] = abstract
                    
                # And lastly the body of the text.
                # These clauses check if the user wants to clean up references
                if self.NO_BIB:
                    body_list = self.__clean_bib(data['body_text'], 4)
                else:
                    body_list = [bt for bt in data['body_text']]

                row['text_body'] = body_list
                sent_row['doc_id'] = data['paper_id']
                temp = pd.DataFrame(body_list)
                temp['doc_id'] = data['paper_id']
                df_section= df_section.append(temp)
                df_list.append(row)
        # create final dataframe
        self.PAPERS_DF = pd.DataFrame(df_list)
        self.PAPERS_SECTION_DF =df_section
        
    def merge_metadata(self, metadata = 'metadata.csv'):
        """
            Joins paper information with information on journal for paper,
            authors, doi and published date  
                metadata: path to csv file containing metadata
        """
        metadata_df = pd.read_csv(self.ROOT_DIR + metadata)
        metadata_df = metadata_df.loc[:, ['sha', 'publish_time', 'authors', 'journal', 'doi']]
        self.PAPERS_DF = self.PAPERS_DF.merge(metadata_df,left_on='doc_id',right_on='sha', how='inner')

    def get_df(self):
        """
        Returns processed dataframe
        """
        self.PAPERS_DF = self.PAPERS_DF.dropna(subset=['abstract', 'text_body'])
        self.PAPERS_SECTION_DF = self.PAPERS_SECTION_DF.dropna(subset=['doc_id', 'section','text'])
        self.PAPERS_SECTION_DF = self.PAPERS_SECTION_DF[['doc_id', 'section','text']]
        return self.PAPERS_DF,self.PAPERS_SECTION_DF

#### Functions to generate sentence dataframe

In [7]:
def generate_sentence_df(docid,sec,text):
    global paper_sent_df
    temp_sent = pd.DataFrame(tokenizer.tokenize(text), columns=['sentence'])
    temp_sent['doc_id'] = docid
    temp_sent['section'] = sec
    paper_sent_df= paper_sent_df.append(temp_sent)
    return temp_sent

#df_section= df_section.append(temp)
#for section_row in temp.itertuples():
#    temp_sent = pd.DataFrame(tokenizer.tokenize(section_row.text), columns=['sentence'])
#    temp_sent['doc_id'] = data['paper_id']
#    temp_sent['section'] = section_row.section
#    df_sent= df_sent.append(temp_sent)

#### Load Aricles into pandas dataframe

In [8]:
root = "./data/comm_use_subset/"
#meta="./data/metadata.csv"
paper_loader = PaperLoader(root)
paper_loader.create_paper_df()
paper_loader.merge_metadata()

100%|██████████| 18746/18746 [02:29<00:00, 125.59it/s]


In [9]:
papers_df,paper_sec_df = paper_loader.get_df()

#### Generate sentence dataframe from section data

In [13]:
paper_sent_df = pd.DataFrame()
df= paper_sec_df.progress_apply(lambda x: generate_sentence_df(x['doc_id'], x['section'],x['text']), axis=1)

100%|██████████| 107535/107535 [1:25:35<00:00, 20.94it/s]


0                                                  s...
1                                                 se...
2                                                 se...
3                                                  s...
4                                                 se...
                            ...                        
8                                                 se...
9                                                 se...
10                                                se...
11                                                 s...
12                                                se...
Length: 107535, dtype: object

In [15]:
paper_sec_df.section.nunique()

64492

In [16]:
paper_sent_df.count()

sentence    1619203
doc_id      1619203
section     1619203
dtype: int64

In [17]:
papers_df.count()

doc_id          7862
title           7862
abstract        7862
text_body       7862
sha             7862
publish_time    7862
authors         7854
journal         7793
doi             7811
dtype: int64

In [19]:
# List of keywords for covid-19
cov_list = [
    'novel coronavi',
    'covid',
    'cov_2',
    'cord-19',
    'cord 19',
    '2019-nCoV',
    '2019 ncov',
    '2019 cov',
    'wuhan coronavi',
]

#### Fucntions for filtering relevant articles

In [20]:
class RelevantFilter():
    
    def __init__(self, keywords, year='2019'):
        """
        constructor for RelevantFilter
            keywords: keywords to filter for
            year: papers written before this year will be discarded
        """
        self.KEYWORDS = keywords
        self.YEAR = year

    def extract_recent(self, df):
        """
        extracts documents published on or after self.YEAR
        """
        return df[df['publish_time'] >= self.YEAR]

    def filter_papers(self, df):
        """
        Filters for papers whose title have mention of 
        any of the terms in self.KEYWORDS
        """
        pattern = re.compile('(' + "|".join(self.KEYWORDS) + ')',
                                 re.IGNORECASE)
        # We will filter for rows with one or more matches 
        # for title and covid keywords
        df = df[df['title'].apply(lambda x: 
                                  len(pattern.findall(x)) >= 1
                                  if x else False)]
        
        return df

#### Filter covid19 related articles

In [21]:
covid_filter = RelevantFilter(cov_list, '2019')
covid_df = covid_filter.filter_papers(papers_df)
covid_df = covid_filter.extract_recent(covid_df)

In [22]:
convid_sent_df = paper_sent_df[paper_sent_df.doc_id.isin(covid_df['doc_id'])]

In [23]:
convid_sent_df.doc_id.nunique()

75

### Load Pre-Trained BioSentVec Model

In [24]:
model_path = 'BioSentVec_PubMed_MIMICIII-bigram_d700.bin'
model = sent2vec.Sent2vecModel()
try:
    model.load_model(model_path)
except Exception as e:
    print(e)
print('model successfully loaded')

model successfully loaded


#### Pre process sentence - basic text clean up

In [25]:
stop_words = set(stopwords.words('english'))
def preprocess_sentence(text):
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    text = text.lower()

    tokens = [token for token in word_tokenize(text) if token not in punctuation and token not in stop_words]

    return ' '.join(tokens)

### Generate BioSentVec embedding vectore for sentence corpus

In [26]:
embs = model.embed_sentences(convid_sent_df['sentence'])

#### Generate embedding vector for question 

In [27]:
emb = model.embed_sentence('Physical science of the coronavirus') 
#embs = model.embed_sentences(["first sentence .", "another sentence"])

###Train KNN model with sentence embedding vecotor

In [28]:
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(embs)

#### Predict answer sentence for question

In [29]:
distances, indices = nbrs.kneighbors(emb)

In [30]:
indices

array([[6090,  114]])

In [31]:
distances

array([[4.1479283 , 4.29738979]])

In [32]:
convid_sent_df =convid_sent_df.reset_index()

In [33]:
convid_sent_df['doc_id'].iloc[114]

'da81f0d3a12ab7faa09148acb6564271474e9e02'

In [34]:
convid_sent_df['doc_id'].iloc[6090]

'5ba8056230c17ec133169d79aacf61ed7d4b458b'

### Answers as output from KNN model

In [35]:
print("Title:",(covid_df[covid_df.doc_id =='5ba8056230c17ec133169d79aacf61ed7d4b458b']['title']).to_string(index=False))
print("Abstract:",(covid_df[covid_df.doc_id =='5ba8056230c17ec133169d79aacf61ed7d4b458b']['abstract']).to_string(index=False))
print("Section of the article:",(convid_sent_df[['section']].iloc[6091]).to_string(index=False))
print("Answer:",(convid_sent_df[['sentence']].iloc[[6089,6091]]).to_string(index=False))

Title:  The novel coronavirus outbreak in Wuhan, China
Abstract:  The novel coronavirus (2019-nCoV, or COVID-19)...
Section of the article:  Introduction
Answer:                                                                                                                                                                                                sentence
 But most pressingly as the global outbreak continues to grow, can we develop effective vaccine and therapeutic strategies to treat not only this epidemic but any future coronavirus spillover events?
                                                                                                                                  The COVID-19 has then rapidly spread to all over China and the world.


In [36]:
print("Title:",(covid_df[covid_df.doc_id =='da81f0d3a12ab7faa09148acb6564271474e9e02']['title']).to_string(index=False))
print("Abstract:",(covid_df[covid_df.doc_id =='da81f0d3a12ab7faa09148acb6564271474e9e02']['abstract']).to_string(index=False))
print("Section of the article:",(convid_sent_df[['section']].iloc[114]).to_string(index=False))
print("Answer:",(convid_sent_df[['sentence']].iloc[[113,114,115]]).to_string(index=False))

Title:  Molecular and serological investigation of 201...
Abstract:  In December 2019, a novel coronavirus (2019-nC...
Section of the article:  Introduction
Answer:                                                                                                                          sentence
           Coronaviruses (CoVs) belong to the subfamily Orthocoronavirinae in the family Coronaviridae and the order Nidovirales.
                 A human coronavirus (SARS-CoV) caused the severe acute respiratory syndrome coronavirus (SARS) outbreak in 2003.
 Most recently, an SARS-related CoV was implicated as the etiological agent responsible for the outbreak in Wuhan, central China.
