<a href="https://colab.research.google.com/github/samrat-halder/covid-19-recommendation-NLP-BERT-ELMO/blob/master/notebook/Covid_19.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libraries

In [0]:
%pip install bert-extractive-summarizer
%pip install spacy
%pip install transformers==2.2.0

In [0]:
from google.colab import drive
from google.colab import files

import os
import json
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
import tensorflow_hub as hub
from summarizer import Summarizer
import spacy
from spacy.lang.en import English
from spacy import displacy

from IPython.display import HTML
import logging

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from IPython.display import HTML

logging.getLogger('tensorflow').disabled = True
drive.mount('/content/drive')
os.chdir("/content/drive/My Drive/Colab Notebooks/COVID-2020-03-13")

nlp = spacy.load('en_core_web_sm')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## Load Data

In [0]:
data_c = pd.read_csv('all_sources_metadata_2020-03-13.csv')
data_c = data_c[['sha', 'Microsoft Academic Paper ID', 
                 'WHO #Covidence', 'title', 'abstract']]

subdirs = ['./biorxiv_medrxiv/biorxiv_medrxiv/', './comm_use_subset/comm_use_subset/', './noncomm_use_subset/noncomm_use_subset/',
           './pmc_custom_license/pmc_custom_license/']

## Keep articles that is related to corona virus

In [0]:
keywords = ['corona', 'covid', 'covid-19', 'covid-2019', 
            'coronavirus', 'ncov', 'sars-cov-2', 'ncov-2019', '2019-ncov']
data_c = data_c[data_c['title'].notna()]
data_c['abstract'] = data_c['abstract'].str.lower()
data_c['flagCol'] = np.where(data_c.abstract.str.contains('|'.join(keywords)),1,0)
data_c = data_c[data_c['flagCol'] == 1]
sentences = data_c['title'].tolist()
data_c['title'][0:5].tolist()

['Angiotensin-converting enzyme 2 (ACE2) as a SARS-CoV-2 receptor: molecular mechanisms and potential therapeutic target',
 'Comparative genetic analysis of the novel coronavirus (2019-nCoV/SARS-CoV-2) receptor ACE2 in different populations',
 'Incubation Period and Other Epidemiological Characteristics of 2019 Novel Coronavirus Infections with Right Truncation: A Statistical Analysis of Publicly Available Case Data',
 'Characteristics of and Public Health Responses to the Coronavirus Disease 2019 Outbreak in China',
 'Imaging changes in severe COVID-19 pneumonia']

We find the number of papers related to COVID-19 are actually 30% of all the documents

In [0]:
print(f'Number of Corona Virus related documents {len(sentences)}')

Number of Corona Virus related documents 8061


## Import ELMO from tensorhub

In [0]:
url = "https://tfhub.dev/google/elmo/2"
embed = hub.Module(url)

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/elmo/2'.
INFO:absl:Downloaded https://tfhub.dev/google/elmo/2, Total size: 357.40MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/elmo/2'.


## Clean text

In [0]:
cleaned_sent = []
for text in sentences:
  text = text.lower().replace('\n', ' ').replace('\t', ' ')
  text = ' '.join(text.split())
  cleaned_sent.append(text)

In [0]:
cleaned_sent[0:5]

['angiotensin-converting enzyme 2 (ace2) as a sars-cov-2 receptor: molecular mechanisms and potential therapeutic target',
 'comparative genetic analysis of the novel coronavirus (2019-ncov/sars-cov-2) receptor ace2 in different populations',
 'incubation period and other epidemiological characteristics of 2019 novel coronavirus infections with right truncation: a statistical analysis of publicly available case data',
 'characteristics of and public health responses to the coronavirus disease 2019 outbreak in china',
 'imaging changes in severe covid-19 pneumonia']

In [0]:
f = open('topic_embeddings.pkl','rb')
#pickle.dump(x, f)
x = pickle.load(open('topic_embeddings.pkl','rb'))
f.close()

EOFError: ignored

## Creating embeddings for topics with ELMO

In [0]:
if not x:
  x = []
  for i in range(0, len(sentences), 400): 
    print(f'Creating embeddings for article {i} to {i+400}')
    embeddings = embed(
        cleaned_sent[i:i+400],
        signature="default",
        as_dict=True)["default"]
    with tf.Session() as sess:
      sess.run(tf.global_variables_initializer())
      sess.run(tf.tables_initializer())
      x.append(sess.run(embeddings))

## PCA and t-SNE to visualize the document similarities

In [0]:
x = np.vstack(x)
pca = PCA(n_components=50)
y = pca.fit_transform(x)
y = TSNE(n_components=2).fit_transform(y)

In [0]:
init_notebook_mode(connected=True)
data = [
    go.Scatter(
        x=[i[0] for i in y],
        y=[i[1] for i in y],
        mode='markers',
        text=[i for i in sentences],
    marker=dict(
        size=5,
        color = [len(i) for i in sentences],
        opacity= 0.2,
        colorscale='Viridis',
        showscale=False
    )
    )
]
layout = go.Layout()
layout = dict(
              yaxis = dict(zeroline = True),
              xaxis = dict(zeroline = True)
             )
fig = go.Figure(data=data, layout=layout)
file = plot(fig, filename='COVID Paper encode.html')

files.download('COVID Paper encode.html') 

## Enter search queries to find most related papers

In [0]:
#@title Find most related articles
#@markdown Enter a query to find matching topics. 'results_returned' can be used to modify the number of matching topics. 
query = "What has been published about ethical and social science considerations?" #@param {type:"string"}
results_returned = "10" #@param [1, 2, 3, 4, 5, 10]

data_c.reset_index(drop=True, inplace=True)
embeddings2 = embed(
    [query],
    signature="default",
    as_dict=True)["default"]

with tf.Session() as sess:
  sess.run(tf.global_variables_initializer())
  sess.run(tf.tables_initializer())
  search_vect = sess.run(embeddings2)
  
cosine_similarities = pd.Series(cosine_similarity(search_vect, x).flatten())
doc_id = []
output =""
for k,j in cosine_similarities.nlargest(int(results_returned)).iteritems():
  output +='<p style="font-family:verdana; font-size:110%;"> '
  for i in sentences[k].split():
    if i.lower() in query:
      output += " <b>"+str(i)+"</b>"
    else:
      output += " "+str(i)
  output += "  || SHA Doc Id " + str(data_c['sha'][k]) #+ str(k)
  output += "</p><hr>"
  doc_id.append(k)
    
output = '<h3>Results:</h3>'+output
display(HTML(output))

## Read paper from json file

In [0]:
def read_paper(shaid, body_text):
  #if len(shaids) > 0:
  #  for id in shaids:
  for dir in subdirs:
    fname = dir + str(shaid) + '.json'
    if os.path.isfile(fname):
      f = open(fname)
      data_text = json.load(f)
      for par in data_text['body_text']:
        body_text += par['text']
      break
  return body_text

## Use BERT based summarizer to summarize related documents (abstract and full text)

In [0]:
def summary_text(doc_id):
  model = Summarizer()
  full_body = ''
  shaids = []
  for id in doc_id:
    body = data_c['abstract'][id]
    if not pd.isnull(body):
      full_body += body
    shaid = data_c['sha'][id]
    #if not pd.isnull(shaid):
    #  print('Paper found for shaid :', shaid)
    #  shaids.append(data_c['sha'][id])
    #  full_body += read_paper(shaid, '')
  result = model(full_body, min_length=50, max_length=300)
  #print('Summary from all the papers available to your search:')
  return result

Q1. what do we know about corona virus diagnostics and surveillance?

In [0]:
display(HTML('''
<style>
  pre {
      white-space: normal;
  }
</style>
'''))
ans = summary_text(doc_id)
print(ans)

background: middle east respiratory syndrome coronavirus (mers-cov) was first identified in humans in 2012. a systematic literature review was conducted to synthesize current knowledge and identify critical knowledge gaps. dromedary camels remain the only documented zoonotic source of human infection, but mers-like covs have been detected in bat species globally, as well as in dromedary camels throughout the middle east and africa. there have been few rigorous studies of baseline prevalence, transmission, and spectrum of disease. terms such as “camel exposure” and the epidemiological relationships of cases should be clearly defined and standardized. polymerase chain reaction-based testing has allowed detection of newer agents (e.g. human metapneumovirus, coronavirus hku1 and nl63) as well as improved the ability to detect “old” viral infections such as influenza virus and rhinovirus. they are diligent in treating patients, at the same time, they constantly summarize experience and comb