### Install sentence transformers library

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/6a/e2/84d6acfcee2d83164149778a33b6bdd1a74e1bcb59b2b2cd1b861359b339/sentence-transformers-0.4.1.2.tar.gz (64kB)
[K     |████████████████████████████████| 71kB 5.4MB/s 
[?25hCollecting transformers<5.0.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba281598a968320283f48d42782845f6ba567f00b/transformers-4.2.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 9.9MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/67/e42bd1181472c95c8cda79305df848264f2a7f62740995a46945d9797b67/sentencepiece-0.1.95-cp36-cp36m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 31.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     

### Useful imports

In [None]:
import json,glob,nltk,copy,torch,time,sentence_transformers,pickle
import numpy as np
from scipy import spatial
from queue import PriorityQueue
from sentence_transformers import SentenceTransformer,util
nltk.download('punkt')
from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Mounted at /content/drive


### Retrieve dataset

In [None]:
!wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2020-03-13.tar.gz
!tar -xf cord-19_2020-03-13.tar.gz
!tar -xf 2020-03-13/comm_use_subset.tar.gz

--2021-02-03 15:16:42--  https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2020-03-13.tar.gz
Resolving ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com (ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com)... 52.218.234.233
Connecting to ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com (ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com)|52.218.234.233|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 278921140 (266M) [application/x-tar]
Saving to: ‘cord-19_2020-03-13.tar.gz’


2021-02-03 15:16:47 (57.9 MB/s) - ‘cord-19_2020-03-13.tar.gz’ saved [278921140/278921140]



### Prepare GPU Cuda.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device available for running: ")
print(device)

Device available for running: 
cuda


### Read JSON files and store title,abstract and text of each article into a list

In [None]:
data = []

files = glob.glob('comm_use_subset/*', recursive=True)
number_of_articles = len(files)
bound = 9000

for single_file in files[0:bound]:
  with open(single_file, 'r') as f:
    json_file = json.load(f)

    # Retrieve title
    title = json_file["metadata"]["title"]

    # Retrieve abstracts
    abstracts = []
    if len(json_file["abstract"]) != 0 :
      for abstract in json_file["abstract"]:
        abstracts.append(abstract["text"])

    # Retrieve texts
    texts = []
    for text in json_file["body_text"]:
      texts.append(text["text"])

    data.append([title,abstracts,texts])

### Convert corpus to sentences with help of library nltk

In [None]:
# For each article
for article in range(bound):

  # Abstracts section
  for abstract in range(len(data[article][1])):
    data[article][1][abstract] = nltk.sent_tokenize(data[article][1][abstract])

  # Texts section
  for text in range(len(data[article][2])):
    data[article][2][text] = nltk.sent_tokenize(data[article][2][text])

### Transform sentences to embeddings with first sentence trasformer 'stsb-bert-base' model

In [None]:
# We gonna calculate time spent for sentence embeddings
fist_start_time = time.time()

# Declare our first sentence transformer model and pass it to appropriate device
first_model = SentenceTransformer('stsb-bert-base').to(device)

# Here we'll put sentence embeddings of each article from first sentence transformer model
first_sentence_embeddings = [[] for i in range(bound)]

# Here we'll put for each article sentences of abstract and body text
flatten_data = [[] for i in range(bound)]

# For each article
for article in range(bound):

  # Get abstract and body text of each article
  abstract_ = data[article][1]
  body_ = data[article][2]

  # Process to keep abstract and body text's sentences of each article in a big list
  for abstract in abstract_:
    for sentence in abstract:
      flatten_data[article].append(sentence)
  for text in body_:
    for sentence in text:
      flatten_data[article].append(sentence)

  # Transform sentences to embeddings
  first_sentence_embeddings[article].append(first_model.encode(flatten_data[article],convert_to_tensor=True))

  # Convert all sentence embeddings to a 2D pytorch tensor
  first_sentence_embeddings[article] = torch.cat(first_sentence_embeddings[article])

# Check elapsed time of first model
first_model_time = (time.time() - fist_start_time)/60
print("Elapsed time: %s minutes" % (round(first_model_time,1)))

# # Save first sentence embeddings
# with open('/content/drive/MyDrive/first_sentence_embeddings.pkl', 'wb') as f:
#   pickle.dump(first_sentence_embeddings,f)

Elapsed time: 110.3 minutes


### Random printings just for safety reasons

In [None]:
# Testing with prints
print(len(first_sentence_embeddings),type(first_sentence_embeddings))
print(len(first_sentence_embeddings[0]),type(first_sentence_embeddings[0]))
print(first_sentence_embeddings[0].shape)

print(len(flatten_data),type(flatten_data))
print(len(flatten_data[0]),type(flatten_data[0]))
print(len(flatten_data[0][0]),type(flatten_data[0][0]))
print(flatten_data[0][0])

9000 <class 'list'>
140 <class 'torch.Tensor'>
torch.Size([140, 768])
9000 <class 'list'>
140 <class 'list'>
150 <class 'str'>
The essential requirement of the lymphotoxin beta receptor (LTβR) in the development and maintenance of peripheral lymphoid organs is well recognized.


### Declare our queries and tranform them to embeddings based on our two models.

In [None]:
queries = ['What are the coronoviruses?','What was discovered in Wuhuan in December 2019?',
           'What is Coronovirus Disease 2019?','What is COVID-19?','What is caused by SARS-COV2?',
           'How is COVID-19 spread?','Where was COVID-19 discovered?','How does coronavirus spread?']

first_queries_embeddings = first_model.encode(queries,convert_to_tensor=True)

print("For 1st model... Number Of Queries:",len(first_queries_embeddings)," Query Embedding's Length:",len(first_queries_embeddings[0]))

For 1st model... Number Of Queries: 8  Query Embedding's Length: 768


### Test our first model

In [None]:
# # Load first sentence embeddings
# with open('/content/drive/MyDrive/first_sentence_embeddings.pkl', 'rb') as f:
#   first_sentence_embeddings = pickle.load(f)

# We gonna calculate time spent for finding best answer
fist_start_time = time.time()

# For each query
for qindex,query in enumerate(first_queries_embeddings):
  # Declare an empty priority queue
  answer_pq = PriorityQueue()
  # For each article's sentence embedding
  for index,embed in enumerate(first_sentence_embeddings):
    # Find the most similar vector and return it so as to add it to priority queue
    first_results = sentence_transformers.util.semantic_search(query,embed,top_k=1)
    for res in first_results:
      # Add to priority queue triple value of (score,article's index,sentence's index)
      answer_pq.put((-res[0]['score'],index,res[0]['corpus_id']))
  # Get vectors with the best cosine similarity (which are our answers)
  print("Query:",queries[qindex],"\n")
  for idx in range(4):
    res = answer_pq.get()
    print("Answer",idx+1,":",flatten_data[res[1]][res[2]],"\nFrom article:",data[res[1]][0],"\nScore:",-res[0],"\n")
  print("------------------------------------------------------------------------------------------------------------------------------")

first_model_time = (time.time() - fist_start_time)/60
print("Elapsed time: %s minutes" % (round(first_model_time,1)))

Query: What are the coronoviruses? 

Answer 1 : in orthomyxoviruses (e.g. 
From article: Mapping overlapping functional elements embedded within the protein-coding regions of RNA viruses 
Score: 0.7579688 

Answer 2 : C) Parechoviruses. 
From article: A viral metagenomic survey identifies known and novel mammalian viruses in bats from Saudi Arabia 
Score: 0.7296622 

Answer 3 : reoviruses and orthomyxoviruses). 
From article: Non-canonical translation in RNA viruses 
Score: 0.6961699 

Answer 4 : Adenoviruses. 
From article: Virus-induced exacerbations in asthma and COPD 
Score: 0.6928731 

------------------------------------------------------------------------------------------------------------------------------
Query: What was discovered in Wuhuan in December 2019? 

Answer 1 : In December 2019 
From article: Clinical Medicine Characteristics of and Public Health Responses to the Coronavirus Disease 2019 Outbreak in China 
Score: 0.7151003 

Answer 2 : (2019c) , Zhao et al. 
From a