# Notebook for Inference



In [8]:
# imports

import pandas as pd
import scipy.spatial
import pickle as pkl

from sentence_transformers import SentenceTransformer

#### Read embeddings

In [9]:
#Assuming the files in the same folder

df_sentences = pd.read_csv("covid_sentences.csv", index_col=0)
df = pd.read_csv("covid_full_sentences.csv", index_col=0)

In [11]:
df_sentences = df_sentences["paper_id"].to_dict()
df_sentences_list = list(df_sentences.keys())

#Convert everything to string and that's our sentence text corpus
corpus = [str(d) for d in df_sentences_list]

### Load BERT embeddings using SentenceTransformer

In [12]:
# Load pickle file
with open("corpus_embeddings_base_2.pkl" , "rb") as file_:
    corpus_embeddings = pkl.load(file_)

In [13]:
embedder = SentenceTransformer('bert-base-nli-stsb-mean-tokens')


In [14]:
### This is where the questions addressed by the user comes in. Built it for a list of questions 

# Using a typical query 

questions = ['Use of AI in real-time health care delivery to evaluate interventions, risk factors, and outcomes in a way that could not be done manually']


question_embeddings = embedder.encode(questions, show_progress_bar = True)

Batches: 100%|██████████| 1/1 [00:00<00:00,  7.35it/s]


### Generated results

In [15]:
### We find the top 5 sentences in terms of cosine similarity and display them along with the other metadata

closest_n = 5
print("\n Displaying the top 5 closest set of sentences to the question:")
for question, question_embedding in zip(questions, question_embeddings):
    
    #cosine similarity
    distances = scipy.spatial.distance.cdist([question_embedding], corpus_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x:x[1])

    print(question)
    print("\n\n")


    for idx, distance in results[0:closest_n]:
        
        print("Score:  ", "(Score: %.4f)" % (1-distance), "\n")
        print("Set of Sentences:  ", corpus[idx].strip(), "\n")

        row_dict = df.loc[df.index==corpus[idx]].to_dict()
        print("paper_id:  ", row_dict["paper_id"][corpus[idx]], "\n")
        print("Title:  ", row_dict["title"][corpus[idx]],"\n")
        print("Abstract:  ", row_dict["abstract"][corpus[idx]], "\n")
        print("Abstract_summary:  ", row_dict["abstract_summary"][corpus[idx]], "\n")


 Displaying the top 5 closest set of sentences to the question:
Use of AI in real-time health care delivery to evaluate interventions, risk factors, and outcomes in a way that could not be done manually



Score:   (Score: 0.7080) 

Set of Sentences:   while augmented intelligence ai in healthcare has been widely cited as an important approach to aid in the detection of disease and making clinical diagnosis this recent outbreak emphasizes the need and opportunity to utilize ai to predict outbreaks while the use of expert epidemiologists and public health officials cannot be replaced ai can serve to compile rapidly evolving information to assist public health experts in complex decisionmaking aggregation of social media news media rapidly evolving health reports and other disparate data is a daunting task which ai is poised to overcome during prior outbreaks such as severe acute respiratory syndrome sars in china in 2003 little realtime data was available [5]  now there is an explosion