In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm





# Load Data

In [2]:
df = pd.read_pickle('../data/processed_arxiv_data.pkl')

# Load Pre trained SBERT model

In [3]:
# This model maps sentences to a 384-dimensional vector space.
model = SentenceTransformer('all-MiniLM-L6-v2')

In [4]:
# Encoding titles
titles = df['title'].tolist()
title_embeddings = model.encode(titles, batch_size=100, show_progress_bar=True)

np.save('../data/title_embeddings.npy', title_embeddings)

Batches: 100%|██████████| 9988/9988 [59:38<00:00,  2.79it/s]  


In [None]:
# Encoding abstracts
abstracts = df['abstract'].tolist()
embeddings = model.encode(abstracts, batch_size=100, show_progress_bar=True)

np.save('../data/abstract_embeddings.npy', embeddings)

In [None]:
# Load saved embeddings 
embeddings = np.load('../data/abstract_embeddings.npy')
title_embeddings = np.load('../data/title_embeddings.npy')

# Search Function

In [6]:
def semantic_search(query, model, embeddings, df, top_k=10):
    query_embedding = model.encode([query]) # Encode raw user query to embedding
    
    cosine_scores = cosine_similarity(query_embedding, embeddings)[0] # Compute cosine similarity between query and all abstract embeddings
    top_indices = cosine_scores.argsort()[-top_k:][::-1]
    
    results = df.iloc[top_indices].copy()
    results['similarity_score'] = cosine_scores[top_indices]
    return results[['id', 'title', 'authors', 'abstract', 'similarity_score']]

# Example

In [7]:
# abstract
semantic_search("using machine learning to treat illnesses", model, embeddings, df, top_k=5)

Unnamed: 0,id,title,authors,abstract,similarity_score
250491,1103.3223,Using Soft Computer Techniques on Smart Device...,"Piero Giacomelli, Giulia Munaro and Roberto Rosso","CHRONIOUS is an Open, Ubiquitous and Adaptiv...",0.5614
594722,1502.00062,A New Intelligence Based Approach for Computer...,Vadrevu Sree Hari Rao and Mallenahalli Naresh ...,Identification of the influential clinical s...,0.550519
443139,1307.1411,Discovering Sequential Patterns in a UK Genera...,"Jenna Reps, Jonathan M. Garibaldi, Uwe Aickeli...",The wealth of computerised medical informati...,0.549737
560478,1409.8053,Medical diagnosis as pattern recognition in a ...,J. Gerard Wolff,This paper describes a novel approach to med...,0.543002
686642,1512.0399,Cloud-based Electronic Health Records for Real...,"Mauricio Santillana, Andre Nguyen, Tamara Loui...",Accurate real-time monitoring systems of inf...,0.533153


In [8]:
# title
semantic_search("deep learning in medical imaging", model, title_embeddings, df, top_k=5)

Unnamed: 0,id,title,authors,abstract,similarity_score
883999,1708.08987,Deep Learning for Medical Image Analysis,"Mina Rezaei, Haojin Yang and Christoph Meinel",This report describes my research activities...,0.878883
820341,1702.05747,A Survey on Deep Learning in Medical Image Ana...,"Geert Litjens, Thijs Kooi, Babak Ehteshami Bej...","Deep learning algorithms, in particular conv...",0.788628
841337,1704.06825,Deep Learning for Medical Image Processing: Ov...,"Muhammad Imran Razzak, Saeeda Naz and Ahmad Zaib",Healthcare sector is totally different from ...,0.788516
622272,1505.02,Deep Learning for Medical Image Segmentation,Matthew Lai,This report provides an overview of the curr...,0.786511
888351,1709.03485,NiftyNet: a deep-learning platform for medical...,"Eli Gibson, Wenqi Li, Carole Sudre, Lucas Fido...",Medical image analysis and computer-assisted...,0.77333
