In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm





# Load Data

In [2]:
df = pd.read_pickle('../data/processed_arxiv_data.pkl')

# Load Pre trained SBERT model

In [3]:
# This model maps sentences to a 384-dimensional vector space.
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# Create embeddings for all abstracts
abstracts = df['abstract'].tolist()
embeddings = model.encode(abstracts, batch_size=100, show_progress_bar=True)

Batches:   6%|▋         | 632/9988 [23:40<5:04:44,  1.95s/it] 

In [None]:
# Save embeddings to disk for reuse
np.save('../data/abstract_embeddings.npy', embeddings)

In [None]:
# Load later with
embeddings = np.load('../data/abstract_embeddings.npy')

# Search Function

In [None]:
def semantic_search(query, model, embeddings, df, top_k=10):
    query_embedding = model.encode([query]) # Encode raw user query to embedding
    
    cosine_scores = cosine_similarity(query_embedding, embeddings)[0] # Compute cosine similarity between query and all abstract embeddings
    top_indices = cosine_scores.argsort()[-top_k:][::-1]
    
    results = df.iloc[top_indices].copy()
    results['similarity_score'] = cosine_scores[top_indices]
    return results[['id', 'title', 'authors', 'abstract', 'similarity_score']]

# Example

In [None]:
results = semantic_search("using machine learning to treat illnesses", model, embeddings, df, top_k=5)

In [None]:
results