# Semantic Search on ArXiv Data (Titles & Abstracts)

### Dataset: ArXiv Abstracts from Papers With Code
### Database: Milvus-lite
### Embedding Model: Cohere's co.embed

In [1]:
#! pip install pymilvus gdown milvus

In [9]:
#!pip install cohere
#!pip install langchain[all]
#! python -m pip install --upgrade langchain

In [1]:
#Import libraries
import pandas as pd
import zipfile
import cohere
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection
#from sentence_transformers import SentenceTransformer
from tqdm import tqdm
#from langchain.vectorstores import Milvus

In [2]:
#Extract the zip file that has ArXiv abstracts and titles collection
# Ref for dataset: https://paperswithcode.com/dataset/arxiv-10
# with zipfile.ZipFile("./ArXiv-10.zip","r") as zip_ref:
#     zip_ref.extractall("./ArXiv-10")

In [2]:
# Read the dataset into pandas dataframe and explore 
arxiv = pd.read_csv("./ArXiv-10/arxiv100.csv")
arxiv = arxiv.reset_index()
arxiv.head()

Unnamed: 0,index,title,abstract,label
0,0,The Pre-He White Dwarfs in Eclipsing Binaries....,We report the first $BV$ light curves and hi...,astro-ph
1,1,A Possible Origin of kHZ QPOs in Low-Mass X-ra...,A possible origin of kHz QPOs in low-mass X-...,astro-ph
2,2,The effects of driving time scales on heating ...,Context. The relative importance of AC and D...,astro-ph
3,3,A new hard X-ray selected sample of extreme hi...,Extreme high-energy peaked BL Lac objects (E...,astro-ph
4,4,The baryon cycle of Seven Dwarfs with superbub...,"We present results from a high-resolution, c...",astro-ph


In [3]:
arxiv = arxiv.rename(columns={'index': 'id'})

In [4]:
arxiv.describe()

Unnamed: 0,id
count,100000.0
mean,49999.5
std,28867.657797
min,0.0
25%,24999.75
50%,49999.5
75%,74999.25
max,99999.0


In [4]:
#Check for NA
arxiv = arxiv.dropna()

In [5]:
arxiv['label'].unique()

array(['astro-ph', 'cond-mat', 'cs', 'eess', 'hep-ph', 'hep-th', 'math',
       'physics', 'quant-ph', 'stat'], dtype=object)

In [6]:
# Store the dataset into milvus db
COLLECTION_NAME = "arxiv_10000"
DIMENSION = 1024
BATCH_SIZE = 128
TOPK = 5
COUNT = 10000

In [7]:
from milvus import default_server
from pymilvus import connections, utility

default_server.start()
connections.connect(host = "127.0.0.1", port = default_server.listen_port)

utility.get_server_version()

'v2.2.14-lite'

In [8]:
if utility.has_collection(COLLECTION_NAME):
    utility.drop_collection(COLLECTION_NAME)

In [10]:
arxiv.columns

Index(['id', 'title', 'abstract', 'label'], dtype='object')

In [9]:
# object should be inserted in the format of (title, date, location, speech embedding)
fields = [
    FieldSchema(name = "id", dtype = DataType.INT64, is_primary = True, auto_id = True),
    FieldSchema(name = "title", dtype = DataType.VARCHAR, max_length = 800),
    FieldSchema(name = "abstract", dtype = DataType.VARCHAR, max_length = 9000),
    FieldSchema(name = "label", dtype = DataType.VARCHAR, max_length = 20),
    FieldSchema(name = "embedding", dtype = DataType.FLOAT_VECTOR, dim = DIMENSION)
]
schema = CollectionSchema(fields = fields)
collection = Collection(name = COLLECTION_NAME, schema = schema)

In [10]:
index_params = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 100},
}
collection.create_index(field_name = "embedding", index_params = index_params)
collection.load()

Create Vector Embedding

In [20]:
# # Create cohere embeddings
# # Set up a co:here client.
cohere_client = cohere.Client("enter-your-api-key")#prod key


In [21]:
# Extract embeddings from questions using Cohere
def embed(texts):
    res = cohere_client.embed(texts, model = "embed-english-v3.0", input_type = "search_document")
    return res.embeddings

In [22]:
import numpy as np
#total = pd.DataFrame()
for batch in tqdm(np.array_split(arxiv, (COUNT/BATCH_SIZE) + 1)):
    #titles = 
    abstracts = batch['abstract'].tolist()
    data = [
        batch['title'].tolist(),
        abstracts,
        batch['label'].tolist(),
        embed(abstracts)
    ]

    collection.insert(data)

# Flush at end to make sure all rows are sent for indexing
collection.flush()

100%|██████████████████████████████████████████████████████████████████████████████████| 79/79 [07:51<00:00,  5.97s/it]


In [27]:
import time
search_terms = ["What papers talk about astrophysics?", "What are the papers on that discuss computer architecture?"]

# Search the database based on input text
def embed_search(data):
    embeds = cohere_client.embed(data, model = "embed-english-v3.0", input_type = "search_query") 
    return [x for x in embeds]

search_data = embed_search(search_terms)

start = time.time()
res = collection.search(
    data = search_data,  # Embed search value
    anns_field = "embedding",  # Search across embeddings
    param = {"metric_type": "L2",
            "params": {"nprobe": 20}},
    limit = TOPK,  # Limit to top_k results per search
    output_fields = ["title","abstract"]  # Include title field in result 
)

end = time.time()

for hits_i, hits in enumerate(res):
    print("Query:", search_terms[hits_i])
    #print("Abstract:", search_terms[hits_i])
    print("Search Time:", end-start)
    print("Results:\n")
    for hit in hits:
        print( hit.entity.get("title"), "----", round(hit.distance, 3))
        print()
        print( hit.entity.get("abstract"), "----", round(hit.distance, 3))
        print()
    print()

Query: What papers talk about astrophysics?
Search Time: 0.08154153823852539
Results:

Cosmic Magnetism ---- 0.888

  Magnetic fields are involved in every astrophysical process on every scale:
from planetary and stellar interiors to neutron stars, stellar wind bubbles and
supernova remnants; from the interstellar medium in galactic disks, nuclei,
spiral arms and halos to the intracluster and intergalactic media. They are
involved in essentially every particle acceleration process and are thus
fundamental to non-thermal physics in the Universe. Key questions include the
origin of magnetic fields, their evolution over cosmic time, the amplification
and decay processes that modify their strength, and their impact on other
processes such as star formation and galaxy evolution. Astrophysical plasmas
provide a unique laboratory for testing magnetic dynamo theory. The study of
magnetic fields requires observations that span the wavelength range from radio
through infrared, optical, UV, X-ray