In [1]:
from sparql_llm import SparqlExamplesLoader, SparqlInfoLoader, SparqlVoidShapesLoader
from langchain_core.documents import Document


In [2]:
endpoints = [
    "https://sparql.uniprot.org/sparql/",
    "https://sparql.rhea-db.org/sparql/",
    "https://beta.sparql.swisslipids.org/",
    #"https://biosoda.unil.ch/emi/sparql/"
]

embedding_model = "BAAI/bge-large-en-v1.5"
#embedding_model = "BAAI/bge-small-en-v1.5"

collection_name = "biomedical_examples_collection_v1.0"

vector_size = 384

In [3]:
from typing import List, Dict, Optional
from qdrant_client import QdrantClient
from qdrant_client.http.models import SearchRequest, NamedVector, NamedSparseVector, SparseIndexParams, SparseVector
from langchain_qdrant.fastembed_sparse import FastEmbedSparse
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
import gc
import time
from langchain_qdrant import QdrantVectorStore
from qdrant_client.http import models



def init_endpoint_examples(endpoints: List[str], collection_name: str = "biomedical_examples_collection_v1.0", embedding_model: str = "BAAI/bge-base-en-v1.5", parallel: int = 4, vector_size: int = 768) -> None:

    client = QdrantClient(
            host="localhost",
            grpc_port=6334,
            prefer_grpc=True,
            timeout=60
        )


    collections = client.get_collections()

    if collection_name in [c.name for c in collections.collections]:
        print(f"Collection '{collection_name}' exists, deleting...")
        client.delete_collection(collection_name)

    client.create_collection(
        collection_name=collection_name,
        vectors_config={
            "dense": models.VectorParams(
                size= vector_size,
                distance=models.Distance.COSINE
            )
        },
        hnsw_config=models.HnswConfigDiff(
            on_disk=True
        ),
    )

    
    docs = []
 
    
    for endpoint in endpoints:
        docs += SparqlExamplesLoader(endpoint_url=endpoint).load()
    

    print(f"Loaded {len(docs)} documents from {len(endpoints)} endpoints")

    start_time = time.time()

    QdrantVectorStore.from_documents(
        docs,
        host="localhost",
        grpc_port=6334,
        prefer_grpc=True,
        collection_name=collection_name,
        force_recreate=True,
        embedding= FastEmbedEmbeddings(model_name=embedding_model, parallel=parallel),
        vector_name="dense"
    )

    print(f"Done generating and indexing {len(docs)} documents into the vectordb in {time.time() - start_time} seconds")

In [4]:
init_endpoint_examples(endpoints= endpoints, embedding_model= embedding_model, collection_name= collection_name, vector_size= vector_size)
    

Collection 'biomedical_examples_collection_v1.0' exists, deleting...


Found 111 examples queries for https://sparql.uniprot.org/sparql/
Found 121 examples queries for https://sparql.rhea-db.org/sparql/
Found 23 examples queries for https://beta.sparql.swisslipids.org/


Loaded 255 documents from 3 endpoints


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/742 [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Done generating and indexing 255 documents into the vectordb in 137.44155192375183 seconds


In [5]:
examples = []


for endpoint in endpoints:
    examples += SparqlExamplesLoader(endpoint_url=endpoint).load()

Found 111 examples queries for https://sparql.uniprot.org/sparql/
Found 121 examples queries for https://sparql.rhea-db.org/sparql/
Found 23 examples queries for https://beta.sparql.swisslipids.org/


In [36]:
print(examples[0].metadata.keys())

dict_keys(['question', 'answer', 'endpoint_url', 'query_type', 'doc_type'])


In [39]:
print(examples[0])

page_content='Select all taxa from the UniProt taxonomy' metadata={'question': 'Select all taxa from the UniProt taxonomy', 'answer': 'PREFIX up: <http://purl.uniprot.org/core/>\n\nSELECT ?taxon\nFROM <http://sparql.uniprot.org/taxonomy>\nWHERE\n{\n    ?taxon a up:Taxon .\n}', 'endpoint_url': 'https://sparql.uniprot.org/sparql/', 'query_type': 'SelectQuery', 'doc_type': 'SPARQL endpoints query examples'}


In [6]:
import numpy as np
import pandas as pd

rows = []

client = QdrantClient(
    host="localhost",
    grpc_port=6334,
    prefer_grpc=True,
    timeout=60
)

dense_embeddings = FastEmbedEmbeddings(
    model_name=embedding_model
)

for q_idx, question in enumerate(examples): 
    dense_vector = dense_embeddings.embed_query(question.page_content)

    if dense_vector is not None:
        results = client.query_points(
            collection_name=collection_name,
            query=dense_vector,
            using="dense",
            with_payload=True,
            limit=10,
        )

        for r_idx, point in enumerate(results.points):

            if question.page_content in point.payload["metadata"]["question"]:
                rows.append({
                    "question": question.page_content,
                    "rank": r_idx,
                    "similarity_score": point.score,
                })

df = pd.DataFrame(rows)

In [8]:
df.head(500)

Unnamed: 0,question,rank,similarity_score
0,Select all taxa from the UniProt taxonomy,0,1.0
1,Select all human UniProtKB entries with a sequ...,0,1.0
2,List the proteins encoded by a gene that is lo...,0,1.0
3,List all human UniProtKB entries and their com...,0,1.0
4,Extracting an UniProtKB primary accession from...,0,1.0
...,...,...,...
266,"For a given lipid (SWISSLIPID:000399814, Ceram...",0,1.0
267,"For a given lipid (SWISSLIPID:000399814, Ceram...",0,1.0
268,"For a given list of lipid Species, return a li...",0,1.0
269,Retrieve mapping between SwissLipids lipid ide...,0,1.0


In [11]:
print(df["similarity_score"].where(df["similarity_score"] < 1.0))

0      1.0
1      NaN
2      NaN
3      1.0
4      NaN
      ... 
266    1.0
267    NaN
268    NaN
269    NaN
270    1.0
Name: similarity_score, Length: 271, dtype: float64


In [12]:
print(df["similarity_score"].where(df["similarity_score"] < 1.0).count())

89


In [49]:
dense_vector = dense_embeddings.embed_query(examples[0].page_content)

matches = []

if dense_vector is not None:
    results = client.query_points(
        collection_name=collection_name,
        query=dense_vector,
        using="dense",
        with_payload=True,
        limit=10,
    )

for r_idx, point in enumerate(results.points):

    if point.score < 0.9:
        doc = Document(
            page_content=point.payload.get("page_content", ""),
            metadata=point.payload.get("metadata", ""),
        )
        matches.append(doc)

In [45]:
print(matches[0].page_content.keys())

dict_keys(['endpoint_url', 'doc_type', 'answer', 'question', 'query_type'])


In [46]:
print(matches[0])

page_content='Select all Rhea reactions used in UniProtKB/Swiss-Prot for a given organism (NCBI taxonomy ID).' metadata={'endpoint_url': 'https://sparql.rhea-db.org/sparql/', 'doc_type': 'SPARQL endpoints query examples', 'answer': 'PREFIX rh: <http://rdf.rhea-db.org/>\nPREFIX taxon: <http://purl.uniprot.org/taxonomy/>\nPREFIX up: <http://purl.uniprot.org/core/>\n\n# Query 13\n# Select all Rhea reactions used to annotate Escherichia coli (taxid=83333) in UniProtKB/Swiss-Prot\n# return the number of UniProtKB entries\n# \n# Federated query using a service to UniProt SPARQL endpoint\n#\n# This query cannot be performed using the Rhea search website\nSELECT ?uniprot ?mnemo ?rhea ?accession ?equation \nWHERE {\n  SERVICE <https://sparql.uniprot.org/sparql> { \n    VALUES (?taxid) { (taxon:83333) }\n    GRAPH <http://sparql.uniprot.org/uniprot> {\n      ?uniprot up:reviewed true . \n      ?uniprot up:mnemonic ?mnemo . \n      ?uniprot up:organism ?taxid .\n      ?uniprot up:annotation/up:ca